
    ޺i5,                         % S r SSKrSSKJrJrJr  SrSrSrSr	Sr
S	rS
rS\-   r\\S'   S\S\S\4S jrS\S\S-  4S jrS\S\S-  4S jrS\S\S-  4S jrS\S\4S jrSS\S\S\4S jjrg)a  Stage 1a+: UTF-16/UTF-32 detection for data without BOM.

This stage runs after BOM detection but before binary detection.
UTF-16 and UTF-32 encoded text contains characteristic null-byte patterns
that would otherwise cause binary detection to reject the data.

Note: ``from __future__ import annotations`` is intentionally omitted because
this module is compiled with mypyc, which does not support PEP 563 string
annotations.
    N)ASCII_TEXT_BYTESDETERMINISTIC_CONFIDENCEDetectionResulti      
   gQ?      ?gffffff?g333333?    _NULL_SEPARATOR_ALLOWEDdata	null_fracreturnc                 N    U[         :  a  gU R                  S[        5      (       + $ )uc  Return True if the data looks like ASCII with null byte separators.

:param data: The raw byte sample to examine.
:param null_frac: The positional null fraction for this UTF-16 candidate
    (i.e. fraction of null bytes in even positions for BE, or odd positions
    for LE) — not the total null fraction across all bytes.

Checks two conditions:
1. The positional null fraction is below ``_NULL_SEPARATOR_MAX_FRACTION``
2. Every non-null byte is printable ASCII or common whitespace

When both conditions are met, the nulls are likely field separators
(e.g. ``find -print0``), not UTF-16 encoding artifacts.
FN)_NULL_SEPARATOR_MAX_FRACTION	translater
   )r   r   s     W/var/www/piano.thomer.com/venv/lib/python3.13/site-packages/chardet/pipeline/utf1632.py_is_null_separator_patternr   6   s%     00~~d$;<<<    c                 r    U S[          n[        U5      [        :  a  g[        U5      nUb  U$ [	        U5      $ )a   Detect UTF-32 or UTF-16 encoding from null-byte patterns.

UTF-32 is checked before UTF-16 since UTF-32 patterns are more specific.

:param data: The raw byte data to examine.
:returns: A :class:`DetectionResult` if a strong pattern is found, or ``None``.
N)_SAMPLE_SIZElen_MIN_BYTES_UTF16_check_utf32_check_utf16)r   sampleresults      r   detect_utf1632_patternsr   J   sD     -< F
6{%% &!F r   c           	        ^  [        T 5      [        T 5      S-  -
  nU[        :  a  gT SU m US-  n[        U 4S j[        S[        T 5      S5       5       5      n[        U 4S j[        S[        T 5      S5       5       5      nX2:X  a:  XB-  S:  a2   T R	                  S5      n[        U5      (       a  [        S[        SS9$  [        U 4S	 j[        S
[        T 5      S5       5       5      n[        U 4S j[        S[        T 5      S5       5       5      nXb:X  a;  Xr-  S:  a3   T R	                  S5      n[        U5      (       a  [        S[        SS9$  gg! [         a     Nf = f! [         a     gf = f)av  Check for UTF-32 encoding based on 4-byte unit structure.

For valid Unicode (U+0000 to U+10FFFF = 0x0010FFFF):
- UTF-32-BE: the first byte of each 4-byte unit is always 0x00
- UTF-32-LE: the last byte of each 4-byte unit is always 0x00

For BMP characters (U+0000 to U+FFFF), additionally:
- UTF-32-BE: the second byte is also 0x00
- UTF-32-LE: the third byte is also 0x00
   Nc              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr      N .0ir   s     r   	<genexpr>_check_utf32.<locals>.<genexpr>t        J#9aT!W\#9   	r   c              3   D   >#    U  H  nTUS -      S:X  d  M  S v   M     g7f)r!   r   Nr"   r#   s     r   r&   r'   v   s$     O$:qd1q5kQ>N$:s    	 r   z	utf-32-beencoding
confidencelanguagec              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr    r"   r#   s     r   r&   r'      s     I"8QDGqLqq"8r)      c              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr    r"   r#   s     r   r&   r'      r(   r)      z	utf-32-le)	r   _MIN_BYTES_UTF32sumrangedecode_looks_like_textr   r   UnicodeDecodeError)r   trimmed_len	num_unitsbe_first_nullbe_second_nulltextle_last_nullle_third_nulls   `       r   r   r   `   sv    d)s4y1}-K%%Dq I J5CIq#9JJMOE!SY$:OON!n&@3&F		;;{+D%%&(7!  & I%3t9a"8IILJ5CIq#9JJM ]%>%D		;;{+D%%&(7!  & ) " 		" " 		s$   /E "/E% 
E"!E"%
E21E2c                 V  ^  [        [        T 5      [        5      nXS-  -  nU[        :  a  gUS-  n[	        U 4S j[        SUS5       5       5      n[	        U 4S j[        SUS5       5       5      nX2-  nXB-  n/ nU[        :  a'  [        T SU U5      (       d  UR                  SU45        U[        :  a'  [        T SU U5      (       d  UR                  SU45        U(       d  g[        U5      S:X  a>  US   S   n T SU R                  U5      n	[        U	5      (       a  [        U[        SS	9$  gSn
S
nU H0  u  p T SU R                  U5      n	[        U	5      nX:  d  M,  UnUn
M2     U
b  U[        :  a  [        U
[        SS	9$ g! [         a     gf = f! [         a     Mn  f = f)a  Check for UTF-16 via null-byte patterns in alternating positions.

UTF-16 encodes each BMP character as two bytes.  For characters whose
code-point high byte is 0x00 (Latin, digits, basic punctuation, many
control structures), one of the two bytes in each unit will be a null.
Even for non-Latin scripts (Arabic, CJK, Cyrillic, etc.) a significant
fraction of code units still contain at least one null byte.

Non-UTF-16 single-byte encodings never contain null bytes, so even a
small null-byte fraction in alternating positions is a strong signal.

When both endiannesses show null-byte patterns (e.g., Latin text where
every other byte is null), we disambiguate by decoding both ways and
comparing text-quality scores.
r2   Nc              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr    r"   r#   s     r   r&   _check_utf16.<locals>.<genexpr>        K#:ad1gl#:r)   r   c              3   >   >#    U  H  nTU   S :X  d  M  Sv   M     g7fr    r"   r#   s     r   r&   rB      rC   r)   r!   z	utf-16-lez	utf-16-ber+         )minr   r   r   r4   r5   _UTF16_MIN_NULL_FRACTIONr   appendr6   r7   r   r   r8   _text_quality_MIN_TEXT_QUALITY)r   
sample_lenr:   be_null_countle_null_countbe_fracle_frac
candidatesr,   r=   best_encodingbest_quality_qualitys   `             r   r   r      s     SY-Jq. J$$aI K5J#:KKMK5J#:KKM'G'G*,J**3M[j74 4 	;01**3M[j74 4 	;01 :!a=#		$++H5D%%&%7!  &  !%ML!	$++H5D  %!"L$M "  \5F%F"/
 	
 5 " 		 " 		s$   >2F
 ?F

FF
F('F(r=   c                 j    U (       d  gU SS n[        S U 5       5      nU[        U5      -  [        :  $ )z9Quick check: is decoded text mostly printable characters.FN  c              3   ^   #    U  H#  oR                  5       (       d  US ;   d  M  Sv   M%     g7f)
	r!   N)isprintable)r$   cs     r   r&   #_looks_like_text.<locals>.<genexpr>   s      Jv!AMAAvs   -	-)r4   r   _MIN_PRINTABLE_FRACTION)r=   r   	printables      r   r7   r7      s8    $3ZFJvJJIs6{"%<<<r   limitc                    U SU n[        U5      nUS:X  a  gSnSnSnSnSnU Hr  n	[        R                  " U	5      n
U
S   S:X  a  US-  n[        U	5      S:  a  US-  nM=  M?  U
S   S:X  a  US-  nMO  U
S:X  d  U	S	;   a  US-  nMb  U
S   S
:X  d  Mm  US-  nMt     Xs-  S:  a  gXS-  S:  a  gXC-  nXU-  S-  -  nUS:  a  US:  a  US-  nU$ )u  Score how much *text* looks like real human-readable content.

Returns a score in the range [-1.0, ~1.6).  Higher values indicate
more natural text.  The practical maximum is 1.5 for all-ASCII-letter
input (1.6 approaches as sample size grows with all ASCII letters plus
whitespace).  A score of -1.0 means the content is almost certainly not
valid text (too many control characters or combining marks).

Scoring factors:

* Base score: ratio of Unicode letters (category ``L*``) to sample length.
* ASCII bonus: additional 0.5x weight for ASCII letters.  This is the
  primary signal for disambiguating endianness — correct decoding of
  Latin-heavy text produces ASCII letters, wrong decoding produces CJK.
* Space bonus: +0.1 when the sample contains at least one whitespace
  character and is longer than 20 characters.
* Rejection: returns -1.0 if >10% control characters or >20% combining
  marks (category ``M*``).
Nr   rE   Lr!      MZsrX   Cg?g?r      )r   unicodedatacategoryord)r=   r^   r   nlettersmarksspacescontrolsascii_lettersrZ   catscores               r   rI   rI      s   ( &5\FFAAvGEFHM""1%q6S=qLG1v|" Vs]QJED[AMaKFVs]MH  |cy3KE	a3&&E2v&1*Lr   )rV   )__doc__rf   chardet.pipeliner   r   r   r   r3   r   rG   rJ   r\   r   r
   bytes__annotations__floatboolr   r   r   r   strr7   intrI   r"   r   r   <module>ry      s   	  X X             $  ")+;!;  ;=U =u = =( %  Od,B  ,5u 54!7 5pQu Q4!7 Qh=3 =4 =9 9C 9% 9r   