
    ޺ic                        % S r SSKJr  SSKrSSKJr  SSKJr  SSKJ	r	J
r
JrJr  SSKJrJrJrJr  SSKJr  SS	KJr  SS
KJr  SSKJr  SSKJr  SSKJr  SSKJr  SSK J!r!J"r"J#r#  SSK$J%r%  SSK&J'r'  SSK(J)r)  SSK*J+r+J,r,  \" S\SS9r-\" SSSS9r.Sr/\0" 1 Sk5      r1S\2S'   \0" 1 Sk5      r3S\2S'   \0" 1 Sk5      r4S\2S'   \0" 1 Sk5      r5S\2S '   \3\4\5S!.r6S"\2S#'   \0" 1 S$k5      r7S\2S%'           S:S& jr8S;S' jr9S(r:S)r;S*r<S+r=S,r>        S<S- jr?          S=S. jr@      S>S/ jrA      S>S0 jrBS1rCS?S2 jrD      S>S3 jrE      S>S4 jrF\4SSS5S6S7.               S@S8 jjjrG\4SSS5S6S7.               S@S9 jjjrHg)Au@   Pipeline orchestrator — runs all detection stages in sequence.    )annotationsN)DEFAULT_MAX_BYTES)EncodingEra)BigramProfilehas_model_variantsinfer_languagescore_best_language)DETERMINISTIC_CONFIDENCE
HIGH_BYTESDetectionResultPipelineContext)detect_ascii)	is_binary)
detect_bom)resolve_confusion_groups)detect_escape_encoding)detect_markup_charset)score_candidates)compute_lead_byte_diversitycompute_multibyte_byte_coveragecompute_structural_score)detect_utf8)detect_utf1632_patterns)filter_by_validity)EncodingInfoget_candidatesencoding
confidencelanguage        g333333?>   cp1252	iso8859-1
iso8859-15frozenset[str]_COMMON_LATIN_ENCODINGS>.                                                                                                                                             zfrozenset[int]_ISO_8859_10_DISTINGUISHING>   r'   r(   r*   r+   r,   r-   r/   r0   r1   r3      r4   r5   r6   r7   r8      r:   r;   r<   r=   r>   r?   r@   rA      rI         rR      _ISO_8859_14_DISTINGUISHING>   rX      rY   rZ      r[   _WINDOWS_1254_DISTINGUISHING)z
iso8859-10z
iso8859-14cp1254zdict[str, frozenset[int]]_DEMOTION_CANDIDATES>                           r'   r(   r+   r8   _KOI8_T_DISTINGUISHINGc                t    X;  a(  [         R                  " U SU < S3[        SS9  [        /$ [	        U SSS9/$ )zReturn a low-confidence result for *encoding*, or ``encoding=None`` if filtered out.

``stacklevel=5`` targets the public caller:
detect() -> run_pipeline() -> _run_pipeline_core() -> _make_fallback_or_none().
 zL is excluded by include_encodings/exclude_encodings; returning encoding=None   )
stacklevelg?Nr   )warningswarnUserWarning_NONE_RESULTr   )r   allowed
param_names      \/var/www/piano.thomer.com/venv/lib/python3.13/site-packages/chardet/pipeline/orchestrator.py_make_fallback_or_nonerv      sO     l!H< (K L		
 ~X$NOO    c                j   ^ [         R                  U 5      mTc  g[        U4S jU 5       5      (       + $ )aa  Return True if encoding is a demotion candidate with no distinguishing bytes.

Checks whether any non-ASCII byte in *data* falls in the set of byte
values that decode differently under the given encoding vs iso-8859-1.
If none do, the data is equally valid under both encodings and there is
no byte-level evidence for preferring the candidate encoding.
Fc              3  <   >#    U  H  oS :  d  M
  UT;   v   M     g7f   N ).0bdistinguishings     ru   	<genexpr>!_should_demote.<locals>.<genexpr>   s     A1D&1&s   	)ra   getany)r   datar   s     @ru   _should_demoter      s2     *--h7NAAAAArw   g?   gffffff?      c                D   / nU GH  nUR                   (       a  [        XU5      nXRR                  UR                  '   U[        :  a  ME  UR
                  c0  [        U 5      [        U R                  S[        5      5      -
  Ul        UR
                  [        :  a  M  [        XX"R
                  S9nXbR                  UR                  '   U[        :  a  M  UR
                  [        :  a  [        XU5      nU[        :  a  M  UR!                  U5        GM     [#        U5      $ )a  Eliminate CJK multi-byte candidates that lack genuine multi-byte structure.

Four checks are applied in order to each multi-byte candidate:

1. **Structural pair ratio** (valid_pairs / lead_bytes) must be
   >= ``_CJK_MIN_MB_RATIO``.  Catches files with many orphan lead bytes.

2. **Minimum non-ASCII byte count**: the data must contain at least
   ``_CJK_MIN_NON_ASCII`` bytes > 0x7F.  Tiny files with 1-5 high bytes
   can accidentally form perfect pairs and score 1.0 structurally.

3. **Byte coverage** (non-ASCII bytes in valid multi-byte sequences /
   total non-ASCII bytes) must be >= ``_CJK_MIN_BYTE_COVERAGE``.  Latin
   text has many high bytes that are NOT consumed by multi-byte pairs;
   genuine CJK text has nearly all high bytes accounted for.

4. **Lead byte diversity**: the number of distinct lead byte values in
   valid pairs must be >= ``_CJK_MIN_LEAD_DIVERSITY``.  Genuine CJK text
   draws from a wide repertoire of lead bytes; European false positives
   cluster in a narrow band (e.g. 0xC0-0xDF for accented Latin).

Returns the filtered candidate list.  Structural scores are cached in
``ctx.mb_scores`` for reuse in Stage 2b.
N)non_ascii_count)is_multibyter   	mb_scoresname_CJK_MIN_MB_RATIOr   len	translater   _CJK_MIN_NON_ASCIIr   mb_coverage_CJK_MIN_BYTE_COVERAGE_CJK_DIVERSITY_MIN_NON_ASCIIr   _CJK_MIN_LEAD_DIVERSITYappendtuple)r   valid_candidatesctxgatedencmb_scorebyte_coveragelead_diversitys           ru   _gate_cjk_candidatesr      s    : !#E/3?H&.MM#((#++""*&)$i#dnnT:6V2W&W#""%77;30C0CM )6OOCHH%55""&BB!<T!L!$;;S)  * <rw   c           	     L  ^ U Vs0 s H"  oDR                   (       d  M  UR                  U_M$     snm[        U4S jU 5       5      n[        S U 5       5      n[        [	        U / UQUQ75      5      n/ nU H  n	U	R
                  (       a&  UR                  R                  U	R
                  S5      OSn
U
S:  a@  UR                  [        U	R
                  U	R                  SU
-   -  U	R                  S95        M  UR                  U	5        M     UR                  S SS	9  U$ s  snf )
a]  Score structurally-valid CJK candidates using statistical bigrams.

When multiple CJK encodings score equally high structurally, statistical
scoring differentiates them (e.g. euc-jp vs big5 for Japanese data).
Single-byte candidates are also scored and included so that the caller
can compare CJK vs single-byte confidence.

Multi-byte candidates with high byte coverage (>= 0.95) receive a
confidence boost proportional to coverage.  When nearly all non-ASCII
bytes form valid multi-byte pairs, the structural evidence is strong
and should increase the candidate's ranking relative to single-byte
alternatives whose bigram models may score higher on small samples.

Note: boosted confidence values may exceed 1.0 and are used only for
relative ranking among candidates.  ``run_pipeline`` clamps all
confidence values to [0.0, 1.0] before returning to callers.
c              3  B   >#    U  H  u  pUT;   d  M  TU   v   M     g 7fNr|   )r}   r   _sc
enc_lookups      ru   r   /_score_structural_candidates.<locals>.<genexpr>G  s&      *;YTtz?Q
4*;s   c              3  J   #    U  H  oR                   (       a  M  Uv   M     g 7fr   )r   )r}   es     ru   r   r   J  s     J#3a>>#3s   #	#r!   gffffff?   r   c                    U R                   $ r   )r   xs    ru   <lambda>._score_structural_candidates.<locals>.<lambda>[  s    q||rw   Tkeyreverse)r   r   r   listr   r   r   r   r   r   r   r    sort)r   structural_scoresr   r   r   valid_mbsingle_byteresultsboostedrcoverager   s              @ru   _score_structural_candidatesr   -  s   0 ,++a~~		++J  *; H J#3JJK#D*CH*C{*CDEG &(G;<::3??&&qzz373tNNZZ ||q8|<ZZ NN1  LL+TL:N1+s
   D!D!c                   [        U5      S:  a  US   R                  b  [        US   R                  U 5      (       a  US   R                  nUSS  Hl  nUR                  [        ;   d  M  U Vs/ s H  oDR                  U:w  d  M  XCLd  M  UPM     nnU Vs/ s H  oDR                  U:X  d  M  UPM     nnU/UQUQs  $    U$ s  snf s  snf )a  Demote niche Latin encodings when no distinguishing bytes are present.

Some bigram models (e.g. iso-8859-10, iso-8859-14, windows-1254) can win
on data that contains only bytes shared with common Western Latin
encodings.  When there is no byte-level evidence for the winning
encoding, promote the first common Western Latin candidate to the top and
push the demoted encoding to last.
r   r   N)r   r   r   r&   )r   r   demoted_encodingr   r   othersdemoted_entriess          ru   _demote_niche_latinr   _  s     	GqAJ+71:..55"1:..Azz44&&!**8H*HAQZAw   /6"XgGW9W1g"X5F5_55  N #Ys   /CCCC	/C	c                   U(       a  US   R                   S:w  a  U$ [        S [        U5       5       S5      nUc  U$ [        S U  5       5      (       a/  X   n[        U5       VVs/ s H  u  pEXB:w  d  M  UPM     nnnU/UQ$ U$ s  snnf )ag  Promote KOI8-T over KOI8-R when Tajik-specific bytes are present.

KOI8-T and KOI8-R share the entire 0xC0-0xFF Cyrillic letter block,
making statistical discrimination difficult.  However, KOI8-T maps 12
bytes in 0x80-0xBF to Tajik-specific Cyrillic letters where KOI8-R has
box-drawing characters.  If any of these bytes appear, KOI8-T is the
better match.
r   zkoi8-rc              3  N   #    U  H  u  pUR                   S :X  d  M  Uv   M     g7f)zkoi8-tN)r   )r}   ir   s      ru   r   !_promote_koi8t.<locals>.<genexpr>  s!     Q$6DA!**:Paa$6s   %	%Nc              3  B   #    U  H  oS :  d  M
  U[         ;   v   M     g7frz   )rj   )r}   r~   s     ru   r   r     s     
A1D&1&&s   	)r   next	enumerater   )r   r   	koi8t_idxkoi8t_resultr   r   r   s          ru   _promote_koi8tr   {  s     gaj))X5QIg$6QSWXI

A
AAA) )' 2E 2an! 2E&v&&N Fs   'B6Bi   c                |    US:X  a  U $  U R                  USS9R                  SSS9$ ! [        [        4 a     gf = f)a<  Decode data from encoding and re-encode as UTF-8 for language scoring.

Returns None if the encoding is unknown. For UTF-8, returns data as-is.
Uses ``errors="ignore"`` because the data already passed byte-validity
filtering for the detected encoding; any residual invalid bytes are
irrelevant for language scoring.
utf-8ignore)errorssurrogatepassN)decodeencodeLookupError	TypeError)r   r   s     ru   _to_utf8r     sY     7{{8H{5<<O = 
 	
 # s   ( ;;c           	     b   / nSnSnU GH!  nUR                   c  UR                  b  [        UR                  5      nUcE  U (       a>  [        UR                  5      (       a$  Uc  [	        U 5      n[        XR                  US9u  pvUc^  U (       aW  [        S5      (       aG  [        XR                  5      nU(       a+  Ub  UR                  S:w  a  [	        U5      n[        USUS9u  pvUb1  UR                  [        UR                  UR                  US95        GM  UR                  U5        GM$     U$ )a  Fill in language for results missing it.

Tier 1: single-language encodings via hardcoded map (instant).
Tier 2: multi-language encodings via statistical bigram scoring (lazy).
Tier 3: decode to UTF-8, score against UTF-8 language models (universal fallback).
N)profiler   r   )
r    r   r   r   r   r	   r   r   r   r   )	r   r   filledr   utf8_profileresultlang_	utf8_datas	            ru   _fill_languager     s    %'F$(G)-L??"v'B!&//2D|);FOO)L)L?+D1G-dOOWU|);G)D)D$T??;	#+v'/I'4Y'?1!7LGA #!'#)#4#4!% f7 8 Mrw   c                D    [        X5      n[        X5      n[        X5      $ )zGApply confusion resolution, niche Latin demotion, and KOI8-T promotion.)r   r   r   )r   r   s     ru   _postprocess_resultsr     s#    
 't5G!$0G$((rw   r"   r   include_encodingsexclude_encodingsno_match_encodingempty_input_encodingc                  [        5       nU SU n [        XU5      n[        S U 5       5      n	U (       d  [        XiS5      $ [	        U 5      n
U
b  U
R
                  U	;   a  U
/$ [        U 5      nUb  UR
                  U	;   a  U/$ [        U 5      nUb   UR
                  b  UR
                  U	;   a  U/$ [        U 5      n[        U 5      nUc  Uc  [        XS9(       a  [        /$ [        U 5      nUb  UR
                  U	;   a  U/$ Ub  UR
                  U	;   a  U/$ Ub  UR
                  U	;   a  U/$ [        X5      nU(       d  [        XYS5      $ [        U UU5      nU(       d  [        XYS5      $ / nU Hp  nUR                  (       d  M  UR                   R#                  UR$                  5      nUc  ['        U UU5      nUS:  d  MS  UR)                  UR$                  U45        Mr     U(       a=  UR+                  S SS	9  US
   u  nnU[,        :  a  [/        U UUU5      n[1        U U5      $ [3        [5        U [7        U5      5      5      nU(       d  [        XYS5      $ [1        U U5      $ )zBCore pipeline logic. Returns list of results sorted by confidence.Nc              3  8   #    U  H  oR                   v   M     g 7fr   )r   )r}   r   s     ru   r   %_run_pipeline_core.<locals>.<genexpr>  s     'GJSJs   r   )	max_bytesr   r!   c                    U S   $ )Nr   r|   r   s    ru   r   $_run_pipeline_core.<locals>.<lambda>T  s    QqTrw   Tr   r   )r   r   	frozensetrv   r   r   r   r   r   r   r   _BINARY_RESULTr   r   r   r   r   r   r   r   r   r    _STRUCTURAL_CONFIDENCE_THRESHOLDr   r   r   r   r   )r   encoding_erar   r   r   r   r   r   
candidatesrs   
bom_resultutf1632_resultescape_resultutf8_precheckascii_precheckmarkup_resultr   r   r   scorer   
best_scorer   s                          ru   _run_pipeline_corer     s    
C
D
  ARSJ''GJ'GGG% +A
 	
 D!J*"5"5"@|
 -T2N!n&=&=&H
 +40M!"".""g-  %M "$'N
 	"d0
 *$/M ]%;%;w%F !n&=&=&H  ]%;%;w%F *$;%&7BUVV ,D2BCH%&7BUVV 24MM%%chh/E}0sC@s{!((#((E):;   >4@)!,:992')93G (g66 #D%0@*ABCG%&7BUVVg..rw   c          
     (   [        U UUUUUUS9n[        U S[         U5      nU(       d  Sn[        U5      eU V	s/ s HJ  n	U	R                  S:  a5  [        U	R                  [        U	R                  S5      U	R                  5      OU	PML     sn	$ s  sn	f )a1  Run the full detection pipeline.

:param data: The raw byte data to analyze.
:param encoding_era: Filter candidates to a specific era of encodings.
:param max_bytes: Maximum number of bytes to process.
:param include_encodings: If not ``None``, only return these encodings.
:param exclude_encodings: If not ``None``, never return these encodings.
:param no_match_encoding: Encoding returned when no candidate survives.
:param empty_input_encoding: Encoding returned for empty input.
:returns: A list of :class:`DetectionResult` sorted by confidence descending.
r   Nz/pipeline must always return at least one resultg      ?)	r   r   _LANG_SCORE_MAX_BYTESRuntimeErrorr   r   r   minr    )
r   r   r   r   r   r   r   r   msgr   s
             ru   run_pipeliner   d  s    * !+++1G T"8#897CG?3 	 A <<# 	

Cc$:AJJG	 	  s   AB)r   strrs   r%   rt   r   returnlist[DetectionResult])r   r   r   bytesr   bool)r   r  r   tuple[EncodingInfo, ...]r   r   r   r  )
r   r  r   zlist[tuple[str, float]]r   r  r   r   r   r   )r   r  r   r   r   r   )r   r  r   r   r   zbytes | None)r   r  r   r   r   intr   frozenset[str] | Noner   r  r   r   r   r   r   r   )I__doc__
__future__r   ro   chardet._utilsr   chardet.enumsr   chardet.modelsr   r   r   r	   chardet.pipeliner
   r   r   r   chardet.pipeline.asciir   chardet.pipeline.binaryr   chardet.pipeline.bomr   chardet.pipeline.confusionr   chardet.pipeline.escaper   chardet.pipeline.markupr   chardet.pipeline.statisticalr   chardet.pipeline.structuralr   r   r   chardet.pipeline.utf8r   chardet.pipeline.utf1632r   chardet.pipeline.validityr   chardet.registryr   r   r   rr   r   r   r&   __annotations__rU   r\   r_   ra   rj   rv   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r|   rw   ru   <module>r     s6   F "  , %   0 - + ? : 9 9 
 . < 8 9 6 tL $(   +4+   /8/1/ ^ 1n /8 "/ ^ "T 09(0 n  .-*3 /  *3L*  
PPP P 	P*B$          " 3
3.3 
3 	3l/
/./ // 
	/
 /d
" 8
" :  $(
(/((V)
)") ) '/
 04/3% '/
// /
 -/ -/ / / /J ',
 04/3% ',
,, ,
 -, -, , , ,rw   