
    ޺i%                       % S r SSKJr  SSKrSSKrSSKrSSKrSSKJ	r	J
r
JrJr  SSKJr  SSKJr  \\\\4   \\\   \\\\\4   4   4   4   r0 SS_SS	_S
S_SS_SS_SS_SS_SS_SS_SS_SS_SS_SS_S S!_S"S#_S$S%_S&S'_S(S)S*S+S,S-S.S/S0S1S2S3S4S5.ErS6\S7'   \R3                  5        V Vs0 s H  u  pX_M	     snn rS8\S9'   SCS: jr\R8                  SDS; j5       r0 SS_S	S_SS_SS_SS_SS_SS_SS_SS_SS_S!S_S#S_S%S_S'S_S(S_S*S_S)S_SSSSSS
SSSSSSSS<.ErS8\S='               SES> jr        SFS? jr           SGS@ jr!        SHSA jr"      SISB jr#gs  snn f )Jay  Confusion group resolution for similar single-byte encodings.

At runtime, loads pre-computed distinguishing byte maps from confusion.bin
and uses them to resolve statistical scoring ties between similar encodings.

Build-time computation (``compute_confusion_groups``, ``compute_distinguishing_maps``,
``serialize_confusion_data``) lives in ``scripts/confusion_training.py``.
    )annotationsN)NON_ASCII_BIGRAM_WEIGHTBigramProfileget_enc_indexscore_with_profile)DetectionResult)lookup_encodingLu   Ll   Lt   Lm   Lo   Mn   Mc   Me   Nd	   Nl
   No   Pc   Pd   Ps   Pe   Pi   PfPoSmScSkSoZsZlZpCcCfCsCoCn)                                       zdict[int, str]_INT_TO_CATEGORYzdict[str, int]_CATEGORY_TO_INTc                   0 nSn[         R                  " SX5      u  nUS-  n[        U5       GH  n[         R                  " SX5      u  nUS-  nXX%-    R                  S5      nX%-  n[         R                  " SX5      u  nUS-  nXX'-    R                  S5      nX'-  n[         R                  " SX5      u  n	US-  n/ n
0 n[        U	5       Ha  n[         R                  " SX5      u  pnUS-  nU
R	                  U5        [
        R                  US	5      [
        R                  US	5      4X'   Mc     [        U
5      U4XU4'   GM     U$ )
zLoad confusion group data from raw bytes.

:param data: The raw binary content of a confusion.bin file.
:returns: A :data:`DistinguishingMaps` dictionary keyed by encoding pairs.
r   z!Hr   z!Br   zutf-8z!BBBr   r7   )structunpack_fromrangedecodeappendrE   get	frozenset)dataresultoffset	num_pairs_
name_a_lenname_a
name_b_lenname_b	num_diffsdiff_bytes_list
categoriesbv	cat_a_int	cat_b_ints                  Y/var/www/piano.thomer.com/venv/lib/python3.13/site-packages/chardet/pipeline/confusion.py%deserialize_confusion_data_from_bytesr_   H   sa    "$FF%%dD9LY
aKF9**4>!v23::7C**4>!v23::7C))$=!%'13
y!A'-'9'9&$'O$B9aKF""2& $$Y5 $$Y5JN	 " %.o$>
#K 1 4 M    c                    [         R                  R                  S5      R                  S5      n U R	                  5       nU(       d  [
        R                  " S[        SS9  0 $  [        U5      n0 nUR                  5        H5  u  u  pgn[        U5      =(       d    Un	[        U5      =(       d    Un
XX4'   M7     U$ ! [        R                  [        4 a  nSU 3n[        U5      UeSnAff = f)zLoad confusion group data from the bundled confusion.bin file.

:returns: A :data:`DistinguishingMaps` dictionary keyed by encoding pairs.
zchardet.modelszconfusion.binuZ   chardet confusion.bin is empty — confusion resolution disabled; reinstall chardet to fixr   )
stacklevelzcorrupt confusion.bin: N)	importlib	resourcesfilesjoinpath
read_byteswarningswarnRuntimeWarningr_   rH   errorUnicodeDecodeError
ValueErroritemsr	   )refrawraw_mapsemsg
normalizedabvaluenorm_anorm_bs              r^   load_confusion_datarz   p   s     


#
#$4
5
>
>
OC
..
C'		
 	%8=
 &(J!) #(q #(q',F#$ *  LL,- %'s+o1$%s   #B; ;C+C&&C+)r.   r/   r0   r1   r2   r4   r3   r6   r5   r7   r   r   r   _CATEGORY_PREFERENCEc                   SnSn[        U 5      U-  nU(       d  gU HQ  nXH   u  p[        R                  U	S5      n[        R                  U
S5      nX:  a	  X[U-
  -  nMC  X:  d  MJ  XlU-
  -  nMS     XV:  a  U$ Xe:  a  U$ g)at  Resolve between two encodings using Unicode category voting.

For each distinguishing byte present in the data, compare the Unicode
general category under each encoding. The encoding whose interpretation
has the higher category preference score gets a vote. The encoding with
more votes wins.

:param data: The raw byte data to examine.
:param enc_a: First encoding name.
:param enc_b: Second encoding name.
:param diff_bytes: Byte values where the two encodings differ.
:param categories: Mapping of byte value to ``(cat_a, cat_b)`` Unicode
    general category pairs.
:returns: The winning encoding name, or ``None`` if tied.
r   N)rN   r{   rM   )rO   enc_aenc_b
diff_bytesrZ   votes_avotes_brelevantr[   cat_acat_bpref_apref_bs                r^   resolve_by_category_votingr      s    , GG+H!~%))%3%))%3?&G_&G  r`   c                `   ^  UR                  U5      nU(       d  g[        U 4S jU 5       5      $ )zDReturn the best bigram score across all language variants for *enc*.g        c              3  B   >#    U  H  u  pn[        TX#5      v   M     g 7f)N)r   ).0rS   model	model_keyprofiles       r^   	<genexpr>&_best_variant_score.<locals>.<genexpr>   s'      #+Ai 	7E55#+s   )rM   max)r   indexencvariantss   `   r^   _best_variant_scorer      s2     yy~H #+  r`   c                   [        U 5      S:  a  g0 n[        [        U 5      S-
  5       HM  nX   nXS-      nXc;  a  Xs;  a  M  US-  U-  nUS:  d  US:  a  [        OSn	UR                  US5      U	-   XH'   MO     U(       d  g[        R
                  " U5      n
[        5       n[        XU5      n[        XU5      nX:  a  U$ X:  a  U$ g)a  Resolve between two encodings by re-scoring only distinguishing bigrams.

Builds a focused bigram profile containing only bigrams where at least one
byte is a distinguishing byte, then scores both encodings against their
best language model.

:param data: The raw byte data to examine.
:param enc_a: First encoding name.
:param enc_b: Second encoding name.
:param diff_bytes: Byte values where the two encodings differ.
:returns: The winning encoding name, or ``None`` if tied.
r   Nr   r      r   )lenrJ   r   rM   r   from_weighted_freqr   r   )rO   r}   r~   r   freqib1b2idxweightr   r   best_abest_bs                 r^   resolve_by_bigram_rescorer      s    $ 4y1}D3t9q=!Wa%[B$8Qw"n-/$Y"t)(!HHS!$v-	 " ..t4GOE 7F 7Fr`   c                ,    X4U ;   a  X4$ X!4U ;   a  X!4$ g)zEFind the canonical key for a pair of encodings in the confusion maps.N )mapsr}   r~   s      r^   _find_pair_keyr     s+     	~~~~r`   c                Z   [        U5      S:  a  U$ US   nUS   nUR                  b  UR                  c  U$ [        5       n[        XBR                  UR                  5      nUc  U$ XE   u  pgUu  p[	        XXU5      n
[        XX5      nUb  UOU
nUb  XR                  :X  a  U$ X2/USS Q$ )a  Resolve confusion between similar encodings in the top results.

Compares the top two results. If they form a known confusion pair,
it determines which encoding should win by checking the
resolve_by_bigram_rescore and resolve_by_category_voting tie-breakers
and giving precedence to bigram re-scoring when they disagree.

:param data: The raw byte data to examine.
:param results: Detection results sorted by confidence descending.
:returns: A reordered list of :class:`DetectionResult` with the winner first.
r   r   r   N)r   encodingrz   r   r   r   )rO   resultstopsecondr   pair_keyr   rZ   r}   r~   
cat_winnerbigram_winnerwinners                r^   resolve_confusion_groupsr   '  s     7|a
!*CQZF
||v6 DdLL&//BH!^JLE+DJWJ-d5MM+7]ZF~<</&'!"+&&r`   )rO   bytesreturnDistinguishingMaps)r   r   )rO   r   r}   strr~   r   r   frozenset[int]rZ   zdict[int, tuple[str, str]]r   
str | None)r   r   r   z2dict[str, list[tuple[str | None, bytearray, str]]]r   r   r   float)
rO   r   r}   r   r~   r   r   r   r   r   )r   r   r}   r   r~   r   r   ztuple[str, str] | None)rO   r   r   list[DetectionResult]r   r   )$__doc__
__future__r   	functoolsimportlib.resourcesrc   rH   rh   chardet.modelsr   r   r   r   chardet.pipeliner   chardet.registryr	   dicttupler   rN   intr   rE   __annotations__rn   rF   r_   cacherz   r{   r   r   r   r   r   )kvs   00r^   <module>r      s   #      - , 	#s(O	)C.$sE#s(O34
457 $t$t$ t$ t	$
 t$ t$ t$ t$ t$ t$ $ $ $ $ $  !$" #$$ 	=$ . D 6F5K5K5M#N5MTQAD5M#N . N%P  >("("( 	"( 	!	(
 	!( 	!( 	!( 	!( 	!( 	!( 	!( 	!( 	!( 	!( 	!(  	!!(" 	!#($ 











=( n D'
'' ' 	'
 +' 'T= 
 	,
,, , 	,
 ,^



 
 	
&'
&'"&' &'E $Os   -E-