o
    zUi7                     @   s
  d Z ddlZddlZddlZddlZddlZddlmZ ddlm	Z	 ddl
m
Z
 dd Zdd	 Zd
d Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zdd Zh d Zd0d"d#Zd$d% Zd1d'd(Zd)d* Zd+d, Zd-d. Zed/kre  dS dS )2u   
Moodhoney Library Scanner

Scans local music folders, reads metadata (ID3/Vorbis/M4A), and fuzzy-matches
against Bill's AOTY JSON lists (1980–2025). Outputs a match report and a
structured library-audit.json.

Usage:
    python3 scripts/moodhoney_library_scanner.py ~/Music /Volumes/Drive/Albums

    Options:
        --threshold 80      Fuzzy match threshold (0-100, default 80)
        --output FILE       Output JSON path (default: library-audit.json)
        --verbose           Print every match attempt
    N)Path)defaultdict)datetimec                 C   s  i }zAt | d0}|d}t|dk s|dd dkr*t| W  d   W S |d }|dd }|d d> |d	 d
> B |d d> B |d B }||}d}|t|d k r3|dkr|||d  jddd}	|	d  srn|dkr||d |d  }
|
d d> |
d	 d
> B |
d d> B |
d B }ntd||d |d  d }|d7 }n@|dkr|||d  jddd}	|	d  sno||d  d> ||d  d> B ||d  B }|d7 }dddd}||	|	}	nnG|dks|| t|krn:||||  }||7 }|	dv r+t	|}|	dkr||d< n|	dkr"||d< n	|	dkr+||d< |t|d k sZW d   n	1 s>w   Y  W n
 t
yN   Y nw |sVt| S |S )zIRead ID3v2 tags from an MP3 file. Returns dict with artist, album, title.rb
   N   s   ID3   r                     asciiignoreerrors   >I      TIT2TPE1TALB)TT2TP1TAL)r   r   r   titleartistalbum)openreadlen
read_id3v1decodeisalphastructunpackget_decode_id3_text	Exception)filepathresultfheaderversion_major
size_bytessizedataposframe_idfs
frame_sizeid_map
frame_datatext r:   R/sessions/determined-charming-fermi/mnt/clawd/scripts/moodhoney_library_scanner.py
read_id3v2   sj   
,
. 
,





6r<   c                 C   s   | sdS | d }| dd }z6|dkr| ddW S |dkr)| ddW S |dkr6| d	dW S |d
krC| ddW S W n	 tyM   Y nw |j ddddS )zDecode ID3v2 text frame data. r   r
   Nlatin-1 zutf-16r   z	utf-16-ber   utf-8r   r   )r$   stripr*   )r2   encodingpayloadr:   r:   r;   r)   a   s$   r)   c                 C   s  i }zt | dr}|dd |d}|dd dkr&|W  d   W S |dd jd	d
dd }|dd jd	d
dd }|dd jd	d
dd }|r\||d< |rb||d< |rq||d< W d   W |S W d   W |S 1 s}w   Y  W |S  ty   Y |S w )z2Read ID3v1 tags from the last 128 bytes of an MP3.r   ir      Nr   s   TAG!   r>   r   r   r?   ?   ]   r   r   r   )r    seekr!   r$   rA   r*   )r+   r,   r-   tagr   r   r   r:   r:   r;   r#   v   s2   
   r#   c           	      C   s  i }zt | dr}|d}|dkr|W  d   W S 	 |d}t|dk r)n6|d d@ dk}|d d@ }|d	 d
> |d d> B |d B }|dkrU||}t|}n||d	 |r^nqW d   W |S W d   W |S W d   W |S 1 s}w   Y  W |S  ty   Y |S w )z&Read Vorbis comments from a FLAC file.r   r   s   fLaCNTr   rD      r
   r   r   r   r   )r    r!   r"   _parse_vorbis_commentrH   r*   )	r+   r,   r-   magicblock_headeris_last
block_type
block_size
block_datar:   r:   r;   read_flac_tags   sJ   

 
rR   c                 C   s   i }z@t | d0}|d}d}||}|dkr#|W  d   W S t||t| d }W d   W |S 1 s;w   Y  W |S  tyL   Y |S w )z&Read Vorbis comments from an OGG file.r   i   s   vorbisN)r    r!   findrK   r"   r*   )r+   r,   r-   r2   markeridxr:   r:   r;   read_ogg_tags   s$   

rW   c           
      C   sX  i }zd}t d| ||d  d }|d| 7 }t d| ||d  d }|d7 }t|D ]o}|d t| kr= W |S t d| ||d  d }|d7 }|| t| kr[ W |S | |||  jddd}||7 }d|v r|dd\}}	|  }|d	kr|	 |d
< q/|dkr|	 |d< q/|dv r|	 |d< q/W |S  ty   Y |S w )zParse a Vorbis comment block.r   z<Ir   r@   r   r   =r
   TITLEr   ARTISTr   )ALBUM
ALBUMTITLEr   )	r&   r'   ranger"   r$   splitupperrA   r*   )
r2   r,   r3   
vendor_lencomment_count_clencommentkeyvalr:   r:   r;   rK      sD   rK   c                    s.  i }zt | d}|d}W d   n1 sw   Y  d fdd	 ddd	d
}| D ][\}} |g d}|du r>q.|\}}	 |d|d |	}
|
du rPq.|
\}} ||||}|du r`q.|\}} |d||}|du rpq.|\}}||d | jddd }|r|||< q.W |S  ty   Y |S w )z1Read metadata from M4A/AAC (MP4 container) files.r   i    Nr   c                    s  |d u rt | }|}t|tr|d n|}t|tr |dd  ng }||d k rtd| ||d  d }| |d |d  }|dk rG	 d S t|| |}	z|d}
W n ty`   d}
Y nw |
|krv|rp | ||d |	S |d |	fS ||7 }||d k s(d S )Nr   r
   r   r   r   r   r=   )r"   
isinstancelistr&   r'   minr$   r*   )r2   pathstartendr3   target	remainingr1   	atom_type
actual_endname	find_atomr:   r;   rs      s0   z read_m4a_tags.<locals>.find_atomr   r   r   )u   ©namu   ©ARTu   ©alb)moovudtametailstr   r2   r   r@   r   r   )r   N)r    r!   itemsr$   rA   r*   )r+   r,   r-   r2   tag_map	atom_namefieldloc
meta_startmeta_endrw   
ilst_startilst_endrm   t_startt_enddata_locd_startd_endr9   r:   rr   r;   read_m4a_tags   sJ   r   c                 C   s  i }t | }|jj}|j}|dri S td|}|r^|d |d< |d |d< t	dd|d  |d< td	|}|rM|d |d
< ntd|}|r^|d |d
< |s|jjj}|r|dvrh d}	d|vr||	vr||d< ||d< td|}|r|d |d
< |
ds|r||	vr|dvr||d< td|}
|
r|
d |d< td|}|r|d |d
< |S )a  Extract artist/album/title from folder structure and filename.

    Handles common patterns:
        Artist - Album/Artist - Album - 01 Title.ext
        Artist - Album/01 Title.ext
        Artist - Album/01 - Title.ext
        Artist/Album/01 Title.ext
    ._z^(.+?)\s*-\s*(.+)$r
   r   r   r   z\s*\((?:pre-order|\d+)\)\s*$r=   z^.+?-\s*.+?-\s*\d+\s+(.+)$r   z^\d+\s*[-.]?\s*(.+)$)r=   /.>   MusicmediaDJMOODiTunesz - z^.+?-\s*(.+?)\s*-\s*\d+)r   parentrq   stem
startswithrematchgrouprA   subr(   )r+   r,   pr   fnamefolder_matchtitle_matchtitle_match2grandparentdrive_namesfile_album_matchr:   r:   r;   parse_from_path&  sJ   	


r   c                 C   s   t | j }t | jdri S i }|dkrt| }n|dkr%t| }n|dv r.t| }n|dv r6t| }|	dr@|	dsYt
| }dD ]}|	|sX|	|rX|| ||< qF|S )	zHRead metadata from any supported audio file. Falls back to path parsing.r   .mp3.flac).ogg.oga).m4a.aac.mp4.alacr   r   )r   r   r   )r   suffixlowerrq   r   r<   rR   rW   r   r(   r   )r+   extr,   	path_metare   r:   r:   r;   read_metadatal  s&   


r   c                 C   sf   |    } tjdd| tjd} tjdd| tjd} tdd| } tdd| } tdd|  } | S )	z&Normalize a string for fuzzy matching.z\s*\[(album|artist)\d+\]r=   flagszc\s*[\(\[](deluxe|remaster|remastered|expanded|anniversary|bonus|special)\s*(edition|version)?[\)\]]z^the\s+z[^\w\s]z\s+ )r   rA   r   r   
IGNORECASE)sr:   r:   r;   	normalize  s   r   c                 C   s&  | s|sdS | r
|sdS t | t |}}t||dkr*t||t|| dk r*dS tt|d }dg|d  }td|d D ]=}||d< td|d D ]*}| |d  ||d  kr]dnd}t||d  d || d ||d  | ||< qM||}}q@|| }	t||}
ttd|	|
  d S )zRCalculate similarity ratio (0-100) between two strings using Levenshtein distance.d   r   皙?r
   )r"   maxri   rh   r]   intround)s1s2len1len2prevcurrijcostdistancemax_lenr:   r:   r;   levenshtein_ratio  s$   & 2
r   c           
      C   sH   t | }t |}t |}t |}t||}t||}	t|d |	d  S )zDCalculate combined match score between AOTY entry and file metadata.g333333?r   )r   r   r   )
aoty_artist
aoty_albumfile_artist
file_albumna_aotyna_filenb_aotynb_fileartist_scorealbum_scorer:   r:   r;   match_score  s   

r   c                 C   s  g }t j| ddd}t j| ddddddd}t }||fD ]}t j|s(qtt |D ]}|ds7q/t	d|}|rD|
d	nd
}||v rO||krOq/t j||}	zt|	d}
t|
}W d   n1 slw   Y  W n	 ty{   Y q/w |dg }|d|}|D ]?}t|dd }t|dd }|dd}|sqtjdd|tjd }|||pd|||||d || qq/q|S )z<Load all AOTY JSON files and return a list of album entries.importsaotysyrrosisarchiverawz2023-11.jsonz(\d{4})r
   unknownrNrx   	list_namer   r=   r   orderi  z\s*\[(Album|Artist)\d+\]r   z	(unknown))r   r   	raw_titleyearrankrh   source_file)osrj   joinsetisdirsortedlistdirendswithr   searchr   r    jsonloadr*   r(   strrA   r   r   appendadd)
clawd_rootalbumsprimary_dirarchive_dir
seen_yearsdir_pathr   
year_matchr   r+   r-   r2   rx   r   itemr   r   r   clean_titler:   r:   r;   load_aoty_data  sZ   
	+r   >   .aif.wav.aiffr   r   r   r   r   r   r   r   Fc                 C   s  g }d}d}| D ]}t j|}t j|s!td| tjd qtd| tjd d}t |D ]\}}}	dd |D |dd< |	D ]}
t|
j	
 }|tvrRqDt j||
}zEt|}|dsh|d	r|||dd
|d	d
|dd
tt j|d dd |d7 }n|d7 }|rtd| tjd W qD ty } z|d7 }|rtd| d| tjd W Y d}~qDd}~ww q2td| dtjd qtdt| d| d| dtjd |S )z5Scan folders for audio files and read their metadata.r   z  WARNING: Folder not found: filez  Scanning: c                 S   s   g | ]	}| d s|qS )r   )r   ).0dr:   r:   r;   
<listcomp>  s    z&scan_music_folders.<locals>.<listcomp>Nr   r   r=   r   i   r
   )rj   r   r   r   size_mbz    No tags: z    Error: : z
    Found z tagged audio filesz	  Total:  files, z skipped (no tags), z errors)r   rj   
expanduserr   printsysstderrwalkr   r   r   AUDIO_EXTENSIONSr   r   r(   r   r   getsizer*   r"   )foldersverbosefilesskippedr   foldercountrootdirs	filenamesr   r   fpathrv   er:   r:   r;   scan_music_folders  sT   



&r  c                 C   s<   t t}| D ]}t|d t|d f}|| | q|S )zAGroup files by normalized (artist, album) for efficient matching.r   r   )r   rh   r   r   )r  indexr-   re   r:   r:   r;   build_file_album_indexC  s
   r  P   c                 C   s  t |}t| }g }g }g }| D ]}	t|	d }
t|	d }d}d}|D ]\}}t|	d |	d ||}||kr?|}||f}q&||kr|r|| }i |	|t|ttdd |D d|d d d	}t|d
krq|| n|| |rt	d| d|	d  d|	d  d|	d  d|	d  dt| dt
jd q||	 |rt	d| d|	d  d|	d  d|	d  d|	d  dt
jd q|||fS )z&Match AOTY albums against local files.r   r   r   Nc                 s       | ]}|d  V  qdS )r   Nr:   )r   r-   r:   r:   r;   	<genexpr>h      z match_library.<locals>.<genexpr>r
   rj   )r   matched_filestotal_size_mbsample_pathr   z	  MATCH (z%):     — z [r    #r   u   ] → z filesr   z	  MISS  (])r  rh   keysr   r   r"   r   sumr   r   r   r   )aoty_albumsr  	thresholdr  
file_index	file_keysmatchedpartialmissingr   nanb
best_scorebest_key	fk_artistfk_albumscorer  entryr:   r:   r;   match_libraryL  sJ   

H
>
r-  c                 C   s2   zt | }|d d  dW S  ty   Y dS w )Nr   r   r   )r   
ValueError)year_stryr:   r:   r;   	decade_of~  s   r1  c                    sN  t | }t |}t |}t |}|| }tttt | D ]}	t|	d   d7  < q|| D ]}
 t|
d   d7  < q1|d }t|d d d}dd |D }d	d |D }tt}| D ]}	||	d
   d7  < qat| dd ddd }tdd || D fdd|D }ttdd || D d d}g }|	d |	d |	dt
 d  |	d |	d |	d|d |	d|dd|d |  d |	d |dd|d |  d |	d!|dd|d |  d |	d |	d"|dd#t| d$ |	d%| d& |	d |	d' |	d( |	d' t D ]@}| } |d}|dkrM|d | nd}d)|d*  d+d|d*    }|	d|d,d-|d.d/|d0d| d| d1 q6|	d |r|	d' |	d2 |	d' t|d3d dD ]}
|	d|
d  d4|
d
  d5|
d6   q|	d |r|	d' |	d7 |	d' |dd D ]\}}|	d| d| d8 q|	d |	d' |	d9 |	d' |	d |dkr|	d:| d; |	d<t| d= |	d> n-|d?kr4|	d:| d@ |	d<t| dA |	dB n|	d:| dC |	dD |	d |	d' |	dE |	d' t|| dFd dddG }|D ](}
|	d|
d  dH|
dI dJdK|
d
  d5|
d6  d|
dL  dM|
dN  dO qf|	d |	d' |	dP |	d' t|dQd dddG }|D ]}
|	d|
d  dH|
dI  dK|
d
  d5|
d6   q|	d |	d dR|}t
  |||||t||dS fdTdUt D t|dVd dt|dWd dt|dXd dt|dYd dt|dZd dt|d[	}||fS )\z5Generate a human-readable report and structured JSON.r   r
   r   g      @<   r   c                 S   s   g | ]
}|d  dkr|qS )r   r
   r:   r   mr:   r:   r;   r         z#generate_report.<locals>.<listcomp>c                 S   s   g | ]
}|d  dkr|qS )r   r   r:   r3  r:   r:   r;   r     r5  r   c                 S   s
   | d  S )Nr
   r:   xr:   r:   r;   <lambda>  s   
 z!generate_report.<locals>.<lambda>)re   N   c                 s   r  )r   Nr:   r3  r:   r:   r;   r    r  z"generate_report.<locals>.<genexpr>c                    s    g | ]\}}| vr||fqS r:   r:   )r   ac)matched_artistsr:   r;   r     s     c                 s   s    | ]	}| d dV  qdS )r  r   Nr(   r3  r:   r:   r;   r    s    i   z<============================================================u#     MOODHONEY RADIO — LIBRARY AUDITz  z%Y-%m-%d %H:%Mr=   z  Total AOTY albums:     ,z  Matched locally:       z (r   z%)z  Partial match:         z  Missing:               z  Estimated library:     ~z tracks / ~z hoursz  Library size on disk:  ~z GBu   ────────────────────────────────────────────────────────────z  DECADE BREAKDOWNu   █r   u   ░z>7sz:  z>3r   z<3%z%  MISSING #1 PICKS (highest priority)c                 S      | d S Nr   r:   r6  r:   r:   r;   r8        z #1: r  r   z5  MISSING TOP ARTISTS (appear most across your lists)z appearances)u&     YOUR LAUNCH LIBRARY — WHAT'S READYz  You have u*    albums matched — that's a solid launch.z  ~z hours of non-repeating music.zB  That's enough for several days of radio before anything repeats.2   u,    albums matched — a decent starting point.z2 hours of music. The station will feel a bit tightz/  at first but grows quickly as you add albums.u(    albums matched — enough to test with.zA  You'll want to add more before sharing with the WhatsApp group.z  TOP MATCHED ALBUMS (by rank)c                 S      | d | d fS Nr   r   r:   r6  r:   r:   r;   r8           r  r   z>2r   r  r   r   z% match)z8  NEXT ALBUMS TO BUY (Top 5 per year, currently missing)c                 S   rD  rE  r:   r6  r:   r:   r;   r8    rF  
)total_aoty_albumsr"  r#  r$  estimated_tracksestimated_hourslibrary_size_gbc                    s$   i | ]}||   |d dqS )r   )totalr"  r=  )r   decade)decade_matcheddecade_totalr:   r;   
<dictcomp>  s    
z#generate_report.<locals>.<dictcomp>c                 S   rD  Nr   r   r:   r6  r:   r:   r;   r8    rF  c                 S   rD  rR  r:   r6  r:   r:   r;   r8    rF  c                 S   rD  rR  r:   r6  r:   r:   r;   r8    rF  c                 S   r@  rA  r:   r6  r:   r:   r;   r8    rB  c                 S   rD  rE  r:   r6  r:   r:   r;   r8    rF  )		generatedsummarydecade_breakdownmatched_albumspartial_albumsmissing_albumsmissing_number_onesbuy_list_top5artist_frequency)r"   r   r   r1  r   r   rx   r   r  r   r   nowstrftimer  r(   r   	isoformatdict)r  r"  r#  r$  rM  	n_matched	n_partial	n_missingn_availabler:  r4  
est_tracks	est_hoursmissing_no1missing_top5artist_countstop_artistsmissing_top_artiststotal_size_gblinesrN  dtdmpctbarr   r	  best_matchesbuy_listreport_textauditr:   )rO  rP  r<  r;   generate_report  s    



"""




4



,















N



6



	
ru  c                  C   sh  t jdd  } d}d }d}g }d}|t| k ry| | dkr2|d t| k r2t| |d  }|d7 }nA| | dkrK|d t| k rK| |d  }|d7 }n(| | dkrXd	}|d7 }n| | d
krhtt t d n|| |  |d7 }|t| k s|stdt jd tdt jd tdt jd t d t	j
t	j
t}t	j
|}|st	j
|d}tdt jd tdt jd tdt jd t|}tdt| dttdd |D  dt jd i }	g }
|D ]"}t|d t|d f}||	vs|d |	| d k r||	|< qt|	 }
tdt|
 t jd tdt jd t||d}|s7tdt jd t d td| d t jd t|
|||d!\}}}t|
|||\}}td" t| t|d#}tj||dtd$ W d    n	1 sxw   Y  td%| t jd |d&d'}t|d#}|| W d    n	1 sw   Y  td(| t jd d S ))Nr
   r  Fr   z--thresholdr   z--outputz	--verboseTz--helpzAUsage: python3 moodhoney_library_scanner.py FOLDER1 [FOLDER2 ...]r   zL       python3 moodhoney_library_scanner.py ~/Music /Volumes/External/Albumsz2       python3 moodhoney_library_scanner.py --helpzlibrary-audit.jsonu   
🎵 Moodhoney Library Scannerux   ────────────────────────────────────────u   
📋 Loading AOTY lists...z	  Loaded z albums across c                 s   r  )r   Nr:   )r   r:  r:   r:   r;   r  L  r  zmain.<locals>.<genexpr>z yearsr   r   r   z  Unique albums (after dedup): u   
🔍 Scanning music folders...)r  z1
  No audio files found. Check your folder paths.u.   
🎯 Matching against AOTY lists (threshold: z%)...)r  r  rH  w)indentdefaultu   
📁 Full audit saved to: r   z-report.txtu   📄 Report saved to: )r   argvr"   r   r   __doc__exitr   r   r   rj   dirnameabspath__file__r   r   r   r   rh   valuesr  r-  ru  r    r   dumpr   replacewrite)argsr  output_pathr  r  r   
script_dirr   r  seenunique_albumsr:  re   r  r"  r#  r$  rs  rt  r-   report_pathr:   r:   r;   main  s   



2 
r  __main__)F)r  F) rz  r   r   r   r&   r   pathlibr   collectionsr   r   r<   r)   r#   rR   rW   rK   r   r   r   r   r   r   r   r  r  r  r-  r1  ru  r  __name__r:   r:   r:   r;   <module>   sB   B"IF"@
5
	2 Y
