#!/usr/bin/env python3
"""
Moodhoney Library Scanner

Scans local music folders, reads metadata (ID3/Vorbis/M4A), and fuzzy-matches
against Bill's AOTY JSON lists (1980–2025). Outputs a match report and a
structured library-audit.json.

Usage:
    python3 scripts/moodhoney_library_scanner.py ~/Music /Volumes/Drive/Albums

    Options:
        --threshold 80      Fuzzy match threshold (0-100, default 80)
        --output FILE       Output JSON path (default: library-audit.json)
        --verbose           Print every match attempt
"""

import json
import os
import re
import struct
import sys
from pathlib import Path
from collections import defaultdict
from datetime import datetime

# ─────────────────────────────────────────────
# Pure-Python audio metadata readers
# ─────────────────────────────────────────────

def read_id3v2(filepath):
    """Read ID3v2 tags from an MP3 file. Returns dict with artist, album, title."""
    result = {}
    try:
        with open(filepath, 'rb') as f:
            header = f.read(10)
            if len(header) < 10 or header[:3] != b'ID3':
                return read_id3v1(filepath)

            version_major = header[3]
            # Size is synchsafe integer
            size_bytes = header[6:10]
            size = (size_bytes[0] << 21) | (size_bytes[1] << 14) | (size_bytes[2] << 7) | size_bytes[3]

            data = f.read(size)
            pos = 0

            while pos < len(data) - 10:
                if version_major >= 3:
                    # ID3v2.3/v2.4: 4-byte frame ID, 4-byte size, 2-byte flags
                    frame_id = data[pos:pos+4].decode('ascii', errors='ignore')
                    if not frame_id[0].isalpha():
                        break
                    if version_major == 4:
                        # v2.4 uses synchsafe integer for frame size
                        fs = data[pos+4:pos+8]
                        frame_size = (fs[0] << 21) | (fs[1] << 14) | (fs[2] << 7) | fs[3]
                    else:
                        frame_size = struct.unpack('>I', data[pos+4:pos+8])[0]
                    pos += 10
                elif version_major == 2:
                    # ID3v2.2: 3-byte frame ID, 3-byte size
                    frame_id = data[pos:pos+3].decode('ascii', errors='ignore')
                    if not frame_id[0].isalpha():
                        break
                    frame_size = (data[pos+3] << 16) | (data[pos+4] << 8) | data[pos+5]
                    pos += 6
                    # Map v2.2 frame IDs to v2.3+
                    id_map = {'TT2': 'TIT2', 'TP1': 'TPE1', 'TAL': 'TALB'}
                    frame_id = id_map.get(frame_id, frame_id)
                else:
                    break

                if frame_size <= 0 or pos + frame_size > len(data):
                    break

                frame_data = data[pos:pos+frame_size]
                pos += frame_size

                if frame_id in ('TIT2', 'TPE1', 'TALB'):
                    text = _decode_id3_text(frame_data)
                    if frame_id == 'TIT2':
                        result['title'] = text
                    elif frame_id == 'TPE1':
                        result['artist'] = text
                    elif frame_id == 'TALB':
                        result['album'] = text

    except Exception:
        pass

    if not result:
        return read_id3v1(filepath)
    return result


def _decode_id3_text(data):
    """Decode ID3v2 text frame data."""
    if not data:
        return ''
    encoding = data[0]
    payload = data[1:]
    try:
        if encoding == 0:
            return payload.decode('latin-1').strip('\x00')
        elif encoding == 1:
            # UTF-16 with BOM
            return payload.decode('utf-16').strip('\x00')
        elif encoding == 2:
            return payload.decode('utf-16-be').strip('\x00')
        elif encoding == 3:
            return payload.decode('utf-8').strip('\x00')
    except Exception:
        pass
    return payload.decode('latin-1', errors='ignore').strip('\x00')


def read_id3v1(filepath):
    """Read ID3v1 tags from the last 128 bytes of an MP3."""
    result = {}
    try:
        with open(filepath, 'rb') as f:
            f.seek(-128, 2)
            tag = f.read(128)
            if tag[:3] != b'TAG':
                return result
            title = tag[3:33].decode('latin-1', errors='ignore').strip('\x00').strip()
            artist = tag[33:63].decode('latin-1', errors='ignore').strip('\x00').strip()
            album = tag[63:93].decode('latin-1', errors='ignore').strip('\x00').strip()
            if title: result['title'] = title
            if artist: result['artist'] = artist
            if album: result['album'] = album
    except Exception:
        pass
    return result


def read_flac_tags(filepath):
    """Read Vorbis comments from a FLAC file."""
    result = {}
    try:
        with open(filepath, 'rb') as f:
            magic = f.read(4)
            if magic != b'fLaC':
                return result

            while True:
                block_header = f.read(4)
                if len(block_header) < 4:
                    break
                is_last = (block_header[0] & 0x80) != 0
                block_type = block_header[0] & 0x7F
                block_size = (block_header[1] << 16) | (block_header[2] << 8) | block_header[3]

                if block_type == 4:  # VORBIS_COMMENT
                    block_data = f.read(block_size)
                    result = _parse_vorbis_comment(block_data)
                    break
                else:
                    f.seek(block_size, 1)

                if is_last:
                    break
    except Exception:
        pass
    return result


def read_ogg_tags(filepath):
    """Read Vorbis comments from an OGG file."""
    result = {}
    try:
        with open(filepath, 'rb') as f:
            # Read through OGG pages looking for vorbis comment header
            data = f.read(65536)  # Read first 64KB
            # Look for vorbis comment header marker
            marker = b'\x03vorbis'
            idx = data.find(marker)
            if idx == -1:
                return result
            result = _parse_vorbis_comment(data[idx + len(marker):])
    except Exception:
        pass
    return result


def _parse_vorbis_comment(data):
    """Parse a Vorbis comment block."""
    result = {}
    try:
        pos = 0
        vendor_len = struct.unpack('<I', data[pos:pos+4])[0]
        pos += 4 + vendor_len
        comment_count = struct.unpack('<I', data[pos:pos+4])[0]
        pos += 4

        for _ in range(comment_count):
            if pos + 4 > len(data):
                break
            clen = struct.unpack('<I', data[pos:pos+4])[0]
            pos += 4
            if pos + clen > len(data):
                break
            comment = data[pos:pos+clen].decode('utf-8', errors='ignore')
            pos += clen

            if '=' in comment:
                key, val = comment.split('=', 1)
                key = key.upper().strip()
                if key == 'TITLE':
                    result['title'] = val.strip()
                elif key == 'ARTIST':
                    result['artist'] = val.strip()
                elif key in ('ALBUM', 'ALBUMTITLE'):
                    result['album'] = val.strip()
    except Exception:
        pass
    return result


def read_m4a_tags(filepath):
    """Read metadata from M4A/AAC (MP4 container) files."""
    result = {}
    try:
        with open(filepath, 'rb') as f:
            data = f.read(2 * 1024 * 1024)  # Read first 2MB

        def find_atom(data, path, start=0, end=None):
            if end is None:
                end = len(data)
            pos = start
            target = path[0] if isinstance(path, list) else path
            remaining = path[1:] if isinstance(path, list) else []

            while pos < end - 8:
                size = struct.unpack('>I', data[pos:pos+4])[0]
                atom_type = data[pos+4:pos+8]
                if size < 8:
                    break
                actual_end = min(pos + size, end)

                try:
                    name = atom_type.decode('ascii')
                except Exception:
                    name = ''

                if name == target:
                    if remaining:
                        return find_atom(data, remaining, pos + 8, actual_end)
                    else:
                        return (pos + 8, actual_end)

                pos += size
            return None

        # iTunes metadata atoms
        tag_map = {
            '\xa9nam': 'title',
            '\xa9ART': 'artist',
            '\xa9alb': 'album',
        }

        for atom_name, field in tag_map.items():
            loc = find_atom(data, ['moov', 'udta', 'meta'])
            if loc is None:
                continue
            # meta atom has 4 bytes of version/flags after the header
            meta_start, meta_end = loc
            # Search for 'ilst' inside meta (skip 4 bytes version/flags)
            ilst = find_atom(data, 'ilst', meta_start + 4, meta_end)
            if ilst is None:
                continue
            ilst_start, ilst_end = ilst
            # Search for the specific atom inside ilst
            target = find_atom(data, atom_name, ilst_start, ilst_end)
            if target is None:
                continue
            t_start, t_end = target
            # Inside the atom, look for 'data' sub-atom
            data_loc = find_atom(data, 'data', t_start, t_end)
            if data_loc is None:
                continue
            d_start, d_end = data_loc
            # Skip 8 bytes (version/flags + locale)
            text = data[d_start+8:d_end].decode('utf-8', errors='ignore').strip()
            if text:
                result[field] = text

    except Exception:
        pass
    return result


def parse_from_path(filepath):
    """Extract artist/album/title from folder structure and filename.

    Handles common patterns:
        Artist - Album/Artist - Album - 01 Title.ext
        Artist - Album/01 Title.ext
        Artist - Album/01 - Title.ext
        Artist/Album/01 Title.ext
    """
    result = {}
    p = Path(filepath)
    parent = p.parent.name
    fname = p.stem  # filename without extension

    # Skip macOS resource fork files
    if fname.startswith('._'):
        return {}

    # Pattern 1: "Artist - Album" folder name (most common on DJMOOD)
    folder_match = re.match(r'^(.+?)\s*-\s*(.+)$', parent)
    if folder_match:
        result['artist'] = folder_match.group(1).strip()
        result['album'] = folder_match.group(2).strip()
        # Clean up album: remove "(pre-order)", "(1)", etc.
        result['album'] = re.sub(r'\s*\((?:pre-order|\d+)\)\s*$', '', result['album']).strip()

        # Try to extract track title from filename
        # Pattern: "Artist feat. X - Album - 01 Title"
        title_match = re.match(r'^.+?-\s*.+?-\s*\d+\s+(.+)$', fname)
        if title_match:
            result['title'] = title_match.group(1).strip()
        else:
            # Pattern: "01 Title" or "01 - Title"
            title_match2 = re.match(r'^\d+\s*[-.]?\s*(.+)$', fname)
            if title_match2:
                result['title'] = title_match2.group(1).strip()

    if not folder_match:
        # Pattern 2: "Artist/Album/track" (two-level folder structure)
        grandparent = p.parent.parent.name
        if grandparent and grandparent not in ('', '/', '.'):
            # Check if grandparent looks like an artist name (no " - " in it)
            # and it's not the drive/root name
            drive_names = {'DJMOOD', 'Music', 'iTunes', 'media'}
            if ' - ' not in grandparent and grandparent not in drive_names:
                result['artist'] = grandparent
                result['album'] = parent
                # Title from filename
                title_match = re.match(r'^\d+\s*[-.]?\s*(.+)$', fname)
                if title_match:
                    result['title'] = title_match.group(1).strip()

        # Pattern 3: Bare artist folder at drive root (e.g., DJMOOD/Caribou/track.wav)
        # The folder is just an artist name, no album subfolder
        if not result.get('artist'):
            # Use folder name as artist, try to get album from filename
            if parent and parent not in drive_names and parent not in ('', '/', '.'):
                result['artist'] = parent
                # Try filename patterns like "Artist - Album - Track" or just use artist as album
                file_album_match = re.match(r'^.+?-\s*(.+?)\s*-\s*\d+', fname)
                if file_album_match:
                    result['album'] = file_album_match.group(1).strip()
                # Title from filename
                title_match = re.match(r'^\d+\s*[-.]?\s*(.+)$', fname)
                if title_match:
                    result['title'] = title_match.group(1).strip()

    return result


def read_metadata(filepath):
    """Read metadata from any supported audio file. Falls back to path parsing."""
    ext = Path(filepath).suffix.lower()

    # Skip macOS resource fork files
    if Path(filepath).name.startswith('._'):
        return {}

    # Try tag-based reading first for formats that support tags
    result = {}
    if ext == '.mp3':
        result = read_id3v2(filepath)
    elif ext == '.flac':
        result = read_flac_tags(filepath)
    elif ext in ('.ogg', '.oga'):
        result = read_ogg_tags(filepath)
    elif ext in ('.m4a', '.aac', '.mp4', '.alac'):
        result = read_m4a_tags(filepath)

    # For WAV/AIF or when tags are empty, parse from folder/filename
    if not result.get('artist') or not result.get('album'):
        path_meta = parse_from_path(filepath)
        # Merge: path_meta fills in gaps, tag data takes priority
        for key in ('artist', 'album', 'title'):
            if not result.get(key) and path_meta.get(key):
                result[key] = path_meta[key]

    return result


# ─────────────────────────────────────────────
# Fuzzy matching
# ─────────────────────────────────────────────

def normalize(s):
    """Normalize a string for fuzzy matching."""
    s = s.lower().strip()
    # Remove AOTY bracket IDs like [Album2854] or [Artist788]
    s = re.sub(r'\s*\[(album|artist)\d+\]', '', s, flags=re.IGNORECASE)
    # Remove common noise: disc numbers, remaster notes, deluxe edition, etc.
    s = re.sub(r'\s*[\(\[](deluxe|remaster|remastered|expanded|anniversary|bonus|special)\s*(edition|version)?[\)\]]', '', s, flags=re.IGNORECASE)
    # Normalize "the" prefix
    s = re.sub(r'^the\s+', '', s)
    # Remove punctuation except spaces
    s = re.sub(r'[^\w\s]', '', s)
    # Collapse whitespace
    s = re.sub(r'\s+', ' ', s).strip()
    return s


def levenshtein_ratio(s1, s2):
    """Calculate similarity ratio (0-100) between two strings using Levenshtein distance."""
    if not s1 and not s2:
        return 100
    if not s1 or not s2:
        return 0

    len1, len2 = len(s1), len(s2)
    # Quick length check — if lengths differ by more than 50%, skip full calculation
    if max(len1, len2) > 0 and min(len1, len2) / max(len1, len2) < 0.4:
        return 0

    # Standard Levenshtein with two-row optimization
    prev = list(range(len2 + 1))
    curr = [0] * (len2 + 1)

    for i in range(1, len1 + 1):
        curr[0] = i
        for j in range(1, len2 + 1):
            cost = 0 if s1[i-1] == s2[j-1] else 1
            curr[j] = min(curr[j-1] + 1, prev[j] + 1, prev[j-1] + cost)
        prev, curr = curr, prev

    distance = prev[len2]
    max_len = max(len1, len2)
    return int(round((1 - distance / max_len) * 100))


def match_score(aoty_artist, aoty_album, file_artist, file_album):
    """Calculate combined match score between AOTY entry and file metadata."""
    na_aoty = normalize(aoty_artist)
    na_file = normalize(file_artist)
    nb_aoty = normalize(aoty_album)
    nb_file = normalize(file_album)

    artist_score = levenshtein_ratio(na_aoty, na_file)
    album_score = levenshtein_ratio(nb_aoty, nb_file)

    # Artist match is more important (weighted 60/40)
    return int(artist_score * 0.6 + album_score * 0.4)


# ─────────────────────────────────────────────
# AOTY data loader
# ─────────────────────────────────────────────

def load_aoty_data(clawd_root):
    """Load all AOTY JSON files and return a list of album entries."""
    albums = []

    # Primary location: imports/aoty/syrrosis/
    primary_dir = os.path.join(clawd_root, 'imports', 'aoty', 'syrrosis')
    # Archive location: archive/raw/imports/2023-11/imports/aoty/syrrosis/
    archive_dir = os.path.join(clawd_root, 'archive', 'raw', 'imports', '2023-11', 'imports', 'aoty', 'syrrosis')

    seen_years = set()

    for dir_path in [primary_dir, archive_dir]:
        if not os.path.isdir(dir_path):
            continue
        for fname in sorted(os.listdir(dir_path)):
            if not fname.endswith('.json'):
                continue
            # Extract year from filename
            year_match = re.search(r'(\d{4})', fname)
            year = year_match.group(1) if year_match else 'unknown'

            # Primary dir takes precedence for duplicate years
            if year in seen_years and dir_path == archive_dir:
                continue

            filepath = os.path.join(dir_path, fname)
            try:
                with open(filepath, 'r') as f:
                    data = json.load(f)
            except Exception:
                continue

            items = data.get('items', [])
            list_name = data.get('list_name', fname)

            for item in items:
                artist = str(item.get('artist', '')).strip()
                title = str(item.get('title', '')).strip()
                order = item.get('order', 999)

                if not artist:
                    continue

                # Clean bracket IDs from title for display
                clean_title = re.sub(r'\s*\[(Album|Artist)\d+\]', '', title, flags=re.IGNORECASE).strip()

                albums.append({
                    'artist': artist,
                    'album': clean_title or '(unknown)',
                    'raw_title': title,
                    'year': year,
                    'rank': order,
                    'list': list_name,
                    'source_file': fname,
                })
                seen_years.add(year)

    return albums


# ─────────────────────────────────────────────
# File scanner
# ─────────────────────────────────────────────

AUDIO_EXTENSIONS = {'.mp3', '.flac', '.m4a', '.aac', '.ogg', '.oga', '.mp4', '.alac', '.wav', '.aif', '.aiff'}

def scan_music_folders(folders, verbose=False):
    """Scan folders for audio files and read their metadata."""
    files = []
    skipped = 0
    errors = 0

    for folder in folders:
        folder = os.path.expanduser(folder)
        if not os.path.isdir(folder):
            print(f"  WARNING: Folder not found: {folder}", file=sys.stderr)
            continue

        print(f"  Scanning: {folder}", file=sys.stderr)
        count = 0
        for root, dirs, filenames in os.walk(folder):
            # Skip hidden directories
            dirs[:] = [d for d in dirs if not d.startswith('.')]
            for fname in filenames:
                ext = Path(fname).suffix.lower()
                if ext not in AUDIO_EXTENSIONS:
                    continue

                fpath = os.path.join(root, fname)
                try:
                    meta = read_metadata(fpath)
                    if meta.get('artist') or meta.get('album'):
                        files.append({
                            'path': fpath,
                            'artist': meta.get('artist', ''),
                            'album': meta.get('album', ''),
                            'title': meta.get('title', ''),
                            'size_mb': round(os.path.getsize(fpath) / (1024 * 1024), 1),
                        })
                        count += 1
                    else:
                        skipped += 1
                        if verbose:
                            print(f"    No tags: {fpath}", file=sys.stderr)
                except Exception as e:
                    errors += 1
                    if verbose:
                        print(f"    Error: {fpath}: {e}", file=sys.stderr)

        print(f"    Found {count} tagged audio files", file=sys.stderr)

    print(f"  Total: {len(files)} files, {skipped} skipped (no tags), {errors} errors", file=sys.stderr)
    return files


# ─────────────────────────────────────────────
# Matching engine
# ─────────────────────────────────────────────

def build_file_album_index(files):
    """Group files by normalized (artist, album) for efficient matching."""
    index = defaultdict(list)
    for f in files:
        key = (normalize(f['artist']), normalize(f['album']))
        index[key].append(f)
    return index


def match_library(aoty_albums, files, threshold=80, verbose=False):
    """Match AOTY albums against local files."""
    file_index = build_file_album_index(files)
    file_keys = list(file_index.keys())

    matched = []
    partial = []
    missing = []

    for aoty in aoty_albums:
        na = normalize(aoty['artist'])
        nb = normalize(aoty['album'])

        best_score = 0
        best_key = None

        for fk_artist, fk_album in file_keys:
            score = match_score(aoty['artist'], aoty['album'], fk_artist, fk_album)
            if score > best_score:
                best_score = score
                best_key = (fk_artist, fk_album)

        if best_score >= threshold and best_key:
            matched_files = file_index[best_key]
            entry = {
                **aoty,
                'match_score': best_score,
                'matched_files': len(matched_files),
                'total_size_mb': round(sum(f['size_mb'] for f in matched_files), 1),
                'sample_path': matched_files[0]['path'],
            }
            if len(matched_files) >= 3:
                matched.append(entry)
            else:
                partial.append(entry)

            if verbose:
                print(f"  MATCH ({best_score}%): {aoty['artist']} — {aoty['album']} [{aoty['year']} #{aoty['rank']}] → {len(matched_files)} files", file=sys.stderr)
        else:
            missing.append(aoty)
            if verbose:
                print(f"  MISS  ({best_score}%): {aoty['artist']} — {aoty['album']} [{aoty['year']} #{aoty['rank']}]", file=sys.stderr)

    return matched, partial, missing


# ─────────────────────────────────────────────
# Report generator
# ─────────────────────────────────────────────

def decade_of(year_str):
    try:
        y = int(year_str)
        return f"{(y // 10) * 10}s"
    except ValueError:
        return "unknown"


def generate_report(aoty_albums, matched, partial, missing):
    """Generate a human-readable report and structured JSON."""
    total = len(aoty_albums)
    n_matched = len(matched)
    n_partial = len(partial)
    n_missing = len(missing)
    n_available = n_matched + n_partial

    # Decade breakdown
    decade_total = defaultdict(int)
    decade_matched = defaultdict(int)
    for a in aoty_albums:
        decade_total[decade_of(a['year'])] += 1
    for m in matched + partial:
        decade_matched[decade_of(m['year'])] += 1

    # Estimate total playback time (~3.5 min avg per track, ~10 tracks per album)
    est_tracks = n_available * 10
    est_hours = round(est_tracks * 3.5 / 60, 0)

    # Missing #1 picks
    missing_no1 = [m for m in missing if m['rank'] == 1]

    # Missing Top 5 picks
    missing_top5 = [m for m in missing if m['rank'] <= 5]

    # Artist frequency across all lists
    artist_counts = defaultdict(int)
    for a in aoty_albums:
        artist_counts[a['artist']] += 1
    top_artists = sorted(artist_counts.items(), key=lambda x: -x[1])[:20]

    # Which top artists are fully missing?
    matched_artists = set(m['artist'] for m in matched + partial)
    missing_top_artists = [(a, c) for a, c in top_artists if a not in matched_artists]

    # Total library size
    total_size_gb = round(sum(m.get('total_size_mb', 0) for m in matched + partial) / 1024, 1)

    # Build report text
    lines = []
    lines.append("=" * 60)
    lines.append("  MOODHONEY RADIO — LIBRARY AUDIT")
    lines.append(f"  {datetime.now().strftime('%Y-%m-%d %H:%M')}")
    lines.append("=" * 60)
    lines.append("")
    lines.append(f"  Total AOTY albums:     {total:,}")
    lines.append(f"  Matched locally:       {n_matched:,} ({n_matched*100//total}%)")
    lines.append(f"  Partial match:         {n_partial:,} ({n_partial*100//total}%)")
    lines.append(f"  Missing:               {n_missing:,} ({n_missing*100//total}%)")
    lines.append("")
    lines.append(f"  Estimated library:     ~{est_tracks:,} tracks / ~{int(est_hours)} hours")
    lines.append(f"  Library size on disk:  ~{total_size_gb} GB")
    lines.append("")
    lines.append("─" * 60)
    lines.append("  DECADE BREAKDOWN")
    lines.append("─" * 60)
    for decade in sorted(decade_total.keys()):
        dt = decade_total[decade]
        dm = decade_matched.get(decade, 0)
        pct = dm * 100 // dt if dt > 0 else 0
        bar = "█" * (pct // 5) + "░" * (20 - pct // 5)
        lines.append(f"  {decade:>7s}:  {dm:>3}/{dt:<3}  {bar}  {pct}%")
    lines.append("")

    if missing_no1:
        lines.append("─" * 60)
        lines.append("  MISSING #1 PICKS (highest priority)")
        lines.append("─" * 60)
        for m in sorted(missing_no1, key=lambda x: x['year']):
            lines.append(f"  {m['year']} #1: {m['artist']} — {m['album']}")
        lines.append("")

    if missing_top_artists:
        lines.append("─" * 60)
        lines.append("  MISSING TOP ARTISTS (appear most across your lists)")
        lines.append("─" * 60)
        for artist, count in missing_top_artists[:10]:
            lines.append(f"  {artist} ({count} appearances)")
        lines.append("")

    lines.append("─" * 60)
    lines.append("  YOUR LAUNCH LIBRARY — WHAT'S READY")
    lines.append("─" * 60)
    lines.append("")
    if n_available >= 100:
        lines.append(f"  You have {n_available} albums matched — that's a solid launch.")
        lines.append(f"  ~{int(est_hours)} hours of non-repeating music.")
        lines.append("  That's enough for several days of radio before anything repeats.")
    elif n_available >= 50:
        lines.append(f"  You have {n_available} albums matched — a decent starting point.")
        lines.append(f"  ~{int(est_hours)} hours of music. The station will feel a bit tight")
        lines.append("  at first but grows quickly as you add albums.")
    else:
        lines.append(f"  You have {n_available} albums matched — enough to test with.")
        lines.append("  You'll want to add more before sharing with the WhatsApp group.")
    lines.append("")

    lines.append("─" * 60)
    lines.append("  TOP MATCHED ALBUMS (by rank)")
    lines.append("─" * 60)
    best_matches = sorted(matched + partial, key=lambda x: (x['rank'], x['year']))[:30]
    for m in best_matches:
        lines.append(f"  {m['year']} #{m['rank']:>2}: {m['artist']} — {m['album']} ({m['matched_files']} files, {m['match_score']}% match)")
    lines.append("")

    lines.append("─" * 60)
    lines.append("  NEXT ALBUMS TO BUY (Top 5 per year, currently missing)")
    lines.append("─" * 60)
    buy_list = sorted(missing_top5, key=lambda x: (x['rank'], x['year']))[:30]
    for m in buy_list:
        lines.append(f"  {m['year']} #{m['rank']}: {m['artist']} — {m['album']}")
    lines.append("")

    lines.append("=" * 60)

    report_text = "\n".join(lines)

    # Build JSON output
    audit = {
        'generated': datetime.now().isoformat(),
        'summary': {
            'total_aoty_albums': total,
            'matched': n_matched,
            'partial': n_partial,
            'missing': n_missing,
            'estimated_tracks': est_tracks,
            'estimated_hours': int(est_hours),
            'library_size_gb': total_size_gb,
        },
        'decade_breakdown': {
            decade: {
                'total': decade_total[decade],
                'matched': decade_matched.get(decade, 0),
            }
            for decade in sorted(decade_total.keys())
        },
        'matched_albums': sorted(matched, key=lambda x: (x['year'], x['rank'])),
        'partial_albums': sorted(partial, key=lambda x: (x['year'], x['rank'])),
        'missing_albums': sorted(missing, key=lambda x: (x['year'], x['rank'])),
        'missing_number_ones': sorted(missing_no1, key=lambda x: x['year']),
        'buy_list_top5': sorted(missing_top5, key=lambda x: (x['rank'], x['year'])),
        'artist_frequency': dict(top_artists),
    }

    return report_text, audit


# ─────────────────────────────────────────────
# Main
# ─────────────────────────────────────────────

def main():
    args = sys.argv[1:]

    # Parse flags
    threshold = 80
    output_path = None
    verbose = False
    folders = []

    i = 0
    while i < len(args):
        if args[i] == '--threshold' and i + 1 < len(args):
            threshold = int(args[i+1])
            i += 2
        elif args[i] == '--output' and i + 1 < len(args):
            output_path = args[i+1]
            i += 2
        elif args[i] == '--verbose':
            verbose = True
            i += 1
        elif args[i] == '--help':
            print(__doc__)
            sys.exit(0)
        else:
            folders.append(args[i])
            i += 1

    if not folders:
        print("Usage: python3 moodhoney_library_scanner.py FOLDER1 [FOLDER2 ...]", file=sys.stderr)
        print("       python3 moodhoney_library_scanner.py ~/Music /Volumes/External/Albums", file=sys.stderr)
        print("       python3 moodhoney_library_scanner.py --help", file=sys.stderr)
        sys.exit(1)

    # Find clawd root (script is in clawd/scripts/)
    script_dir = os.path.dirname(os.path.abspath(__file__))
    clawd_root = os.path.dirname(script_dir)

    if not output_path:
        output_path = os.path.join(clawd_root, 'library-audit.json')

    print("\n🎵 Moodhoney Library Scanner", file=sys.stderr)
    print("─" * 40, file=sys.stderr)

    # Load AOTY data
    print("\n📋 Loading AOTY lists...", file=sys.stderr)
    aoty_albums = load_aoty_data(clawd_root)
    print(f"  Loaded {len(aoty_albums)} albums across {len(set(a['year'] for a in aoty_albums))} years", file=sys.stderr)

    # Deduplicate by (artist, album) — keep highest rank
    seen = {}
    unique_albums = []
    for a in aoty_albums:
        key = (normalize(a['artist']), normalize(a['album']))
        if key not in seen or a['rank'] < seen[key]['rank']:
            seen[key] = a
    unique_albums = list(seen.values())
    print(f"  Unique albums (after dedup): {len(unique_albums)}", file=sys.stderr)

    # Scan music folders
    print(f"\n🔍 Scanning music folders...", file=sys.stderr)
    files = scan_music_folders(folders, verbose=verbose)

    if not files:
        print("\n  No audio files found. Check your folder paths.", file=sys.stderr)
        sys.exit(1)

    # Match
    print(f"\n🎯 Matching against AOTY lists (threshold: {threshold}%)...", file=sys.stderr)
    matched, partial, missing = match_library(unique_albums, files, threshold=threshold, verbose=verbose)

    # Report
    report_text, audit = generate_report(unique_albums, matched, partial, missing)

    # Print report to stdout
    print("\n")
    print(report_text)

    # Save JSON
    with open(output_path, 'w') as f:
        json.dump(audit, f, indent=2, default=str)
    print(f"\n📁 Full audit saved to: {output_path}", file=sys.stderr)

    # Also save a markdown report
    report_path = output_path.replace('.json', '-report.txt')
    with open(report_path, 'w') as f:
        f.write(report_text)
    print(f"📄 Report saved to: {report_path}", file=sys.stderr)


if __name__ == '__main__':
    main()