Spaces:
Running
Running
| """Maps taxonomy dimensions to current inference-pipeline coverage. | |
| Answers: which of the 12 taxonomy dimensions the current MAEST, MuQ, PaSST, | |
| taxonomy-adapter, lyrics, rhythm, and GT-review layers produce tags for, and | |
| how completely. Static map because the answer depends on the pipeline | |
| architecture, not the per-track data. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass | |
| from typing import Literal | |
| CoverageStatus = Literal["covered", "partial", "none", "expert"] | |
| class DimensionCoverage: | |
| status: CoverageStatus | |
| sources: list[str] | |
| note: str | |
| DIMENSION_COVERAGE: dict[int, DimensionCoverage] = { | |
| 1: DimensionCoverage( | |
| "covered", | |
| [ | |
| "maest/genre", | |
| "muq-probe/genre", | |
| "passt-probe/genre", | |
| "taxonomy-adapter/genre", | |
| ], | |
| "MAEST keeps a Discogs baseline while MuQ, PaSST, and the taxonomy adapter now provide the Turkish taxonomy-facing genre families used in the track explorer.", | |
| ), | |
| 2: DimensionCoverage( | |
| "covered", | |
| ["muq-probe/genre", "passt-probe/genre", "taxonomy-adapter/genre"], | |
| "Expanded MuQ/PaSST probes and the taxonomy adapter provide the subtype-oriented Turkish genre terms surfaced for review.", | |
| ), | |
| 3: DimensionCoverage( | |
| "partial", | |
| ["groundtruth/web-research"], | |
| "Yöre — populated via Tier 4 web research pipeline (Wikipedia TR action=query, Discogs v2 API + release credits, MusicBrainz, YouTube → DeepSeek extraction against taxonomy controlled vocab). Full catalog (68/68 records on disk; one is 005_benim_icin_uzulme which exists as GT-only after the YouTube DMCA gap). Yöre populated on 4/68 tracks (güneydoğu 075, roman 082, urban-ankara 087, iç-anadolu 069) — the rest correctly null because the title carries no deterministic yöre cue and sources don't explicitly assert one. Largest residual gap in the taxonomy; needs Aran/Murat expert pass for rural-folk tracks where the source web doesn't disambiguate. Ground truth lives in sync_pilot/data/groundtruth/median/<track_id>.json (visible per-track on the Tracks page).", | |
| ), | |
| 4: DimensionCoverage( | |
| "covered", | |
| ["librosa/beat_track", "librosa/tempogram"], | |
| "Tier 3: librosa.beat.beat_track for BPM + tempogram-based aksak hint for meter. " | |
| "Catalog distribution: 55/67 tracks 4/4, 11/67 flagged 9/8 aksak candidate, 1/67 uncertain (silent / no onsets). " | |
| "Aksak classification is exploratory — librosa alone cannot reliably distinguish 9/8 from 7/8 or 5/8 from a single waveform, " | |
| "so the heuristic confidence ceiling for non-4/4 calls is capped at 0.49. Catalog confidence distribution: " | |
| "0/67 above 0.7, 5/67 in [0.5, 0.7], 62/67 below 0.5 (flagged for expert spot-check). The dashboard caption reads aksak " | |
| "meters as '9/8 aksak candidate' rather than as definitive calls. A future Tier 3.5 pass with madmom or a dedicated " | |
| "aksak model could refine.", | |
| ), | |
| 5: DimensionCoverage( | |
| "expert", | |
| [], | |
| "Makam — intentionally expert-only per taxonomy. Only Aran or a session musician should populate.", | |
| ), | |
| 6: DimensionCoverage( | |
| "covered", | |
| ["muq-probe/instrument", "passt-probe/instrument", "taxonomy-adapter/instrument"], | |
| "Instrumentation is now grounded primarily by promoted MuQ/PaSST probe heads plus the taxonomy adapter. GT review can override these into source-verified instrumentation.", | |
| ), | |
| 7: DimensionCoverage( | |
| "covered", | |
| ["muq-probe/vocal", "taxonomy-adapter/vocal", "review-sidecar/vocal"], | |
| "Vocal configuration and technique candidates come from MuQ probes and the taxonomy adapter, with reviewer selections persisted in the GT-review sidecar.", | |
| ), | |
| 8: DimensionCoverage( | |
| "covered", | |
| ["lyrics-llm"], | |
| "Themes extracted from Whisper lyrics via DeepSeek against the taxonomy's 22 controlled theme terms (gurbet, aşk-acısı, kabadayı, isyan, dini, etc.). Tier 2: 19 distinct themes fire across the 67-track catalog (top: aşk-acısı 37, pişmanlık 26, ayrılık 22, aşk 22, kader 21). 59/67 tracks carry at least one theme tag; 8 tracks have no themes (short/instrumental lyrics or empty after subtitle-strip).", | |
| ), | |
| 9: DimensionCoverage( | |
| "partial", | |
| ["muq-probe/mood", "review-sidecar/mood"], | |
| "Mood candidates are currently MuQ-probe driven and can be accepted or corrected in GT review. Coverage is useful but still needs human review because mood vocabulary is subjective.", | |
| ), | |
| 10: DimensionCoverage( | |
| "partial", | |
| ["groundtruth/web-research"], | |
| "Era — populated via Tier 4 web research. Full catalog: 54/68 tracks have era populated (14 high + 40 medium confidence), 14 uncertain (anonymous/multi-recording titles). Era is deterministically derived from year via a post-extract correction (year < 1970 → pre-1970; 1970–1979 → 1970-1980; …; ≥2020 → 2020-plus) so internal consistency is machine-guaranteed. Catalog era distribution: 1980-1990 leads (15), then 1990-2000 (8), 2000-2010 (7), 1970-1980 (6), 2010-2020 (5), 2020-plus (1).", | |
| ), | |
| 11: DimensionCoverage( | |
| "covered", | |
| ["librosa/beat_track", "librosa/rms"], | |
| "Tier 3: librosa BPM + RMS energy. Tempo band per taxonomy controlled vocab " | |
| "(yavaş <80 BPM, orta 80–120 BPM, hızlı >120 BPM). Catalog distribution: orta 45/67, hızlı 21/67, yavaş 1/67. " | |
| "Mean BPM 112.9 ± 21.9; one outlier above 160 (161.5 BPM on 090_ilk_ogretmen), one BPM=0 sentinel on the " | |
| "silent/onset-free 088_kara_sevda track. Energy bands: high 44/67, medium 21/67, low 2/67. " | |
| "Unblocks the description LLM to ground 'mid-tempo'/'yavaş'/'hızlı' claims on a real signal instead of " | |
| "hallucinating them — Tier 3 sets up the v0.7 description rerun.", | |
| ), | |
| 12: DimensionCoverage( | |
| "partial", | |
| ["groundtruth/web-research"], | |
| "Production arrangement/context — Tier 4 uses arrangement_aesthetic (orchestral-arabesk, electric-anadolu-rock, modern-pop-production, gazino-fantezi, bare-bones-halk) and recording_context (studio, live-recording, lo-fi-cassette, broadcast).", | |
| ), | |
| } | |
| STATUS_ICON: dict[CoverageStatus, str] = { | |
| "covered": "✅", | |
| "partial": "🟡", | |
| "none": "⛔", | |
| "expert": "🔒", | |
| } | |
| STATUS_LABEL: dict[CoverageStatus, str] = { | |
| "covered": "Covered", | |
| "partial": "Partial", | |
| "none": "No inference", | |
| "expert": "Expert-only", | |
| } | |
| def summary_counts() -> dict[str, int]: | |
| """Quick rollup — used by the page header.""" | |
| counts: dict[str, int] = {"covered": 0, "partial": 0, "none": 0, "expert": 0} | |
| for c in DIMENSION_COVERAGE.values(): | |
| counts[c.status] += 1 | |
| return counts | |