"""Download the sampled ambience beds from Wikimedia Commons.

A no-GPU alternative to make_ambience.py: instead of generating the seven
sampled beds, this pulls real field recordings from Wikimedia Commons
(public-domain / CC-licensed), trims each to a steady ~14 s loop, and writes
mono 16-bit wavs into assets/ambience/ — the format ambience.py expects.

It auto-selects: for each slug it searches Commons, drops obvious junk
(alarms, music, traffic…) by keyword, then downloads candidates in turn and
measures them, keeping the first that is long enough and not near-silent.
Provenance + licence for every pick is written to assets/ambience/CREDITS.md
so attribution can be honored when the Space ships.

Usage:
    uv pip install soundfile          # bundles libsndfile (ogg/mp3/flac/wav)
    python scripts/fetch_ambience.py                 # fill in what's missing
    python scripts/fetch_ambience.py ocean_waves --force
"""

import argparse
import io
import json
import re
import sys
import time
import unicodedata
import urllib.parse
import urllib.request
import wave
from pathlib import Path

import numpy as np

ROOT = Path(__file__).resolve().parent.parent
OUT_DIR = ROOT / "assets" / "ambience"
API = "https://commons.wikimedia.org/w/api.php"
UA = "LoFinity/0.1 (lofi hackathon ambience fetcher; https://huggingface.co/spaces)"

TARGET_S = 30.0          # loop length we keep == default song length, so a bed
                         # this long tiles to a 30 s song with zero seams
MIN_SRC_DUR = 8.0        # too short to be useful ambience
MAX_SRC_DUR = 400.0      # skip anything longer (podcasts, mixes)
MAX_BYTES = 30_000_000   # don't pull giant wavs
MAX_RATE = 32000         # cap stored rate (== musicgen rate); keeps files small

# How to find each bed: a list of probes whose results are unioned. Commons
# search ANDs every word in a probe, so each probe stays 1-2 words; more
# probes = more candidates to fall back through. ("category", name) lists a
# curated category; ("search", terms) is a File-namespace full-text search.
SOURCES = {
    "soft_rain": [("category", "Sounds of rain"), ("search", "rain ambience")],
    "ocean_waves": [("search", "ocean waves"), ("search", "sea waves"),
                    ("search", "surf beach")],
    "fireplace_crackle": [("search", "campfire"), ("search", "fireplace"),
                          ("search", "fire crackling")],
    "birdsong": [("search", "birdsong"), ("search", "dawn chorus"),
                 ("search", "birds chirping")],
    "night_crickets": [("search", "crickets"), ("search", "cricket chirping"),
                       ("search", "cicada")],
    "wind_in_trees": [("search", "wind trees"), ("search", "wind forest"),
                      ("search", "wind leaves")],
    "cafe_murmur": [("search", "restaurant ambience"), ("search", "cafe ambience"),
                    ("search", "crowd murmur")],
}

# Hand-vetted Commons files tried before falling back to search — auto-selection
# can't judge "continuous dawn chorus" vs "one repetitive cuckoo", so the good
# picks found during development are pinned here. Still run through every gate
# below, so a renamed/deleted file just falls through to search.
PREFERRED = {
    "soft_rain": "File:Lluvia en techo de lamina.wav",
    "ocean_waves": "File:Sea waves.wav",
    "fireplace_crackle": "File:WWS Fireoftheforge.ogg",
    "birdsong": "File:Birds singing in Fribourg 01.ogg",
    "night_crickets": "File:Black-Prince-Cicada- Psaltoda-plaga.wav",
    "wind_in_trees": "File:Wind in forest (Gravity Sound).wav",
    "cafe_murmur": "File:Shopping mall less crowded.ogg",
}

# Title contains any of these (lowercased) -> not ambience, skip it. This is
# what keeps "fire" from returning fire *alarms*, "sea" from podcasts, and
# "waves" from sine-wave test tones.
BLOCKLIST = (
    "alarm", "podcast", "episode", "interview", "speech", "talk", "lecture",
    "music", "song -", "band", "orchestra", "anthem", "hymn", "vocal", "choir",
    "dance", "ritual", "march", "siren", "horn", "traffic", "tram", "engine",
    "motor", "gun", "explosion", "war", "radio", "national", "voice", "demo",
    "sine", "tone", "hz", "sweep", "beep", "dtmf", "calibration", "signal",
    "woodwind", "clarinet", "flute", "accordion", "instrument", "guitar",
)

# Chosen file's title must contain one of these (accent-stripped) — a sound
# actually related to the slug. Multilingual because Commons is international.
RELEVANCE = {
    "soft_rain": ("rain", "lluvia", "regen", "pluie", "pioggia", "chuva",
                  "downpour", "drizzle", "storm"),
    "ocean_waves": ("ocean", "wave", "sea", "surf", "beach", "mar", "ola",
                    "vague", "welle", "tide", "shore", "playa", "costa"),
    "fireplace_crackle": ("fire", "campfire", "fireplace", "crackl", "crepit",
                          "feu", "fuego", "hoguera", "fogata", "ember", "hearth"),
    "birdsong": ("bird", "song", "chorus", "dawn", "chirp", "cuckoo", "wren",
                 "sparrow", "robin", "blackbird", "finch", "warbler", "thrush",
                 "nightingale", "lark", "vogel", "oiseau", "pajaro", "canto"),
    "night_crickets": ("cricket", "cicada", "cicad", "cigarra", "grasshopper",
                       "grillo", "grille", "katydid", "locust", "insect", "chirp"),
    "wind_in_trees": ("wind", "breeze", "gust", "rustl", "viento", "vent",
                      "howl", "gale", "brisa", "blowing"),
    "cafe_murmur": ("cafe", "restaurant", "crowd", "murmur", "coffee", "bar",
                    "pub", "chatter", "ambien", "mall", "station", "people",
                    "plaza", "market", "tunnel", "hall", "lobby", "gente"),
}


def _norm(s):
    """Lowercase + strip accents so 'pájaro'/'Pajaro' both match 'pajaro'."""
    s = unicodedata.normalize("NFKD", str(s))
    return "".join(c for c in s if not unicodedata.combining(c)).lower()


def commons_api(params, tries=5):
    params = {**params, "format": "json", "formatversion": "2"}
    url = API + "?" + urllib.parse.urlencode(params)
    for i in range(tries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": UA})
            with urllib.request.urlopen(req, timeout=30) as r:
                return json.load(r)
        except urllib.error.HTTPError as e:
            if e.code == 429 and i < tries - 1:
                time.sleep(2 * (i + 1))
                continue
            raise
    return {}


def find_titles(slug):
    titles = []
    for kind, value in SOURCES[slug]:
        if kind == "category":
            res = commons_api({"action": "query", "list": "categorymembers",
                               "cmtitle": f"Category:{value}", "cmtype": "file",
                               "cmlimit": "30"})
            hits = [m["title"] for m in res.get("query", {}).get("categorymembers", [])]
        else:
            res = commons_api({"action": "query", "list": "search", "srnamespace": "6",
                               "srsearch": f"filetype:audio {value}", "srlimit": "15"})
            hits = [h["title"] for h in res.get("query", {}).get("search", [])]
        titles += hits
        time.sleep(1)
    # dedupe (keep order); drop junk, then require a slug-relevant word
    seen, kept = set(), []
    for t in titles:
        nt = _norm(t)
        if t in seen or any(b in nt for b in BLOCKLIST):
            continue
        if not any(kw in nt for kw in RELEVANCE[slug]):
            continue
        seen.add(t)
        kept.append(t)
    return kept


def file_info(titles):
    """title -> dict(url, dur, license, artist, page) for a batch of titles."""
    out = {}
    for i in range(0, len(titles), 20):
        info = commons_api({"action": "query", "titles": "|".join(titles[i:i + 20]),
                            "prop": "imageinfo",
                            "iiprop": "url|size|mediatype|extmetadata"})
        for page in info.get("query", {}).get("pages", []):
            ii = (page.get("imageinfo") or [{}])[0]
            ext = ii.get("extmetadata", {})
            def field(k):
                return ext.get(k, {}).get("value", "")
            out[page.get("title", "?")] = {
                "url": ii.get("url", ""),
                "dur": float(ii.get("duration") or 0.0),
                "mediatype": ii.get("mediatype", ""),
                "license": field("LicenseShortName") or "?",
                "artist": _strip_html(field("Artist")) or "Unknown",
                "page": ii.get("descriptionurl", ""),
            }
        time.sleep(1)
    return out


def _strip_html(s):
    return re.sub(r"<[^>]+>", "", s).strip()


def spectral_flatness(mono, rate):
    """Ratio of geometric to arithmetic mean of the power spectrum. ~0 for a
    pure tone, higher for broadband texture — catches test tones that slip
    past the title filter (a 'Sine Wave' file is named like a sea 'wave').

    The signal is detrended and high-passed (first difference) first: crowd
    and surf ambience carries heavy low-frequency rumble that otherwise
    dominates the spectrum and reads as falsely 'tonal' (calibration showed
    real cafe recordings at 2e-5 raw vs 1e-12 for a true sine — too close;
    after the high-pass they separate to 2e-3 vs 1e-12)."""
    seg = mono[: rate * 4].astype(np.float64)
    if len(seg) < 256:
        return 1.0
    seg = np.diff(seg - seg.mean())
    power = np.abs(np.fft.rfft(seg * np.hanning(len(seg)))) ** 2 + 1e-12
    return float(np.exp(np.mean(np.log(power))) / np.mean(power))


def download(url):
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=60) as r:
        length = int(r.headers.get("Content-Length") or 0)
        if length and length > MAX_BYTES:
            raise ValueError(f"too big ({length / 1e6:.0f} MB)")
        return r.read(MAX_BYTES + 1)


def decode_mono(blob):
    import soundfile as sf

    try:
        data, rate = sf.read(io.BytesIO(blob), dtype="float64", always_2d=True)
        return data.mean(axis=1), rate
    except sf.LibsndfileError:
        return _decode_av(blob)  # Opus/other codecs libsndfile can't open


def _decode_av(blob):
    """Fallback decoder via PyAV (bundles ffmpeg) — most Commons crowd/cafe
    recordings are Ogg/Opus, which libsndfile doesn't support."""
    import av

    with av.open(io.BytesIO(blob)) as container:
        stream = container.streams.audio[0]
        rate = stream.codec_context.sample_rate
        chunks = []
        resampler = av.AudioResampler(format="flt", layout="mono", rate=rate)
        for frame in container.decode(stream):
            for out in resampler.resample(frame):
                chunks.append(out.to_ndarray().reshape(-1))
    if not chunks:
        raise ValueError("no audio frames decoded")
    return np.concatenate(chunks).astype(np.float64), rate


def steady_window(mono, rate):
    """Pick the best TARGET_S loop window. Short clips are returned whole (the
    mixer tiles them). The window is scored on three things, because the mixer
    crossfades the loop's tail back into its head:
      - steady interior (low RMS variation) so it doesn't swell or drop
      - head and tail at matched energy, so the crossfade blends like-for-like
      - neither boundary in a lull, so the loop point doesn't briefly drop out
    The last two matter for sparse textures (birdsong, fireplace): a window
    that merely minimizes variance can still start/end in a gap, dipping ~10 dB
    every loop."""
    n = int(TARGET_S * rate)
    if len(mono) <= n:
        return mono
    hop = max(int(rate * 0.1), 1)          # 100 ms frames: fine enough to see the seam
    frame_rms = np.array([
        np.sqrt(np.mean(mono[i:i + hop] ** 2)) for i in range(0, len(mono) - hop, hop)
    ])
    median = float(np.median(frame_rms)) or 1.0
    win_frames = max(n // hop, 1)
    edge = max(int(rate * 0.5) // hop, 1)  # frames spanning one crossfade (~0.5 s)
    best, best_score = None, 1e9
    for start in range(0, len(frame_rms) - win_frames, max(win_frames // 8, 1)):
        seg = frame_rms[start:start + win_frames]
        mean = float(seg.mean())
        if mean < 0.5 * median:            # window mostly in a lull
            continue
        head, tail = float(seg[:edge].mean()), float(seg[-edge:].mean())
        cv = float(seg.std()) / (mean or 1.0)
        mismatch = abs(head - tail) / median
        lull = max(0.0, 1.0 - min(head, tail) / median)  # 0 once boundary >= median
        score = cv + 2.0 * mismatch + 2.0 * lull
        if score < best_score:
            best_score, best = score, start * hop
    start = best if best is not None else (len(mono) - n) // 2
    return mono[start:start + n]


def resample(mono, src, dst):
    if src <= dst:
        return mono, src
    m = int(len(mono) * dst / src)
    return np.interp(np.arange(m) * (src / dst), np.arange(len(mono)), mono), dst


def write_wav(mono, rate, path):
    peak = float(np.abs(mono).max() or 1.0)
    pcm = (mono * (0.9 / peak) * 32767).astype("<i2")
    with wave.open(str(path), "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(rate)
        w.writeframes(pcm.tobytes())


def fetch_one(slug):
    """Return a credit dict on success, or None if nothing usable was found."""
    found = find_titles(slug)
    pref = PREFERRED.get(slug)
    # the pinned pick is tried first; search results (relevance order) back it up
    lookup, seen = [], set()
    for t in ([pref] if pref else []) + found:
        if t not in seen:
            seen.add(t)
            lookup.append(t)
    if not lookup:
        print(f"  no candidates found for {slug}")
        return None
    info = file_info(lookup)
    for title in [t for t in lookup if info.get(t, {}).get("url")][:8]:
        meta = info[title]
        if meta["dur"] and meta["dur"] > MAX_SRC_DUR:
            continue
        try:
            blob = download(meta["url"])
            mono, rate = decode_mono(blob)
        except Exception as e:  # noqa: BLE001 — try the next candidate
            print(f"    skip {title[5:][:40]!r}: {e}")
            continue
        dur = len(mono) / rate
        rms = float(np.sqrt(np.mean(mono ** 2)))
        flat = spectral_flatness(mono, rate)
        if dur < MIN_SRC_DUR or dur > MAX_SRC_DUR or rms < 5e-3:
            print(f"    skip {title[5:][:40]!r}: dur={dur:.0f}s rms={rms:.3f}")
            continue
        if flat < 1e-3:  # essentially a pure tone, not ambience (sines ~1e-12)
            print(f"    skip {title[5:][:40]!r}: too tonal (flatness {flat:.0e})")
            continue
        seg = steady_window(mono, rate)
        seg, out_rate = resample(seg, rate, MAX_RATE)
        write_wav(seg, out_rate, OUT_DIR / f"{slug}.wav")
        seams = "no seam" if len(seg) / out_rate >= 30 else "1 seam @30s"
        print(f"  {slug} <- {title[5:][:42]!r}  "
              f"({dur:.0f}s src -> {len(seg)/out_rate:.0f}s, {seams}, {meta['license']})")
        return {"slug": slug, "title": title[5:], "license": meta["license"],
                "artist": meta["artist"], "page": meta["page"]}
    print(f"  no usable file for {slug} (all candidates failed checks)")
    return None


def save_credits(new_credits):
    """Merge this run's picks into credits.json (the source of truth, keyed by
    slug) and re-render CREDITS.md. Merging means fetching one slug doesn't
    drop the others' attribution."""
    store = OUT_DIR / "credits.json"
    merged = {}
    if store.exists():
        try:
            merged = json.loads(store.read_text())
        except ValueError:
            pass
    for c in new_credits:
        merged[c["slug"]] = c
    store.write_text(json.dumps(merged, indent=2, sort_keys=True))

    lines = ["# Ambience sample credits", "",
             "Auto-fetched from Wikimedia Commons by `scripts/fetch_ambience.py`.",
             "vinyl_crackle and tape_hiss are synthesized in `ambience.py` and not listed.", ""]
    for slug in sorted(merged):
        c = merged[slug]
        lines += [
            f"## {slug}",
            f"- **{c['title']}**",
            f"- Author: {c['artist']}",
            f"- Licence: {c['license']}",
            f"- Source: {c['page']}",
            "",
        ]
    (OUT_DIR / "CREDITS.md").write_text("\n".join(lines))


def main():
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument("slugs", nargs="*", choices=[*SOURCES, []], metavar="slug",
                        help=f"beds to fetch (default: missing ones). One of: {', '.join(SOURCES)}")
    parser.add_argument("--force", action="store_true", help="re-fetch even if the wav exists")
    args = parser.parse_args()

    todo = args.slugs or [s for s in SOURCES if args.force or not (OUT_DIR / f"{s}.wav").exists()]
    if not todo:
        print("all sampled beds already present — use --force to refetch")
        return 0
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    credits = []
    for slug in todo:
        print(f"\n[{slug}]")
        c = fetch_one(slug)
        if c:
            credits.append(c)
        time.sleep(1)

    if credits:
        save_credits(credits)  # merges into credits.json, won't drop other slugs
    got = len(credits)
    print(f"\nfetched {got}/{len(todo)} beds -> {OUT_DIR.relative_to(ROOT)}")
    return 0 if got else 1


if __name__ == "__main__":
    sys.exit(main())