File size: 17,403 Bytes
722a5d8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
"""Download the sampled ambience beds from Wikimedia Commons.

A no-GPU alternative to make_ambience.py: instead of generating the seven
sampled beds, this pulls real field recordings from Wikimedia Commons
(public-domain / CC-licensed), trims each to a steady ~14 s loop, and writes
mono 16-bit wavs into assets/ambience/ — the format ambience.py expects.

It auto-selects: for each slug it searches Commons, drops obvious junk
(alarms, music, traffic…) by keyword, then downloads candidates in turn and
measures them, keeping the first that is long enough and not near-silent.
Provenance + licence for every pick is written to assets/ambience/CREDITS.md
so attribution can be honored when the Space ships.

Usage:
    uv pip install soundfile          # bundles libsndfile (ogg/mp3/flac/wav)
    python scripts/fetch_ambience.py                 # fill in what's missing
    python scripts/fetch_ambience.py ocean_waves --force
"""

import argparse
import io
import json
import re
import sys
import time
import unicodedata
import urllib.parse
import urllib.request
import wave
from pathlib import Path

import numpy as np

ROOT = Path(__file__).resolve().parent.parent
OUT_DIR = ROOT / "assets" / "ambience"
API = "https://commons.wikimedia.org/w/api.php"
UA = "LoFinity/0.1 (lofi hackathon ambience fetcher; https://huggingface.co/spaces)"

TARGET_S = 30.0          # loop length we keep == default song length, so a bed
                         # this long tiles to a 30 s song with zero seams
MIN_SRC_DUR = 8.0        # too short to be useful ambience
MAX_SRC_DUR = 400.0      # skip anything longer (podcasts, mixes)
MAX_BYTES = 30_000_000   # don't pull giant wavs
MAX_RATE = 32000         # cap stored rate (== musicgen rate); keeps files small

# How to find each bed: a list of probes whose results are unioned. Commons
# search ANDs every word in a probe, so each probe stays 1-2 words; more
# probes = more candidates to fall back through. ("category", name) lists a
# curated category; ("search", terms) is a File-namespace full-text search.
SOURCES = {
    "soft_rain": [("category", "Sounds of rain"), ("search", "rain ambience")],
    "ocean_waves": [("search", "ocean waves"), ("search", "sea waves"),
                    ("search", "surf beach")],
    "fireplace_crackle": [("search", "campfire"), ("search", "fireplace"),
                          ("search", "fire crackling")],
    "birdsong": [("search", "birdsong"), ("search", "dawn chorus"),
                 ("search", "birds chirping")],
    "night_crickets": [("search", "crickets"), ("search", "cricket chirping"),
                       ("search", "cicada")],
    "wind_in_trees": [("search", "wind trees"), ("search", "wind forest"),
                      ("search", "wind leaves")],
    "cafe_murmur": [("search", "restaurant ambience"), ("search", "cafe ambience"),
                    ("search", "crowd murmur")],
}

# Hand-vetted Commons files tried before falling back to search — auto-selection
# can't judge "continuous dawn chorus" vs "one repetitive cuckoo", so the good
# picks found during development are pinned here. Still run through every gate
# below, so a renamed/deleted file just falls through to search.
PREFERRED = {
    "soft_rain": "File:Lluvia en techo de lamina.wav",
    "ocean_waves": "File:Sea waves.wav",
    "fireplace_crackle": "File:WWS Fireoftheforge.ogg",
    "birdsong": "File:Birds singing in Fribourg 01.ogg",
    "night_crickets": "File:Black-Prince-Cicada- Psaltoda-plaga.wav",
    "wind_in_trees": "File:Wind in forest (Gravity Sound).wav",
    "cafe_murmur": "File:Shopping mall less crowded.ogg",
}

# Title contains any of these (lowercased) -> not ambience, skip it. This is
# what keeps "fire" from returning fire *alarms*, "sea" from podcasts, and
# "waves" from sine-wave test tones.
BLOCKLIST = (
    "alarm", "podcast", "episode", "interview", "speech", "talk", "lecture",
    "music", "song -", "band", "orchestra", "anthem", "hymn", "vocal", "choir",
    "dance", "ritual", "march", "siren", "horn", "traffic", "tram", "engine",
    "motor", "gun", "explosion", "war", "radio", "national", "voice", "demo",
    "sine", "tone", "hz", "sweep", "beep", "dtmf", "calibration", "signal",
    "woodwind", "clarinet", "flute", "accordion", "instrument", "guitar",
)

# Chosen file's title must contain one of these (accent-stripped) — a sound
# actually related to the slug. Multilingual because Commons is international.
RELEVANCE = {
    "soft_rain": ("rain", "lluvia", "regen", "pluie", "pioggia", "chuva",
                  "downpour", "drizzle", "storm"),
    "ocean_waves": ("ocean", "wave", "sea", "surf", "beach", "mar", "ola",
                    "vague", "welle", "tide", "shore", "playa", "costa"),
    "fireplace_crackle": ("fire", "campfire", "fireplace", "crackl", "crepit",
                          "feu", "fuego", "hoguera", "fogata", "ember", "hearth"),
    "birdsong": ("bird", "song", "chorus", "dawn", "chirp", "cuckoo", "wren",
                 "sparrow", "robin", "blackbird", "finch", "warbler", "thrush",
                 "nightingale", "lark", "vogel", "oiseau", "pajaro", "canto"),
    "night_crickets": ("cricket", "cicada", "cicad", "cigarra", "grasshopper",
                       "grillo", "grille", "katydid", "locust", "insect", "chirp"),
    "wind_in_trees": ("wind", "breeze", "gust", "rustl", "viento", "vent",
                      "howl", "gale", "brisa", "blowing"),
    "cafe_murmur": ("cafe", "restaurant", "crowd", "murmur", "coffee", "bar",
                    "pub", "chatter", "ambien", "mall", "station", "people",
                    "plaza", "market", "tunnel", "hall", "lobby", "gente"),
}


def _norm(s):
    """Lowercase + strip accents so 'pájaro'/'Pajaro' both match 'pajaro'."""
    s = unicodedata.normalize("NFKD", str(s))
    return "".join(c for c in s if not unicodedata.combining(c)).lower()


def commons_api(params, tries=5):
    params = {**params, "format": "json", "formatversion": "2"}
    url = API + "?" + urllib.parse.urlencode(params)
    for i in range(tries):
        try:
            req = urllib.request.Request(url, headers={"User-Agent": UA})
            with urllib.request.urlopen(req, timeout=30) as r:
                return json.load(r)
        except urllib.error.HTTPError as e:
            if e.code == 429 and i < tries - 1:
                time.sleep(2 * (i + 1))
                continue
            raise
    return {}


def find_titles(slug):
    titles = []
    for kind, value in SOURCES[slug]:
        if kind == "category":
            res = commons_api({"action": "query", "list": "categorymembers",
                               "cmtitle": f"Category:{value}", "cmtype": "file",
                               "cmlimit": "30"})
            hits = [m["title"] for m in res.get("query", {}).get("categorymembers", [])]
        else:
            res = commons_api({"action": "query", "list": "search", "srnamespace": "6",
                               "srsearch": f"filetype:audio {value}", "srlimit": "15"})
            hits = [h["title"] for h in res.get("query", {}).get("search", [])]
        titles += hits
        time.sleep(1)
    # dedupe (keep order); drop junk, then require a slug-relevant word
    seen, kept = set(), []
    for t in titles:
        nt = _norm(t)
        if t in seen or any(b in nt for b in BLOCKLIST):
            continue
        if not any(kw in nt for kw in RELEVANCE[slug]):
            continue
        seen.add(t)
        kept.append(t)
    return kept


def file_info(titles):
    """title -> dict(url, dur, license, artist, page) for a batch of titles."""
    out = {}
    for i in range(0, len(titles), 20):
        info = commons_api({"action": "query", "titles": "|".join(titles[i:i + 20]),
                            "prop": "imageinfo",
                            "iiprop": "url|size|mediatype|extmetadata"})
        for page in info.get("query", {}).get("pages", []):
            ii = (page.get("imageinfo") or [{}])[0]
            ext = ii.get("extmetadata", {})
            def field(k):
                return ext.get(k, {}).get("value", "")
            out[page.get("title", "?")] = {
                "url": ii.get("url", ""),
                "dur": float(ii.get("duration") or 0.0),
                "mediatype": ii.get("mediatype", ""),
                "license": field("LicenseShortName") or "?",
                "artist": _strip_html(field("Artist")) or "Unknown",
                "page": ii.get("descriptionurl", ""),
            }
        time.sleep(1)
    return out


def _strip_html(s):
    return re.sub(r"<[^>]+>", "", s).strip()


def spectral_flatness(mono, rate):
    """Ratio of geometric to arithmetic mean of the power spectrum. ~0 for a
    pure tone, higher for broadband texture — catches test tones that slip
    past the title filter (a 'Sine Wave' file is named like a sea 'wave').

    The signal is detrended and high-passed (first difference) first: crowd
    and surf ambience carries heavy low-frequency rumble that otherwise
    dominates the spectrum and reads as falsely 'tonal' (calibration showed
    real cafe recordings at 2e-5 raw vs 1e-12 for a true sine — too close;
    after the high-pass they separate to 2e-3 vs 1e-12)."""
    seg = mono[: rate * 4].astype(np.float64)
    if len(seg) < 256:
        return 1.0
    seg = np.diff(seg - seg.mean())
    power = np.abs(np.fft.rfft(seg * np.hanning(len(seg)))) ** 2 + 1e-12
    return float(np.exp(np.mean(np.log(power))) / np.mean(power))


def download(url):
    req = urllib.request.Request(url, headers={"User-Agent": UA})
    with urllib.request.urlopen(req, timeout=60) as r:
        length = int(r.headers.get("Content-Length") or 0)
        if length and length > MAX_BYTES:
            raise ValueError(f"too big ({length / 1e6:.0f} MB)")
        return r.read(MAX_BYTES + 1)


def decode_mono(blob):
    import soundfile as sf

    try:
        data, rate = sf.read(io.BytesIO(blob), dtype="float64", always_2d=True)
        return data.mean(axis=1), rate
    except sf.LibsndfileError:
        return _decode_av(blob)  # Opus/other codecs libsndfile can't open


def _decode_av(blob):
    """Fallback decoder via PyAV (bundles ffmpeg) — most Commons crowd/cafe
    recordings are Ogg/Opus, which libsndfile doesn't support."""
    import av

    with av.open(io.BytesIO(blob)) as container:
        stream = container.streams.audio[0]
        rate = stream.codec_context.sample_rate
        chunks = []
        resampler = av.AudioResampler(format="flt", layout="mono", rate=rate)
        for frame in container.decode(stream):
            for out in resampler.resample(frame):
                chunks.append(out.to_ndarray().reshape(-1))
    if not chunks:
        raise ValueError("no audio frames decoded")
    return np.concatenate(chunks).astype(np.float64), rate


def steady_window(mono, rate):
    """Pick the best TARGET_S loop window. Short clips are returned whole (the
    mixer tiles them). The window is scored on three things, because the mixer
    crossfades the loop's tail back into its head:
      - steady interior (low RMS variation) so it doesn't swell or drop
      - head and tail at matched energy, so the crossfade blends like-for-like
      - neither boundary in a lull, so the loop point doesn't briefly drop out
    The last two matter for sparse textures (birdsong, fireplace): a window
    that merely minimizes variance can still start/end in a gap, dipping ~10 dB
    every loop."""
    n = int(TARGET_S * rate)
    if len(mono) <= n:
        return mono
    hop = max(int(rate * 0.1), 1)          # 100 ms frames: fine enough to see the seam
    frame_rms = np.array([
        np.sqrt(np.mean(mono[i:i + hop] ** 2)) for i in range(0, len(mono) - hop, hop)
    ])
    median = float(np.median(frame_rms)) or 1.0
    win_frames = max(n // hop, 1)
    edge = max(int(rate * 0.5) // hop, 1)  # frames spanning one crossfade (~0.5 s)
    best, best_score = None, 1e9
    for start in range(0, len(frame_rms) - win_frames, max(win_frames // 8, 1)):
        seg = frame_rms[start:start + win_frames]
        mean = float(seg.mean())
        if mean < 0.5 * median:            # window mostly in a lull
            continue
        head, tail = float(seg[:edge].mean()), float(seg[-edge:].mean())
        cv = float(seg.std()) / (mean or 1.0)
        mismatch = abs(head - tail) / median
        lull = max(0.0, 1.0 - min(head, tail) / median)  # 0 once boundary >= median
        score = cv + 2.0 * mismatch + 2.0 * lull
        if score < best_score:
            best_score, best = score, start * hop
    start = best if best is not None else (len(mono) - n) // 2
    return mono[start:start + n]


def resample(mono, src, dst):
    if src <= dst:
        return mono, src
    m = int(len(mono) * dst / src)
    return np.interp(np.arange(m) * (src / dst), np.arange(len(mono)), mono), dst


def write_wav(mono, rate, path):
    peak = float(np.abs(mono).max() or 1.0)
    pcm = (mono * (0.9 / peak) * 32767).astype("<i2")
    with wave.open(str(path), "wb") as w:
        w.setnchannels(1)
        w.setsampwidth(2)
        w.setframerate(rate)
        w.writeframes(pcm.tobytes())


def fetch_one(slug):
    """Return a credit dict on success, or None if nothing usable was found."""
    found = find_titles(slug)
    pref = PREFERRED.get(slug)
    # the pinned pick is tried first; search results (relevance order) back it up
    lookup, seen = [], set()
    for t in ([pref] if pref else []) + found:
        if t not in seen:
            seen.add(t)
            lookup.append(t)
    if not lookup:
        print(f"  no candidates found for {slug}")
        return None
    info = file_info(lookup)
    for title in [t for t in lookup if info.get(t, {}).get("url")][:8]:
        meta = info[title]
        if meta["dur"] and meta["dur"] > MAX_SRC_DUR:
            continue
        try:
            blob = download(meta["url"])
            mono, rate = decode_mono(blob)
        except Exception as e:  # noqa: BLE001 — try the next candidate
            print(f"    skip {title[5:][:40]!r}: {e}")
            continue
        dur = len(mono) / rate
        rms = float(np.sqrt(np.mean(mono ** 2)))
        flat = spectral_flatness(mono, rate)
        if dur < MIN_SRC_DUR or dur > MAX_SRC_DUR or rms < 5e-3:
            print(f"    skip {title[5:][:40]!r}: dur={dur:.0f}s rms={rms:.3f}")
            continue
        if flat < 1e-3:  # essentially a pure tone, not ambience (sines ~1e-12)
            print(f"    skip {title[5:][:40]!r}: too tonal (flatness {flat:.0e})")
            continue
        seg = steady_window(mono, rate)
        seg, out_rate = resample(seg, rate, MAX_RATE)
        write_wav(seg, out_rate, OUT_DIR / f"{slug}.wav")
        seams = "no seam" if len(seg) / out_rate >= 30 else "1 seam @30s"
        print(f"  {slug} <- {title[5:][:42]!r}  "
              f"({dur:.0f}s src -> {len(seg)/out_rate:.0f}s, {seams}, {meta['license']})")
        return {"slug": slug, "title": title[5:], "license": meta["license"],
                "artist": meta["artist"], "page": meta["page"]}
    print(f"  no usable file for {slug} (all candidates failed checks)")
    return None


def save_credits(new_credits):
    """Merge this run's picks into credits.json (the source of truth, keyed by
    slug) and re-render CREDITS.md. Merging means fetching one slug doesn't
    drop the others' attribution."""
    store = OUT_DIR / "credits.json"
    merged = {}
    if store.exists():
        try:
            merged = json.loads(store.read_text())
        except ValueError:
            pass
    for c in new_credits:
        merged[c["slug"]] = c
    store.write_text(json.dumps(merged, indent=2, sort_keys=True))

    lines = ["# Ambience sample credits", "",
             "Auto-fetched from Wikimedia Commons by `scripts/fetch_ambience.py`.",
             "vinyl_crackle and tape_hiss are synthesized in `ambience.py` and not listed.", ""]
    for slug in sorted(merged):
        c = merged[slug]
        lines += [
            f"## {slug}",
            f"- **{c['title']}**",
            f"- Author: {c['artist']}",
            f"- Licence: {c['license']}",
            f"- Source: {c['page']}",
            "",
        ]
    (OUT_DIR / "CREDITS.md").write_text("\n".join(lines))


def main():
    parser = argparse.ArgumentParser(description=__doc__.split("\n")[0])
    parser.add_argument("slugs", nargs="*", choices=[*SOURCES, []], metavar="slug",
                        help=f"beds to fetch (default: missing ones). One of: {', '.join(SOURCES)}")
    parser.add_argument("--force", action="store_true", help="re-fetch even if the wav exists")
    args = parser.parse_args()

    todo = args.slugs or [s for s in SOURCES if args.force or not (OUT_DIR / f"{s}.wav").exists()]
    if not todo:
        print("all sampled beds already present — use --force to refetch")
        return 0
    OUT_DIR.mkdir(parents=True, exist_ok=True)

    credits = []
    for slug in todo:
        print(f"\n[{slug}]")
        c = fetch_one(slug)
        if c:
            credits.append(c)
        time.sleep(1)

    if credits:
        save_credits(credits)  # merges into credits.json, won't drop other slugs
    got = len(credits)
    print(f"\nfetched {got}/{len(todo)} beds -> {OUT_DIR.relative_to(ROOT)}")
    return 0 if got else 1


if __name__ == "__main__":
    sys.exit(main())