File size: 2,321 Bytes
ab62c9e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# fetch_gaia_audio.py

import os
import re
import requests

DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
OUT_PATH = "/mnt/data/test.wav"

def main():
    # 1) Fetch GAIA questions
    resp = requests.get(f"{DEFAULT_API_URL}/questions", timeout=15)
    resp.raise_for_status()
    questions = resp.json()

    # 2) Try attachments field first
    for q in questions:
        for field in ("attachments", "attachment", "audio"):
            urls = q.get(field)
            if not urls:
                continue
            if isinstance(urls, str):
                urls = [urls]
            for url in urls:
                if is_media_url(url):
                    return download_audio(url)

    # 3) Fallback: regex scan in question text
    pattern = re.compile(r"(https?://\S+\.(?:mp3|wav))", re.IGNORECASE)
    for q in questions:
        text = q.get("question", "")
        match = pattern.search(text)
        if match:
            url = match.group(1)
            return download_audio(url)

    print("⚠️  No .mp3/.wav URL found in GAIA payload; skipping download.")
    return

def is_media_url(url: str) -> bool:
    return bool(re.match(r"^https?://.*\.(?:mp3|wav)$", url, re.IGNORECASE))

def download_audio(url: str):
    print(f"Downloading audio from {url}")
    r = requests.get(url, timeout=30)
    r.raise_for_status()

    ext = os.path.splitext(url)[1].lower()
    content = r.content

    if ext == ".mp3":
        # try to convert to wav if pydub installed
        try:
            from pydub import AudioSegment
            mp3_path = "/mnt/data/tmp.mp3"
            with open(mp3_path, "wb") as f:
                f.write(content)
            audio = AudioSegment.from_mp3(mp3_path)
            audio.export(OUT_PATH, format="wav")
            print(f"✔ Saved WAV to {OUT_PATH}")
            return
        except ImportError:
            # fallback: write raw mp3 bytes
            OUT = OUT_PATH.replace(".wav", ".mp3")
            with open(OUT, "wb") as f:
                f.write(content)
            print(f"⚠ pydub not installed; saved MP3 to {OUT}")
            return

    # if it's .wav or any other, write directly
    with open(OUT_PATH, "wb") as f:
        f.write(content)
    print(f"✔ Saved WAV to {OUT_PATH}")

if __name__ == "__main__":
    main()