File size: 2,598 Bytes
d3a7a1c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
import os
from pathlib import Path

MODAL_TRANSCRIBE_URL = os.environ.get(
    "MODAL_TRANSCRIBE_URL",
    "https://mitvho09--tinyworld-inference-transcribe-endpoint.modal.run",
)

MOCK_SENTENCES = [
    "A street parade just showed up out of nowhere.",
    "Someone left a mysterious package on the sidewalk.",
    "The neighbors are arguing about something loud again.",
    "There's a strange light coming from the old building.",
    "I just saw something really weird down the block.",
]


def _is_mock() -> bool:
    return os.environ.get("TINYWORLD_MOCK", "0") == "1"


def _backend() -> str:
    return os.environ.get("TINYWORLD_INFER", "modal").lower()


def transcribe(audio_path: str | None) -> str:
    if not audio_path or _is_mock():
        import random
        return random.choice(MOCK_SENTENCES)

    if _backend() == "local":
        try:
            import inference  # ZeroGPU Whisper, imported lazily
            text = inference.transcribe_audio(audio_path)
            if text:
                return text
        except Exception as e:
            print(f"[transcribe] local failed: {e}")

    if MODAL_TRANSCRIBE_URL:
        try:
            text = _modal_transcribe(audio_path)
            if text:
                return text
        except Exception as e:
            print(f"[transcribe] Modal failed: {e}")

    try:
        return _cohere_transcribe(audio_path)
    except Exception as e:
        print(f"[transcribe] failed: {e}")
        return ""


def _modal_transcribe(audio_path: str) -> str:
    import httpx

    path = Path(audio_path)
    headers = {"x-audio-suffix": path.suffix or ".wav"}
    with path.open("rb") as f:
        audio_bytes = f.read()

    with httpx.Client(timeout=300.0, follow_redirects=True) as client:
        response = client.post(
            MODAL_TRANSCRIBE_URL,
            content=audio_bytes,
            headers=headers,
        )
        response.raise_for_status()

    data = response.json()
    return (data.get("text") or "").strip()


def _cohere_transcribe(audio_path: str) -> str:
    api_key = os.environ.get("COHERE_API_KEY", "")
    if not api_key:
        print("[transcribe] no COHERE_API_KEY set")
        return ""

    import cohere

    co = cohere.ClientV2(api_key=api_key)

    with open(audio_path, "rb") as f:
        response = co.audio.transcriptions.create(
            model="cohere-transcribe-03-2026",
            language="en",
            file=f,
        )

    return response.text.strip() if response.text else ""


if __name__ == "__main__":
    print("Mock transcribe:", transcribe(None))