RonanMcGovern commited on
Commit
4e9e7ec
·
verified ·
1 Parent(s): 627c371

initial Chorus demo

Browse files
Files changed (7) hide show
  1. .gitattributes +2 -0
  2. Dockerfile +23 -0
  3. README.md +28 -5
  4. app.py +490 -0
  5. requirements.txt +9 -0
  6. static/sample.wav +3 -0
  7. static/sample_ami.wav +3 -0
.gitattributes CHANGED
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ static/sample.wav filter=lfs diff=lfs merge=lfs -text
37
+ static/sample_ami.wav filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.11-slim
2
+
3
+ RUN apt-get update && apt-get install -y --no-install-recommends \
4
+ ffmpeg libsndfile1 \
5
+ && rm -rf /var/lib/apt/lists/*
6
+
7
+ # HF Spaces sets HOME=/home/user for non-root user. Use that for caches.
8
+ RUN useradd -m -u 1000 user
9
+ USER user
10
+ ENV HOME=/home/user \
11
+ PATH=/home/user/.local/bin:$PATH \
12
+ HF_HOME=/home/user/.cache/huggingface \
13
+ PORT=7860
14
+
15
+ WORKDIR /home/user/app
16
+ COPY --chown=user:user requirements.txt .
17
+ RUN pip install --user --no-cache-dir -r requirements.txt
18
+
19
+ COPY --chown=user:user app.py .
20
+ COPY --chown=user:user static/ ./static/
21
+
22
+ EXPOSE 7860
23
+ CMD ["python", "app.py"]
README.md CHANGED
@@ -1,10 +1,33 @@
1
  ---
2
- title: Chorus
3
- emoji: 👀
4
- colorFrom: green
5
- colorTo: indigo
6
  sdk: docker
 
7
  pinned: false
8
  ---
9
 
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: Trelis Chorus
3
+ emoji: 🎙️
4
+ colorFrom: blue
5
+ colorTo: yellow
6
  sdk: docker
7
+ app_port: 7860
8
  pinned: false
9
  ---
10
 
11
+ # Trelis Chorus Multi-Speaker Whisper
12
+
13
+ Upload audio of two people talking (possibly overlapping) and get separate transcripts for each speaker with timestamps.
14
+
15
+ Running on CPU — expect ~30-60s per 30s of audio.
16
+
17
+ ## How it works
18
+
19
+ Chorus is a LoRA fine-tune of [whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) that adds two speaker-conditioned tokens (`<|speaker1|>`, `<|speaker2|>`). At inference time the decoder prefix includes the speaker token, which biases cross-attention toward that speaker's audio regions. Two forward passes (one per speaker) produce a transcript per speaker.
20
+
21
+ Trained on a mix of:
22
+ - **VoxPopuli** (parliamentary speech, synthetically mixed pairs)
23
+ - **AMI Meeting Corpus** (real conversational meeting speech)
24
+
25
+ See the [Trelis Studio repo](https://github.com/TrelisResearch/studio) (private) for full training pipeline.
26
+
27
+ ## Model
28
+
29
+ - **Chorus Turbo**: `Trelis/chorus-merged-test` — merged standalone Whisper model (base + LoRA merged + expanded tokenizer)
30
+
31
+ ## Environment
32
+
33
+ The Space requires `HF_TOKEN` (Space secret) to pull the private model weights.
app.py ADDED
@@ -0,0 +1,490 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Trelis Chorus — HF Space demo (CPU inference).
3
+
4
+ Loads the merged Chorus model (base Whisper Turbo + LoRA merged +
5
+ expanded tokenizer) once and serves a FastAPI + vanilla-JS UI that
6
+ accepts uploaded or recorded audio and returns S1/S2 transcripts.
7
+
8
+ CPU inference takes ~30-60s per 30s clip on the free HF Space tier.
9
+ GPU tier would make this near-instant.
10
+ """
11
+ import os, io, re, time
12
+ from pathlib import Path
13
+
14
+ import numpy as np
15
+ import soundfile as sf
16
+ import torch
17
+ from fastapi import FastAPI, UploadFile, File, HTTPException
18
+ from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
19
+ import uvicorn
20
+
21
+ # Merged model containing base Whisper Turbo + LoRA merged in + expanded tokenizer
22
+ MODEL_REPO = os.environ.get("CHORUS_MODEL_REPO", "Trelis/chorus-merged-test")
23
+ SPEAKER1_TOKEN = "<|speaker1|>"
24
+ SPEAKER2_TOKEN = "<|speaker2|>"
25
+ SR = 16_000
26
+
27
+ DEVICE = "cpu"
28
+ DTYPE = torch.float32 # fp32 on CPU for stability
29
+ print(f"[chorus-space] Device: {DEVICE} ({DTYPE}), model: {MODEL_REPO}")
30
+
31
+ _model = None
32
+ _processor = None
33
+ _tok_ids: dict = {}
34
+ _TS_START_ID: int = -1
35
+ _TS_END_ID: int = -1
36
+ _TS_STEP = 0.02
37
+
38
+
39
+ def load_model():
40
+ global _model, _processor, _tok_ids, _TS_START_ID, _TS_END_ID
41
+ if _model is not None:
42
+ return
43
+ from transformers import WhisperForConditionalGeneration, WhisperProcessor
44
+
45
+ hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
46
+ print(f"[chorus-space] Loading {MODEL_REPO}...")
47
+ t = time.time()
48
+ proc = WhisperProcessor.from_pretrained(MODEL_REPO, token=hf_token)
49
+ m = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=hf_token, dtype=DTYPE)
50
+ m = m.to(DEVICE).eval()
51
+ m.generation_config.predict_timestamps = True
52
+ m.generation_config.max_initial_timestamp_index = 1500
53
+
54
+ _tok_ids["spk1"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER1_TOKEN)
55
+ _tok_ids["spk2"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER2_TOKEN)
56
+ _tok_ids["en"] = proc.tokenizer.convert_tokens_to_ids("<|en|>")
57
+ _tok_ids["transcribe"] = proc.tokenizer.convert_tokens_to_ids("<|transcribe|>")
58
+ _TS_START_ID = proc.tokenizer.convert_tokens_to_ids("<|0.00|>")
59
+ _TS_END_ID = proc.tokenizer.convert_tokens_to_ids("<|30.00|>")
60
+ _processor = proc
61
+ _model = m
62
+ print(f"[chorus-space] Model ready in {time.time()-t:.1f}s (ts range: {_TS_START_ID}..{_TS_END_ID})")
63
+
64
+
65
+ def _infer(arr: np.ndarray, spk_id: int) -> list[dict]:
66
+ feats = _processor.feature_extractor(
67
+ [arr], sampling_rate=SR, return_tensors="pt"
68
+ ).input_features.to(DEVICE).to(DTYPE)
69
+ forced = [[1, _tok_ids["en"]], [2, _tok_ids["transcribe"]], [3, spk_id]]
70
+ with torch.no_grad():
71
+ out = _model.generate(
72
+ feats, forced_decoder_ids=forced,
73
+ return_timestamps=True, max_new_tokens=444,
74
+ )
75
+ return _parse_segments(out[0].tolist())
76
+
77
+
78
+ def _parse_segments(ids: list[int]) -> list[dict]:
79
+ segments = []
80
+ cur_start = None
81
+ cur_text_ids: list[int] = []
82
+ for t in ids:
83
+ if _TS_START_ID <= t <= _TS_END_ID:
84
+ ts = (t - _TS_START_ID) * _TS_STEP
85
+ if cur_start is None:
86
+ cur_start = ts
87
+ else:
88
+ text = _processor.tokenizer.decode(cur_text_ids, skip_special_tokens=True).strip()
89
+ if text:
90
+ segments.append({"start": round(cur_start, 2), "end": round(ts, 2), "text": text})
91
+ cur_start = None
92
+ cur_text_ids = []
93
+ elif cur_start is not None:
94
+ cur_text_ids.append(t)
95
+ return segments
96
+
97
+
98
+ def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]:
99
+ try:
100
+ return sf.read(io.BytesIO(audio_bytes))
101
+ except Exception:
102
+ import subprocess, tempfile
103
+ with tempfile.NamedTemporaryFile(suffix=".bin") as fin:
104
+ fin.write(audio_bytes)
105
+ fin.flush()
106
+ result = subprocess.run(
107
+ ["ffmpeg", "-i", fin.name, "-f", "wav", "-ac", "1", "-ar", str(SR), "-"],
108
+ capture_output=True, check=True,
109
+ )
110
+ return sf.read(io.BytesIO(result.stdout))
111
+
112
+
113
+ def transcribe_bytes(audio_bytes: bytes) -> dict:
114
+ t0 = time.time()
115
+ arr, orig_sr = _decode_audio(audio_bytes)
116
+ arr = np.asarray(arr, dtype=np.float32)
117
+ if arr.ndim > 1:
118
+ arr = arr.mean(axis=1)
119
+ if orig_sr != SR:
120
+ import librosa
121
+ arr = librosa.resample(arr, orig_sr=orig_sr, target_sr=SR)
122
+ max_samples = 30 * SR
123
+ if len(arr) > max_samples:
124
+ arr = arr[:max_samples]
125
+ s1 = _infer(arr, _tok_ids["spk1"])
126
+ s2 = _infer(arr, _tok_ids["spk2"])
127
+ return {
128
+ "duration_s": float(len(arr) / SR),
129
+ "elapsed_s": time.time() - t0,
130
+ "speaker1": {"segments": s1},
131
+ "speaker2": {"segments": s2},
132
+ }
133
+
134
+
135
+ INDEX_HTML = r"""<!DOCTYPE html>
136
+ <html lang="en">
137
+ <head>
138
+ <meta charset="utf-8">
139
+ <meta name="viewport" content="width=device-width, initial-scale=1">
140
+ <title>Trelis Chorus</title>
141
+ <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
142
+ <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&display=swap" rel="stylesheet">
143
+ <style>
144
+ :root {
145
+ --trelis-blue: #0d579b; --trelis-blue-50: #e8f2fc;
146
+ --trelis-green: #329239; --trelis-green-50: #e8f5e9;
147
+ --trelis-orange: #f7931a; --trelis-orange-50: #fff4e5;
148
+ --text-primary: #1a1a2e; --text-secondary: #4a5568; --text-muted: #718096;
149
+ --bg-primary: #ffffff; --bg-secondary: #fafbfc; --bg-accent: #f0f4f8;
150
+ --shadow-sm: 0 2px 4px rgba(0,0,0,.06); --shadow-md: 0 4px 12px rgba(0,0,0,.08);
151
+ --radius-sm: 8px; --radius-md: 16px; --radius-full: 9999px;
152
+ }
153
+ body { font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif; color:var(--text-primary); background:var(--bg-primary); min-height:100vh; }
154
+ .navbar { background:var(--bg-primary); border-bottom:1px solid rgba(0,0,0,.06); padding:1rem 1.5rem; position:relative; }
155
+ .navbar::after { content:''; position:absolute; bottom:0; left:0; right:0; height:3px; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); }
156
+ .navbar-brand { font-weight:800; font-size:1.4rem; color:var(--text-primary)!important; display:flex; align-items:center; gap:.75rem; }
157
+ .brand-dot { width:14px; height:14px; border-radius:50%; background:linear-gradient(135deg,var(--trelis-blue),var(--trelis-green),var(--trelis-orange)); box-shadow:0 0 0 3px rgba(13,87,155,.08); }
158
+ .model-chip { font-family:'SF Mono',Monaco,monospace; font-size:.72rem; color:var(--text-muted); padding:.25rem .6rem; background:var(--bg-accent); border-radius:var(--radius-full); }
159
+ .hero { background:linear-gradient(180deg,var(--bg-secondary) 0%,var(--bg-primary) 100%); padding:3rem 0 2rem; }
160
+ .hero h1 { font-weight:800; font-size:2.75rem; margin-bottom:.75rem; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; }
161
+ .hero p { color:var(--text-secondary); font-size:1.1rem; max-width:640px; margin-bottom:0; }
162
+ .card { background:var(--bg-primary); border:1px solid rgba(0,0,0,.06); border-radius:var(--radius-md); box-shadow:var(--shadow-sm); transition:.3s cubic-bezier(.4,0,.2,1); }
163
+ .card:hover { box-shadow:var(--shadow-md); }
164
+ .card-body { padding:1.5rem; }
165
+ .btn-primary { background:var(--trelis-blue); border:none; border-radius:var(--radius-full); padding:.65rem 1.75rem; font-weight:700; color:#fff; box-shadow:var(--shadow-sm); transition:.2s; }
166
+ .btn-primary:hover:not(:disabled) { background:#0c4a85; box-shadow:var(--shadow-md); transform:translateY(-1px); }
167
+ .btn-primary:disabled { opacity:.6; }
168
+ .btn-outline-secondary { border-radius:var(--radius-full); font-weight:600; padding:.6rem 1.5rem; border-color:#dee2e6; color:var(--text-secondary); }
169
+ .btn-outline-secondary:hover { background:var(--bg-accent); border-color:var(--trelis-blue); color:var(--trelis-blue); }
170
+ .upload-zone { border:2px dashed #dee2e6; border-radius:var(--radius-md); padding:2rem; text-align:center; transition:.2s; cursor:pointer; background:var(--bg-secondary); }
171
+ .upload-zone:hover { border-color:var(--trelis-blue); background:var(--trelis-blue-50); }
172
+ .upload-zone.has-file { border-color:var(--trelis-green); background:var(--trelis-green-50); }
173
+ .upload-zone input[type=file] { display:none; }
174
+ .upload-icon { font-size:2rem; color:var(--text-muted); margin-bottom:.5rem; }
175
+ .upload-zone.has-file .upload-icon { color:var(--trelis-green); }
176
+ audio { width:100%; margin-top:1rem; border-radius:var(--radius-full); }
177
+ audio::-webkit-media-controls-panel { background:var(--bg-accent); }
178
+ .speaker-card { padding:1.25rem 1.5rem; border-radius:var(--radius-md); background:var(--bg-primary); box-shadow:var(--shadow-sm); border:1px solid rgba(0,0,0,.06); height:100%; position:relative; overflow:hidden; }
179
+ .speaker-card::before { content:''; position:absolute; top:0; left:0; bottom:0; width:4px; }
180
+ .speaker-card.s1::before { background:linear-gradient(180deg,var(--trelis-blue),#1e70b8); }
181
+ .speaker-card.s2::before { background:linear-gradient(180deg,var(--trelis-orange),#ff9f2e); }
182
+ .speaker-label { display:inline-flex; align-items:center; gap:.5rem; font-size:.75rem; font-weight:700; text-transform:uppercase; letter-spacing:.05em; padding:.3rem .7rem; border-radius:var(--radius-full); margin-bottom:.75rem; }
183
+ .s1 .speaker-label { background:var(--trelis-blue-50); color:var(--trelis-blue); }
184
+ .s2 .speaker-label { background:var(--trelis-orange-50); color:var(--trelis-orange); }
185
+ .segment { padding:.5rem .75rem; margin:.25rem 0; border-radius:var(--radius-sm); cursor:pointer; transition:.15s; display:flex; align-items:baseline; gap:.75rem; line-height:1.5; }
186
+ .segment:hover { background:var(--bg-accent); }
187
+ .s1 .segment:hover { background:var(--trelis-blue-50); }
188
+ .s2 .segment:hover { background:var(--trelis-orange-50); }
189
+ .timestamp { font-family:'SF Mono',Monaco,monospace; font-size:.75rem; color:var(--text-muted); flex-shrink:0; min-width:3rem; padding:.1rem .4rem; background:var(--bg-accent); border-radius:4px; }
190
+ .segment-text { color:var(--text-primary); }
191
+ .mic-select { width:auto; max-width:240px; border-radius:var(--radius-full); padding:.4rem 1rem; font-size:.85rem; border-color:#dee2e6; color:var(--text-secondary); }
192
+ .mic-select:focus { border-color:var(--trelis-blue); box-shadow:0 0 0 .2rem rgba(13,87,155,.15); }
193
+ #recordBtn { display:inline-flex; align-items:center; gap:.5rem; }
194
+ .record-dot { width:10px; height:10px; border-radius:50%; background:#c0c0c0; transition:.2s; flex-shrink:0; }
195
+ #recordBtn.recording .record-dot { background:#dc3545; animation: pulse 1.2s ease-in-out infinite; }
196
+ #recordBtn.recording { color:#dc3545; border-color:#dc3545; }
197
+ @keyframes pulse { 0%,100% { box-shadow:0 0 0 0 rgba(220,53,69,.5); } 50% { box-shadow:0 0 0 6px rgba(220,53,69,0); } }
198
+ #status { font-size:.9rem; color:var(--text-secondary); }
199
+ .spinner-border-sm { width:.9rem; height:.9rem; border-width:.15em; color:var(--trelis-blue); }
200
+ .empty { color:var(--text-muted); font-style:italic; }
201
+ .cpu-note { background:var(--trelis-orange-50); border:1px solid var(--trelis-orange); color:var(--trelis-brown,#92400e); border-radius:var(--radius-sm); padding:.75rem 1rem; font-size:.9rem; margin-bottom:1rem; }
202
+ </style>
203
+ </head>
204
+ <body>
205
+
206
+ <nav class="navbar">
207
+ <div class="container d-flex justify-content-between align-items-center">
208
+ <a class="navbar-brand" href="#"><span class="brand-dot"></span>Trelis Chorus</a>
209
+ <span class="model-chip">model: <span id="modelRepo">...</span> · <span id="device">...</span></span>
210
+ </div>
211
+ </nav>
212
+
213
+ <section class="hero">
214
+ <div class="container">
215
+ <h1>Separate two voices<br>from a single stream.</h1>
216
+ <p>Multi-speaker Whisper fine-tune by Trelis. Upload audio of two people talking &mdash; possibly overlapping &mdash; and Trelis Chorus returns a transcript for each speaker with timestamps.</p>
217
+ </div>
218
+ </section>
219
+
220
+ <div class="container pb-5">
221
+ <div class="cpu-note">
222
+ <strong>Running on CPU</strong> &mdash; transcription takes ~30-60s per 30s of audio. First request also downloads the model (~3GB, one-off).
223
+ </div>
224
+
225
+ <div class="card mb-4">
226
+ <div class="card-body">
227
+ <label for="audioFile" class="upload-zone" id="uploadZone">
228
+ <div class="upload-icon">&uarr;</div>
229
+ <div id="uploadLabel"><strong>Click to upload</strong> or drop an audio file here</div>
230
+ <div class="text-muted small mt-1">WAV, MP3, M4A, FLAC &mdash; up to 30s</div>
231
+ <input type="file" id="audioFile" accept="audio/*">
232
+ </label>
233
+
234
+ <div class="d-flex flex-wrap gap-2 mt-3 align-items-center">
235
+ <button id="transcribeBtn" class="btn btn-primary" disabled>Transcribe</button>
236
+ <button id="recordBtn" class="btn btn-outline-secondary">
237
+ <span class="record-dot"></span>
238
+ <span id="recordLabel">Record (two speakers)</span>
239
+ </button>
240
+ <select id="micSelect" class="form-select form-select-sm mic-select" title="Recording device">
241
+ <option value="">Default microphone</option>
242
+ </select>
243
+ <button class="btn btn-outline-secondary sample-btn" data-sample="ami" data-label="AMI meeting &mdash; 78% overlap, multi-turn">Meeting sample</button>
244
+ <button class="btn btn-outline-secondary sample-btn" data-sample="librispeech" data-label="LibriSpeechMix &mdash; 50% overlap, 2 speakers">Read speech sample</button>
245
+ <span id="status" class="ms-2"></span>
246
+ </div>
247
+
248
+ <audio id="audioPlayer" controls style="display:none;"></audio>
249
+ </div>
250
+ </div>
251
+
252
+ <div id="results" style="display:none;">
253
+ <div class="row g-3">
254
+ <div class="col-md-6">
255
+ <div class="speaker-card s1">
256
+ <span class="speaker-label">Speaker 1</span>
257
+ <div id="s1Segments"></div>
258
+ </div>
259
+ </div>
260
+ <div class="col-md-6">
261
+ <div class="speaker-card s2">
262
+ <span class="speaker-label">Speaker 2</span>
263
+ <div id="s2Segments"></div>
264
+ </div>
265
+ </div>
266
+ </div>
267
+ </div>
268
+ </div>
269
+
270
+ <script>
271
+ const fileInput = document.getElementById('audioFile');
272
+ const uploadZone = document.getElementById('uploadZone');
273
+ const uploadLabel = document.getElementById('uploadLabel');
274
+ const audioPlayer = document.getElementById('audioPlayer');
275
+ const transcribeBtn = document.getElementById('transcribeBtn');
276
+ const statusEl = document.getElementById('status');
277
+ const results = document.getElementById('results');
278
+ let audioBlob = null;
279
+
280
+ fetch('/info').then(r => r.json()).then(d => {
281
+ document.getElementById('modelRepo').textContent = d.model_repo;
282
+ document.getElementById('device').textContent = d.device;
283
+ });
284
+
285
+ function setAudio(blob, label) {
286
+ audioBlob = blob;
287
+ audioPlayer.src = URL.createObjectURL(blob);
288
+ audioPlayer.style.display = 'block';
289
+ transcribeBtn.disabled = false;
290
+ uploadZone.classList.add('has-file');
291
+ uploadLabel.innerHTML = `<strong>${label}</strong> ready`;
292
+ results.style.display = 'none';
293
+ statusEl.textContent = '';
294
+ }
295
+
296
+ fileInput.addEventListener('change', e => {
297
+ const f = e.target.files[0];
298
+ if (!f) return;
299
+ setAudio(f, f.name);
300
+ });
301
+
302
+ // ---- Browser recording ----
303
+ let mediaRec = null, recChunks = [], recTimer = null, recStart = 0;
304
+ const recordBtn = document.getElementById('recordBtn');
305
+ const recordLabel = document.getElementById('recordLabel');
306
+ const micSelect = document.getElementById('micSelect');
307
+ const MAX_REC_SEC = 30;
308
+
309
+ async function populateMics() {
310
+ try {
311
+ const devices = await navigator.mediaDevices.enumerateDevices();
312
+ const mics = devices.filter(d => d.kind === 'audioinput');
313
+ const currentValue = micSelect.value;
314
+ micSelect.innerHTML = '<option value="">Default microphone</option>';
315
+ for (const d of mics) {
316
+ const opt = document.createElement('option');
317
+ opt.value = d.deviceId;
318
+ opt.textContent = d.label || `Microphone ${mics.indexOf(d) + 1}`;
319
+ micSelect.appendChild(opt);
320
+ }
321
+ if (currentValue) micSelect.value = currentValue;
322
+ } catch (err) { /* ignore */ }
323
+ }
324
+
325
+ let micsUnlocked = false;
326
+ async function unlockMics() {
327
+ if (micsUnlocked) return;
328
+ try {
329
+ const s = await navigator.mediaDevices.getUserMedia({ audio: true });
330
+ s.getTracks().forEach(t => t.stop());
331
+ micsUnlocked = true;
332
+ await populateMics();
333
+ } catch (err) { /* user denied — leave fallback list */ }
334
+ }
335
+ micSelect.addEventListener('mousedown', unlockMics);
336
+ micSelect.addEventListener('focus', unlockMics);
337
+
338
+ populateMics();
339
+ if (navigator.mediaDevices && navigator.mediaDevices.addEventListener) {
340
+ navigator.mediaDevices.addEventListener('devicechange', populateMics);
341
+ }
342
+
343
+ recordBtn.addEventListener('click', async () => {
344
+ if (mediaRec && mediaRec.state === 'recording') { stopRecording(); return; }
345
+ try {
346
+ const audioConstraints = { channelCount: 1, sampleRate: 16000 };
347
+ if (micSelect.value) audioConstraints.deviceId = { exact: micSelect.value };
348
+ const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints });
349
+ micsUnlocked = true;
350
+ populateMics();
351
+ const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus') ? 'audio/webm;codecs=opus' : 'audio/webm';
352
+ mediaRec = new MediaRecorder(stream, { mimeType: mime });
353
+ recChunks = [];
354
+ recStart = Date.now();
355
+ mediaRec.ondataavailable = e => { if (e.data.size > 0) recChunks.push(e.data); };
356
+ mediaRec.onstop = () => {
357
+ stream.getTracks().forEach(t => t.stop());
358
+ const blob = new Blob(recChunks, { type: mime });
359
+ setAudio(blob, `Recording (${((Date.now()-recStart)/1000).toFixed(1)}s)`);
360
+ recordBtn.classList.remove('recording');
361
+ recordLabel.textContent = 'Record (two speakers)';
362
+ if (recTimer) { clearInterval(recTimer); recTimer = null; }
363
+ };
364
+ mediaRec.start();
365
+ recordBtn.classList.add('recording');
366
+ recTimer = setInterval(() => {
367
+ const sec = (Date.now() - recStart) / 1000;
368
+ recordLabel.textContent = `Stop recording (${sec.toFixed(0)}s)`;
369
+ if (sec >= MAX_REC_SEC) stopRecording();
370
+ }, 200);
371
+ } catch (err) {
372
+ statusEl.innerHTML = `<span class="text-danger">Mic error: ${err.message}</span>`;
373
+ }
374
+ });
375
+
376
+ function stopRecording() { if (mediaRec && mediaRec.state === 'recording') mediaRec.stop(); }
377
+
378
+ document.querySelectorAll('.sample-btn').forEach(btn => {
379
+ btn.addEventListener('click', async () => {
380
+ const which = btn.dataset.sample;
381
+ const label = btn.dataset.label;
382
+ btn.disabled = true;
383
+ statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Loading sample...';
384
+ try {
385
+ const r = await fetch(`/sample/${which}`);
386
+ const blob = await r.blob();
387
+ setAudio(blob, label);
388
+ } finally {
389
+ btn.disabled = false;
390
+ }
391
+ });
392
+ });
393
+
394
+ ['dragover','dragenter'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = 'var(--trelis-blue)'; }));
395
+ ['dragleave','drop'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = ''; }));
396
+ uploadZone.addEventListener('drop', e => {
397
+ const f = e.dataTransfer.files[0];
398
+ if (f) { fileInput.files = e.dataTransfer.files; setAudio(f, f.name); }
399
+ });
400
+
401
+ transcribeBtn.addEventListener('click', async () => {
402
+ if (!audioBlob) return;
403
+ transcribeBtn.disabled = true;
404
+ statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Transcribing... (CPU, slow)';
405
+ results.style.display = 'none';
406
+ const fd = new FormData();
407
+ fd.append('file', audioBlob, 'audio.wav');
408
+ try {
409
+ const r = await fetch('/transcribe', { method:'POST', body:fd });
410
+ if (!r.ok) throw new Error(`HTTP ${r.status}: ${await r.text()}`);
411
+ const data = await r.json();
412
+ render('s1Segments', data.speaker1.segments);
413
+ render('s2Segments', data.speaker2.segments);
414
+ results.style.display = 'block';
415
+ statusEl.innerHTML = `<span class="text-success">Done in ${data.elapsed_s.toFixed(1)}s</span>`;
416
+ } catch (err) {
417
+ statusEl.innerHTML = `<span class="text-danger">Error: ${err.message}</span>`;
418
+ } finally {
419
+ transcribeBtn.disabled = false;
420
+ }
421
+ });
422
+
423
+ function render(elId, segs) {
424
+ const el = document.getElementById(elId);
425
+ el.innerHTML = '';
426
+ if (!segs.length) { el.innerHTML = '<div class="empty">No speech detected.</div>'; return; }
427
+ for (const s of segs) {
428
+ const d = document.createElement('div');
429
+ d.className = 'segment';
430
+ d.innerHTML = `<span class="timestamp">${s.start.toFixed(2)}</span><span class="segment-text">${esc(s.text)}</span>`;
431
+ d.addEventListener('click', () => { audioPlayer.currentTime = s.start; audioPlayer.play(); });
432
+ el.appendChild(d);
433
+ }
434
+ }
435
+
436
+ function esc(s) { return String(s).replace(/&/g,'&amp;').replace(/</g,'&lt;').replace(/>/g,'&gt;'); }
437
+ </script>
438
+ </body>
439
+ </html>
440
+ """
441
+
442
+ app = FastAPI()
443
+
444
+
445
+ @app.on_event("startup")
446
+ def startup():
447
+ load_model()
448
+
449
+
450
+ @app.get("/", response_class=HTMLResponse)
451
+ def index():
452
+ return INDEX_HTML
453
+
454
+
455
+ @app.get("/info")
456
+ def info():
457
+ return {"model_repo": MODEL_REPO, "device": DEVICE}
458
+
459
+
460
+ _SAMPLES = {
461
+ "librispeech": "sample.wav",
462
+ "ami": "sample_ami.wav",
463
+ }
464
+
465
+
466
+ @app.get("/sample/{name}")
467
+ def sample(name: str):
468
+ fname = _SAMPLES.get(name)
469
+ if not fname:
470
+ raise HTTPException(404, f"Unknown sample: {name}")
471
+ path = Path(__file__).parent / "static" / fname
472
+ if not path.exists():
473
+ raise HTTPException(404, f"Sample file not found: {fname}")
474
+ return FileResponse(str(path), media_type="audio/wav")
475
+
476
+
477
+ @app.post("/transcribe")
478
+ async def transcribe(file: UploadFile = File(...)):
479
+ audio_bytes = await file.read()
480
+ if len(audio_bytes) > 50 * 1024 * 1024:
481
+ raise HTTPException(400, "File too large (50MB max).")
482
+ try:
483
+ return JSONResponse(transcribe_bytes(audio_bytes))
484
+ except Exception as e:
485
+ raise HTTPException(500, f"Inference failed: {e}")
486
+
487
+
488
+ if __name__ == "__main__":
489
+ port = int(os.environ.get("PORT", 7860)) # HF Spaces default port
490
+ uvicorn.run(app, host="0.0.0.0", port=port)
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ torch>=2.0.0
2
+ transformers>=4.48.0,<5.0.0
3
+ soundfile>=0.12.0
4
+ librosa>=0.10.0
5
+ numpy>=1.24.0
6
+ fastapi[standard]
7
+ python-multipart
8
+ uvicorn
9
+ huggingface_hub>=0.32.0,<2.0.0
static/sample.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b97d3b33c3e99e3a7eb69665b25752d21ddb602591dd59fa2d5ee9ff4ff1173d
3
+ size 606960
static/sample_ami.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a80537a734ae0c1f3518cee7782141673e743612e99126fb8341f9d88ade979b
3
+ size 1583440