| """ |
| Trelis Chorus — HF Space demo (CPU inference). |
| |
| Loads the merged Chorus model (base Whisper Turbo + LoRA merged + |
| expanded tokenizer) once and serves a FastAPI + vanilla-JS UI that |
| accepts uploaded or recorded audio and returns S1/S2 transcripts. |
| |
| CPU inference takes ~30-60s per 30s clip on the free HF Space tier. |
| GPU tier would make this near-instant. |
| """ |
| import os, io, re, time |
| from pathlib import Path |
|
|
| import numpy as np |
| import soundfile as sf |
| import torch |
| from fastapi import FastAPI, UploadFile, File, HTTPException |
| from fastapi.responses import HTMLResponse, JSONResponse, FileResponse |
| import uvicorn |
|
|
| |
| MODEL_REPO = os.environ.get("CHORUS_MODEL_REPO", "Trelis/Chorus-v1") |
| SPEAKER1_TOKEN = "<|speaker1|>" |
| SPEAKER2_TOKEN = "<|speaker2|>" |
| SR = 16_000 |
|
|
| if torch.cuda.is_available(): |
| DEVICE, DTYPE = "cuda", torch.float16 |
| _GPU_NAME = torch.cuda.get_device_name(0) |
| else: |
| DEVICE, DTYPE = "cpu", torch.float32 |
| _GPU_NAME = None |
| print(f"[chorus-space] Device: {DEVICE} ({DTYPE}){' — ' + _GPU_NAME if _GPU_NAME else ''}, model: {MODEL_REPO}") |
|
|
| _model = None |
| _processor = None |
| _tok_ids: dict = {} |
| _TS_START_ID: int = -1 |
| _TS_END_ID: int = -1 |
| _TS_STEP = 0.02 |
|
|
|
|
| def load_model(): |
| global _model, _processor, _tok_ids, _TS_START_ID, _TS_END_ID |
| if _model is not None: |
| return |
| from transformers import WhisperForConditionalGeneration, WhisperProcessor |
|
|
| hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN") |
| print(f"[chorus-space] Loading {MODEL_REPO}...") |
| t = time.time() |
| proc = WhisperProcessor.from_pretrained(MODEL_REPO, token=hf_token) |
| m = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=hf_token, dtype=DTYPE) |
| m = m.to(DEVICE).eval() |
| m.generation_config.predict_timestamps = True |
| m.generation_config.max_initial_timestamp_index = 1500 |
|
|
| _tok_ids["spk1"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER1_TOKEN) |
| _tok_ids["spk2"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER2_TOKEN) |
| _tok_ids["en"] = proc.tokenizer.convert_tokens_to_ids("<|en|>") |
| _tok_ids["transcribe"] = proc.tokenizer.convert_tokens_to_ids("<|transcribe|>") |
| _TS_START_ID = proc.tokenizer.convert_tokens_to_ids("<|0.00|>") |
| _TS_END_ID = proc.tokenizer.convert_tokens_to_ids("<|30.00|>") |
| _processor = proc |
| _model = m |
| print(f"[chorus-space] Model ready in {time.time()-t:.1f}s (ts range: {_TS_START_ID}..{_TS_END_ID})") |
|
|
|
|
| def _infer(arr: np.ndarray, spk_id: int) -> list[dict]: |
| feats = _processor.feature_extractor( |
| [arr], sampling_rate=SR, return_tensors="pt" |
| ).input_features.to(DEVICE).to(DTYPE) |
| forced = [[1, _tok_ids["en"]], [2, _tok_ids["transcribe"]], [3, spk_id]] |
| with torch.no_grad(): |
| out = _model.generate( |
| feats, forced_decoder_ids=forced, |
| return_timestamps=True, max_new_tokens=444, |
| ) |
| return _parse_segments(out[0].tolist()) |
|
|
|
|
| def _parse_segments(ids: list[int]) -> list[dict]: |
| segments = [] |
| cur_start = None |
| cur_text_ids: list[int] = [] |
| for t in ids: |
| if _TS_START_ID <= t <= _TS_END_ID: |
| ts = (t - _TS_START_ID) * _TS_STEP |
| if cur_start is None: |
| cur_start = ts |
| else: |
| text = _processor.tokenizer.decode(cur_text_ids, skip_special_tokens=True).strip() |
| if text: |
| segments.append({"start": round(cur_start, 2), "end": round(ts, 2), "text": text}) |
| cur_start = None |
| cur_text_ids = [] |
| elif cur_start is not None: |
| cur_text_ids.append(t) |
| return segments |
|
|
|
|
| def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]: |
| try: |
| return sf.read(io.BytesIO(audio_bytes)) |
| except Exception: |
| import subprocess, tempfile |
| with tempfile.NamedTemporaryFile(suffix=".bin") as fin: |
| fin.write(audio_bytes) |
| fin.flush() |
| result = subprocess.run( |
| ["ffmpeg", "-i", fin.name, "-f", "wav", "-ac", "1", "-ar", str(SR), "-"], |
| capture_output=True, check=True, |
| ) |
| return sf.read(io.BytesIO(result.stdout)) |
|
|
|
|
| def transcribe_bytes(audio_bytes: bytes) -> dict: |
| t0 = time.time() |
| arr, orig_sr = _decode_audio(audio_bytes) |
| arr = np.asarray(arr, dtype=np.float32) |
| if arr.ndim > 1: |
| arr = arr.mean(axis=1) |
| if orig_sr != SR: |
| import librosa |
| arr = librosa.resample(arr, orig_sr=orig_sr, target_sr=SR) |
| max_samples = 30 * SR |
| if len(arr) > max_samples: |
| arr = arr[:max_samples] |
| s1 = _infer(arr, _tok_ids["spk1"]) |
| s2 = _infer(arr, _tok_ids["spk2"]) |
| return { |
| "duration_s": float(len(arr) / SR), |
| "elapsed_s": time.time() - t0, |
| "speaker1": {"segments": s1}, |
| "speaker2": {"segments": s2}, |
| } |
|
|
|
|
| INDEX_HTML = r"""<!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="utf-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1"> |
| <title>Trelis Chorus</title> |
| <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet"> |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&display=swap" rel="stylesheet"> |
| <style> |
| :root { |
| --trelis-blue: #0d579b; --trelis-blue-50: #e8f2fc; |
| --trelis-green: #329239; --trelis-green-50: #e8f5e9; |
| --trelis-orange: #f7931a; --trelis-orange-50: #fff4e5; |
| --text-primary: #1a1a2e; --text-secondary: #4a5568; --text-muted: #718096; |
| --bg-primary: #ffffff; --bg-secondary: #fafbfc; --bg-accent: #f0f4f8; |
| --shadow-sm: 0 2px 4px rgba(0,0,0,.06); --shadow-md: 0 4px 12px rgba(0,0,0,.08); |
| --radius-sm: 8px; --radius-md: 16px; --radius-full: 9999px; |
| } |
| body { font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif; color:var(--text-primary); background:var(--bg-primary); min-height:100vh; } |
| .navbar { background:var(--bg-primary); border-bottom:1px solid rgba(0,0,0,.06); padding:1rem 1.5rem; position:relative; } |
| .navbar::after { content:''; position:absolute; bottom:0; left:0; right:0; height:3px; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); } |
| .navbar-brand { font-weight:800; font-size:1.4rem; color:var(--text-primary)!important; display:flex; align-items:center; gap:.75rem; } |
| .brand-dot { width:14px; height:14px; border-radius:50%; background:linear-gradient(135deg,var(--trelis-blue),var(--trelis-green),var(--trelis-orange)); box-shadow:0 0 0 3px rgba(13,87,155,.08); } |
| .model-chip { font-family:'SF Mono',Monaco,monospace; font-size:.72rem; color:var(--text-muted); padding:.25rem .6rem; background:var(--bg-accent); border-radius:var(--radius-full); } |
| .hero { background:linear-gradient(180deg,var(--bg-secondary) 0%,var(--bg-primary) 100%); padding:3rem 0 2rem; } |
| .hero h1 { font-weight:800; font-size:2.75rem; margin-bottom:.75rem; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; } |
| .hero p { color:var(--text-secondary); font-size:1.1rem; max-width:640px; margin-bottom:0; } |
| .card { background:var(--bg-primary); border:1px solid rgba(0,0,0,.06); border-radius:var(--radius-md); box-shadow:var(--shadow-sm); transition:.3s cubic-bezier(.4,0,.2,1); } |
| .card:hover { box-shadow:var(--shadow-md); } |
| .card-body { padding:1.5rem; } |
| .btn-primary { background:var(--trelis-blue); border:none; border-radius:var(--radius-full); padding:.65rem 1.75rem; font-weight:700; color:#fff; box-shadow:var(--shadow-sm); transition:.2s; } |
| .btn-primary:hover:not(:disabled) { background:#0c4a85; box-shadow:var(--shadow-md); transform:translateY(-1px); } |
| .btn-primary:disabled { opacity:.6; } |
| .btn-outline-secondary { border-radius:var(--radius-full); font-weight:600; padding:.6rem 1.5rem; border-color:#dee2e6; color:var(--text-secondary); } |
| .btn-outline-secondary:hover { background:var(--bg-accent); border-color:var(--trelis-blue); color:var(--trelis-blue); } |
| .upload-zone { border:2px dashed #dee2e6; border-radius:var(--radius-md); padding:2rem; text-align:center; transition:.2s; cursor:pointer; background:var(--bg-secondary); } |
| .upload-zone:hover { border-color:var(--trelis-blue); background:var(--trelis-blue-50); } |
| .upload-zone.has-file { border-color:var(--trelis-green); background:var(--trelis-green-50); } |
| .upload-zone input[type=file] { display:none; } |
| .upload-icon { font-size:2rem; color:var(--text-muted); margin-bottom:.5rem; } |
| .upload-zone.has-file .upload-icon { color:var(--trelis-green); } |
| audio { width:100%; margin-top:1rem; border-radius:var(--radius-full); } |
| audio::-webkit-media-controls-panel { background:var(--bg-accent); } |
| .speaker-card { padding:1.25rem 1.5rem; border-radius:var(--radius-md); background:var(--bg-primary); box-shadow:var(--shadow-sm); border:1px solid rgba(0,0,0,.06); height:100%; position:relative; overflow:hidden; } |
| .speaker-card::before { content:''; position:absolute; top:0; left:0; bottom:0; width:4px; } |
| .speaker-card.s1::before { background:linear-gradient(180deg,var(--trelis-blue),#1e70b8); } |
| .speaker-card.s2::before { background:linear-gradient(180deg,var(--trelis-orange),#ff9f2e); } |
| .speaker-label { display:inline-flex; align-items:center; gap:.5rem; font-size:.75rem; font-weight:700; text-transform:uppercase; letter-spacing:.05em; padding:.3rem .7rem; border-radius:var(--radius-full); margin-bottom:.75rem; } |
| .s1 .speaker-label { background:var(--trelis-blue-50); color:var(--trelis-blue); } |
| .s2 .speaker-label { background:var(--trelis-orange-50); color:var(--trelis-orange); } |
| .segment { padding:.5rem .75rem; margin:.25rem 0; border-radius:var(--radius-sm); cursor:pointer; transition:.15s; display:flex; align-items:baseline; gap:.75rem; line-height:1.5; } |
| .segment:hover { background:var(--bg-accent); } |
| .s1 .segment:hover { background:var(--trelis-blue-50); } |
| .s2 .segment:hover { background:var(--trelis-orange-50); } |
| .timestamp { font-family:'SF Mono',Monaco,monospace; font-size:.75rem; color:var(--text-muted); flex-shrink:0; min-width:3rem; padding:.1rem .4rem; background:var(--bg-accent); border-radius:4px; } |
| .segment-text { color:var(--text-primary); } |
| .mic-select { width:auto; max-width:240px; border-radius:var(--radius-full); padding:.4rem 2.25rem .4rem 1rem; font-size:.85rem; border-color:#dee2e6; color:var(--text-secondary); } |
| .mic-row label { font-size:.8rem; } |
| .mic-select:focus { border-color:var(--trelis-blue); box-shadow:0 0 0 .2rem rgba(13,87,155,.15); } |
| #recordBtn { display:inline-flex; align-items:center; gap:.5rem; } |
| .record-dot { width:10px; height:10px; border-radius:50%; background:#c0c0c0; transition:.2s; flex-shrink:0; } |
| #recordBtn.recording .record-dot { background:#dc3545; animation: pulse 1.2s ease-in-out infinite; } |
| #recordBtn.recording { color:#dc3545; border-color:#dc3545; } |
| @keyframes pulse { 0%,100% { box-shadow:0 0 0 0 rgba(220,53,69,.5); } 50% { box-shadow:0 0 0 6px rgba(220,53,69,0); } } |
| #status { font-size:.9rem; color:var(--text-secondary); } |
| .spinner-border-sm { width:.9rem; height:.9rem; border-width:.15em; color:var(--trelis-blue); } |
| .empty { color:var(--text-muted); font-style:italic; } |
| .cpu-note { background:var(--trelis-orange-50); border:1px solid var(--trelis-orange); color:var(--trelis-brown,#92400e); border-radius:var(--radius-sm); padding:.75rem 1rem; font-size:.9rem; margin-bottom:1rem; } |
| </style> |
| </head> |
| <body> |
| |
| <nav class="navbar"> |
| <div class="container d-flex justify-content-between align-items-center"> |
| <a class="navbar-brand" href="#"><span class="brand-dot"></span>Trelis Chorus</a> |
| <span class="model-chip">model: <span id="modelRepo">...</span> · <span id="device">...</span></span> |
| </div> |
| </nav> |
| |
| <section class="hero"> |
| <div class="container"> |
| <h1>Separate two voices<br>from a single stream.</h1> |
| <p>Multi-speaker Whisper fine-tune by Trelis. Upload audio of two people talking — possibly overlapping — and Trelis Chorus returns a transcript for each speaker with timestamps.</p> |
| </div> |
| </section> |
| |
| <div class="container pb-5"> |
| <div id="deviceNote" class="cpu-note" style="display:none;"></div> |
| |
| <div class="card mb-4"> |
| <div class="card-body"> |
| <label for="audioFile" class="upload-zone" id="uploadZone"> |
| <div class="upload-icon">↑</div> |
| <div id="uploadLabel"><strong>Click to upload</strong> or drop an audio file here</div> |
| <div class="text-muted small mt-1">WAV, MP3, M4A, FLAC — up to 30s</div> |
| <input type="file" id="audioFile" accept="audio/*"> |
| </label> |
| |
| <div class="d-flex flex-wrap gap-2 mt-3 align-items-center"> |
| <button id="transcribeBtn" class="btn btn-primary" disabled>Transcribe</button> |
| <button id="recordBtn" class="btn btn-outline-secondary"> |
| <span class="record-dot"></span> |
| <span id="recordLabel">Record (two speakers)</span> |
| </button> |
| <button class="btn btn-outline-secondary sample-btn" data-sample="podcast" data-label="Podcast clip — 30s">Try sample</button> |
| <span id="status" class="ms-2"></span> |
| </div> |
| |
| <div class="d-flex align-items-center gap-2 mt-2 mic-row"> |
| <label for="micSelect" class="small text-muted mb-0">Recording mic:</label> |
| <select id="micSelect" class="form-select form-select-sm mic-select" title="Recording device"> |
| <option value="">Default microphone</option> |
| </select> |
| </div> |
| |
| <audio id="audioPlayer" controls style="display:none;"></audio> |
| </div> |
| </div> |
| |
| <div id="results" style="display:none;"> |
| <div class="row g-3"> |
| <div class="col-md-6"> |
| <div class="speaker-card s1"> |
| <span class="speaker-label">Speaker 1</span> |
| <div id="s1Segments"></div> |
| </div> |
| </div> |
| <div class="col-md-6"> |
| <div class="speaker-card s2"> |
| <span class="speaker-label">Speaker 2</span> |
| <div id="s2Segments"></div> |
| </div> |
| </div> |
| </div> |
| </div> |
| </div> |
| |
| <script> |
| const fileInput = document.getElementById('audioFile'); |
| const uploadZone = document.getElementById('uploadZone'); |
| const uploadLabel = document.getElementById('uploadLabel'); |
| const audioPlayer = document.getElementById('audioPlayer'); |
| const transcribeBtn = document.getElementById('transcribeBtn'); |
| const statusEl = document.getElementById('status'); |
| const results = document.getElementById('results'); |
| let audioBlob = null; |
| |
| fetch('/info').then(r => r.json()).then(d => { |
| document.getElementById('modelRepo').textContent = d.model_repo; |
| document.getElementById('device').textContent = d.gpu_name || d.device; |
| const note = document.getElementById('deviceNote'); |
| if (d.device === 'cuda') { |
| note.innerHTML = `<strong>Running on ${d.gpu_name || 'GPU'}</strong> — transcription takes ~2-5s per clip. First request downloads the model (~3GB, one-off).`; |
| } else { |
| note.innerHTML = `<strong>Running on CPU</strong> — transcription takes ~30-60s per 30s of audio. First request downloads the model (~3GB, one-off).`; |
| } |
| note.style.display = 'block'; |
| }); |
| |
| function setAudio(blob, label) { |
| audioBlob = blob; |
| audioPlayer.src = URL.createObjectURL(blob); |
| audioPlayer.style.display = 'block'; |
| transcribeBtn.disabled = false; |
| uploadZone.classList.add('has-file'); |
| uploadLabel.innerHTML = `<strong>${label}</strong> ready`; |
| results.style.display = 'none'; |
| statusEl.textContent = ''; |
| } |
| |
| fileInput.addEventListener('change', e => { |
| const f = e.target.files[0]; |
| if (!f) return; |
| setAudio(f, f.name); |
| }); |
| |
| // ---- Browser recording ---- |
| let mediaRec = null, recChunks = [], recTimer = null, recStart = 0; |
| const recordBtn = document.getElementById('recordBtn'); |
| const recordLabel = document.getElementById('recordLabel'); |
| const micSelect = document.getElementById('micSelect'); |
| const MAX_REC_SEC = 30; |
| |
| async function populateMics() { |
| try { |
| const devices = await navigator.mediaDevices.enumerateDevices(); |
| const mics = devices.filter(d => d.kind === 'audioinput'); |
| const currentValue = micSelect.value; |
| micSelect.innerHTML = '<option value="">Default microphone</option>'; |
| for (const d of mics) { |
| const opt = document.createElement('option'); |
| opt.value = d.deviceId; |
| opt.textContent = d.label || `Microphone ${mics.indexOf(d) + 1}`; |
| micSelect.appendChild(opt); |
| } |
| if (currentValue) micSelect.value = currentValue; |
| } catch (err) { /* ignore */ } |
| } |
| |
| let micsUnlocked = false; |
| async function unlockMics() { |
| if (micsUnlocked) return; |
| try { |
| const s = await navigator.mediaDevices.getUserMedia({ audio: true }); |
| s.getTracks().forEach(t => t.stop()); |
| micsUnlocked = true; |
| await populateMics(); |
| } catch (err) { /* user denied — leave fallback list */ } |
| } |
| micSelect.addEventListener('mousedown', unlockMics); |
| micSelect.addEventListener('focus', unlockMics); |
| |
| populateMics(); |
| if (navigator.mediaDevices && navigator.mediaDevices.addEventListener) { |
| navigator.mediaDevices.addEventListener('devicechange', populateMics); |
| } |
| |
| recordBtn.addEventListener('click', async () => { |
| if (mediaRec && mediaRec.state === 'recording') { stopRecording(); return; } |
| try { |
| const audioConstraints = { channelCount: 1, sampleRate: 16000 }; |
| if (micSelect.value) audioConstraints.deviceId = { exact: micSelect.value }; |
| const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints }); |
| micsUnlocked = true; |
| populateMics(); |
| const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus') ? 'audio/webm;codecs=opus' : 'audio/webm'; |
| mediaRec = new MediaRecorder(stream, { mimeType: mime }); |
| recChunks = []; |
| recStart = Date.now(); |
| mediaRec.ondataavailable = e => { if (e.data.size > 0) recChunks.push(e.data); }; |
| mediaRec.onstop = () => { |
| stream.getTracks().forEach(t => t.stop()); |
| const blob = new Blob(recChunks, { type: mime }); |
| setAudio(blob, `Recording (${((Date.now()-recStart)/1000).toFixed(1)}s)`); |
| recordBtn.classList.remove('recording'); |
| recordLabel.textContent = 'Record (two speakers)'; |
| if (recTimer) { clearInterval(recTimer); recTimer = null; } |
| }; |
| mediaRec.start(); |
| recordBtn.classList.add('recording'); |
| recTimer = setInterval(() => { |
| const sec = (Date.now() - recStart) / 1000; |
| recordLabel.textContent = `Stop recording (${sec.toFixed(0)}s)`; |
| if (sec >= MAX_REC_SEC) stopRecording(); |
| }, 200); |
| } catch (err) { |
| statusEl.innerHTML = `<span class="text-danger">Mic error: ${err.message}</span>`; |
| } |
| }); |
| |
| function stopRecording() { if (mediaRec && mediaRec.state === 'recording') mediaRec.stop(); } |
| |
| document.querySelectorAll('.sample-btn').forEach(btn => { |
| btn.addEventListener('click', async () => { |
| const which = btn.dataset.sample; |
| const label = btn.dataset.label; |
| btn.disabled = true; |
| statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Loading sample...'; |
| try { |
| const r = await fetch(`/sample/${which}`); |
| const blob = await r.blob(); |
| setAudio(blob, label); |
| } finally { |
| btn.disabled = false; |
| } |
| }); |
| }); |
| |
| ['dragover','dragenter'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = 'var(--trelis-blue)'; })); |
| ['dragleave','drop'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = ''; })); |
| uploadZone.addEventListener('drop', e => { |
| const f = e.dataTransfer.files[0]; |
| if (f) { fileInput.files = e.dataTransfer.files; setAudio(f, f.name); } |
| }); |
| |
| transcribeBtn.addEventListener('click', async () => { |
| if (!audioBlob) return; |
| transcribeBtn.disabled = true; |
| statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Transcribing...'; |
| results.style.display = 'none'; |
| const fd = new FormData(); |
| fd.append('file', audioBlob, 'audio.wav'); |
| try { |
| const r = await fetch('/transcribe', { method:'POST', body:fd }); |
| if (!r.ok) throw new Error(`HTTP ${r.status}: ${await r.text()}`); |
| const data = await r.json(); |
| render('s1Segments', data.speaker1.segments); |
| render('s2Segments', data.speaker2.segments); |
| results.style.display = 'block'; |
| statusEl.innerHTML = `<span class="text-success">Done in ${data.elapsed_s.toFixed(1)}s</span>`; |
| } catch (err) { |
| statusEl.innerHTML = `<span class="text-danger">Error: ${err.message}</span>`; |
| } finally { |
| transcribeBtn.disabled = false; |
| } |
| }); |
| |
| function render(elId, segs) { |
| const el = document.getElementById(elId); |
| el.innerHTML = ''; |
| if (!segs.length) { el.innerHTML = '<div class="empty">No speech detected.</div>'; return; } |
| for (const s of segs) { |
| const d = document.createElement('div'); |
| d.className = 'segment'; |
| d.innerHTML = `<span class="timestamp">${s.start.toFixed(2)}</span><span class="segment-text">${esc(s.text)}</span>`; |
| d.addEventListener('click', () => { audioPlayer.currentTime = s.start; audioPlayer.play(); }); |
| el.appendChild(d); |
| } |
| } |
| |
| function esc(s) { return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); } |
| </script> |
| </body> |
| </html> |
| """ |
|
|
| app = FastAPI() |
|
|
|
|
| @app.on_event("startup") |
| def startup(): |
| load_model() |
|
|
|
|
| @app.get("/", response_class=HTMLResponse) |
| def index(): |
| return INDEX_HTML |
|
|
|
|
| @app.get("/info") |
| def info(): |
| return {"model_repo": MODEL_REPO, "device": DEVICE, "gpu_name": _GPU_NAME} |
|
|
|
|
| _SAMPLES = { |
| "podcast": "sample_podcast.wav", |
| } |
|
|
|
|
| @app.get("/sample/{name}") |
| def sample(name: str): |
| fname = _SAMPLES.get(name) |
| if not fname: |
| raise HTTPException(404, f"Unknown sample: {name}") |
| path = Path(__file__).parent / "static" / fname |
| if not path.exists(): |
| raise HTTPException(404, f"Sample file not found: {fname}") |
| return FileResponse(str(path), media_type="audio/wav") |
|
|
|
|
| @app.post("/transcribe") |
| async def transcribe(file: UploadFile = File(...)): |
| audio_bytes = await file.read() |
| if len(audio_bytes) > 50 * 1024 * 1024: |
| raise HTTPException(400, "File too large (50MB max).") |
| try: |
| return JSONResponse(transcribe_bytes(audio_bytes)) |
| except Exception as e: |
| raise HTTPException(500, f"Inference failed: {e}") |
|
|
|
|
| if __name__ == "__main__": |
| port = int(os.environ.get("PORT", 7860)) |
| uvicorn.run(app, host="0.0.0.0", port=port) |
|
|