initial Chorus demo
Browse files- .gitattributes +2 -0
- Dockerfile +23 -0
- README.md +28 -5
- app.py +490 -0
- requirements.txt +9 -0
- static/sample.wav +3 -0
- static/sample_ami.wav +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
static/sample.wav filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
static/sample_ami.wav filter=lfs diff=lfs merge=lfs -text
|
Dockerfile
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM python:3.11-slim
|
| 2 |
+
|
| 3 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 4 |
+
ffmpeg libsndfile1 \
|
| 5 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 6 |
+
|
| 7 |
+
# HF Spaces sets HOME=/home/user for non-root user. Use that for caches.
|
| 8 |
+
RUN useradd -m -u 1000 user
|
| 9 |
+
USER user
|
| 10 |
+
ENV HOME=/home/user \
|
| 11 |
+
PATH=/home/user/.local/bin:$PATH \
|
| 12 |
+
HF_HOME=/home/user/.cache/huggingface \
|
| 13 |
+
PORT=7860
|
| 14 |
+
|
| 15 |
+
WORKDIR /home/user/app
|
| 16 |
+
COPY --chown=user:user requirements.txt .
|
| 17 |
+
RUN pip install --user --no-cache-dir -r requirements.txt
|
| 18 |
+
|
| 19 |
+
COPY --chown=user:user app.py .
|
| 20 |
+
COPY --chown=user:user static/ ./static/
|
| 21 |
+
|
| 22 |
+
EXPOSE 7860
|
| 23 |
+
CMD ["python", "app.py"]
|
README.md
CHANGED
|
@@ -1,10 +1,33 @@
|
|
| 1 |
---
|
| 2 |
-
title: Chorus
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
sdk: docker
|
|
|
|
| 7 |
pinned: false
|
| 8 |
---
|
| 9 |
|
| 10 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Trelis Chorus
|
| 3 |
+
emoji: 🎙️
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: yellow
|
| 6 |
sdk: docker
|
| 7 |
+
app_port: 7860
|
| 8 |
pinned: false
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Trelis Chorus — Multi-Speaker Whisper
|
| 12 |
+
|
| 13 |
+
Upload audio of two people talking (possibly overlapping) and get separate transcripts for each speaker with timestamps.
|
| 14 |
+
|
| 15 |
+
Running on CPU — expect ~30-60s per 30s of audio.
|
| 16 |
+
|
| 17 |
+
## How it works
|
| 18 |
+
|
| 19 |
+
Chorus is a LoRA fine-tune of [whisper-large-v3-turbo](https://huggingface.co/openai/whisper-large-v3-turbo) that adds two speaker-conditioned tokens (`<|speaker1|>`, `<|speaker2|>`). At inference time the decoder prefix includes the speaker token, which biases cross-attention toward that speaker's audio regions. Two forward passes (one per speaker) produce a transcript per speaker.
|
| 20 |
+
|
| 21 |
+
Trained on a mix of:
|
| 22 |
+
- **VoxPopuli** (parliamentary speech, synthetically mixed pairs)
|
| 23 |
+
- **AMI Meeting Corpus** (real conversational meeting speech)
|
| 24 |
+
|
| 25 |
+
See the [Trelis Studio repo](https://github.com/TrelisResearch/studio) (private) for full training pipeline.
|
| 26 |
+
|
| 27 |
+
## Model
|
| 28 |
+
|
| 29 |
+
- **Chorus Turbo**: `Trelis/chorus-merged-test` — merged standalone Whisper model (base + LoRA merged + expanded tokenizer)
|
| 30 |
+
|
| 31 |
+
## Environment
|
| 32 |
+
|
| 33 |
+
The Space requires `HF_TOKEN` (Space secret) to pull the private model weights.
|
app.py
ADDED
|
@@ -0,0 +1,490 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Trelis Chorus — HF Space demo (CPU inference).
|
| 3 |
+
|
| 4 |
+
Loads the merged Chorus model (base Whisper Turbo + LoRA merged +
|
| 5 |
+
expanded tokenizer) once and serves a FastAPI + vanilla-JS UI that
|
| 6 |
+
accepts uploaded or recorded audio and returns S1/S2 transcripts.
|
| 7 |
+
|
| 8 |
+
CPU inference takes ~30-60s per 30s clip on the free HF Space tier.
|
| 9 |
+
GPU tier would make this near-instant.
|
| 10 |
+
"""
|
| 11 |
+
import os, io, re, time
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
import numpy as np
|
| 15 |
+
import soundfile as sf
|
| 16 |
+
import torch
|
| 17 |
+
from fastapi import FastAPI, UploadFile, File, HTTPException
|
| 18 |
+
from fastapi.responses import HTMLResponse, JSONResponse, FileResponse
|
| 19 |
+
import uvicorn
|
| 20 |
+
|
| 21 |
+
# Merged model containing base Whisper Turbo + LoRA merged in + expanded tokenizer
|
| 22 |
+
MODEL_REPO = os.environ.get("CHORUS_MODEL_REPO", "Trelis/chorus-merged-test")
|
| 23 |
+
SPEAKER1_TOKEN = "<|speaker1|>"
|
| 24 |
+
SPEAKER2_TOKEN = "<|speaker2|>"
|
| 25 |
+
SR = 16_000
|
| 26 |
+
|
| 27 |
+
DEVICE = "cpu"
|
| 28 |
+
DTYPE = torch.float32 # fp32 on CPU for stability
|
| 29 |
+
print(f"[chorus-space] Device: {DEVICE} ({DTYPE}), model: {MODEL_REPO}")
|
| 30 |
+
|
| 31 |
+
_model = None
|
| 32 |
+
_processor = None
|
| 33 |
+
_tok_ids: dict = {}
|
| 34 |
+
_TS_START_ID: int = -1
|
| 35 |
+
_TS_END_ID: int = -1
|
| 36 |
+
_TS_STEP = 0.02
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def load_model():
|
| 40 |
+
global _model, _processor, _tok_ids, _TS_START_ID, _TS_END_ID
|
| 41 |
+
if _model is not None:
|
| 42 |
+
return
|
| 43 |
+
from transformers import WhisperForConditionalGeneration, WhisperProcessor
|
| 44 |
+
|
| 45 |
+
hf_token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGINGFACE_TOKEN")
|
| 46 |
+
print(f"[chorus-space] Loading {MODEL_REPO}...")
|
| 47 |
+
t = time.time()
|
| 48 |
+
proc = WhisperProcessor.from_pretrained(MODEL_REPO, token=hf_token)
|
| 49 |
+
m = WhisperForConditionalGeneration.from_pretrained(MODEL_REPO, token=hf_token, dtype=DTYPE)
|
| 50 |
+
m = m.to(DEVICE).eval()
|
| 51 |
+
m.generation_config.predict_timestamps = True
|
| 52 |
+
m.generation_config.max_initial_timestamp_index = 1500
|
| 53 |
+
|
| 54 |
+
_tok_ids["spk1"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER1_TOKEN)
|
| 55 |
+
_tok_ids["spk2"] = proc.tokenizer.convert_tokens_to_ids(SPEAKER2_TOKEN)
|
| 56 |
+
_tok_ids["en"] = proc.tokenizer.convert_tokens_to_ids("<|en|>")
|
| 57 |
+
_tok_ids["transcribe"] = proc.tokenizer.convert_tokens_to_ids("<|transcribe|>")
|
| 58 |
+
_TS_START_ID = proc.tokenizer.convert_tokens_to_ids("<|0.00|>")
|
| 59 |
+
_TS_END_ID = proc.tokenizer.convert_tokens_to_ids("<|30.00|>")
|
| 60 |
+
_processor = proc
|
| 61 |
+
_model = m
|
| 62 |
+
print(f"[chorus-space] Model ready in {time.time()-t:.1f}s (ts range: {_TS_START_ID}..{_TS_END_ID})")
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def _infer(arr: np.ndarray, spk_id: int) -> list[dict]:
|
| 66 |
+
feats = _processor.feature_extractor(
|
| 67 |
+
[arr], sampling_rate=SR, return_tensors="pt"
|
| 68 |
+
).input_features.to(DEVICE).to(DTYPE)
|
| 69 |
+
forced = [[1, _tok_ids["en"]], [2, _tok_ids["transcribe"]], [3, spk_id]]
|
| 70 |
+
with torch.no_grad():
|
| 71 |
+
out = _model.generate(
|
| 72 |
+
feats, forced_decoder_ids=forced,
|
| 73 |
+
return_timestamps=True, max_new_tokens=444,
|
| 74 |
+
)
|
| 75 |
+
return _parse_segments(out[0].tolist())
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def _parse_segments(ids: list[int]) -> list[dict]:
|
| 79 |
+
segments = []
|
| 80 |
+
cur_start = None
|
| 81 |
+
cur_text_ids: list[int] = []
|
| 82 |
+
for t in ids:
|
| 83 |
+
if _TS_START_ID <= t <= _TS_END_ID:
|
| 84 |
+
ts = (t - _TS_START_ID) * _TS_STEP
|
| 85 |
+
if cur_start is None:
|
| 86 |
+
cur_start = ts
|
| 87 |
+
else:
|
| 88 |
+
text = _processor.tokenizer.decode(cur_text_ids, skip_special_tokens=True).strip()
|
| 89 |
+
if text:
|
| 90 |
+
segments.append({"start": round(cur_start, 2), "end": round(ts, 2), "text": text})
|
| 91 |
+
cur_start = None
|
| 92 |
+
cur_text_ids = []
|
| 93 |
+
elif cur_start is not None:
|
| 94 |
+
cur_text_ids.append(t)
|
| 95 |
+
return segments
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def _decode_audio(audio_bytes: bytes) -> tuple[np.ndarray, int]:
|
| 99 |
+
try:
|
| 100 |
+
return sf.read(io.BytesIO(audio_bytes))
|
| 101 |
+
except Exception:
|
| 102 |
+
import subprocess, tempfile
|
| 103 |
+
with tempfile.NamedTemporaryFile(suffix=".bin") as fin:
|
| 104 |
+
fin.write(audio_bytes)
|
| 105 |
+
fin.flush()
|
| 106 |
+
result = subprocess.run(
|
| 107 |
+
["ffmpeg", "-i", fin.name, "-f", "wav", "-ac", "1", "-ar", str(SR), "-"],
|
| 108 |
+
capture_output=True, check=True,
|
| 109 |
+
)
|
| 110 |
+
return sf.read(io.BytesIO(result.stdout))
|
| 111 |
+
|
| 112 |
+
|
| 113 |
+
def transcribe_bytes(audio_bytes: bytes) -> dict:
|
| 114 |
+
t0 = time.time()
|
| 115 |
+
arr, orig_sr = _decode_audio(audio_bytes)
|
| 116 |
+
arr = np.asarray(arr, dtype=np.float32)
|
| 117 |
+
if arr.ndim > 1:
|
| 118 |
+
arr = arr.mean(axis=1)
|
| 119 |
+
if orig_sr != SR:
|
| 120 |
+
import librosa
|
| 121 |
+
arr = librosa.resample(arr, orig_sr=orig_sr, target_sr=SR)
|
| 122 |
+
max_samples = 30 * SR
|
| 123 |
+
if len(arr) > max_samples:
|
| 124 |
+
arr = arr[:max_samples]
|
| 125 |
+
s1 = _infer(arr, _tok_ids["spk1"])
|
| 126 |
+
s2 = _infer(arr, _tok_ids["spk2"])
|
| 127 |
+
return {
|
| 128 |
+
"duration_s": float(len(arr) / SR),
|
| 129 |
+
"elapsed_s": time.time() - t0,
|
| 130 |
+
"speaker1": {"segments": s1},
|
| 131 |
+
"speaker2": {"segments": s2},
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
INDEX_HTML = r"""<!DOCTYPE html>
|
| 136 |
+
<html lang="en">
|
| 137 |
+
<head>
|
| 138 |
+
<meta charset="utf-8">
|
| 139 |
+
<meta name="viewport" content="width=device-width, initial-scale=1">
|
| 140 |
+
<title>Trelis Chorus</title>
|
| 141 |
+
<link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css" rel="stylesheet">
|
| 142 |
+
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;600;700;800&display=swap" rel="stylesheet">
|
| 143 |
+
<style>
|
| 144 |
+
:root {
|
| 145 |
+
--trelis-blue: #0d579b; --trelis-blue-50: #e8f2fc;
|
| 146 |
+
--trelis-green: #329239; --trelis-green-50: #e8f5e9;
|
| 147 |
+
--trelis-orange: #f7931a; --trelis-orange-50: #fff4e5;
|
| 148 |
+
--text-primary: #1a1a2e; --text-secondary: #4a5568; --text-muted: #718096;
|
| 149 |
+
--bg-primary: #ffffff; --bg-secondary: #fafbfc; --bg-accent: #f0f4f8;
|
| 150 |
+
--shadow-sm: 0 2px 4px rgba(0,0,0,.06); --shadow-md: 0 4px 12px rgba(0,0,0,.08);
|
| 151 |
+
--radius-sm: 8px; --radius-md: 16px; --radius-full: 9999px;
|
| 152 |
+
}
|
| 153 |
+
body { font-family:'Inter',-apple-system,BlinkMacSystemFont,sans-serif; color:var(--text-primary); background:var(--bg-primary); min-height:100vh; }
|
| 154 |
+
.navbar { background:var(--bg-primary); border-bottom:1px solid rgba(0,0,0,.06); padding:1rem 1.5rem; position:relative; }
|
| 155 |
+
.navbar::after { content:''; position:absolute; bottom:0; left:0; right:0; height:3px; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); }
|
| 156 |
+
.navbar-brand { font-weight:800; font-size:1.4rem; color:var(--text-primary)!important; display:flex; align-items:center; gap:.75rem; }
|
| 157 |
+
.brand-dot { width:14px; height:14px; border-radius:50%; background:linear-gradient(135deg,var(--trelis-blue),var(--trelis-green),var(--trelis-orange)); box-shadow:0 0 0 3px rgba(13,87,155,.08); }
|
| 158 |
+
.model-chip { font-family:'SF Mono',Monaco,monospace; font-size:.72rem; color:var(--text-muted); padding:.25rem .6rem; background:var(--bg-accent); border-radius:var(--radius-full); }
|
| 159 |
+
.hero { background:linear-gradient(180deg,var(--bg-secondary) 0%,var(--bg-primary) 100%); padding:3rem 0 2rem; }
|
| 160 |
+
.hero h1 { font-weight:800; font-size:2.75rem; margin-bottom:.75rem; background:linear-gradient(90deg,var(--trelis-blue) 0%,var(--trelis-green) 50%,var(--trelis-orange) 100%); -webkit-background-clip:text; -webkit-text-fill-color:transparent; background-clip:text; }
|
| 161 |
+
.hero p { color:var(--text-secondary); font-size:1.1rem; max-width:640px; margin-bottom:0; }
|
| 162 |
+
.card { background:var(--bg-primary); border:1px solid rgba(0,0,0,.06); border-radius:var(--radius-md); box-shadow:var(--shadow-sm); transition:.3s cubic-bezier(.4,0,.2,1); }
|
| 163 |
+
.card:hover { box-shadow:var(--shadow-md); }
|
| 164 |
+
.card-body { padding:1.5rem; }
|
| 165 |
+
.btn-primary { background:var(--trelis-blue); border:none; border-radius:var(--radius-full); padding:.65rem 1.75rem; font-weight:700; color:#fff; box-shadow:var(--shadow-sm); transition:.2s; }
|
| 166 |
+
.btn-primary:hover:not(:disabled) { background:#0c4a85; box-shadow:var(--shadow-md); transform:translateY(-1px); }
|
| 167 |
+
.btn-primary:disabled { opacity:.6; }
|
| 168 |
+
.btn-outline-secondary { border-radius:var(--radius-full); font-weight:600; padding:.6rem 1.5rem; border-color:#dee2e6; color:var(--text-secondary); }
|
| 169 |
+
.btn-outline-secondary:hover { background:var(--bg-accent); border-color:var(--trelis-blue); color:var(--trelis-blue); }
|
| 170 |
+
.upload-zone { border:2px dashed #dee2e6; border-radius:var(--radius-md); padding:2rem; text-align:center; transition:.2s; cursor:pointer; background:var(--bg-secondary); }
|
| 171 |
+
.upload-zone:hover { border-color:var(--trelis-blue); background:var(--trelis-blue-50); }
|
| 172 |
+
.upload-zone.has-file { border-color:var(--trelis-green); background:var(--trelis-green-50); }
|
| 173 |
+
.upload-zone input[type=file] { display:none; }
|
| 174 |
+
.upload-icon { font-size:2rem; color:var(--text-muted); margin-bottom:.5rem; }
|
| 175 |
+
.upload-zone.has-file .upload-icon { color:var(--trelis-green); }
|
| 176 |
+
audio { width:100%; margin-top:1rem; border-radius:var(--radius-full); }
|
| 177 |
+
audio::-webkit-media-controls-panel { background:var(--bg-accent); }
|
| 178 |
+
.speaker-card { padding:1.25rem 1.5rem; border-radius:var(--radius-md); background:var(--bg-primary); box-shadow:var(--shadow-sm); border:1px solid rgba(0,0,0,.06); height:100%; position:relative; overflow:hidden; }
|
| 179 |
+
.speaker-card::before { content:''; position:absolute; top:0; left:0; bottom:0; width:4px; }
|
| 180 |
+
.speaker-card.s1::before { background:linear-gradient(180deg,var(--trelis-blue),#1e70b8); }
|
| 181 |
+
.speaker-card.s2::before { background:linear-gradient(180deg,var(--trelis-orange),#ff9f2e); }
|
| 182 |
+
.speaker-label { display:inline-flex; align-items:center; gap:.5rem; font-size:.75rem; font-weight:700; text-transform:uppercase; letter-spacing:.05em; padding:.3rem .7rem; border-radius:var(--radius-full); margin-bottom:.75rem; }
|
| 183 |
+
.s1 .speaker-label { background:var(--trelis-blue-50); color:var(--trelis-blue); }
|
| 184 |
+
.s2 .speaker-label { background:var(--trelis-orange-50); color:var(--trelis-orange); }
|
| 185 |
+
.segment { padding:.5rem .75rem; margin:.25rem 0; border-radius:var(--radius-sm); cursor:pointer; transition:.15s; display:flex; align-items:baseline; gap:.75rem; line-height:1.5; }
|
| 186 |
+
.segment:hover { background:var(--bg-accent); }
|
| 187 |
+
.s1 .segment:hover { background:var(--trelis-blue-50); }
|
| 188 |
+
.s2 .segment:hover { background:var(--trelis-orange-50); }
|
| 189 |
+
.timestamp { font-family:'SF Mono',Monaco,monospace; font-size:.75rem; color:var(--text-muted); flex-shrink:0; min-width:3rem; padding:.1rem .4rem; background:var(--bg-accent); border-radius:4px; }
|
| 190 |
+
.segment-text { color:var(--text-primary); }
|
| 191 |
+
.mic-select { width:auto; max-width:240px; border-radius:var(--radius-full); padding:.4rem 1rem; font-size:.85rem; border-color:#dee2e6; color:var(--text-secondary); }
|
| 192 |
+
.mic-select:focus { border-color:var(--trelis-blue); box-shadow:0 0 0 .2rem rgba(13,87,155,.15); }
|
| 193 |
+
#recordBtn { display:inline-flex; align-items:center; gap:.5rem; }
|
| 194 |
+
.record-dot { width:10px; height:10px; border-radius:50%; background:#c0c0c0; transition:.2s; flex-shrink:0; }
|
| 195 |
+
#recordBtn.recording .record-dot { background:#dc3545; animation: pulse 1.2s ease-in-out infinite; }
|
| 196 |
+
#recordBtn.recording { color:#dc3545; border-color:#dc3545; }
|
| 197 |
+
@keyframes pulse { 0%,100% { box-shadow:0 0 0 0 rgba(220,53,69,.5); } 50% { box-shadow:0 0 0 6px rgba(220,53,69,0); } }
|
| 198 |
+
#status { font-size:.9rem; color:var(--text-secondary); }
|
| 199 |
+
.spinner-border-sm { width:.9rem; height:.9rem; border-width:.15em; color:var(--trelis-blue); }
|
| 200 |
+
.empty { color:var(--text-muted); font-style:italic; }
|
| 201 |
+
.cpu-note { background:var(--trelis-orange-50); border:1px solid var(--trelis-orange); color:var(--trelis-brown,#92400e); border-radius:var(--radius-sm); padding:.75rem 1rem; font-size:.9rem; margin-bottom:1rem; }
|
| 202 |
+
</style>
|
| 203 |
+
</head>
|
| 204 |
+
<body>
|
| 205 |
+
|
| 206 |
+
<nav class="navbar">
|
| 207 |
+
<div class="container d-flex justify-content-between align-items-center">
|
| 208 |
+
<a class="navbar-brand" href="#"><span class="brand-dot"></span>Trelis Chorus</a>
|
| 209 |
+
<span class="model-chip">model: <span id="modelRepo">...</span> · <span id="device">...</span></span>
|
| 210 |
+
</div>
|
| 211 |
+
</nav>
|
| 212 |
+
|
| 213 |
+
<section class="hero">
|
| 214 |
+
<div class="container">
|
| 215 |
+
<h1>Separate two voices<br>from a single stream.</h1>
|
| 216 |
+
<p>Multi-speaker Whisper fine-tune by Trelis. Upload audio of two people talking — possibly overlapping — and Trelis Chorus returns a transcript for each speaker with timestamps.</p>
|
| 217 |
+
</div>
|
| 218 |
+
</section>
|
| 219 |
+
|
| 220 |
+
<div class="container pb-5">
|
| 221 |
+
<div class="cpu-note">
|
| 222 |
+
<strong>Running on CPU</strong> — transcription takes ~30-60s per 30s of audio. First request also downloads the model (~3GB, one-off).
|
| 223 |
+
</div>
|
| 224 |
+
|
| 225 |
+
<div class="card mb-4">
|
| 226 |
+
<div class="card-body">
|
| 227 |
+
<label for="audioFile" class="upload-zone" id="uploadZone">
|
| 228 |
+
<div class="upload-icon">↑</div>
|
| 229 |
+
<div id="uploadLabel"><strong>Click to upload</strong> or drop an audio file here</div>
|
| 230 |
+
<div class="text-muted small mt-1">WAV, MP3, M4A, FLAC — up to 30s</div>
|
| 231 |
+
<input type="file" id="audioFile" accept="audio/*">
|
| 232 |
+
</label>
|
| 233 |
+
|
| 234 |
+
<div class="d-flex flex-wrap gap-2 mt-3 align-items-center">
|
| 235 |
+
<button id="transcribeBtn" class="btn btn-primary" disabled>Transcribe</button>
|
| 236 |
+
<button id="recordBtn" class="btn btn-outline-secondary">
|
| 237 |
+
<span class="record-dot"></span>
|
| 238 |
+
<span id="recordLabel">Record (two speakers)</span>
|
| 239 |
+
</button>
|
| 240 |
+
<select id="micSelect" class="form-select form-select-sm mic-select" title="Recording device">
|
| 241 |
+
<option value="">Default microphone</option>
|
| 242 |
+
</select>
|
| 243 |
+
<button class="btn btn-outline-secondary sample-btn" data-sample="ami" data-label="AMI meeting — 78% overlap, multi-turn">Meeting sample</button>
|
| 244 |
+
<button class="btn btn-outline-secondary sample-btn" data-sample="librispeech" data-label="LibriSpeechMix — 50% overlap, 2 speakers">Read speech sample</button>
|
| 245 |
+
<span id="status" class="ms-2"></span>
|
| 246 |
+
</div>
|
| 247 |
+
|
| 248 |
+
<audio id="audioPlayer" controls style="display:none;"></audio>
|
| 249 |
+
</div>
|
| 250 |
+
</div>
|
| 251 |
+
|
| 252 |
+
<div id="results" style="display:none;">
|
| 253 |
+
<div class="row g-3">
|
| 254 |
+
<div class="col-md-6">
|
| 255 |
+
<div class="speaker-card s1">
|
| 256 |
+
<span class="speaker-label">Speaker 1</span>
|
| 257 |
+
<div id="s1Segments"></div>
|
| 258 |
+
</div>
|
| 259 |
+
</div>
|
| 260 |
+
<div class="col-md-6">
|
| 261 |
+
<div class="speaker-card s2">
|
| 262 |
+
<span class="speaker-label">Speaker 2</span>
|
| 263 |
+
<div id="s2Segments"></div>
|
| 264 |
+
</div>
|
| 265 |
+
</div>
|
| 266 |
+
</div>
|
| 267 |
+
</div>
|
| 268 |
+
</div>
|
| 269 |
+
|
| 270 |
+
<script>
|
| 271 |
+
const fileInput = document.getElementById('audioFile');
|
| 272 |
+
const uploadZone = document.getElementById('uploadZone');
|
| 273 |
+
const uploadLabel = document.getElementById('uploadLabel');
|
| 274 |
+
const audioPlayer = document.getElementById('audioPlayer');
|
| 275 |
+
const transcribeBtn = document.getElementById('transcribeBtn');
|
| 276 |
+
const statusEl = document.getElementById('status');
|
| 277 |
+
const results = document.getElementById('results');
|
| 278 |
+
let audioBlob = null;
|
| 279 |
+
|
| 280 |
+
fetch('/info').then(r => r.json()).then(d => {
|
| 281 |
+
document.getElementById('modelRepo').textContent = d.model_repo;
|
| 282 |
+
document.getElementById('device').textContent = d.device;
|
| 283 |
+
});
|
| 284 |
+
|
| 285 |
+
function setAudio(blob, label) {
|
| 286 |
+
audioBlob = blob;
|
| 287 |
+
audioPlayer.src = URL.createObjectURL(blob);
|
| 288 |
+
audioPlayer.style.display = 'block';
|
| 289 |
+
transcribeBtn.disabled = false;
|
| 290 |
+
uploadZone.classList.add('has-file');
|
| 291 |
+
uploadLabel.innerHTML = `<strong>${label}</strong> ready`;
|
| 292 |
+
results.style.display = 'none';
|
| 293 |
+
statusEl.textContent = '';
|
| 294 |
+
}
|
| 295 |
+
|
| 296 |
+
fileInput.addEventListener('change', e => {
|
| 297 |
+
const f = e.target.files[0];
|
| 298 |
+
if (!f) return;
|
| 299 |
+
setAudio(f, f.name);
|
| 300 |
+
});
|
| 301 |
+
|
| 302 |
+
// ---- Browser recording ----
|
| 303 |
+
let mediaRec = null, recChunks = [], recTimer = null, recStart = 0;
|
| 304 |
+
const recordBtn = document.getElementById('recordBtn');
|
| 305 |
+
const recordLabel = document.getElementById('recordLabel');
|
| 306 |
+
const micSelect = document.getElementById('micSelect');
|
| 307 |
+
const MAX_REC_SEC = 30;
|
| 308 |
+
|
| 309 |
+
async function populateMics() {
|
| 310 |
+
try {
|
| 311 |
+
const devices = await navigator.mediaDevices.enumerateDevices();
|
| 312 |
+
const mics = devices.filter(d => d.kind === 'audioinput');
|
| 313 |
+
const currentValue = micSelect.value;
|
| 314 |
+
micSelect.innerHTML = '<option value="">Default microphone</option>';
|
| 315 |
+
for (const d of mics) {
|
| 316 |
+
const opt = document.createElement('option');
|
| 317 |
+
opt.value = d.deviceId;
|
| 318 |
+
opt.textContent = d.label || `Microphone ${mics.indexOf(d) + 1}`;
|
| 319 |
+
micSelect.appendChild(opt);
|
| 320 |
+
}
|
| 321 |
+
if (currentValue) micSelect.value = currentValue;
|
| 322 |
+
} catch (err) { /* ignore */ }
|
| 323 |
+
}
|
| 324 |
+
|
| 325 |
+
let micsUnlocked = false;
|
| 326 |
+
async function unlockMics() {
|
| 327 |
+
if (micsUnlocked) return;
|
| 328 |
+
try {
|
| 329 |
+
const s = await navigator.mediaDevices.getUserMedia({ audio: true });
|
| 330 |
+
s.getTracks().forEach(t => t.stop());
|
| 331 |
+
micsUnlocked = true;
|
| 332 |
+
await populateMics();
|
| 333 |
+
} catch (err) { /* user denied — leave fallback list */ }
|
| 334 |
+
}
|
| 335 |
+
micSelect.addEventListener('mousedown', unlockMics);
|
| 336 |
+
micSelect.addEventListener('focus', unlockMics);
|
| 337 |
+
|
| 338 |
+
populateMics();
|
| 339 |
+
if (navigator.mediaDevices && navigator.mediaDevices.addEventListener) {
|
| 340 |
+
navigator.mediaDevices.addEventListener('devicechange', populateMics);
|
| 341 |
+
}
|
| 342 |
+
|
| 343 |
+
recordBtn.addEventListener('click', async () => {
|
| 344 |
+
if (mediaRec && mediaRec.state === 'recording') { stopRecording(); return; }
|
| 345 |
+
try {
|
| 346 |
+
const audioConstraints = { channelCount: 1, sampleRate: 16000 };
|
| 347 |
+
if (micSelect.value) audioConstraints.deviceId = { exact: micSelect.value };
|
| 348 |
+
const stream = await navigator.mediaDevices.getUserMedia({ audio: audioConstraints });
|
| 349 |
+
micsUnlocked = true;
|
| 350 |
+
populateMics();
|
| 351 |
+
const mime = MediaRecorder.isTypeSupported('audio/webm;codecs=opus') ? 'audio/webm;codecs=opus' : 'audio/webm';
|
| 352 |
+
mediaRec = new MediaRecorder(stream, { mimeType: mime });
|
| 353 |
+
recChunks = [];
|
| 354 |
+
recStart = Date.now();
|
| 355 |
+
mediaRec.ondataavailable = e => { if (e.data.size > 0) recChunks.push(e.data); };
|
| 356 |
+
mediaRec.onstop = () => {
|
| 357 |
+
stream.getTracks().forEach(t => t.stop());
|
| 358 |
+
const blob = new Blob(recChunks, { type: mime });
|
| 359 |
+
setAudio(blob, `Recording (${((Date.now()-recStart)/1000).toFixed(1)}s)`);
|
| 360 |
+
recordBtn.classList.remove('recording');
|
| 361 |
+
recordLabel.textContent = 'Record (two speakers)';
|
| 362 |
+
if (recTimer) { clearInterval(recTimer); recTimer = null; }
|
| 363 |
+
};
|
| 364 |
+
mediaRec.start();
|
| 365 |
+
recordBtn.classList.add('recording');
|
| 366 |
+
recTimer = setInterval(() => {
|
| 367 |
+
const sec = (Date.now() - recStart) / 1000;
|
| 368 |
+
recordLabel.textContent = `Stop recording (${sec.toFixed(0)}s)`;
|
| 369 |
+
if (sec >= MAX_REC_SEC) stopRecording();
|
| 370 |
+
}, 200);
|
| 371 |
+
} catch (err) {
|
| 372 |
+
statusEl.innerHTML = `<span class="text-danger">Mic error: ${err.message}</span>`;
|
| 373 |
+
}
|
| 374 |
+
});
|
| 375 |
+
|
| 376 |
+
function stopRecording() { if (mediaRec && mediaRec.state === 'recording') mediaRec.stop(); }
|
| 377 |
+
|
| 378 |
+
document.querySelectorAll('.sample-btn').forEach(btn => {
|
| 379 |
+
btn.addEventListener('click', async () => {
|
| 380 |
+
const which = btn.dataset.sample;
|
| 381 |
+
const label = btn.dataset.label;
|
| 382 |
+
btn.disabled = true;
|
| 383 |
+
statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Loading sample...';
|
| 384 |
+
try {
|
| 385 |
+
const r = await fetch(`/sample/${which}`);
|
| 386 |
+
const blob = await r.blob();
|
| 387 |
+
setAudio(blob, label);
|
| 388 |
+
} finally {
|
| 389 |
+
btn.disabled = false;
|
| 390 |
+
}
|
| 391 |
+
});
|
| 392 |
+
});
|
| 393 |
+
|
| 394 |
+
['dragover','dragenter'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = 'var(--trelis-blue)'; }));
|
| 395 |
+
['dragleave','drop'].forEach(ev => uploadZone.addEventListener(ev, e => { e.preventDefault(); uploadZone.style.borderColor = ''; }));
|
| 396 |
+
uploadZone.addEventListener('drop', e => {
|
| 397 |
+
const f = e.dataTransfer.files[0];
|
| 398 |
+
if (f) { fileInput.files = e.dataTransfer.files; setAudio(f, f.name); }
|
| 399 |
+
});
|
| 400 |
+
|
| 401 |
+
transcribeBtn.addEventListener('click', async () => {
|
| 402 |
+
if (!audioBlob) return;
|
| 403 |
+
transcribeBtn.disabled = true;
|
| 404 |
+
statusEl.innerHTML = '<span class="spinner-border spinner-border-sm"></span> Transcribing... (CPU, slow)';
|
| 405 |
+
results.style.display = 'none';
|
| 406 |
+
const fd = new FormData();
|
| 407 |
+
fd.append('file', audioBlob, 'audio.wav');
|
| 408 |
+
try {
|
| 409 |
+
const r = await fetch('/transcribe', { method:'POST', body:fd });
|
| 410 |
+
if (!r.ok) throw new Error(`HTTP ${r.status}: ${await r.text()}`);
|
| 411 |
+
const data = await r.json();
|
| 412 |
+
render('s1Segments', data.speaker1.segments);
|
| 413 |
+
render('s2Segments', data.speaker2.segments);
|
| 414 |
+
results.style.display = 'block';
|
| 415 |
+
statusEl.innerHTML = `<span class="text-success">Done in ${data.elapsed_s.toFixed(1)}s</span>`;
|
| 416 |
+
} catch (err) {
|
| 417 |
+
statusEl.innerHTML = `<span class="text-danger">Error: ${err.message}</span>`;
|
| 418 |
+
} finally {
|
| 419 |
+
transcribeBtn.disabled = false;
|
| 420 |
+
}
|
| 421 |
+
});
|
| 422 |
+
|
| 423 |
+
function render(elId, segs) {
|
| 424 |
+
const el = document.getElementById(elId);
|
| 425 |
+
el.innerHTML = '';
|
| 426 |
+
if (!segs.length) { el.innerHTML = '<div class="empty">No speech detected.</div>'; return; }
|
| 427 |
+
for (const s of segs) {
|
| 428 |
+
const d = document.createElement('div');
|
| 429 |
+
d.className = 'segment';
|
| 430 |
+
d.innerHTML = `<span class="timestamp">${s.start.toFixed(2)}</span><span class="segment-text">${esc(s.text)}</span>`;
|
| 431 |
+
d.addEventListener('click', () => { audioPlayer.currentTime = s.start; audioPlayer.play(); });
|
| 432 |
+
el.appendChild(d);
|
| 433 |
+
}
|
| 434 |
+
}
|
| 435 |
+
|
| 436 |
+
function esc(s) { return String(s).replace(/&/g,'&').replace(/</g,'<').replace(/>/g,'>'); }
|
| 437 |
+
</script>
|
| 438 |
+
</body>
|
| 439 |
+
</html>
|
| 440 |
+
"""
|
| 441 |
+
|
| 442 |
+
app = FastAPI()
|
| 443 |
+
|
| 444 |
+
|
| 445 |
+
@app.on_event("startup")
|
| 446 |
+
def startup():
|
| 447 |
+
load_model()
|
| 448 |
+
|
| 449 |
+
|
| 450 |
+
@app.get("/", response_class=HTMLResponse)
|
| 451 |
+
def index():
|
| 452 |
+
return INDEX_HTML
|
| 453 |
+
|
| 454 |
+
|
| 455 |
+
@app.get("/info")
|
| 456 |
+
def info():
|
| 457 |
+
return {"model_repo": MODEL_REPO, "device": DEVICE}
|
| 458 |
+
|
| 459 |
+
|
| 460 |
+
_SAMPLES = {
|
| 461 |
+
"librispeech": "sample.wav",
|
| 462 |
+
"ami": "sample_ami.wav",
|
| 463 |
+
}
|
| 464 |
+
|
| 465 |
+
|
| 466 |
+
@app.get("/sample/{name}")
|
| 467 |
+
def sample(name: str):
|
| 468 |
+
fname = _SAMPLES.get(name)
|
| 469 |
+
if not fname:
|
| 470 |
+
raise HTTPException(404, f"Unknown sample: {name}")
|
| 471 |
+
path = Path(__file__).parent / "static" / fname
|
| 472 |
+
if not path.exists():
|
| 473 |
+
raise HTTPException(404, f"Sample file not found: {fname}")
|
| 474 |
+
return FileResponse(str(path), media_type="audio/wav")
|
| 475 |
+
|
| 476 |
+
|
| 477 |
+
@app.post("/transcribe")
|
| 478 |
+
async def transcribe(file: UploadFile = File(...)):
|
| 479 |
+
audio_bytes = await file.read()
|
| 480 |
+
if len(audio_bytes) > 50 * 1024 * 1024:
|
| 481 |
+
raise HTTPException(400, "File too large (50MB max).")
|
| 482 |
+
try:
|
| 483 |
+
return JSONResponse(transcribe_bytes(audio_bytes))
|
| 484 |
+
except Exception as e:
|
| 485 |
+
raise HTTPException(500, f"Inference failed: {e}")
|
| 486 |
+
|
| 487 |
+
|
| 488 |
+
if __name__ == "__main__":
|
| 489 |
+
port = int(os.environ.get("PORT", 7860)) # HF Spaces default port
|
| 490 |
+
uvicorn.run(app, host="0.0.0.0", port=port)
|
requirements.txt
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch>=2.0.0
|
| 2 |
+
transformers>=4.48.0,<5.0.0
|
| 3 |
+
soundfile>=0.12.0
|
| 4 |
+
librosa>=0.10.0
|
| 5 |
+
numpy>=1.24.0
|
| 6 |
+
fastapi[standard]
|
| 7 |
+
python-multipart
|
| 8 |
+
uvicorn
|
| 9 |
+
huggingface_hub>=0.32.0,<2.0.0
|
static/sample.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b97d3b33c3e99e3a7eb69665b25752d21ddb602591dd59fa2d5ee9ff4ff1173d
|
| 3 |
+
size 606960
|
static/sample_ami.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a80537a734ae0c1f3518cee7782141673e743612e99126fb8341f9d88ade979b
|
| 3 |
+
size 1583440
|