don0726 commited on
Commit
6eb3f02
·
verified ·
1 Parent(s): 6f1a080

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +67 -89
app.py CHANGED
@@ -1,109 +1,87 @@
 
 
 
 
 
 
1
  import numpy as np
2
- import librosa
3
- import gradio as gr
4
- from sklearn.preprocessing import StandardScaler
5
 
6
- CHUNK_DURATION = 30
7
- SIMILARITY_THRESHOLD = 0.75
8
 
9
- def extract_features(y, sr):
10
- features = []
11
 
12
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
13
- features.extend(np.mean(mfcc, axis=1))
14
 
15
- features.append(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)))
16
- features.append(np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)))
17
- features.append(np.mean(librosa.feature.zero_crossing_rate(y)))
18
- features.append(np.mean(librosa.feature.rms(y=y)))
19
 
20
- return np.array(features)
 
21
 
22
- def split_audio(y, sr, frame_sec=1.0):
23
- frame_len = int(sr * frame_sec)
24
- segments, times = [], []
25
 
26
- for i in range(0, len(y), frame_len):
27
- seg = y[i:i+frame_len]
28
- if len(seg) < frame_len:
29
- continue
30
 
31
- if np.mean(np.abs(seg)) > 0.01:
32
- segments.append(seg)
33
- times.append((i/sr, (i+frame_len)/sr))
34
 
35
- return segments, times
 
 
36
 
37
- def process_audio(file_path):
38
- y, sr = librosa.load(file_path, sr=None)
39
- total_duration = len(y) / sr
40
 
41
- all_segments = []
42
  speaker_embeddings = []
43
  speaker_labels = []
44
- speaker_count = 0
45
 
46
- current_time = 0
 
 
 
 
 
47
 
48
- while current_time < total_duration:
49
- start = int(current_time * sr)
50
- end = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
51
  chunk = y[start:end]
52
 
53
- segments, times = split_audio(chunk, sr)
54
-
55
- for seg, (s, e) in zip(segments, times):
56
- feat = extract_features(seg, sr)
57
-
58
- if speaker_embeddings:
59
- scaler = StandardScaler()
60
- X = np.vstack(speaker_embeddings + [feat])
61
- X = scaler.fit_transform(X)
62
- feat_norm = X[-1]
63
- existing = X[:-1]
64
- else:
65
- feat_norm = feat
66
- existing = []
67
-
68
- assigned = False
69
- for i, emb in enumerate(existing):
70
- sim = np.dot(feat_norm, emb) / (
71
- np.linalg.norm(feat_norm) * np.linalg.norm(emb)
72
  )
73
- if sim > SIMILARITY_THRESHOLD:
74
- speaker_id = speaker_labels[i]
75
- assigned = True
76
- break
77
-
78
- if not assigned:
79
- speaker_count += 1
80
- speaker_id = f"SPEAKER_{speaker_count}"
81
- speaker_embeddings.append(feat)
82
- speaker_labels.append(speaker_id)
83
-
84
- all_segments.append({
85
- "speaker": speaker_id,
86
- "start": round(current_time + s, 2),
87
- "end": round(current_time + e, 2)
88
- })
89
-
90
- current_time += CHUNK_DURATION
91
-
92
- return {"segments": all_segments}
93
-
94
- # 🎯 Gradio UI
95
- def run(audio):
96
- if audio is None:
97
- return {"error": "Upload audio"}
98
-
99
- return process_audio(audio)
100
-
101
- demo = gr.Interface(
102
- fn=run,
103
- inputs=gr.Audio(type="filepath"),
104
- outputs=gr.JSON(),
105
- title="Speaker Diarization (CPU)",
106
- description="Upload audio → get speaker labels with timestamps"
107
- )
108
 
109
- demo.launch()
 
 
1
+ from fastapi import FastAPI, UploadFile, File, Form
2
+ import tempfile
3
+ import shutil
4
+ import uvicorn
5
+ import whisperx
6
+ import torch
7
  import numpy as np
8
+ import soundfile as sf
9
+ from speechbrain.pretrained import EncoderClassifier
 
10
 
11
+ app = FastAPI()
 
12
 
13
+ device = "cpu"
 
14
 
15
+ # Load models (light)
16
+ asr_model = whisperx.load_model("small", device)
17
 
18
+ speaker_model = EncoderClassifier.from_hparams(
19
+ source="speechbrain/spkrec-ecapa-voxceleb",
20
+ run_opts={"device": device}
21
+ )
22
 
23
+ @app.post("/transcribe")
24
+ async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
25
 
26
+ temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
27
+ with temp as buffer:
28
+ shutil.copyfileobj(audio.file, buffer)
29
 
30
+ audio_path = temp.name
 
 
 
31
 
32
+ # Load audio
33
+ audio_data = whisperx.load_audio(audio_path)
 
34
 
35
+ # Transcribe
36
+ result = asr_model.transcribe(audio_data, language=lang)
37
+ segments = result["segments"]
38
 
39
+ y, sr = sf.read(audio_path)
 
 
40
 
 
41
  speaker_embeddings = []
42
  speaker_labels = []
 
43
 
44
+ final_segments = []
45
+
46
+ for i, seg in enumerate(segments):
47
+
48
+ start = int(seg["start"] * sr)
49
+ end = int(seg["end"] * sr)
50
 
 
 
 
51
  chunk = y[start:end]
52
 
53
+ if len(chunk) < sr * 0.5: # skip very short
54
+ continue
55
+
56
+ chunk_tensor = torch.tensor(chunk).unsqueeze(0)
57
+
58
+ emb = speaker_model.encode_batch(chunk_tensor)
59
+ emb = emb.squeeze().detach().cpu().numpy()
60
+
61
+ # Assign speakers
62
+ if len(speaker_embeddings) < 2:
63
+ speaker_id = f"SPEAKER_{len(speaker_embeddings)+1}"
64
+ speaker_embeddings.append(emb)
65
+ speaker_labels.append(speaker_id)
66
+ else:
67
+ sims = []
68
+ for e in speaker_embeddings:
69
+ sim = np.dot(emb, e) / (
70
+ np.linalg.norm(emb) * np.linalg.norm(e)
 
71
  )
72
+ sims.append(sim)
73
+
74
+ speaker_id = speaker_labels[np.argmax(sims)]
75
+
76
+ final_segments.append({
77
+ "speaker": speaker_id,
78
+ "start": round(seg["start"], 2),
79
+ "end": round(seg["end"], 2),
80
+ "text": seg["text"]
81
+ })
82
+
83
+ return {"segments": final_segments}
84
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ if __name__ == "__main__":
87
+ uvicorn.run(app, host="0.0.0.0", port=7860)