don0726 commited on
Commit
c5a6dda
ยท
verified ยท
1 Parent(s): bbd3510

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tempfile
2
+ import shutil
3
+ import numpy as np
4
+ import librosa
5
+ from fastapi import FastAPI, UploadFile, File, Form
6
+ import uvicorn
7
+ import gradio as gr
8
+ from sklearn.preprocessing import StandardScaler
9
+
10
+ app = FastAPI()
11
+
12
+ CHUNK_DURATION = 30 # seconds
13
+ SIMILARITY_THRESHOLD = 0.75
14
+
15
+ @app.get("/")
16
+ def home():
17
+ return {"message": "MFCC Speaker Diarization Server Running"}
18
+
19
+ # ๐Ÿ”น Feature extraction
20
+ def extract_features(y, sr):
21
+ features = []
22
+
23
+ mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
24
+ features.extend(np.mean(mfcc, axis=1))
25
+
26
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
27
+ features.append(np.mean(spectral_centroid))
28
+
29
+ spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
30
+ features.append(np.mean(spectral_bandwidth))
31
+
32
+ zcr = librosa.feature.zero_crossing_rate(y)
33
+ features.append(np.mean(zcr))
34
+
35
+ rms = librosa.feature.rms(y=y)
36
+ features.append(np.mean(rms))
37
+
38
+ return np.array(features)
39
+
40
+ # ๐Ÿ”น Split audio into small segments
41
+ def split_audio(y, sr, frame_sec=1.0):
42
+ frame_len = int(sr * frame_sec)
43
+ segments = []
44
+ times = []
45
+
46
+ for i in range(0, len(y), frame_len):
47
+ segment = y[i:i+frame_len]
48
+ if len(segment) < frame_len:
49
+ continue
50
+
51
+ energy = np.mean(np.abs(segment))
52
+ if energy > 0.01:
53
+ segments.append(segment)
54
+ times.append((i/sr, (i+frame_len)/sr))
55
+
56
+ return segments, times
57
+
58
+ # ๐Ÿ”น Core diarization logic (shared by API + UI)
59
+ def process_audio_file(file_path):
60
+
61
+ y, sr = librosa.load(file_path, sr=None)
62
+ total_duration = len(y) / sr
63
+
64
+ all_segments = []
65
+ speaker_embeddings = []
66
+ speaker_labels = []
67
+ speaker_count = 0
68
+
69
+ current_time = 0
70
+
71
+ while current_time < total_duration:
72
+
73
+ start_sample = int(current_time * sr)
74
+ end_sample = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
75
+
76
+ chunk = y[start_sample:end_sample]
77
+
78
+ segments, times = split_audio(chunk, sr)
79
+
80
+ for seg, (start, end) in zip(segments, times):
81
+
82
+ feat = extract_features(seg, sr)
83
+
84
+ if len(speaker_embeddings) > 0:
85
+ scaler = StandardScaler()
86
+ X = np.vstack(speaker_embeddings + [feat])
87
+ X = scaler.fit_transform(X)
88
+ feat_norm = X[-1]
89
+ existing = X[:-1]
90
+ else:
91
+ feat_norm = feat
92
+ existing = []
93
+
94
+ assigned = False
95
+ for i, emb in enumerate(existing):
96
+ similarity = np.dot(feat_norm, emb) / (
97
+ np.linalg.norm(feat_norm) * np.linalg.norm(emb)
98
+ )
99
+ if similarity > SIMILARITY_THRESHOLD:
100
+ speaker_id = speaker_labels[i]
101
+ assigned = True
102
+ break
103
+
104
+ if not assigned:
105
+ speaker_count += 1
106
+ speaker_id = f"SPEAKER_{speaker_count}"
107
+ speaker_embeddings.append(feat)
108
+ speaker_labels.append(speaker_id)
109
+
110
+ all_segments.append({
111
+ "speaker": speaker_id,
112
+ "start": round(current_time + start, 2),
113
+ "end": round(current_time + end, 2)
114
+ })
115
+
116
+ current_time += CHUNK_DURATION
117
+
118
+ return {"segments": all_segments}
119
+
120
+ # ๐Ÿ”น FastAPI endpoint
121
+ @app.post("/transcribe")
122
+ async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
123
+ temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
124
+ with temp as buffer:
125
+ shutil.copyfileobj(audio.file, buffer)
126
+
127
+ result = process_audio_file(temp.name)
128
+ return result
129
+
130
+ # ๐Ÿ”น Gradio UI function
131
+ def gradio_process(audio_file):
132
+ if audio_file is None:
133
+ return {"error": "No file uploaded"}
134
+
135
+ return process_audio_file(audio_file)
136
+
137
+ # ๐Ÿ”น Build Gradio Interface
138
+ gradio_ui = gr.Interface(
139
+ fn=gradio_process,
140
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
141
+ outputs=gr.JSON(label="Speaker Segments"),
142
+ title="Speaker Diarization (CPU)",
143
+ description="Upload audio and get speaker labels with timestamps"
144
+ )
145
+
146
+ # ๐Ÿ”น Mount Gradio into FastAPI
147
+ from fastapi.middleware.wsgi import WSGIMiddleware
148
+ app.mount("/ui", WSGIMiddleware(gradio_ui))
149
+
150
+ # ๐Ÿ”น Run
151
+ if __name__ == "__main__":
152
+ uvicorn.run(app, host="0.0.0.0", port=7860)