don0726 commited on
Commit
42e288c
·
verified ·
1 Parent(s): 9b24351

Delete app.py

Browse files
Files changed (1) hide show
  1. app.py +0 -152
app.py DELETED
@@ -1,152 +0,0 @@
1
- import tempfile
2
- import shutil
3
- import numpy as np
4
- import librosa
5
- from fastapi import FastAPI, UploadFile, File, Form
6
- import uvicorn
7
- import gradio as gr
8
- from sklearn.preprocessing import StandardScaler
9
-
10
- app = FastAPI()
11
-
12
- CHUNK_DURATION = 30 # seconds
13
- SIMILARITY_THRESHOLD = 0.75
14
-
15
- @app.get("/")
16
- def home():
17
- return {"message": "MFCC Speaker Diarization Server Running"}
18
-
19
- # 🔹 Feature extraction
20
- def extract_features(y, sr):
21
- features = []
22
-
23
- mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
24
- features.extend(np.mean(mfcc, axis=1))
25
-
26
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
27
- features.append(np.mean(spectral_centroid))
28
-
29
- spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
30
- features.append(np.mean(spectral_bandwidth))
31
-
32
- zcr = librosa.feature.zero_crossing_rate(y)
33
- features.append(np.mean(zcr))
34
-
35
- rms = librosa.feature.rms(y=y)
36
- features.append(np.mean(rms))
37
-
38
- return np.array(features)
39
-
40
- # 🔹 Split audio into small segments
41
- def split_audio(y, sr, frame_sec=1.0):
42
- frame_len = int(sr * frame_sec)
43
- segments = []
44
- times = []
45
-
46
- for i in range(0, len(y), frame_len):
47
- segment = y[i:i+frame_len]
48
- if len(segment) < frame_len:
49
- continue
50
-
51
- energy = np.mean(np.abs(segment))
52
- if energy > 0.01:
53
- segments.append(segment)
54
- times.append((i/sr, (i+frame_len)/sr))
55
-
56
- return segments, times
57
-
58
- # 🔹 Core diarization logic (shared by API + UI)
59
- def process_audio_file(file_path):
60
-
61
- y, sr = librosa.load(file_path, sr=None)
62
- total_duration = len(y) / sr
63
-
64
- all_segments = []
65
- speaker_embeddings = []
66
- speaker_labels = []
67
- speaker_count = 0
68
-
69
- current_time = 0
70
-
71
- while current_time < total_duration:
72
-
73
- start_sample = int(current_time * sr)
74
- end_sample = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
75
-
76
- chunk = y[start_sample:end_sample]
77
-
78
- segments, times = split_audio(chunk, sr)
79
-
80
- for seg, (start, end) in zip(segments, times):
81
-
82
- feat = extract_features(seg, sr)
83
-
84
- if len(speaker_embeddings) > 0:
85
- scaler = StandardScaler()
86
- X = np.vstack(speaker_embeddings + [feat])
87
- X = scaler.fit_transform(X)
88
- feat_norm = X[-1]
89
- existing = X[:-1]
90
- else:
91
- feat_norm = feat
92
- existing = []
93
-
94
- assigned = False
95
- for i, emb in enumerate(existing):
96
- similarity = np.dot(feat_norm, emb) / (
97
- np.linalg.norm(feat_norm) * np.linalg.norm(emb)
98
- )
99
- if similarity > SIMILARITY_THRESHOLD:
100
- speaker_id = speaker_labels[i]
101
- assigned = True
102
- break
103
-
104
- if not assigned:
105
- speaker_count += 1
106
- speaker_id = f"SPEAKER_{speaker_count}"
107
- speaker_embeddings.append(feat)
108
- speaker_labels.append(speaker_id)
109
-
110
- all_segments.append({
111
- "speaker": speaker_id,
112
- "start": round(current_time + start, 2),
113
- "end": round(current_time + end, 2)
114
- })
115
-
116
- current_time += CHUNK_DURATION
117
-
118
- return {"segments": all_segments}
119
-
120
- # 🔹 FastAPI endpoint
121
- @app.post("/transcribe")
122
- async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
123
- temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
124
- with temp as buffer:
125
- shutil.copyfileobj(audio.file, buffer)
126
-
127
- result = process_audio_file(temp.name)
128
- return result
129
-
130
- # 🔹 Gradio UI function
131
- def gradio_process(audio_file):
132
- if audio_file is None:
133
- return {"error": "No file uploaded"}
134
-
135
- return process_audio_file(audio_file)
136
-
137
- # 🔹 Build Gradio Interface
138
- gradio_ui = gr.Interface(
139
- fn=gradio_process,
140
- inputs=gr.Audio(type="filepath", label="Upload Audio"),
141
- outputs=gr.JSON(label="Speaker Segments"),
142
- title="Speaker Diarization (CPU)",
143
- description="Upload audio and get speaker labels with timestamps"
144
- )
145
-
146
- # 🔹 Mount Gradio into FastAPI
147
- from fastapi.middleware.wsgi import WSGIMiddleware
148
- app.mount("/ui", WSGIMiddleware(gradio_ui))
149
-
150
- # 🔹 Run
151
- if __name__ == "__main__":
152
- uvicorn.run(app, host="0.0.0.0", port=7860)