Spaces:

don0726
/

Roop

Sleeping

App Files Files Community

don0726 commited on Mar 18

Commit

7757a4a

verified ·

1 Parent(s): c5a6dda

Update app.py

Browse files

Files changed (1) hide show

app.py +31 -74

app.py CHANGED Viewed

@@ -1,63 +1,40 @@
-import tempfile
-import shutil
 import numpy as np
 import librosa
-from fastapi import FastAPI, UploadFile, File, Form
-import uvicorn
 import gradio as gr
 from sklearn.preprocessing import StandardScaler
-app = FastAPI()
-CHUNK_DURATION = 30  # seconds
 SIMILARITY_THRESHOLD = 0.75
-@app.get("/")
-def home():
-    return {"message": "MFCC Speaker Diarization Server Running"}
-# 🔹 Feature extraction
 def extract_features(y, sr):
     features = []
     mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
     features.extend(np.mean(mfcc, axis=1))
-    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-    features.append(np.mean(spectral_centroid))
-    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
-    features.append(np.mean(spectral_bandwidth))
-    zcr = librosa.feature.zero_crossing_rate(y)
-    features.append(np.mean(zcr))
-    rms = librosa.feature.rms(y=y)
-    features.append(np.mean(rms))
     return np.array(features)
-# 🔹 Split audio into small segments
 def split_audio(y, sr, frame_sec=1.0):
     frame_len = int(sr * frame_sec)
-    segments = []
-    times = []
     for i in range(0, len(y), frame_len):
-        segment = y[i:i+frame_len]
-        if len(segment) < frame_len:
             continue
-        energy = np.mean(np.abs(segment))
-        if energy > 0.01:
-            segments.append(segment)
             times.append((i/sr, (i+frame_len)/sr))
     return segments, times
-# 🔹 Core diarization logic (shared by API + UI)
-def process_audio_file(file_path):
     y, sr = librosa.load(file_path, sr=None)
     total_duration = len(y) / sr
@@ -69,19 +46,16 @@ def process_audio_file(file_path):
     current_time = 0
     while current_time < total_duration:
-        start_sample = int(current_time * sr)
-        end_sample = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
-        chunk = y[start_sample:end_sample]
         segments, times = split_audio(chunk, sr)
-        for seg, (start, end) in zip(segments, times):
             feat = extract_features(seg, sr)
-            if len(speaker_embeddings) > 0:
                 scaler = StandardScaler()
                 X = np.vstack(speaker_embeddings + [feat])
                 X = scaler.fit_transform(X)
@@ -93,10 +67,10 @@ def process_audio_file(file_path):
             assigned = False
             for i, emb in enumerate(existing):
-                similarity = np.dot(feat_norm, emb) / (
                     np.linalg.norm(feat_norm) * np.linalg.norm(emb)
                 )
-                if similarity > SIMILARITY_THRESHOLD:
                     speaker_id = speaker_labels[i]
                     assigned = True
                     break
@@ -109,44 +83,27 @@ def process_audio_file(file_path):
             all_segments.append({
                 "speaker": speaker_id,
-                "start": round(current_time + start, 2),
-                "end": round(current_time + end, 2)
             })
         current_time += CHUNK_DURATION
     return {"segments": all_segments}
-# 🔹 FastAPI endpoint
-@app.post("/transcribe")
-async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
-    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
-    with temp as buffer:
-        shutil.copyfileobj(audio.file, buffer)
-    result = process_audio_file(temp.name)
-    return result
-# 🔹 Gradio UI function
-def gradio_process(audio_file):
-    if audio_file is None:
-        return {"error": "No file uploaded"}
-    return process_audio_file(audio_file)
-# 🔹 Build Gradio Interface
-gradio_ui = gr.Interface(
-    fn=gradio_process,
-    inputs=gr.Audio(type="filepath", label="Upload Audio"),
-    outputs=gr.JSON(label="Speaker Segments"),
     title="Speaker Diarization (CPU)",
-    description="Upload audio and get speaker labels with timestamps"
 )
-# 🔹 Mount Gradio into FastAPI
-from fastapi.middleware.wsgi import WSGIMiddleware
-app.mount("/ui", WSGIMiddleware(gradio_ui))
-# 🔹 Run
-if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import numpy as np
 import librosa
 import gradio as gr
 from sklearn.preprocessing import StandardScaler
+CHUNK_DURATION = 30
 SIMILARITY_THRESHOLD = 0.75
 def extract_features(y, sr):
     features = []
     mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
     features.extend(np.mean(mfcc, axis=1))
+    features.append(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)))
+    features.append(np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)))
+    features.append(np.mean(librosa.feature.zero_crossing_rate(y)))
+    features.append(np.mean(librosa.feature.rms(y=y)))
     return np.array(features)
 def split_audio(y, sr, frame_sec=1.0):
     frame_len = int(sr * frame_sec)
+    segments, times = [], []
     for i in range(0, len(y), frame_len):
+        seg = y[i:i+frame_len]
+        if len(seg) < frame_len:
             continue
+        if np.mean(np.abs(seg)) > 0.01:
+            segments.append(seg)
             times.append((i/sr, (i+frame_len)/sr))
     return segments, times
+def process_audio(file_path):
     y, sr = librosa.load(file_path, sr=None)
     total_duration = len(y) / sr
     current_time = 0
     while current_time < total_duration:
+        start = int(current_time * sr)
+        end = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
+        chunk = y[start:end]
         segments, times = split_audio(chunk, sr)
+        for seg, (s, e) in zip(segments, times):
             feat = extract_features(seg, sr)
+            if speaker_embeddings:
                 scaler = StandardScaler()
                 X = np.vstack(speaker_embeddings + [feat])
                 X = scaler.fit_transform(X)
             assigned = False
             for i, emb in enumerate(existing):
+                sim = np.dot(feat_norm, emb) / (
                     np.linalg.norm(feat_norm) * np.linalg.norm(emb)
                 )
+                if sim > SIMILARITY_THRESHOLD:
                     speaker_id = speaker_labels[i]
                     assigned = True
                     break
             all_segments.append({
                 "speaker": speaker_id,
+                "start": round(current_time + s, 2),
+                "end": round(current_time + e, 2)
             })
         current_time += CHUNK_DURATION
     return {"segments": all_segments}
+# 🎯 Gradio UI
+def run(audio):
+    if audio is None:
+        return {"error": "Upload audio"}
+    return process_audio(audio)
+demo = gr.Interface(
+    fn=run,
+    inputs=gr.Audio(type="filepath"),
+    outputs=gr.JSON(),
     title="Speaker Diarization (CPU)",
+    description="Upload audio → get speaker labels with timestamps"
 )
+demo.launch()