Spaces:

don0726
/

Roop

Sleeping

App Files Files Community

don0726 commited on Mar 18

Commit

c5a6dda

verified ·

1 Parent(s): bbd3510

Create app.py

Browse files

Files changed (1) hide show

app.py +152 -0

app.py ADDED Viewed

	@@ -0,0 +1,152 @@

+import tempfile
+import shutil
+import numpy as np
+import librosa
+from fastapi import FastAPI, UploadFile, File, Form
+import uvicorn
+import gradio as gr
+from sklearn.preprocessing import StandardScaler
+app = FastAPI()
+CHUNK_DURATION = 30  # seconds
+SIMILARITY_THRESHOLD = 0.75
+@app.get("/")
+def home():
+    return {"message": "MFCC Speaker Diarization Server Running"}
+# 🔹 Feature extraction
+def extract_features(y, sr):
+    features = []
+    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
+    features.extend(np.mean(mfcc, axis=1))
+    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+    features.append(np.mean(spectral_centroid))
+    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
+    features.append(np.mean(spectral_bandwidth))
+    zcr = librosa.feature.zero_crossing_rate(y)
+    features.append(np.mean(zcr))
+    rms = librosa.feature.rms(y=y)
+    features.append(np.mean(rms))
+    return np.array(features)
+# 🔹 Split audio into small segments
+def split_audio(y, sr, frame_sec=1.0):
+    frame_len = int(sr * frame_sec)
+    segments = []
+    times = []
+    for i in range(0, len(y), frame_len):
+        segment = y[i:i+frame_len]
+        if len(segment) < frame_len:
+            continue
+        energy = np.mean(np.abs(segment))
+        if energy > 0.01:
+            segments.append(segment)
+            times.append((i/sr, (i+frame_len)/sr))
+    return segments, times
+# 🔹 Core diarization logic (shared by API + UI)
+def process_audio_file(file_path):
+    y, sr = librosa.load(file_path, sr=None)
+    total_duration = len(y) / sr
+    all_segments = []
+    speaker_embeddings = []
+    speaker_labels = []
+    speaker_count = 0
+    current_time = 0
+    while current_time < total_duration:
+        start_sample = int(current_time * sr)
+        end_sample = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
+        chunk = y[start_sample:end_sample]
+        segments, times = split_audio(chunk, sr)
+        for seg, (start, end) in zip(segments, times):
+            feat = extract_features(seg, sr)
+            if len(speaker_embeddings) > 0:
+                scaler = StandardScaler()
+                X = np.vstack(speaker_embeddings + [feat])
+                X = scaler.fit_transform(X)
+                feat_norm = X[-1]
+                existing = X[:-1]
+            else:
+                feat_norm = feat
+                existing = []
+            assigned = False
+            for i, emb in enumerate(existing):
+                similarity = np.dot(feat_norm, emb) / (
+                    np.linalg.norm(feat_norm) * np.linalg.norm(emb)
+                )
+                if similarity > SIMILARITY_THRESHOLD:
+                    speaker_id = speaker_labels[i]
+                    assigned = True
+                    break
+            if not assigned:
+                speaker_count += 1
+                speaker_id = f"SPEAKER_{speaker_count}"
+                speaker_embeddings.append(feat)
+                speaker_labels.append(speaker_id)
+            all_segments.append({
+                "speaker": speaker_id,
+                "start": round(current_time + start, 2),
+                "end": round(current_time + end, 2)
+            })
+        current_time += CHUNK_DURATION
+    return {"segments": all_segments}
+# 🔹 FastAPI endpoint
+@app.post("/transcribe")
+async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
+    temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
+    with temp as buffer:
+        shutil.copyfileobj(audio.file, buffer)
+    result = process_audio_file(temp.name)
+    return result
+# 🔹 Gradio UI function
+def gradio_process(audio_file):
+    if audio_file is None:
+        return {"error": "No file uploaded"}
+    return process_audio_file(audio_file)
+# 🔹 Build Gradio Interface
+gradio_ui = gr.Interface(
+    fn=gradio_process,
+    inputs=gr.Audio(type="filepath", label="Upload Audio"),
+    outputs=gr.JSON(label="Speaker Segments"),
+    title="Speaker Diarization (CPU)",
+    description="Upload audio and get speaker labels with timestamps"
+)
+# 🔹 Mount Gradio into FastAPI
+from fastapi.middleware.wsgi import WSGIMiddleware
+app.mount("/ui", WSGIMiddleware(gradio_ui))
+# 🔹 Run
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)