don0726 commited on
Commit
7757a4a
ยท
verified ยท
1 Parent(s): c5a6dda

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -74
app.py CHANGED
@@ -1,63 +1,40 @@
1
- import tempfile
2
- import shutil
3
  import numpy as np
4
  import librosa
5
- from fastapi import FastAPI, UploadFile, File, Form
6
- import uvicorn
7
  import gradio as gr
8
  from sklearn.preprocessing import StandardScaler
9
 
10
- app = FastAPI()
11
-
12
- CHUNK_DURATION = 30 # seconds
13
  SIMILARITY_THRESHOLD = 0.75
14
 
15
- @app.get("/")
16
- def home():
17
- return {"message": "MFCC Speaker Diarization Server Running"}
18
-
19
- # ๐Ÿ”น Feature extraction
20
  def extract_features(y, sr):
21
  features = []
22
 
23
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
24
  features.extend(np.mean(mfcc, axis=1))
25
 
26
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
27
- features.append(np.mean(spectral_centroid))
28
-
29
- spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)
30
- features.append(np.mean(spectral_bandwidth))
31
-
32
- zcr = librosa.feature.zero_crossing_rate(y)
33
- features.append(np.mean(zcr))
34
-
35
- rms = librosa.feature.rms(y=y)
36
- features.append(np.mean(rms))
37
 
38
  return np.array(features)
39
 
40
- # ๐Ÿ”น Split audio into small segments
41
  def split_audio(y, sr, frame_sec=1.0):
42
  frame_len = int(sr * frame_sec)
43
- segments = []
44
- times = []
45
 
46
  for i in range(0, len(y), frame_len):
47
- segment = y[i:i+frame_len]
48
- if len(segment) < frame_len:
49
  continue
50
 
51
- energy = np.mean(np.abs(segment))
52
- if energy > 0.01:
53
- segments.append(segment)
54
  times.append((i/sr, (i+frame_len)/sr))
55
 
56
  return segments, times
57
 
58
- # ๐Ÿ”น Core diarization logic (shared by API + UI)
59
- def process_audio_file(file_path):
60
-
61
  y, sr = librosa.load(file_path, sr=None)
62
  total_duration = len(y) / sr
63
 
@@ -69,19 +46,16 @@ def process_audio_file(file_path):
69
  current_time = 0
70
 
71
  while current_time < total_duration:
72
-
73
- start_sample = int(current_time * sr)
74
- end_sample = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
75
-
76
- chunk = y[start_sample:end_sample]
77
 
78
  segments, times = split_audio(chunk, sr)
79
 
80
- for seg, (start, end) in zip(segments, times):
81
-
82
  feat = extract_features(seg, sr)
83
 
84
- if len(speaker_embeddings) > 0:
85
  scaler = StandardScaler()
86
  X = np.vstack(speaker_embeddings + [feat])
87
  X = scaler.fit_transform(X)
@@ -93,10 +67,10 @@ def process_audio_file(file_path):
93
 
94
  assigned = False
95
  for i, emb in enumerate(existing):
96
- similarity = np.dot(feat_norm, emb) / (
97
  np.linalg.norm(feat_norm) * np.linalg.norm(emb)
98
  )
99
- if similarity > SIMILARITY_THRESHOLD:
100
  speaker_id = speaker_labels[i]
101
  assigned = True
102
  break
@@ -109,44 +83,27 @@ def process_audio_file(file_path):
109
 
110
  all_segments.append({
111
  "speaker": speaker_id,
112
- "start": round(current_time + start, 2),
113
- "end": round(current_time + end, 2)
114
  })
115
 
116
  current_time += CHUNK_DURATION
117
 
118
  return {"segments": all_segments}
119
 
120
- # ๐Ÿ”น FastAPI endpoint
121
- @app.post("/transcribe")
122
- async def transcribe(audio: UploadFile = File(...), lang: str = Form("en")):
123
- temp = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
124
- with temp as buffer:
125
- shutil.copyfileobj(audio.file, buffer)
126
 
127
- result = process_audio_file(temp.name)
128
- return result
129
 
130
- # ๐Ÿ”น Gradio UI function
131
- def gradio_process(audio_file):
132
- if audio_file is None:
133
- return {"error": "No file uploaded"}
134
-
135
- return process_audio_file(audio_file)
136
-
137
- # ๐Ÿ”น Build Gradio Interface
138
- gradio_ui = gr.Interface(
139
- fn=gradio_process,
140
- inputs=gr.Audio(type="filepath", label="Upload Audio"),
141
- outputs=gr.JSON(label="Speaker Segments"),
142
  title="Speaker Diarization (CPU)",
143
- description="Upload audio and get speaker labels with timestamps"
144
  )
145
 
146
- # ๐Ÿ”น Mount Gradio into FastAPI
147
- from fastapi.middleware.wsgi import WSGIMiddleware
148
- app.mount("/ui", WSGIMiddleware(gradio_ui))
149
-
150
- # ๐Ÿ”น Run
151
- if __name__ == "__main__":
152
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
 
1
  import numpy as np
2
  import librosa
 
 
3
  import gradio as gr
4
  from sklearn.preprocessing import StandardScaler
5
 
6
+ CHUNK_DURATION = 30
 
 
7
  SIMILARITY_THRESHOLD = 0.75
8
 
 
 
 
 
 
9
  def extract_features(y, sr):
10
  features = []
11
 
12
  mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
13
  features.extend(np.mean(mfcc, axis=1))
14
 
15
+ features.append(np.mean(librosa.feature.spectral_centroid(y=y, sr=sr)))
16
+ features.append(np.mean(librosa.feature.spectral_bandwidth(y=y, sr=sr)))
17
+ features.append(np.mean(librosa.feature.zero_crossing_rate(y)))
18
+ features.append(np.mean(librosa.feature.rms(y=y)))
 
 
 
 
 
 
 
19
 
20
  return np.array(features)
21
 
 
22
  def split_audio(y, sr, frame_sec=1.0):
23
  frame_len = int(sr * frame_sec)
24
+ segments, times = [], []
 
25
 
26
  for i in range(0, len(y), frame_len):
27
+ seg = y[i:i+frame_len]
28
+ if len(seg) < frame_len:
29
  continue
30
 
31
+ if np.mean(np.abs(seg)) > 0.01:
32
+ segments.append(seg)
 
33
  times.append((i/sr, (i+frame_len)/sr))
34
 
35
  return segments, times
36
 
37
+ def process_audio(file_path):
 
 
38
  y, sr = librosa.load(file_path, sr=None)
39
  total_duration = len(y) / sr
40
 
 
46
  current_time = 0
47
 
48
  while current_time < total_duration:
49
+ start = int(current_time * sr)
50
+ end = int(min((current_time + CHUNK_DURATION) * sr, len(y)))
51
+ chunk = y[start:end]
 
 
52
 
53
  segments, times = split_audio(chunk, sr)
54
 
55
+ for seg, (s, e) in zip(segments, times):
 
56
  feat = extract_features(seg, sr)
57
 
58
+ if speaker_embeddings:
59
  scaler = StandardScaler()
60
  X = np.vstack(speaker_embeddings + [feat])
61
  X = scaler.fit_transform(X)
 
67
 
68
  assigned = False
69
  for i, emb in enumerate(existing):
70
+ sim = np.dot(feat_norm, emb) / (
71
  np.linalg.norm(feat_norm) * np.linalg.norm(emb)
72
  )
73
+ if sim > SIMILARITY_THRESHOLD:
74
  speaker_id = speaker_labels[i]
75
  assigned = True
76
  break
 
83
 
84
  all_segments.append({
85
  "speaker": speaker_id,
86
+ "start": round(current_time + s, 2),
87
+ "end": round(current_time + e, 2)
88
  })
89
 
90
  current_time += CHUNK_DURATION
91
 
92
  return {"segments": all_segments}
93
 
94
+ # ๐ŸŽฏ Gradio UI
95
+ def run(audio):
96
+ if audio is None:
97
+ return {"error": "Upload audio"}
 
 
98
 
99
+ return process_audio(audio)
 
100
 
101
+ demo = gr.Interface(
102
+ fn=run,
103
+ inputs=gr.Audio(type="filepath"),
104
+ outputs=gr.JSON(),
 
 
 
 
 
 
 
 
105
  title="Speaker Diarization (CPU)",
106
+ description="Upload audio โ†’ get speaker labels with timestamps"
107
  )
108
 
109
+ demo.launch()