throgletworld commited on
Commit
0b7e787
Β·
verified Β·
1 Parent(s): 2b1a086

Upload 3 files

Browse files
Files changed (1) hide show
  1. app.py +68 -19
app.py CHANGED
@@ -102,12 +102,14 @@ def analyze_chunk(chunk_tensor, threshold=0.5):
102
  detected = [STUTTER_LABELS[i] for i, p in enumerate(probs) if p > threshold]
103
  return detected, dict(zip(STUTTER_LABELS, probs.tolist()))
104
 
105
- def analyze_audio(audio_input, threshold):
106
  print(f"\n=== ANALYZE CLICKED ===")
107
  print(f"Input: {audio_input}, Type: {type(audio_input)}, Threshold: {threshold}")
108
 
 
 
109
  if audio_input is None:
110
- return "Please upload an audio file first!", "", "", ""
111
 
112
  audio_path = audio_input
113
  if isinstance(audio_input, tuple):
@@ -123,18 +125,25 @@ def analyze_audio(audio_input, threshold):
123
  print(f"File: {audio_path}, Size: {os.path.getsize(audio_path)}")
124
 
125
  try:
 
126
  if not models_loaded and not load_models():
127
- return "Failed to load models", "", "", ""
128
 
 
129
  waveform, sr = load_audio(audio_path)
130
  duration = len(waveform) / sr
131
  print(f"Duration: {duration:.1f}s")
132
 
 
133
  chunk_samples = int(3.0 * sr)
134
  stutter_counts = {l: 0 for l in STUTTER_LABELS}
135
  timeline = []
136
 
137
- for start in range(0, len(waveform), chunk_samples):
 
 
 
 
138
  end = min(start + chunk_samples, len(waveform))
139
  chunk = waveform[start:end]
140
  if len(chunk) < chunk_samples:
@@ -145,20 +154,27 @@ def analyze_audio(audio_input, threshold):
145
  stutter_counts[l] += 1
146
  timeline.append({"time": f"{start/sr:.1f}-{end/sr:.1f}s", "detected": detected or ["Clear"]})
147
 
 
148
  print("Running Whisper...")
149
  transcription = whisper_model.transcribe(audio_path).get('text', '')
150
 
 
151
  total = sum(stutter_counts.values())
152
- summary = f"## Analysis Complete\n\n**Duration:** {duration:.1f}s\n**Stutters:** {total}\n\n"
153
  for l, c in stutter_counts.items():
154
- summary += f"- {l}: {c}\n"
 
155
 
156
  timeline_md = "| Time | Detected |\n|---|---|\n"
157
  for t in timeline[:15]:
158
  timeline_md += f"| {t['time']} | {', '.join(t['detected'])} |\n"
 
 
159
 
160
- defs = "\n".join([f"**{k}:** {v}" for k, v in STUTTER_DEFINITIONS.items()])
 
161
 
 
162
  print("Done!")
163
  return summary, transcription, timeline_md, defs
164
 
@@ -169,26 +185,59 @@ def analyze_audio(audio_input, threshold):
169
 
170
  print("Building UI...")
171
 
172
- with gr.Blocks(title="Stutter Analysis") as demo:
173
- gr.Markdown("# Speech Fluency Analysis\nUpload audio to analyze stuttering.")
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  with gr.Row():
176
- with gr.Column():
177
- audio = gr.Audio(label="Upload Audio", type="filepath")
178
- threshold = gr.Slider(0.3, 0.7, 0.5, label="Threshold")
179
- btn = gr.Button("Analyze", variant="primary")
180
- with gr.Column():
181
- summary = gr.Markdown(value="Upload audio and click Analyze")
 
 
 
 
 
 
 
 
 
182
 
183
  with gr.Tabs():
184
- with gr.TabItem("Transcription"):
185
  trans = gr.Markdown()
186
- with gr.TabItem("Timeline"):
187
  timeline = gr.Markdown()
188
- with gr.TabItem("Definitions"):
189
  defs = gr.Markdown()
190
 
191
- btn.click(analyze_audio, [audio, threshold], [summary, trans, timeline, defs])
 
 
 
 
 
 
 
 
 
 
 
192
 
193
  print("Loading models...")
194
  load_models()
 
102
  detected = [STUTTER_LABELS[i] for i, p in enumerate(probs) if p > threshold]
103
  return detected, dict(zip(STUTTER_LABELS, probs.tolist()))
104
 
105
+ def analyze_audio(audio_input, threshold, progress=gr.Progress()):
106
  print(f"\n=== ANALYZE CLICKED ===")
107
  print(f"Input: {audio_input}, Type: {type(audio_input)}, Threshold: {threshold}")
108
 
109
+ progress(0, desc="πŸ”„ Starting analysis...")
110
+
111
  if audio_input is None:
112
+ return "⚠️ Please upload an audio file first!", "", "", ""
113
 
114
  audio_path = audio_input
115
  if isinstance(audio_input, tuple):
 
125
  print(f"File: {audio_path}, Size: {os.path.getsize(audio_path)}")
126
 
127
  try:
128
+ progress(0.1, desc="πŸ”„ Loading models...")
129
  if not models_loaded and not load_models():
130
+ return "❌ Failed to load models", "", "", ""
131
 
132
+ progress(0.2, desc="🎡 Loading audio file...")
133
  waveform, sr = load_audio(audio_path)
134
  duration = len(waveform) / sr
135
  print(f"Duration: {duration:.1f}s")
136
 
137
+ progress(0.3, desc="βœ‚οΈ Splitting audio into chunks...")
138
  chunk_samples = int(3.0 * sr)
139
  stutter_counts = {l: 0 for l in STUTTER_LABELS}
140
  timeline = []
141
 
142
+ total_chunks = (len(waveform) + chunk_samples - 1) // chunk_samples
143
+
144
+ for i, start in enumerate(range(0, len(waveform), chunk_samples)):
145
+ progress(0.3 + (0.4 * i / total_chunks), desc=f"πŸ” Analyzing chunk {i+1}/{total_chunks}...")
146
+
147
  end = min(start + chunk_samples, len(waveform))
148
  chunk = waveform[start:end]
149
  if len(chunk) < chunk_samples:
 
154
  stutter_counts[l] += 1
155
  timeline.append({"time": f"{start/sr:.1f}-{end/sr:.1f}s", "detected": detected or ["Clear"]})
156
 
157
+ progress(0.75, desc="πŸ—£οΈ Transcribing with Whisper...")
158
  print("Running Whisper...")
159
  transcription = whisper_model.transcribe(audio_path).get('text', '')
160
 
161
+ progress(0.9, desc="πŸ“Š Generating report...")
162
  total = sum(stutter_counts.values())
163
+ summary = f"## βœ… Analysis Complete!\n\n**Duration:** {duration:.1f}s\n**Total Stutters Detected:** {total}\n\n### Stutter Counts:\n"
164
  for l, c in stutter_counts.items():
165
+ emoji = "πŸ”΄" if c > 0 else "βšͺ"
166
+ summary += f"- {emoji} **{l}**: {c}\n"
167
 
168
  timeline_md = "| Time | Detected |\n|---|---|\n"
169
  for t in timeline[:15]:
170
  timeline_md += f"| {t['time']} | {', '.join(t['detected'])} |\n"
171
+ if len(timeline) > 15:
172
+ timeline_md += f"\n*...and {len(timeline) - 15} more chunks*"
173
 
174
+ defs = "## πŸ“– Stutter Type Definitions\n\n"
175
+ defs += "\n".join([f"**{k}:** {v}" for k, v in STUTTER_DEFINITIONS.items()])
176
 
177
+ progress(1.0, desc="βœ… Done!")
178
  print("Done!")
179
  return summary, transcription, timeline_md, defs
180
 
 
185
 
186
  print("Building UI...")
187
 
188
+ with gr.Blocks(title="Stutter Analysis", css="""
189
+ .loading-text {
190
+ font-size: 1.2em;
191
+ color: #666;
192
+ padding: 20px;
193
+ text-align: center;
194
+ }
195
+ """) as demo:
196
+ gr.Markdown("""
197
+ # πŸŽ™οΈ Speech Fluency Analysis System
198
+
199
+ Upload an audio file to analyze stuttering patterns using AI (WavLM + Whisper).
200
+
201
+ **Supported formats:** WAV, MP3, M4A, FLAC, OGG
202
+ """)
203
 
204
  with gr.Row():
205
+ with gr.Column(scale=1):
206
+ audio = gr.Audio(label="🎀 Upload Audio", type="filepath")
207
+ threshold = gr.Slider(
208
+ minimum=0.3,
209
+ maximum=0.7,
210
+ value=0.5,
211
+ step=0.05,
212
+ label="Detection Threshold",
213
+ info="Lower = more sensitive, Higher = more strict"
214
+ )
215
+ btn = gr.Button("πŸ” Analyze Speech", variant="primary", size="lg")
216
+ gr.Markdown("*Analysis takes 30-60 seconds depending on audio length*")
217
+
218
+ with gr.Column(scale=2):
219
+ summary = gr.Markdown(value="### πŸ‘† Upload audio and click Analyze to start")
220
 
221
  with gr.Tabs():
222
+ with gr.TabItem("πŸ“ Transcription"):
223
  trans = gr.Markdown()
224
+ with gr.TabItem("πŸ“ˆ Timeline"):
225
  timeline = gr.Markdown()
226
+ with gr.TabItem("πŸ“– Definitions"):
227
  defs = gr.Markdown()
228
 
229
+ gr.Markdown("""
230
+ ---
231
+ **Note:** The spinner will appear while processing. Please wait for analysis to complete.
232
+ """)
233
+
234
+ # The show_progress parameter shows a spinner during processing
235
+ btn.click(
236
+ fn=analyze_audio,
237
+ inputs=[audio, threshold],
238
+ outputs=[summary, trans, timeline, defs],
239
+ show_progress="full" # Shows loading spinner
240
+ )
241
 
242
  print("Loading models...")
243
  load_models()