Spaces:

throgletworld
/

MultiModalSpeechDisfluencyDetectionSystem

Running

App Files Files Community

throgletworld commited on Jan 28

Commit

0b7e787

verified ·

1 Parent(s): 2b1a086

Upload 3 files

Browse files

Files changed (1) hide show

app.py +68 -19

app.py CHANGED Viewed

@@ -102,12 +102,14 @@ def analyze_chunk(chunk_tensor, threshold=0.5):
     detected = [STUTTER_LABELS[i] for i, p in enumerate(probs) if p > threshold]
     return detected, dict(zip(STUTTER_LABELS, probs.tolist()))
-def analyze_audio(audio_input, threshold):
     print(f"\n=== ANALYZE CLICKED ===")
     print(f"Input: {audio_input}, Type: {type(audio_input)}, Threshold: {threshold}")
     if audio_input is None:
-        return "Please upload an audio file first!", "", "", ""
     audio_path = audio_input
     if isinstance(audio_input, tuple):
@@ -123,18 +125,25 @@ def analyze_audio(audio_input, threshold):
     print(f"File: {audio_path}, Size: {os.path.getsize(audio_path)}")
     try:
         if not models_loaded and not load_models():
-            return "Failed to load models", "", "", ""
         waveform, sr = load_audio(audio_path)
         duration = len(waveform) / sr
         print(f"Duration: {duration:.1f}s")
         chunk_samples = int(3.0 * sr)
         stutter_counts = {l: 0 for l in STUTTER_LABELS}
         timeline = []
-        for start in range(0, len(waveform), chunk_samples):
             end = min(start + chunk_samples, len(waveform))
             chunk = waveform[start:end]
             if len(chunk) < chunk_samples:
@@ -145,20 +154,27 @@ def analyze_audio(audio_input, threshold):
                 stutter_counts[l] += 1
             timeline.append({"time": f"{start/sr:.1f}-{end/sr:.1f}s", "detected": detected or ["Clear"]})
         print("Running Whisper...")
         transcription = whisper_model.transcribe(audio_path).get('text', '')
         total = sum(stutter_counts.values())
-        summary = f"## Analysis Complete\n\n**Duration:** {duration:.1f}s\n**Stutters:** {total}\n\n"
         for l, c in stutter_counts.items():
-            summary += f"- {l}: {c}\n"
         timeline_md = "| Time | Detected |\n|---|---|\n"
         for t in timeline[:15]:
             timeline_md += f"| {t['time']} | {', '.join(t['detected'])} |\n"
-        defs = "\n".join([f"**{k}:** {v}" for k, v in STUTTER_DEFINITIONS.items()])
         print("Done!")
         return summary, transcription, timeline_md, defs
@@ -169,26 +185,59 @@ def analyze_audio(audio_input, threshold):
 print("Building UI...")
-with gr.Blocks(title="Stutter Analysis") as demo:
-    gr.Markdown("# Speech Fluency Analysis\nUpload audio to analyze stuttering.")
     with gr.Row():
-        with gr.Column():
-            audio = gr.Audio(label="Upload Audio", type="filepath")
-            threshold = gr.Slider(0.3, 0.7, 0.5, label="Threshold")
-            btn = gr.Button("Analyze", variant="primary")
-        with gr.Column():
-            summary = gr.Markdown(value="Upload audio and click Analyze")
     with gr.Tabs():
-        with gr.TabItem("Transcription"):
             trans = gr.Markdown()
-        with gr.TabItem("Timeline"):
             timeline = gr.Markdown()
-        with gr.TabItem("Definitions"):
             defs = gr.Markdown()
-    btn.click(analyze_audio, [audio, threshold], [summary, trans, timeline, defs])
 print("Loading models...")
 load_models()

     detected = [STUTTER_LABELS[i] for i, p in enumerate(probs) if p > threshold]
     return detected, dict(zip(STUTTER_LABELS, probs.tolist()))
+def analyze_audio(audio_input, threshold, progress=gr.Progress()):
     print(f"\n=== ANALYZE CLICKED ===")
     print(f"Input: {audio_input}, Type: {type(audio_input)}, Threshold: {threshold}")
+    progress(0, desc="🔄 Starting analysis...")
     if audio_input is None:
+        return "⚠️ Please upload an audio file first!", "", "", ""
     audio_path = audio_input
     if isinstance(audio_input, tuple):
     print(f"File: {audio_path}, Size: {os.path.getsize(audio_path)}")
     try:
+        progress(0.1, desc="🔄 Loading models...")
         if not models_loaded and not load_models():
+            return "❌ Failed to load models", "", "", ""
+        progress(0.2, desc="🎵 Loading audio file...")
         waveform, sr = load_audio(audio_path)
         duration = len(waveform) / sr
         print(f"Duration: {duration:.1f}s")
+        progress(0.3, desc="✂️ Splitting audio into chunks...")
         chunk_samples = int(3.0 * sr)
         stutter_counts = {l: 0 for l in STUTTER_LABELS}
         timeline = []
+        total_chunks = (len(waveform) + chunk_samples - 1) // chunk_samples
+        for i, start in enumerate(range(0, len(waveform), chunk_samples)):
+            progress(0.3 + (0.4 * i / total_chunks), desc=f"🔍 Analyzing chunk {i+1}/{total_chunks}...")
             end = min(start + chunk_samples, len(waveform))
             chunk = waveform[start:end]
             if len(chunk) < chunk_samples:
                 stutter_counts[l] += 1
             timeline.append({"time": f"{start/sr:.1f}-{end/sr:.1f}s", "detected": detected or ["Clear"]})
+        progress(0.75, desc="🗣️ Transcribing with Whisper...")
         print("Running Whisper...")
         transcription = whisper_model.transcribe(audio_path).get('text', '')
+        progress(0.9, desc="📊 Generating report...")
         total = sum(stutter_counts.values())
+        summary = f"## ✅ Analysis Complete!\n\n**Duration:** {duration:.1f}s\n**Total Stutters Detected:** {total}\n\n### Stutter Counts:\n"
         for l, c in stutter_counts.items():
+            emoji = "🔴" if c > 0 else "⚪"
+            summary += f"- {emoji} **{l}**: {c}\n"
         timeline_md = "| Time | Detected |\n|---|---|\n"
         for t in timeline[:15]:
             timeline_md += f"| {t['time']} | {', '.join(t['detected'])} |\n"
+        if len(timeline) > 15:
+            timeline_md += f"\n*...and {len(timeline) - 15} more chunks*"
+        defs = "## 📖 Stutter Type Definitions\n\n"
+        defs += "\n".join([f"**{k}:** {v}" for k, v in STUTTER_DEFINITIONS.items()])
+        progress(1.0, desc="✅ Done!")
         print("Done!")
         return summary, transcription, timeline_md, defs
 print("Building UI...")
+with gr.Blocks(title="Stutter Analysis", css="""
+    .loading-text {
+        font-size: 1.2em;
+        color: #666;
+        padding: 20px;
+        text-align: center;
+    }
+""") as demo:
+    gr.Markdown("""
+    # 🎙️ Speech Fluency Analysis System
+    Upload an audio file to analyze stuttering patterns using AI (WavLM + Whisper).
+    **Supported formats:** WAV, MP3, M4A, FLAC, OGG
+    """)
     with gr.Row():
+        with gr.Column(scale=1):
+            audio = gr.Audio(label="🎤 Upload Audio", type="filepath")
+            threshold = gr.Slider(
+                minimum=0.3,
+                maximum=0.7,
+                value=0.5,
+                step=0.05,
+                label="Detection Threshold",
+                info="Lower = more sensitive, Higher = more strict"
+            )
+            btn = gr.Button("🔍 Analyze Speech", variant="primary", size="lg")
+            gr.Markdown("*Analysis takes 30-60 seconds depending on audio length*")
+        with gr.Column(scale=2):
+            summary = gr.Markdown(value="### 👆 Upload audio and click Analyze to start")
     with gr.Tabs():
+        with gr.TabItem("📝 Transcription"):
             trans = gr.Markdown()
+        with gr.TabItem("📈 Timeline"):
             timeline = gr.Markdown()
+        with gr.TabItem("📖 Definitions"):
             defs = gr.Markdown()
+    gr.Markdown("""
+    ---
+    **Note:** The spinner will appear while processing. Please wait for analysis to complete.
+    """)
+    # The show_progress parameter shows a spinner during processing
+    btn.click(
+        fn=analyze_audio,
+        inputs=[audio, threshold],
+        outputs=[summary, trans, timeline, defs],
+        show_progress="full"  # Shows loading spinner
+    )
 print("Loading models...")
 load_models()