Spaces:

shibly100
/

ShiblASR-v1.0

Sleeping

App Files Files Community

shibly100 commited on Oct 12, 2025

Commit

f2866f8

verified ·

1 Parent(s): 8ead3c6

Upload 4 files

Browse files

Files changed (4) hide show

README.md +20 -7
asr_disfluency.py +71 -0
model_config.json +9 -0
requirements.txt +8 -0

README.md CHANGED Viewed

@@ -1,14 +1,27 @@
 ---
-title: ShiblASR V1.0
-emoji: 🏃
-colorFrom: pink
 colorTo: gray
 sdk: gradio
-sdk_version: 5.49.1
-app_file: app.py
 pinned: false
 license: mit
-short_description: Multilingual offline ASR with disfluency and grammar correct
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: ShiblASR-v1.0
+emoji: 🎙️
+colorFrom: purple
 colorTo: gray
 sdk: gradio
+sdk_version: 4.0.0
+app_file: asr_disfluency.py
 pinned: false
 license: mit
 ---
+# 🎙️ ShiblASR v1.0 — Multilingual ASR with Disfluency Detection
+An offline Whisper-based ASR system that detects disfluencies, fillers, and pauses, with optional grammar correction.
+✅ No FFmpeg
+✅ Works on CPU
+✅ Multilingual support
+✅ Disfluency-aware transcription
+### Usage
+1. Click “Record” or upload a `.wav` / `.mp3`.
+2. Choose output type:
+   - **Verbatim** (with fillers and pauses)
+   - **Clean** (grammar-corrected)
+3. View and copy transcript.

asr_disfluency.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# ==============================================================
+# 🎙️ ShiblASR v1.0 — Multilingual ASR with Disfluency Detection
+# ==============================================================
+import os, json, numpy as np, librosa, whisper, gradio as gr
+from pyAudioAnalysis import ShortTermFeatures
+# --------------------------------------------------------------
+# Load Whisper model
+# --------------------------------------------------------------
+print("🧠 Loading Whisper model...")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = whisper.load_model("base")
+print("✅ Model loaded successfully.")
+# --------------------------------------------------------------
+# ASR + disfluency function
+# --------------------------------------------------------------
+def transcribe_with_disfluency(audio_path):
+    if audio_path is None:
+        return "No audio provided."
+    print(f"🎧 Transcribing: {audio_path}")
+    # --- Step 1: Transcribe ---
+    result = model.transcribe(audio_path)
+    transcript = result["text"]
+    # --- Step 2: Detect pauses / fillers ---
+    y, sr = librosa.load(audio_path, sr=16000)
+    win, step = int(sr * 0.05), int(sr * 0.025)
+    F, f_names = ShortTermFeatures.feature_extraction(y, sr, win, step)
+    energy = (F[f_names.index("energy")] - np.min(F[f_names.index("energy")])) / (
+        np.max(F[f_names.index("energy")]) - np.min(F[f_names.index("energy")])
+    )
+    centroid = (F[f_names.index("spectral_centroid")] - np.min(F[f_names.index("spectral_centroid")])) / (
+        np.max(F[f_names.index("spectral_centroid")]) - np.min(F[f_names.index("spectral_centroid")])
+    )
+    filler_times, grouped = [], []
+    for i in range(len(energy)):
+        if 0.05 < energy[i] < 0.25 and 0.3 < centroid[i] < 0.6:
+            filler_times.append(i * step / sr)
+    if filler_times:
+        cluster = [filler_times[0]]
+        for t in filler_times[1:]:
+            if t - cluster[-1] > 0.4:
+                grouped.append((cluster[0], cluster[-1]))
+                cluster = [t]
+            else:
+                cluster.append(t)
+        grouped.append((cluster[0], cluster[-1]))
+    # --- Step 3: Insert timestamps and pauses ---
+    transcript_with_timestamps = transcript + "\n\n=== Pauses Detected ===\n"
+    for s, e in grouped:
+        transcript_with_timestamps += f"• {s:.2f}s – {e:.2f}s\n"
+    return transcript_with_timestamps.strip()
+# --------------------------------------------------------------
+# Gradio Web Interface
+# --------------------------------------------------------------
+demo = gr.Interface(
+    fn=transcribe_with_disfluency,
+    inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
+    outputs="text",
+    title="🎙️ ShiblASR v1.0",
+    description="Offline multilingual ASR with disfluency detection and timestamped pauses."
+)
+if __name__ == "__main__":
+    demo.launch()

model_config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+- Add a small metadata file:
+```json
+{
+  "model_name": "ShiblASR",
+  "version": "1.0",
+  "description": "Offline multilingual ASR with disfluency detection and timestamped pauses.",
+  "author": "Shibl Bold",
+  "based_on": "Whisper-base multilingual"
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+torch
+whisper
+librosa
+soundfile
+pyAudioAnalysis
+language-tool-python
+numpy
+gradio