Spaces:
Sleeping
Sleeping
Upload 4 files
Browse files- README.md +20 -7
- asr_disfluency.py +71 -0
- model_config.json +9 -0
- requirements.txt +8 -0
README.md
CHANGED
|
@@ -1,14 +1,27 @@
|
|
| 1 |
---
|
| 2 |
-
title: ShiblASR
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
-
sdk_version:
|
| 8 |
-
app_file:
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
| 11 |
-
short_description: Multilingual offline ASR with disfluency and grammar correct
|
| 12 |
---
|
| 13 |
|
| 14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
+
title: ShiblASR-v1.0
|
| 3 |
+
emoji: ποΈ
|
| 4 |
+
colorFrom: purple
|
| 5 |
colorTo: gray
|
| 6 |
sdk: gradio
|
| 7 |
+
sdk_version: 4.0.0
|
| 8 |
+
app_file: asr_disfluency.py
|
| 9 |
pinned: false
|
| 10 |
license: mit
|
|
|
|
| 11 |
---
|
| 12 |
|
| 13 |
+
# ποΈ ShiblASR v1.0 β Multilingual ASR with Disfluency Detection
|
| 14 |
+
|
| 15 |
+
An offline Whisper-based ASR system that detects disfluencies, fillers, and pauses, with optional grammar correction.
|
| 16 |
+
|
| 17 |
+
β
No FFmpeg
|
| 18 |
+
β
Works on CPU
|
| 19 |
+
β
Multilingual support
|
| 20 |
+
β
Disfluency-aware transcription
|
| 21 |
+
|
| 22 |
+
### Usage
|
| 23 |
+
1. Click βRecordβ or upload a `.wav` / `.mp3`.
|
| 24 |
+
2. Choose output type:
|
| 25 |
+
- **Verbatim** (with fillers and pauses)
|
| 26 |
+
- **Clean** (grammar-corrected)
|
| 27 |
+
3. View and copy transcript.
|
asr_disfluency.py
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ==============================================================
|
| 2 |
+
# ποΈ ShiblASR v1.0 β Multilingual ASR with Disfluency Detection
|
| 3 |
+
# ==============================================================
|
| 4 |
+
import os, json, numpy as np, librosa, whisper, gradio as gr
|
| 5 |
+
from pyAudioAnalysis import ShortTermFeatures
|
| 6 |
+
|
| 7 |
+
# --------------------------------------------------------------
|
| 8 |
+
# Load Whisper model
|
| 9 |
+
# --------------------------------------------------------------
|
| 10 |
+
print("π§ Loading Whisper model...")
|
| 11 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 12 |
+
model = whisper.load_model("base")
|
| 13 |
+
print("β
Model loaded successfully.")
|
| 14 |
+
|
| 15 |
+
# --------------------------------------------------------------
|
| 16 |
+
# ASR + disfluency function
|
| 17 |
+
# --------------------------------------------------------------
|
| 18 |
+
def transcribe_with_disfluency(audio_path):
|
| 19 |
+
if audio_path is None:
|
| 20 |
+
return "No audio provided."
|
| 21 |
+
print(f"π§ Transcribing: {audio_path}")
|
| 22 |
+
|
| 23 |
+
# --- Step 1: Transcribe ---
|
| 24 |
+
result = model.transcribe(audio_path)
|
| 25 |
+
transcript = result["text"]
|
| 26 |
+
|
| 27 |
+
# --- Step 2: Detect pauses / fillers ---
|
| 28 |
+
y, sr = librosa.load(audio_path, sr=16000)
|
| 29 |
+
win, step = int(sr * 0.05), int(sr * 0.025)
|
| 30 |
+
F, f_names = ShortTermFeatures.feature_extraction(y, sr, win, step)
|
| 31 |
+
energy = (F[f_names.index("energy")] - np.min(F[f_names.index("energy")])) / (
|
| 32 |
+
np.max(F[f_names.index("energy")]) - np.min(F[f_names.index("energy")])
|
| 33 |
+
)
|
| 34 |
+
centroid = (F[f_names.index("spectral_centroid")] - np.min(F[f_names.index("spectral_centroid")])) / (
|
| 35 |
+
np.max(F[f_names.index("spectral_centroid")]) - np.min(F[f_names.index("spectral_centroid")])
|
| 36 |
+
)
|
| 37 |
+
|
| 38 |
+
filler_times, grouped = [], []
|
| 39 |
+
for i in range(len(energy)):
|
| 40 |
+
if 0.05 < energy[i] < 0.25 and 0.3 < centroid[i] < 0.6:
|
| 41 |
+
filler_times.append(i * step / sr)
|
| 42 |
+
if filler_times:
|
| 43 |
+
cluster = [filler_times[0]]
|
| 44 |
+
for t in filler_times[1:]:
|
| 45 |
+
if t - cluster[-1] > 0.4:
|
| 46 |
+
grouped.append((cluster[0], cluster[-1]))
|
| 47 |
+
cluster = [t]
|
| 48 |
+
else:
|
| 49 |
+
cluster.append(t)
|
| 50 |
+
grouped.append((cluster[0], cluster[-1]))
|
| 51 |
+
|
| 52 |
+
# --- Step 3: Insert timestamps and pauses ---
|
| 53 |
+
transcript_with_timestamps = transcript + "\n\n=== Pauses Detected ===\n"
|
| 54 |
+
for s, e in grouped:
|
| 55 |
+
transcript_with_timestamps += f"β’ {s:.2f}s β {e:.2f}s\n"
|
| 56 |
+
|
| 57 |
+
return transcript_with_timestamps.strip()
|
| 58 |
+
|
| 59 |
+
# --------------------------------------------------------------
|
| 60 |
+
# Gradio Web Interface
|
| 61 |
+
# --------------------------------------------------------------
|
| 62 |
+
demo = gr.Interface(
|
| 63 |
+
fn=transcribe_with_disfluency,
|
| 64 |
+
inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
|
| 65 |
+
outputs="text",
|
| 66 |
+
title="ποΈ ShiblASR v1.0",
|
| 67 |
+
description="Offline multilingual ASR with disfluency detection and timestamped pauses."
|
| 68 |
+
)
|
| 69 |
+
|
| 70 |
+
if __name__ == "__main__":
|
| 71 |
+
demo.launch()
|
model_config.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
- Add a small metadata file:
|
| 2 |
+
```json
|
| 3 |
+
{
|
| 4 |
+
"model_name": "ShiblASR",
|
| 5 |
+
"version": "1.0",
|
| 6 |
+
"description": "Offline multilingual ASR with disfluency detection and timestamped pauses.",
|
| 7 |
+
"author": "Shibl Bold",
|
| 8 |
+
"based_on": "Whisper-base multilingual"
|
| 9 |
+
}
|
requirements.txt
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
whisper
|
| 3 |
+
librosa
|
| 4 |
+
soundfile
|
| 5 |
+
pyAudioAnalysis
|
| 6 |
+
language-tool-python
|
| 7 |
+
numpy
|
| 8 |
+
gradio
|