shibly100 commited on
Commit
f2866f8
Β·
verified Β·
1 Parent(s): 8ead3c6

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +20 -7
  2. asr_disfluency.py +71 -0
  3. model_config.json +9 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,14 +1,27 @@
1
  ---
2
- title: ShiblASR V1.0
3
- emoji: πŸƒ
4
- colorFrom: pink
5
  colorTo: gray
6
  sdk: gradio
7
- sdk_version: 5.49.1
8
- app_file: app.py
9
  pinned: false
10
  license: mit
11
- short_description: Multilingual offline ASR with disfluency and grammar correct
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: ShiblASR-v1.0
3
+ emoji: πŸŽ™οΈ
4
+ colorFrom: purple
5
  colorTo: gray
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
+ app_file: asr_disfluency.py
9
  pinned: false
10
  license: mit
 
11
  ---
12
 
13
+ # πŸŽ™οΈ ShiblASR v1.0 β€” Multilingual ASR with Disfluency Detection
14
+
15
+ An offline Whisper-based ASR system that detects disfluencies, fillers, and pauses, with optional grammar correction.
16
+
17
+ βœ… No FFmpeg
18
+ βœ… Works on CPU
19
+ βœ… Multilingual support
20
+ βœ… Disfluency-aware transcription
21
+
22
+ ### Usage
23
+ 1. Click β€œRecord” or upload a `.wav` / `.mp3`.
24
+ 2. Choose output type:
25
+ - **Verbatim** (with fillers and pauses)
26
+ - **Clean** (grammar-corrected)
27
+ 3. View and copy transcript.
asr_disfluency.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ==============================================================
2
+ # πŸŽ™οΈ ShiblASR v1.0 β€” Multilingual ASR with Disfluency Detection
3
+ # ==============================================================
4
+ import os, json, numpy as np, librosa, whisper, gradio as gr
5
+ from pyAudioAnalysis import ShortTermFeatures
6
+
7
+ # --------------------------------------------------------------
8
+ # Load Whisper model
9
+ # --------------------------------------------------------------
10
+ print("🧠 Loading Whisper model...")
11
+ device = "cuda" if torch.cuda.is_available() else "cpu"
12
+ model = whisper.load_model("base")
13
+ print("βœ… Model loaded successfully.")
14
+
15
+ # --------------------------------------------------------------
16
+ # ASR + disfluency function
17
+ # --------------------------------------------------------------
18
+ def transcribe_with_disfluency(audio_path):
19
+ if audio_path is None:
20
+ return "No audio provided."
21
+ print(f"🎧 Transcribing: {audio_path}")
22
+
23
+ # --- Step 1: Transcribe ---
24
+ result = model.transcribe(audio_path)
25
+ transcript = result["text"]
26
+
27
+ # --- Step 2: Detect pauses / fillers ---
28
+ y, sr = librosa.load(audio_path, sr=16000)
29
+ win, step = int(sr * 0.05), int(sr * 0.025)
30
+ F, f_names = ShortTermFeatures.feature_extraction(y, sr, win, step)
31
+ energy = (F[f_names.index("energy")] - np.min(F[f_names.index("energy")])) / (
32
+ np.max(F[f_names.index("energy")]) - np.min(F[f_names.index("energy")])
33
+ )
34
+ centroid = (F[f_names.index("spectral_centroid")] - np.min(F[f_names.index("spectral_centroid")])) / (
35
+ np.max(F[f_names.index("spectral_centroid")]) - np.min(F[f_names.index("spectral_centroid")])
36
+ )
37
+
38
+ filler_times, grouped = [], []
39
+ for i in range(len(energy)):
40
+ if 0.05 < energy[i] < 0.25 and 0.3 < centroid[i] < 0.6:
41
+ filler_times.append(i * step / sr)
42
+ if filler_times:
43
+ cluster = [filler_times[0]]
44
+ for t in filler_times[1:]:
45
+ if t - cluster[-1] > 0.4:
46
+ grouped.append((cluster[0], cluster[-1]))
47
+ cluster = [t]
48
+ else:
49
+ cluster.append(t)
50
+ grouped.append((cluster[0], cluster[-1]))
51
+
52
+ # --- Step 3: Insert timestamps and pauses ---
53
+ transcript_with_timestamps = transcript + "\n\n=== Pauses Detected ===\n"
54
+ for s, e in grouped:
55
+ transcript_with_timestamps += f"β€’ {s:.2f}s – {e:.2f}s\n"
56
+
57
+ return transcript_with_timestamps.strip()
58
+
59
+ # --------------------------------------------------------------
60
+ # Gradio Web Interface
61
+ # --------------------------------------------------------------
62
+ demo = gr.Interface(
63
+ fn=transcribe_with_disfluency,
64
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath"),
65
+ outputs="text",
66
+ title="πŸŽ™οΈ ShiblASR v1.0",
67
+ description="Offline multilingual ASR with disfluency detection and timestamped pauses."
68
+ )
69
+
70
+ if __name__ == "__main__":
71
+ demo.launch()
model_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ - Add a small metadata file:
2
+ ```json
3
+ {
4
+ "model_name": "ShiblASR",
5
+ "version": "1.0",
6
+ "description": "Offline multilingual ASR with disfluency detection and timestamped pauses.",
7
+ "author": "Shibl Bold",
8
+ "based_on": "Whisper-base multilingual"
9
+ }
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ whisper
3
+ librosa
4
+ soundfile
5
+ pyAudioAnalysis
6
+ language-tool-python
7
+ numpy
8
+ gradio