st192011 commited on
Commit
d5b3a6f
·
verified ·
1 Parent(s): 483c30e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +25 -15
app.py CHANGED
@@ -11,33 +11,39 @@ from datasets import load_dataset, Audio
11
  from gradio_client import Client
12
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
13
 
14
- # 1. Initialize Whisper Tiny (Forced to English)
 
15
  whisper_asr = pipeline(
16
  "automatic-speech-recognition",
17
  model="openai/whisper-tiny",
18
- generate_kwargs={"language": "en", "task": "transcribe"}
 
 
 
 
 
 
19
  )
20
 
21
- # 2. Secret Configuration
22
  HF_TOKEN = os.getenv("HF_TOKEN")
23
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
24
 
25
  def normalize_text(text):
26
  if not text: return ""
 
27
  return re.sub(r'[^\w\s]', '', text).lower().strip()
28
 
29
  def get_sample_logic(speaker_id):
30
- """Bypasses internal decoders for stability and handles schema differences."""
31
  try:
32
  if "UA" in speaker_id:
33
- # UA-Speech loading (As per your working Colab code)
34
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
35
  dataset = dataset.cast_column("audio", Audio(decode=False))
36
- # For UA Female shard, we pick a random sample directly
37
- sample = next(iter(dataset.shuffle(buffer_size=50)))
38
- gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
39
  else:
40
- # Torgo loading (Using path-parsing for Speaker IDs)
41
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
42
  dataset = dataset.cast_column("audio", Audio(decode=False))
43
 
@@ -48,10 +54,11 @@ def get_sample_logic(speaker_id):
48
  return sid == speaker_id
49
 
50
  speaker_ds = dataset.filter(filter_spk)
51
- sample = next(iter(speaker_ds.shuffle(buffer_size=20)))
52
- gt_text = sample.get('transcription') or sample.get('text')
53
 
54
- # Manual Decode via librosa (Bypasses torchcodec requirement)
 
 
 
55
  audio_bytes = sample['audio']['bytes']
56
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
57
 
@@ -64,6 +71,7 @@ def get_sample_logic(speaker_id):
64
 
65
  def run_whisper_step(audio_path):
66
  if not audio_path: return "No audio loaded", ""
 
67
  result = whisper_asr(audio_path)
68
  raw_w = result["text"]
69
  norm_w = normalize_text(raw_w)
@@ -73,15 +81,17 @@ def run_model_step(audio_path, norm_whisper):
73
  if not audio_path or not norm_whisper: return "Load data and run Whisper first."
74
  try:
75
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
76
- # Call private Gemma model (Backend uses repetition_penalty=3.0)
77
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
78
  return prediction
79
  except Exception as e:
80
- return f"Backend Error: {e}. Check if Private Space is Awake."
81
 
82
  # UI Construction
83
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
84
  gr.Markdown("# ⚗️ Torgo DSR Lab")
 
 
85
  current_audio_path = gr.State("")
86
 
87
  with gr.Tab("🔬 Laboratory"):
@@ -109,7 +119,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
109
  with gr.Row():
110
  with gr.Column():
111
  gr.Markdown("### 📏 Metric: Exact Match Accuracy")
112
- gr.Markdown("Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **ground truth**.")
113
  with gr.Column():
114
  gr.Markdown("### 🧪 Model Definitions")
115
  gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")
 
11
  from gradio_client import Client
12
  from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META
13
 
14
+ # 1. Initialize Whisper Tiny (Forced to English with strict output control)
15
+ # max_new_tokens=64 and repetition_penalty=3.0 prevent the "L-O-O-O" infinite loops
16
  whisper_asr = pipeline(
17
  "automatic-speech-recognition",
18
  model="openai/whisper-tiny",
19
+ generate_kwargs={
20
+ "language": "en",
21
+ "task": "transcribe",
22
+ "repetition_penalty": 3.0,
23
+ "max_new_tokens": 64,
24
+ "no_repeat_ngram_size": 3
25
+ }
26
  )
27
 
28
+ # 2. Secret Configuration from Space Settings
29
  HF_TOKEN = os.getenv("HF_TOKEN")
30
  PRIVATE_BACKEND_URL = os.getenv("PRIVATE_BACKEND_URL")
31
 
32
  def normalize_text(text):
33
  if not text: return ""
34
+ # Remove special chars and lowercase
35
  return re.sub(r'[^\w\s]', '', text).lower().strip()
36
 
37
  def get_sample_logic(speaker_id):
38
+ """Bypasses internal decoders for stability and handles dataset differences."""
39
  try:
40
  if "UA" in speaker_id:
41
+ # UA-Speech loading (Speaker F02)
42
  dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
43
  dataset = dataset.cast_column("audio", Audio(decode=False))
44
+ speaker_ds = dataset.filter(lambda x: x["speaker_id"] == "F02")
 
 
45
  else:
46
+ # Torgo loading (Using path-parsing for IDs)
47
  dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
48
  dataset = dataset.cast_column("audio", Audio(decode=False))
49
 
 
54
  return sid == speaker_id
55
 
56
  speaker_ds = dataset.filter(filter_spk)
 
 
57
 
58
+ # Get sample and decode manually
59
+ sample = next(iter(speaker_ds.shuffle(buffer_size=50)))
60
+ gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
61
+
62
  audio_bytes = sample['audio']['bytes']
63
  audio_data, sample_rate = librosa.load(io.BytesIO(audio_bytes), sr=16000)
64
 
 
71
 
72
  def run_whisper_step(audio_path):
73
  if not audio_path: return "No audio loaded", ""
74
+ # Baseline with loop-prevention
75
  result = whisper_asr(audio_path)
76
  raw_w = result["text"]
77
  norm_w = normalize_text(raw_w)
 
81
  if not audio_path or not norm_whisper: return "Load data and run Whisper first."
82
  try:
83
  client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
84
+ # Private app expects audio and normalized whisper
85
  prediction = client.predict(audio_path, norm_whisper, api_name="/predict_dsr")
86
  return prediction
87
  except Exception as e:
88
+ return f"Backend Offline. Details: {e}"
89
 
90
  # UI Construction
91
  with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
92
  gr.Markdown("# ⚗️ Torgo DSR Lab")
93
+ gr.Markdown("Reconstruction Layer for Torgo and UA-Speech")
94
+
95
  current_audio_path = gr.State("")
96
 
97
  with gr.Tab("🔬 Laboratory"):
 
119
  with gr.Row():
120
  with gr.Column():
121
  gr.Markdown("### 📏 Metric: Exact Match Accuracy")
122
+ gr.Markdown("Accuracy is calculated by comparing the **normalized prediction** against the **normalized ground truth**.")
123
  with gr.Column():
124
  gr.Markdown("### 🧪 Model Definitions")
125
  gr.Markdown("* **5K Pure Model:** Trained on real Torgo speech. Optimized for articulatory accuracy.")