Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -23,12 +23,15 @@ whisper_asr = pipeline(
|
|
| 23 |
}
|
| 24 |
)
|
| 25 |
|
|
|
|
| 26 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 27 |
PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
|
| 28 |
|
| 29 |
def normalize_text(text):
|
| 30 |
if not text: return ""
|
| 31 |
-
|
|
|
|
|
|
|
| 32 |
|
| 33 |
def format_audio(audio_path):
|
| 34 |
"""Ensures audio is 16kHz mono to match ASR training conditions."""
|
|
@@ -75,9 +78,9 @@ def get_sample_logic(speaker_id):
|
|
| 75 |
# --- Logic: Model Processing ---
|
| 76 |
def process_audio_step_1(audio_path):
|
| 77 |
"""Runs Whisper Baseline and returns normalized text."""
|
| 78 |
-
if not audio_path: return "No audio", ""
|
| 79 |
|
| 80 |
-
# Pre-process audio format
|
| 81 |
formatted_path = format_audio(audio_path)
|
| 82 |
|
| 83 |
# Run Whisper
|
|
@@ -87,13 +90,21 @@ def process_audio_step_1(audio_path):
|
|
| 87 |
return raw_w, norm_w
|
| 88 |
|
| 89 |
def process_audio_step_2(audio_path, norm_whisper):
|
| 90 |
-
"""Sends audio + normalized whisper to the Private Model."""
|
| 91 |
-
if not audio_path or not norm_whisper:
|
|
|
|
| 92 |
|
| 93 |
try:
|
| 94 |
-
|
| 95 |
client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
|
| 96 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
return prediction
|
| 98 |
except Exception as e:
|
| 99 |
return f"Backend Connection Required. Details: {e}"
|
|
@@ -101,7 +112,7 @@ def process_audio_step_2(audio_path, norm_whisper):
|
|
| 101 |
# --- UI Construction ---
|
| 102 |
with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
|
| 103 |
gr.Markdown("# ⚗️ Torgo DSR Lab")
|
| 104 |
-
gr.Markdown("Neural Reconstruction Layer for Torgo
|
| 105 |
|
| 106 |
# Hidden state to store the path of the currently active audio
|
| 107 |
active_audio_path = gr.State("")
|
|
@@ -132,7 +143,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
|
|
| 132 |
gr.Markdown("#### Step 1: ASR Baseline")
|
| 133 |
whisper_btn = gr.Button("Run Whisper Tiny")
|
| 134 |
w_raw = gr.Textbox(label="Whisper Raw Transcript")
|
| 135 |
-
w_norm = gr.Textbox(label="Whisper Normalized")
|
| 136 |
|
| 137 |
gr.Markdown("---")
|
| 138 |
|
|
@@ -179,9 +190,9 @@ with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
|
|
| 179 |
outputs=[active_audio_path, gt_box, meta_display]
|
| 180 |
)
|
| 181 |
|
| 182 |
-
# Personal Channel: Use Audio -> Update State -> Clear
|
| 183 |
user_load_btn.click(
|
| 184 |
-
lambda x: (x, "User
|
| 185 |
inputs=user_audio,
|
| 186 |
outputs=[active_audio_path, gt_box, meta_display]
|
| 187 |
)
|
|
|
|
| 23 |
}
|
| 24 |
)
|
| 25 |
|
| 26 |
+
# Configuration from Environment Variables
|
| 27 |
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 28 |
PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"
|
| 29 |
|
| 30 |
def normalize_text(text):
|
| 31 |
if not text: return ""
|
| 32 |
+
# Remove punctuation and lowercase
|
| 33 |
+
text = re.sub(r'[^\w\s]', '', text).lower().strip()
|
| 34 |
+
return " ".join(text.split())
|
| 35 |
|
| 36 |
def format_audio(audio_path):
|
| 37 |
"""Ensures audio is 16kHz mono to match ASR training conditions."""
|
|
|
|
| 78 |
# --- Logic: Model Processing ---
|
| 79 |
def process_audio_step_1(audio_path):
|
| 80 |
"""Runs Whisper Baseline and returns normalized text."""
|
| 81 |
+
if not audio_path: return "No audio loaded", ""
|
| 82 |
|
| 83 |
+
# Pre-process audio format to 16k
|
| 84 |
formatted_path = format_audio(audio_path)
|
| 85 |
|
| 86 |
# Run Whisper
|
|
|
|
| 90 |
return raw_w, norm_w
|
| 91 |
|
| 92 |
def process_audio_step_2(audio_path, norm_whisper):
|
| 93 |
+
"""Sends audio + normalized whisper to the Private Model API."""
|
| 94 |
+
if not audio_path or not norm_whisper:
|
| 95 |
+
return "Please load data and run Whisper (Step 1) first."
|
| 96 |
|
| 97 |
try:
|
| 98 |
+
# Connect to the private API
|
| 99 |
client = Client(PRIVATE_BACKEND_URL, hf_token=HF_TOKEN)
|
| 100 |
+
|
| 101 |
+
# Call the endpoint 'predict_dsr' defined in the Private Space
|
| 102 |
+
# We send the audio file and the normalized whisper transcript
|
| 103 |
+
prediction = client.predict(
|
| 104 |
+
audio_path,
|
| 105 |
+
norm_whisper,
|
| 106 |
+
api_name="/predict_dsr"
|
| 107 |
+
)
|
| 108 |
return prediction
|
| 109 |
except Exception as e:
|
| 110 |
return f"Backend Connection Required. Details: {e}"
|
|
|
|
| 112 |
# --- UI Construction ---
|
| 113 |
with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
|
| 114 |
gr.Markdown("# ⚗️ Torgo DSR Lab")
|
| 115 |
+
gr.Markdown("Neural Reconstruction Layer for Torgo and UA-Speech Zero-Shot.")
|
| 116 |
|
| 117 |
# Hidden state to store the path of the currently active audio
|
| 118 |
active_audio_path = gr.State("")
|
|
|
|
| 143 |
gr.Markdown("#### Step 1: ASR Baseline")
|
| 144 |
whisper_btn = gr.Button("Run Whisper Tiny")
|
| 145 |
w_raw = gr.Textbox(label="Whisper Raw Transcript")
|
| 146 |
+
w_norm = gr.Textbox(label="Whisper Normalized (Input for Model)")
|
| 147 |
|
| 148 |
gr.Markdown("---")
|
| 149 |
|
|
|
|
| 190 |
outputs=[active_audio_path, gt_box, meta_display]
|
| 191 |
)
|
| 192 |
|
| 193 |
+
# Personal Channel: Use Audio -> Update State -> Clear Reference
|
| 194 |
user_load_btn.click(
|
| 195 |
+
lambda x: (x, "User Recorded (No Ground Truth)", {"Dataset": "Custom", "Severity": "N/A"}),
|
| 196 |
inputs=user_audio,
|
| 197 |
outputs=[active_audio_path, gt_box, meta_display]
|
| 198 |
)
|