wav2vec2-api

Build error

App Files Files Community

KuyaToto commited on Aug 14, 2025

Commit

e7078f9

verified ·

1 Parent(s): 9594d9d

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -48

app.py CHANGED Viewed

@@ -1,5 +1,5 @@
 import gradio as gr
-from transformers import WhisperProcessor, WhisperForConditionalGeneration, Wav2Vec2ForCTC, Wav2Vec2Processor
 import librosa
 import torch
 import epitran
@@ -10,27 +10,24 @@ from jiwer import wer
 import json
 import string
 import eng_to_ipa as ipa
-import numpy as np  # For normalization
-# Models: Use Whisper for English (better silence/noise handling), Wav2Vec2 for Arabic
 MODELS = {
     "Arabic": {
         "processor": Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
         "model": Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
-        "epitran": epitran.Epitran("ara-Arab"),
-        "is_whisper": False
     },
     "English": {
-        "processor": WhisperProcessor.from_pretrained("openai/whisper-tiny.en"),
-        "model": WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en"),
-        "epitran": epitran.Epitran("eng-Latn"),
-        "is_whisper": True
     }
 }
 for lang in MODELS.values():
-    if not lang["is_whisper"]:
-        lang["model"].config.ctc_loss_reduction = "mean"
 def clean_phonemes(ipa_text):
     return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa_text)
@@ -60,7 +57,6 @@ def analyze_phonemes(language, reference_text, audio_file):
     processor = lang_models["processor"]
     model = lang_models["model"]
     epi = lang_models["epitran"]
-    is_whisper = lang_models["is_whisper"]
     transliterate_fn = safe_transliterate_arabic if language == "Arabic" else transliterate_english
@@ -74,9 +70,9 @@ def analyze_phonemes(language, reference_text, audio_file):
     if max_amp > 0:
         audio = audio / max_amp  # Normalize to [-1, 1]
-    # Trim silence (increase top_db to 30 for stricter noise removal)
     trimmed_audio, _ = librosa.effects.trim(audio, top_db=30)
-    if len(trimmed_audio) < (sr * 0.1):  # Too short = silence
         return json.dumps({
             "language": language,
             "reference_text": reference_text,
@@ -85,44 +81,51 @@ def analyze_phonemes(language, reference_text, audio_file):
             "metrics": {"message": "Audio appears silent or too noisy. Try speaking louder or in a quieter environment."}
         }, indent=2, ensure_ascii=False)
-    # Cap to 1.5s
-    max_duration = 1.5
     if len(trimmed_audio) > int(sr * max_duration):
         trimmed_audio = trimmed_audio[:int(sr * max_duration)]
-    if is_whisper:
-        # Whisper processing
-        input_features = processor(trimmed_audio, sampling_rate=sr, return_tensors="pt").input_features
-        with torch.no_grad():
-            predicted_ids = model.generate(input_features)
-            transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
-    else:
-        # Wav2Vec2 processing (for Arabic)
-        input_values = processor(trimmed_audio, sampling_rate=sr, return_tensors="pt").input_values
-        with torch.no_grad():
-            logits = model(input_values).logits
-            pred_ids = torch.argmax(logits, dim=-1)
-            transcription = processor.batch_decode(pred_ids)[0].strip()
-    # Confidence check (for Wav2Vec2; Whisper has internal VAD)
-    if not is_whisper:
-        probs = torch.softmax(logits, dim=-1)
-        max_probs = probs.max(dim=-1).values.mean().item()
-        if max_probs < 0.4:  # Lower threshold for stricter filtering
-            return json.dumps({
-                "language": language,
-                "reference_text": reference_text,
-                "transcription": transcription,
-                "word_alignment": [],
-                "metrics": {"message": "Low confidence transcription (possible noise). Try again with clearer speech."}
-            }, indent=2, ensure_ascii=False)
-    obs_phonemes = [list(transliterate_fn(word)) for word in transcription.split()]
     results = {
         "language": language,
         "reference_text": reference_text,
-        "transcription": transcription,
         "word_alignment": [],
         "metrics": {}
     }
@@ -169,7 +172,7 @@ def analyze_phonemes(language, reference_text, audio_file):
     phoneme_er = round((total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2)
     word_acc = round((correct_words / max(1, total_word_length)) * 100, 2)
     word_er = round(((total_word_length - correct_words) / max(1, total_word_length)) * 100, 2)
-    text_wer = round(wer(reference_text, transcription) * 100, 2)
     results["metrics"] = {
         "word_accuracy": word_acc,
@@ -184,7 +187,7 @@ def analyze_phonemes(language, reference_text, audio_file):
 def get_default_text(language):
     return {
         "Arabic": "فَبِأَيِّ آلَاءِ رَبِّكُمَا تُكَذِّبَانِ",
-        "English": "The quick brown fox jumps over the lazy dog"
     }.get(language, "")
 with gr.Blocks() as demo:
@@ -192,7 +195,7 @@ with gr.Blocks() as demo:
     gr.Markdown("Compare audio pronunciation with reference text at phoneme level. Tip: Speak clearly; silence or noise may cause errors.")
     with gr.Row():
-        language = gr.Dropdown(["Arabic", "English"], label="Language", value="English")  # Default to English
     reference_text = gr.Textbox(label="Reference Text", value=get_default_text("English"))
     audio_input = gr.Audio(label="Upload Audio File", type="filepath")

 import gradio as gr
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
 import librosa
 import torch
 import epitran
 import json
 import string
 import eng_to_ipa as ipa
+import numpy as np
+# Models: Wav2Vec2 for both Arabic and English
 MODELS = {
     "Arabic": {
         "processor": Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
         "model": Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-arabic"),
+        "epitran": epitran.Epitran("ara-Arab")
     },
     "English": {
+        "processor": Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english"),
+        "model": Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-english"),
+        "epitran": epitran.Epitran("eng-Latn")
     }
 }
 for lang in MODELS.values():
+    lang["model"].config.ctc_loss_reduction = "mean"
 def clean_phonemes(ipa_text):
     return re.sub(r'[\u064B-\u0652\u02D0]', '', ipa_text)
     processor = lang_models["processor"]
     model = lang_models["model"]
     epi = lang_models["epitran"]
     transliterate_fn = safe_transliterate_arabic if language == "Arabic" else transliterate_english
     if max_amp > 0:
         audio = audio / max_amp  # Normalize to [-1, 1]
+    # Stricter silence trimming
     trimmed_audio, _ = librosa.effects.trim(audio, top_db=30)
+    if len(trimmed_audio) < (sr * 0.15):
         return json.dumps({
             "language": language,
             "reference_text": reference_text,
             "metrics": {"message": "Audio appears silent or too noisy. Try speaking louder or in a quieter environment."}
         }, indent=2, ensure_ascii=False)
+    # Cap to 0.75s for single letters
+    max_duration = 0.75
     if len(trimmed_audio) > int(sr * max_duration):
         trimmed_audio = trimmed_audio[:int(sr * max_duration)]
+    # Noise gate
+    noise_gate_threshold = 0.02
+    trimmed_audio[np.abs(trimmed_audio) < noise_gate_threshold] = 0
+    input_values = processor(trimmed_audio, sampling_rate=sr, return_tensors="pt").input_values
+    with torch.no_grad():
+        logits = model(input_values).logits
+        pred_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(pred_ids)[0].strip()
+    # Stricter confidence check
+    probs = torch.softmax(logits, dim=-1)
+    max_probs = probs.max(dim=-1).values.mean().item()
+    if max_probs < 0.6:
+        return json.dumps({
+            "language": language,
+            "reference_text": reference_text,
+            "transcription": "No speech detected",
+            "word_alignment": [],
+            "metrics": {"message": "Low confidence transcription (possible noise). Try again with clearer speech."}
+        }, indent=2, ensure_ascii=False)
+    # Filter vowel-heavy or overly long transcriptions
+    transcription_clean = transcription.lower().replace("the", "").strip()
+    if len(transcription_clean) > 3 or re.match(r'^[aeiou]+$', transcription_clean):
+        return json.dumps({
+            "language": language,
+            "reference_text": reference_text,
+            "transcription": "No speech detected",
+            "word_alignment": [],
+            "metrics": {"message": "Detected noise or unclear speech. Try again with clear pronunciation."}
+        }, indent=2, ensure_ascii=False)
+    obs_phonemes = [list(transliterate_fn(word)) for word in transcription_clean.split()]
     results = {
         "language": language,
         "reference_text": reference_text,
+        "transcription": transcription_clean or "No speech detected",
         "word_alignment": [],
         "metrics": {}
     }
     phoneme_er = round((total_phoneme_errors / max(1, total_phoneme_length)) * 100, 2)
     word_acc = round((correct_words / max(1, total_word_length)) * 100, 2)
     word_er = round(((total_word_length - correct_words) / max(1, total_word_length)) * 100, 2)
+    text_wer = round(wer(reference_text, transcription_clean or "") * 100, 2)
     results["metrics"] = {
         "word_accuracy": word_acc,
 def get_default_text(language):
     return {
         "Arabic": "فَبِأَيِّ آلَاءِ رَبِّكُمَا تُكَذِّبَانِ",
+        "English": "A"
     }.get(language, "")
 with gr.Blocks() as demo:
     gr.Markdown("Compare audio pronunciation with reference text at phoneme level. Tip: Speak clearly; silence or noise may cause errors.")
     with gr.Row():
+        language = gr.Dropdown(["Arabic", "English"], label="Language", value="English")
     reference_text = gr.Textbox(label="Reference Text", value=get_default_text("English"))
     audio_input = gr.Audio(label="Upload Audio File", type="filepath")