Spaces:

danarcat
/

PronunciationChecker

Sleeping

App Files Files Community

karlhajal commited on Feb 5, 2025

Commit

4578f82

verified ·

1 Parent(s): 413e8eb

Add demo code

Browse files

Files changed (1) hide show

app.py +148 -1

app.py CHANGED Viewed

@@ -8,10 +8,12 @@ import tempfile
 import matplotlib.pyplot as plt
 import os
 from src.pronunciation_checker import PronunciationChecker
-from src.audio_preprocessing import assess_pronunciation_quality, denoise_audio
 from datetime import datetime
 from pathlib import Path
 import re
 @spaces.GPU
@@ -172,8 +174,153 @@ def display_test_results(threshold, wavlm_layer):
         components.append(gr.Textbox(case["text"], label="Details", interactive=False, lines=8))
     return components
 with gr.Blocks() as demo:
     with gr.Tabs():
         with gr.Tab("Pronunciation Checker"):
             gr.Interface(
                 fn=check_pronunciation,

 import matplotlib.pyplot as plt
 import os
 from src.pronunciation_checker import PronunciationChecker
+from src.audio_preprocessing import assess_pronunciation_quality, denoise_audio, get_red_green_segments
 from datetime import datetime
 from pathlib import Path
 import re
+import json
+import csv
 @spaces.GPU
         components.append(gr.Textbox(case["text"], label="Details", interactive=False, lines=8))
     return components
+def calculate_red_percentage_demo(red_segments, labels_data, scaling_factor=0.02):
+    red_percentages = []
+    def intersection_length(start1, end1, start2, end2):
+        overlap_start = max(start1, start2)
+        overlap_end = min(end1, end2)
+        return max(0, overlap_end - overlap_start)
+    for start, end, latin, arabic in labels_data:
+        red_intersection = 0.0
+        for index in red_segments:
+            red_start_time = index * scaling_factor
+            red_end_time = (index + 1) * scaling_factor
+            red_intersection += intersection_length(start, end, red_start_time, red_end_time)
+        total_grapheme_duration = end - start
+        red_percentage = (red_intersection / total_grapheme_duration)
+        red_percentages.append(min(red_percentage, 1.))
+    return red_percentages
+@spaces.GPU
+def check_pronunciation_demo(reference_audio, input_audio, labels_data, threshold=0.4, wavlm_layer=24):
+    wavlm_layer = int(wavlm_layer)
+    # ref_wav = denoise_audio(ref_wav)
+    input_audio = denoise_audio(input_audio)
+    ref_wav, sr = pronunciation_checker.preprocess_wav(reference_audio)
+    comparison_wav, _ = pronunciation_checker.preprocess_wav(input_audio)
+    if ref_wav is None or comparison_wav is None:
+        raise ValueError("One or both of the waveforms are empty.")
+    ref_features, ref_wav, sr = pronunciation_checker.extract_features(ref_wav, wavlm_layer)
+    input_features, comparison_wav, _ = pronunciation_checker.extract_features(comparison_wav, wavlm_layer)
+    dist_matrix, path = PronunciationChecker.compute_dtw(ref_features, input_features)
+    red_segments, _, _ = get_red_green_segments(dist_matrix, path, wav_type="ref", threshold=threshold)
+    red_percentages = calculate_red_percentage_demo(red_segments, labels_data)
+    is_red = [percentage > 0.0 for percentage in red_percentages]
+    return is_red
+def parse_tsv(file_path):
+    transcriptions = []
+    num_to_subtract = float('inf')
+    if os.path.exists(file_path):
+        with open(file_path, "r", encoding="utf-8") as f:
+            reader = csv.reader(f, delimiter="\t")
+            for row in reader:
+                if len(row) == 4:  # Ensure it has the expected 4 columns
+                    start, end, latin, arabic = row
+                    start = float(start)/1000.
+                    end = float(end)/1000.
+                    num_to_subtract = min(num_to_subtract, start)
+                    start -= num_to_subtract
+                    end -= num_to_subtract
+                    transcriptions.append((start, end, latin, arabic))
+    return transcriptions
+def collect_demo_data():
+    dialects = []
+    themes = set()
+    sentence_ids = set()
+    for dialect in os.listdir(DATA_DIR):
+        dialect_path = os.path.join(DATA_DIR, dialect)
+        if os.path.isdir(dialect_path):
+            dialects.append(dialect)  # Collect dialects
+            for theme in os.listdir(dialect_path):
+                theme_path = os.path.join(dialect_path, theme)
+                if os.path.isdir(theme_path):
+                    themes.add(theme)  # Collect themes
+                    for file in os.listdir(theme_path):
+                        if file.endswith("_word.wav"):
+                            sentence_id = file.split("_")[0]
+                            sentence_ids.add(sentence_id)  # Collect word IDs
+    return sorted(dialects), sorted(themes), sorted(sentence_ids)
+@spaces.GPU
+def run_demo(dialect, theme, sentence_id, input_audio):
+    reference_audio = os.path.join(DATA_DIR, dialect, theme, f"{sentence_id}_word.wav")
+    label_path = os.path.join(DATA_DIR, dialect, theme, f"{sentence_id}_labels.tsv")
+    labels_data = parse_tsv(label_path)
+    results = check_pronunciation_demo(reference_audio, input_audio, labels_data)
+    latin_output = []
+    arabic_output = []
+    for index, (is_red, (start, end, latin, arabic)) in enumerate(zip(results, labels_data)):
+        latin_output.append({
+            "index" : index,
+            "letter": latin,
+            "result": not is_red
+        })
+        arabic_output.append({
+            "index" : index,
+            "letter": arabic,
+            "result": not is_red
+        })
+    print(labels_data)
+    print(results)
+    result = {
+        "highlighted_text_id": sentence_id,
+        "highlighted_text_latin_payload": latin_output,
+        "highlighted_text_arabic_payload": arabic_output
+    }
+    return json.dumps(result, indent=2, ensure_ascii=False)
+DATA_DIR = "data"
+dialects, themes, sentence_ids = collect_demo_data()
 with gr.Blocks() as demo:
     with gr.Tabs():
+        with gr.Tab("Demo"):
+            dialect_dropdown = gr.Dropdown(choices=dialects, label="Select Dialect")
+            theme_dropdown = gr.Dropdown(choices=themes, label="Select Theme")
+            word_dropdown = gr.Dropdown(choices=sentence_ids, label="Select Sentence ID")
+            input_audio = gr.Audio(type="filepath", label="Reference Audio", format="wav", show_download_button=True)
+            # JSON output
+            output_json = gr.JSON(label="Output JSON")
+            # Button to trigger JSON output
+            submit_btn = gr.Button("Get JSON Output")
+            # JSON output function triggered by button
+            submit_btn.click(run_demo, inputs=[dialect_dropdown, theme_dropdown, word_dropdown, input_audio], outputs=[output_json])
         with gr.Tab("Pronunciation Checker"):
             gr.Interface(
                 fn=check_pronunciation,