Spaces:

st192011
/

Torgo-DSR-Lab

Running

File size: 8,534 Bytes

import gradio as gr
import os
import io
import re
import random
import librosa
import soundfile as sf
import pandas as pd
from gradio_client import Client, handle_file
from transformers import pipeline
from datasets import load_dataset, Audio
from gradio_client import Client
from stats_data import get_indomain_breakdown, get_experimental_summary, SPEAKER_META

# 1. Initialize Baseline ASR (Strict English, Repetition Penalty 3.0)
print("Initializing Whisper Tiny Baseline...")
whisper_asr = pipeline(
    "automatic-speech-recognition", 
    model="openai/whisper-tiny",
    generate_kwargs={
        "language": "en", 
        "task": "transcribe", 
        "repetition_penalty": 3.0
    }
)

# Configuration from Environment Variables
HF_TOKEN = os.getenv("HF_TOKEN")
PRIVATE_BACKEND_URL = "st192011/Torgo-DSR-Private"

def normalize_text(text):
    if not text: return ""
    # Remove punctuation and lowercase
    text = re.sub(r'[^\w\s]', '', text).lower().strip()
    return " ".join(text.split())

def format_audio(audio_path):
    """Ensures audio is 16kHz mono to match ASR training conditions."""
    y, sr = librosa.load(audio_path, sr=16000)
    out_path = "formatted_input.wav"
    sf.write(out_path, y, sr)
    return out_path

# --- Logic: Data Loading ---
def get_sample_logic(speaker_id):
    try:
        if "UA" in speaker_id:
            # UA-Speech Access (Direct pull for F02)
            dataset = load_dataset("resproj007/uaspeech_female", split="train", streaming=True)
            dataset = dataset.cast_column("audio", Audio(decode=False))
            # UA is small, skip slightly for variety
            sample = next(iter(dataset.skip(random.randint(0, 30))))
            gt_text = sample.get('text') or sample.get('transcription') or sample.get('sentence')
        else:
            # Torgo Access (Manual filtering as per Colab fix)
            dataset = load_dataset("abnerh/TORGO-database", split="train", streaming=True)
            dataset = dataset.cast_column("audio", Audio(decode=False))
            
            def filter_spk(x):
                sid = str(x.get('speaker_id', '')).upper()
                if not sid or sid == "NONE":
                    sid = os.path.basename(x['audio']['path']).split('_')[0].upper()
                return sid == speaker_id
            
            speaker_ds = dataset.filter(filter_spk)
            sample = next(iter(speaker_ds.shuffle(buffer_size=10)))
            gt_text = sample.get('transcription') or sample.get('text')

        # Decode Bytes manually to bypass torchcodec errors
        audio_bytes = sample['audio']['bytes']
        audio_data, sr = librosa.load(io.BytesIO(audio_bytes), sr=16000)
        temp_path = "dataset_sample.wav"
        sf.write(temp_path, audio_data, sr)
        
        return temp_path, gt_text.lower().strip(), SPEAKER_META[speaker_id]
    except Exception as e:
        return None, f"Dataset Error: {e}", {}

# --- Logic: Model Processing ---
def process_audio_step_1(audio_path):
    """Runs Whisper Baseline and returns normalized text."""
    if not audio_path: return "No audio loaded", ""
    
    # Pre-process audio format to 16k
    formatted_path = format_audio(audio_path)
    
    # Run Whisper
    result = whisper_asr(formatted_path)
    raw_w = result["text"]
    norm_w = normalize_text(raw_w)
    return raw_w, norm_w

def process_audio_step_2(audio_path, norm_whisper):
    """Sends audio + normalized whisper to the Private Model API."""
    if not audio_path or not norm_whisper: 
        return "Please load data and run Whisper (Step 1) first."
    
    try:
        # Connect to the private API
        client = Client(PRIVATE_BACKEND_URL, token=HF_TOKEN)
        
        # FIX: Wrap audio_path with handle_file()
        # This sends the metadata required by Pydantic ('gradio.FileData')
        prediction = client.predict(
            audio_path=handle_file(audio_path), 
            whisper_norm=norm_whisper, 
            api_name="/predict_dsr"
        )
        return prediction
    except Exception as e:
        return f"Backend Connection Required. Details: {e}"

# --- UI Construction ---
with gr.Blocks(theme=gr.themes.Soft(), title="Torgo DSR Lab") as demo:
    gr.Markdown("# ⚗️ Torgo DSR Lab")
    gr.Markdown("Neural Reconstruction Layer for Torgo and UA-Speech Zero-Shot.")
    
    # Hidden state to store the path of the currently active audio
    active_audio_path = gr.State("")

    with gr.Tab("🔬 Laboratory"):
        with gr.Row():
            # LEFT COLUMN: Data Input
            with gr.Column(scale=1):
                with gr.Group():
                    gr.Markdown("### Channel A: Research Datasets")
                    speaker_input = gr.Dropdown(sorted(list(SPEAKER_META.keys())), label="Select Speaker Profile", value="F01")
                    load_btn = gr.Button("Load Sample from Dataset")
                    gt_box = gr.Textbox(label="Ground Truth (Reference)", interactive=False)
                    meta_display = gr.JSON(label="Speaker Metadata")
                
                gr.Markdown("---")
                
                with gr.Group():
                    gr.Markdown("### Channel B: Personal Input")
                    user_audio = gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload Audio")
                    user_load_btn = gr.Button("Use This Audio")

            # RIGHT COLUMN: Transcripts
            with gr.Column(scale=2):
                gr.Markdown("### Analysis & Reconstruction")
                
                with gr.Group():
                    gr.Markdown("#### Step 1: ASR Baseline")
                    whisper_btn = gr.Button("Run Whisper Tiny")
                    w_raw = gr.Textbox(label="Whisper Raw Transcript")
                    w_norm = gr.Textbox(label="Whisper Normalized")
                
                gr.Markdown("---")
                
                with gr.Group():
                    gr.Markdown("#### Step 2: Neural Reconstruction")
                    model_btn = gr.Button("Run Our Correction Model", variant="primary")
                    final_out = gr.Textbox(label="DSR Lab Prediction (5K Model)")

    with gr.Tab("📊 Research Statistics"):
        gr.Markdown("# 🔬 Performance Evaluation")
        
        with gr.Row():
            with gr.Column():
                gr.Markdown("""
                ### 📏 Metric: Exact Match Accuracy
                Accuracy is the percentage of samples where the **normalized prediction** (lowercase, no punctuation) exactly matches the **normalized ground truth**.
                """)
            
            with gr.Column():
                gr.Markdown("""
                ### 🧪 Model Definitions
                * **5K Pure Model:** Trained on real-world Torgo articulatory distortions. Optimized for phonetic fidelity.
                * **10K Triple-Mix Model:** Includes synthetic data and anchors; utilized for generalization (LOSO) testing.
                """)

        gr.Markdown("---")
        gr.Markdown("## 1. Torgo In-Domain Analysis (By Speaker)")
        gr.DataFrame(get_indomain_breakdown())
        
        gr.Markdown("## 2. Experimental Milestone Summary")
        gr.DataFrame(get_experimental_summary())

        gr.Markdown("""
        ### 🔍 Key Discovery: The Acoustic Floor
        Our research found that the **5K Pure Model** achieved higher accuracy in both in-domain and zero-shot tasks. This suggests an **'Acoustic Floor'** exists where real-world phonetic distortions are more valuable for model grounding than synthetic linguistic diversity.
        """)

    # --- Event Handlers ---

    # Dataset Channel: Load -> Update State -> Update UI Text/Meta
    load_btn.click(
        get_sample_logic, 
        inputs=speaker_input, 
        outputs=[active_audio_path, gt_box, meta_display]
    )

    # Personal Channel: Use Audio -> Update State -> Clear Reference
    user_load_btn.click(
        lambda x: (x, "User Recorded (No Ground Truth)", {"Dataset": "Custom", "Severity": "N/A"}), 
        inputs=user_audio, 
        outputs=[active_audio_path, gt_box, meta_display]
    )

    # Step 1: Whisper (Uses State)
    whisper_btn.click(
        process_audio_step_1, 
        inputs=active_audio_path, 
        outputs=[w_raw, w_norm]
    )

    # Step 2: Model (Uses State + Whisper result)
    model_btn.click(
        process_audio_step_2, 
        inputs=[active_audio_path, w_norm], 
        outputs=final_out
    )

demo.launch()