Changed for API calls instead of locally running models

Browse files

Files changed (10) hide show

.dockerignore +1 -1
.gitignore +1 -0
Dockerfile +2 -15
Dockerfile.cpu +0 -41
README.md +44 -93
app.py +132 -169
docker-compose.cpu.yml +0 -16
docker-compose.yml +5 -16
requirements.txt +2 -5
style.css +149 -0

.dockerignore CHANGED Viewed

@@ -15,4 +15,4 @@ build
 .mypy_cache
 *.log
 .DS_Store
-Thumbs.db

 .mypy_cache
 *.log
 .DS_Store
+Thumbs.db

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ *.env

Dockerfile CHANGED Viewed

@@ -8,34 +8,21 @@ ENV PYTHONDONTWRITEBYTECODE=1
 ENV PYTHONUNBUFFERED=1
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
-ENV HF_HOME=/app/.cache/huggingface
-ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
-# Install system dependencies for audio processing
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
-    libsndfile1 \
-    git \
     && rm -rf /var/lib/apt/lists/*
-# Install PyTorch with CUDA support first
-RUN pip install --no-cache-dir \
-    torch \
-    torchaudio \
-    --index-url https://download.pytorch.org/whl/cu126
 # Copy requirements first for better caching
 COPY requirements.txt .
-# Install remaining Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY app.py .
-# Create cache directory for HuggingFace models
-RUN mkdir -p /app/.cache/huggingface
 # Expose the Gradio port
 EXPOSE 7860

 ENV PYTHONUNBUFFERED=1
 ENV GRADIO_SERVER_NAME=0.0.0.0
 ENV GRADIO_SERVER_PORT=7860
+# Install system dependencies (ffmpeg is required for Gradio audio processing)
 RUN apt-get update && apt-get install -y --no-install-recommends \
     ffmpeg \
     && rm -rf /var/lib/apt/lists/*
 # Copy requirements first for better caching
 COPY requirements.txt .
+# Install Python dependencies
 RUN pip install --no-cache-dir -r requirements.txt
 # Copy application code
 COPY app.py .
 # Expose the Gradio port
 EXPOSE 7860

Dockerfile.cpu DELETED Viewed

@@ -1,41 +0,0 @@
-FROM python:3.11-slim
-# Set working directory
-WORKDIR /app
-# Set environment variables
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-ENV GRADIO_SERVER_NAME=0.0.0.0
-ENV GRADIO_SERVER_PORT=7860
-ENV HF_HOME=/app/.cache/huggingface
-ENV TRANSFORMERS_CACHE=/app/.cache/huggingface
-# Install system dependencies for audio processing
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    ffmpeg \
-    libsndfile1 \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-# Install PyTorch CPU-only version (smaller download, works on Mac/Linux/Windows)
-RUN pip install --no-cache-dir \
-    torch \
-    torchaudio
-# Copy requirements first for better caching
-COPY requirements.txt .
-# Install remaining Python dependencies
-RUN pip install --no-cache-dir -r requirements.txt
-# Copy application code
-COPY app.py .
-# Create cache directory for HuggingFace models
-RUN mkdir -p /app/.cache/huggingface
-# Expose the Gradio port
-EXPOSE 7860
-# Run the application
-CMD ["python", "app.py"]

README.md CHANGED Viewed

@@ -1,75 +1,68 @@
-# 🏆 Speech-to-Text Model Arena
-A Gradio-based web application for comparing multiple speech-to-text models side-by-side. Upload audio or record from your microphone and see how different ASR models transcribe your speech.
 ![Python](https://img.shields.io/badge/Python-3.9+-blue.svg)
 ![Gradio](https://img.shields.io/badge/Gradio-4.0+-orange.svg)
-![PyTorch](https://img.shields.io/badge/PyTorch-2.0+-red.svg)
 ## 🎯 Features
-- **Multi-model comparison**: Compare 3 different STT models simultaneously
-- **Audio input flexibility**: Record via microphone or upload audio files
-- **Real-time inference timing**: See how long each model takes to process
-- **GPU acceleration**: Automatically uses CUDA when available
-- **Model caching**: Models are loaded once and cached for faster subsequent runs
 ## 🤖 Models Included
-| Model | HuggingFace ID | Description |
-|-------|----------------|-------------|
-| StutteredSpeechASR | `AImpower/StutteredSpeechASR` | Whisper fine-tuned for stuttered speech (Mandarin) |
-| Whisper Base | `openai/whisper-base` | OpenAI's base Whisper model |
-| Wav2Vec2 | `facebook/wav2vec2-base-960h` | Meta's Wav2Vec2 (English) |
 ## 📋 Requirements
 - Python 3.9+
-- NVIDIA GPU with CUDA support (recommended)
 - Docker (optional, for containerized deployment)
-## 🚀 Quick Start
-### Option 1: Run with Docker (GPU - Linux/Windows with NVIDIA)
-For machines with NVIDIA GPUs:
-1. **Build and run with Docker Compose**
-   ```bash
-   docker compose up --build
-   ```
-2. **Open your browser** and navigate to `http://localhost:7860`
-### Option 2: Run with Docker (CPU - Mac/Linux/Windows)
-For Mac users or machines without NVIDIA GPUs:
-1. **Build and run with Docker Compose**
-   ```bash
-   docker compose -f docker-compose.cpu.yml up --build
-   ```
-2. **Or build and run manually**
    ```bash
-   # Build the CPU image
-   docker build -f Dockerfile.cpu -t stt-arena-cpu .
-   # Run the container
-   docker run -p 7860:7860 stt-arena-cpu
    ```
 3. **Open your browser** and navigate to `http://localhost:7860`
-> ⚠️ **Note**: CPU inference is significantly slower than GPU. Expect 10-30+ seconds per model depending on audio length.
-### Option 3: Run Locally
 1. **Clone the repository**
    ```bash
    git clone <your-repo-url>
-   cd stt_battle_arena
    ```
 2. **Create a virtual environment** (recommended)
@@ -88,71 +81,29 @@ For Mac users or machines without NVIDIA GPUs:
    pip install -r requirements.txt
    ```
-4. **Run the application**
    ```bash
    python app.py
    ```
-5. **Open your browser** and navigate to `http://localhost:7860`
-## 🐳 Docker Configuration
-### GPU Support (NVIDIA - Linux/Windows only)
-The Docker setup requires the [NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html) for GPU acceleration.
-**Install NVIDIA Container Toolkit:**
-```bash
-# Ubuntu/Debian
-distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
-curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add -
-curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | sudo tee /etc/apt/sources.list.d/nvidia-docker.list
-sudo apt-get update
-sudo apt-get install -y nvidia-container-toolkit
-sudo systemctl restart docker
-```
-### Persistent Model Cache
-The Docker Compose configuration includes a volume (`hf-cache`) to persist downloaded HuggingFace models. This means models won't need to be re-downloaded when the container restarts.
-## 📁 Project Structure
-```
-stt_battle_arena/
-├── app.py                 # Main Gradio application
-├── requirements.txt       # Python dependencies
-├── Dockerfile             # Docker build (GPU/CUDA)
-├── Dockerfile.cpu         # Docker build (CPU-only, Mac compatible)
-├── docker-compose.yml     # Docker Compose (GPU)
-├── docker-compose.cpu.yml # Docker Compose (CPU-only, Mac compatible)
-├── .dockerignore          # Docker build exclusions
-└── README.md              # This file
-```
-## ⚙️ Configuration
-### Changing Models
-To add or modify models, edit the `MODELS` list in `app.py`:
-```python
-MODELS = [
-    {
-        "name": "🎙️ Your Model Name",
-        "id": "unique_id",
-        "hf_id": "huggingface/model-id",
-        "description": "Model description",
-    },
-    # Add more models...
-]
-```
 ## 📚 References
 - [Gradio Documentation](https://www.gradio.app/docs)
-- [HuggingFace Transformers](https://huggingface.co/docs/transformers)
 - [AImpower StutteredSpeechASR](https://huggingface.co/AImpower/StutteredSpeechASR)
 - [OpenAI Whisper](https://github.com/openai/whisper)
-- [Wav2Vec 2.0](https://huggingface.co/facebook/wav2vec2-base-960h)

+# 🗣️ StutteredSpeechASR Research Demo
+A Gradio-based research demonstration showcasing **StutteredSpeechASR**, a Whisper model fine-tuned specifically for stuttered speech recognition (Mandarin). Compare its performance against baseline Whisper models to see the improvement on stuttered speech patterns.
 ![Python](https://img.shields.io/badge/Python-3.9+-blue.svg)
 ![Gradio](https://img.shields.io/badge/Gradio-4.0+-orange.svg)
+![Research](https://img.shields.io/badge/Research-Demo-green.svg)
 ## 🎯 Features
+- **StutteredSpeechASR Research**: Showcases fine-tuned Whisper model specifically designed for stuttered speech
+- **Comparative Analysis**: Side-by-side comparison with baseline Whisper models
+- **Audio Input Flexibility**: Record via microphone or upload audio files
+- **Specialized for Stuttered Speech**: Better handling of repetitions, prolongations, and blocks
+- **Clean Interface**: Organized model cards with clear transcription results
+- **Lightweight Deployment**: All inference via Hugging Face APIs - no GPU required
 ## 🤖 Models Included
+| Model | Type | Description |
+|-------|------|-------------|
+| 🗣️ **StutteredSpeechASR** | Fine-tuned Research Model | Whisper fine-tuned specifically for stuttered speech (Mandarin) |
+| 🎙️ **Whisper Large V3** | Baseline Model | OpenAI's Whisper Large V3 model via HF Inference API |
+| 🔊 **Whisper Large V3 Turbo** | Baseline Model | OpenAI's Whisper Large V3 Turbo (faster) via HF Inference API |
 ## 📋 Requirements
 - Python 3.9+
+- Hugging Face API key
 - Docker (optional, for containerized deployment)
+## 🔑 Environment Setup
+Create a `.env` file in the project root with your Hugging Face credentials:
+```env
+HF_ENDPOINT=https://your-endpoint-url.aws.endpoints.huggingface.cloud
+HF_API_KEY=hf_your_api_key_here
+```
+| Variable | Description |
+|----------|-------------|
+| `HF_ENDPOINT` | Your dedicated Hugging Face Inference Endpoint URL for StutteredSpeechASR |
+| `HF_API_KEY` | Your Hugging Face API token (get one at [huggingface.co/settings/tokens](https://huggingface.co/settings/tokens)) |
+## 🚀 Quick Start
+### Option 1: Run with Docker (Recommended)
+1. **Create your `.env` file** with HuggingFace credentials (see above)
+2. **Build and run with Docker Compose**
    ```bash
+   docker compose up --build
    ```
 3. **Open your browser** and navigate to `http://localhost:7860`
+### Option 2: Run Locally
 1. **Clone the repository**
    ```bash
    git clone <your-repo-url>
+   cd asr_demo
    ```
 2. **Create a virtual environment** (recommended)
    pip install -r requirements.txt
    ```
+4. **Create your `.env` file** with HuggingFace credentials (see Environment Setup above)
+5. **Run the application**
    ```bash
    python app.py
    ```
+6. **Open your browser** and navigate to `http://localhost:7860`
+## 🧪 Research Notes
+- **Target Language**: The StutteredSpeechASR model is specifically trained for Mandarin Chinese
+- **Use Cases**: Research demonstration, stuttered speech analysis, comparative ASR evaluation
+- **Best Results**: Use clear audio recordings for optimal model performance
+- **Baseline Comparison**: The Whisper models may struggle with stuttered speech patterns that StutteredSpeechASR handles well
 ## 📚 References
 - [Gradio Documentation](https://www.gradio.app/docs)
+- [Hugging Face Inference API](https://huggingface.co/docs/api-inference)
+- [Hugging Face Inference Endpoints](https://huggingface.co/docs/inference-endpoints)
 - [AImpower StutteredSpeechASR](https://huggingface.co/AImpower/StutteredSpeechASR)
 - [OpenAI Whisper](https://github.com/openai/whisper)

app.py CHANGED Viewed

@@ -4,20 +4,13 @@ A Gradio demo for comparing multiple STT models side-by-side.
 """
 import gradio as gr
-import time
-import torch
-import librosa
 import logging
-from transformers import (
-    AutoModelForSpeechSeq2Seq,
-    AutoProcessor,
-    WhisperForConditionalGeneration,
-    WhisperProcessor,
-    Wav2Vec2ForCTC,
-    Wav2Vec2Processor,
-)
-# Configure logging
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
@@ -25,14 +18,16 @@ logging.basicConfig(
 )
 logger = logging.getLogger("stt_arena")
-# Determine device
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-TORCH_DTYPE = torch.float16 if torch.cuda.is_available() else torch.float32
-logger.info(f"Using device: {DEVICE}")
-logger.info(f"Torch dtype: {TORCH_DTYPE}")
-# Model configurations
 MODELS = [
     {
         "name": "🗣️ StutteredSpeechASR",
@@ -41,62 +36,84 @@ MODELS = [
         "description": "Whisper fine-tuned for stuttered speech (Mandarin)",
     },
     {
-        "name": "🎙️ Whisper Base",
         "id": "whisper",
-        "hf_id": "openai/whisper-base",
-        "description": "OpenAI Whisper base model",
     },
     {
-        "name": "🔊 Wav2Vec2",
-        "id": "wav2vec",
-        "hf_id": "facebook/wav2vec2-base-960h",
-        "description": "Meta's Wav2Vec2 (English)",
     },
 ]
-# Global model cache
-_model_cache = {}
-def load_model(model_config: dict):
     """
-    Load and cache a model based on its configuration.
     """
-    model_id = model_config["id"]
-    hf_id = model_config["hf_id"]
-    if model_id in _model_cache:
-        logger.debug(f"Model {model_id} found in cache")
-        return _model_cache[model_id]
-    logger.info(f"Loading model: {hf_id}...")
-    if model_id == "stuttered":
-        # StutteredSpeechASR - Whisper-based model
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(hf_id, torch_dtype=TORCH_DTYPE)
-        processor = AutoProcessor.from_pretrained(hf_id)
-        model.to(DEVICE)
-        _model_cache[model_id] = (model, processor, "whisper")
-    elif model_id == "whisper":
-        # Standard Whisper model
-        model = WhisperForConditionalGeneration.from_pretrained(hf_id, torch_dtype=TORCH_DTYPE)
-        processor = WhisperProcessor.from_pretrained(hf_id)
-        model.to(DEVICE)
-        _model_cache[model_id] = (model, processor, "whisper")
-    elif model_id == "wav2vec":
-        # Wav2Vec2 model
-        model = Wav2Vec2ForCTC.from_pretrained(hf_id, torch_dtype=TORCH_DTYPE)
-        processor = Wav2Vec2Processor.from_pretrained(hf_id)
-        model.to(DEVICE)
-        _model_cache[model_id] = (model, processor, "wav2vec")
-    logger.info(f"Model {hf_id} loaded successfully!")
-    return _model_cache[model_id]
-def run_inference(audio_path: str, model_config: dict) -> tuple[str, float]:
     """
     Run inference on a single model.
@@ -105,135 +122,87 @@ def run_inference(audio_path: str, model_config: dict) -> tuple[str, float]:
         model_config: Model configuration dictionary
     Returns:
-        Tuple of (transcribed_text, inference_time_in_seconds)
     """
     if audio_path is None:
         logger.warning("No audio provided")
-        return "⚠️ No audio provided. Please record or upload audio first.", 0.0
     try:
         logger.info(f"Running inference with model: {model_config['name']}")
         logger.debug(f"Audio path: {audio_path}")
-        # Load audio file
-        waveform, sampling_rate = librosa.load(audio_path, sr=16000)
-        logger.debug(f"Audio loaded: {len(waveform)} samples at {sampling_rate}Hz")
-        # Load model
-        model, processor, model_type = load_model(model_config)
-        # Start timing
-        start_time = time.time()
-        if model_type == "whisper":
-            # Whisper-style inference
-            input_features = processor(
-                waveform,
-                sampling_rate=16000,
-                return_tensors="pt"
-            ).input_features
-            input_features = input_features.to(DEVICE, dtype=TORCH_DTYPE)
-            with torch.no_grad():
-                predicted_ids = model.generate(input_features)
-            transcription = processor.batch_decode(
-                predicted_ids,
-                skip_special_tokens=True
-            )[0]
-        elif model_type == "wav2vec":
-            # Wav2Vec2-style inference
-            inputs = processor(
-                waveform,
-                sampling_rate=16000,
-                return_tensors="pt",
-                padding=True
-            )
-            input_values = inputs.input_values.to(DEVICE, dtype=TORCH_DTYPE)
-            with torch.no_grad():
-                logits = model(input_values).logits
-            predicted_ids = torch.argmax(logits, dim=-1)
-            transcription = processor.batch_decode(predicted_ids)[0]
-        else:
-            transcription = "Unknown model type"
-        # Calculate inference time
-        inference_time = time.time() - start_time
-        logger.info(f"Inference complete for {model_config['name']}: {inference_time:.3f}s")
-        logger.debug(f"Transcription: {transcription[:100]}..." if len(transcription) > 100 else f"Transcription: {transcription}")
-        return transcription.strip(), round(inference_time, 3)
     except Exception as e:
         logger.error(f"Error during inference with {model_config['name']}: {str(e)}", exc_info=True)
-        return f"❌ Error: {str(e)}", 0.0
 def run_all_models(audio):
     """
     Run inference on all models sequentially.
-    Note: Running sequentially to avoid GPU memory issues and ensure
-    models are loaded one at a time if needed.
     Args:
         audio: Audio input from Gradio component
     Returns:
-        List of results for each model (text1, time1, text2, time2, text3, time3)
     """
     logger.info(f"Starting inference on {len(MODELS)} models")
     results = []
     for model_config in MODELS:
-        text, inference_time = run_inference(audio, model_config)
-        results.extend([text, inference_time])
     logger.info("All models completed")
     return results
 # Build the Gradio interface
 with gr.Blocks(
     theme=gr.themes.Soft(),
-    title="Speech-to-Text Model Arena",
-    css="""
-        .model-card {
-            border: 1px solid #e0e0e0;
-            border-radius: 12px;
-            padding: 16px;
-            background: linear-gradient(135deg, #f5f7fa 0%, #c3cfe2 100%);
-        }
-        .run-button {
-            background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
-            font-size: 1.2em !important;
-            font-weight: bold !important;
-        }
-        .title-text {
-            text-align: center;
-            background: linear-gradient(90deg, #667eea, #764ba2);
-            -webkit-background-clip: text;
-            -webkit-text-fill-color: transparent;
-            background-clip: text;
-        }
-    """
 ) as demo:
     # Title and Description
     gr.Markdown(
         """
-        # 🏆 Speech-to-Text Model Arena
-        **Compare multiple speech recognition models side-by-side!**
-        Upload an audio file or record using your microphone, then click "Run Models"
-        to see how different STT models transcribe your speech. Compare their outputs
-        and inference times to find the best model for your use case.
         """,
         elem_classes=["title-text"]
     )
@@ -247,23 +216,23 @@ with gr.Blocks(
             sources=["microphone", "upload"],
             type="filepath",
             label="Record or Upload Audio",
-            show_label=True,
         )
     # Run Button
     run_button = gr.Button(
-        "🚀 Run Models",
         variant="primary",
         size="lg",
         elem_classes=["run-button"]
     )
     gr.Markdown("---")
-    gr.Markdown("### 📊 Model Results")
     # Model Output Cards
     with gr.Row(equal_height=True):
-        # Create output components for each model
         output_components = []
         for model in MODELS:
@@ -277,16 +246,8 @@ with gr.Blocks(
                     interactive=False,
                 )
-                time_output = gr.Number(
-                    label="⏱️ Inference Time (seconds)",
-                    value=0.0,
-                    interactive=False,
-                    precision=3,
-                )
-                output_components.extend([text_output, time_output])
-    # Connect the button to the inference function
     run_button.click(
         fn=run_all_models,
         inputs=[audio_input],
@@ -300,9 +261,11 @@ with gr.Blocks(
         """
         <center>
-        **💡 Tip:**
-        - For best results, use clear audio with minimal background noise
-        *Built with ❤️ using Gradio*
         </center>
         """,
@@ -312,7 +275,7 @@ with gr.Blocks(
 # Launch the app
 if __name__ == "__main__":
-    logger.info("Starting Speech-to-Text Model Arena")
     logger.info(f"Models configured: {[m['name'] for m in MODELS]}")
     demo.launch(
         share=False,

 """
 import gradio as gr
 import logging
+import os
+import requests
+from dotenv import load_dotenv
+load_dotenv()
 logging.basicConfig(
     level=logging.INFO,
     format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
 )
 logger = logging.getLogger("stt_arena")
+HF_ENDPOINT = os.getenv("HF_ENDPOINT")
+HF_API_KEY = os.getenv("HF_API_KEY")
+WHISPER_API_URL = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3"
+WHISPER_TURBO_API_URL = "https://router.huggingface.co/hf-inference/models/openai/whisper-large-v3-turbo"
+if HF_ENDPOINT:
+    logger.info(f"Using Hugging Face Endpoint: {HF_ENDPOINT}")
+else:
+    logger.warning("HF_ENDPOINT not set, StutteredSpeechASR will use local model")
 MODELS = [
     {
         "name": "🗣️ StutteredSpeechASR",
         "description": "Whisper fine-tuned for stuttered speech (Mandarin)",
     },
     {
+        "name": "🎙️ Whisper Large V3",
         "id": "whisper",
+        "hf_id": "openai/whisper-large-v3",
+        "description": "OpenAI Whisper Large V3 model (via HF Inference API)",
     },
     {
+        "name": "🔊 Whisper Large V3 Turbo",
+        "id": "whisper_turbo",
+        "hf_id": "openai/whisper-large-v3-turbo",
+        "description": "OpenAI Whisper Large V3 Turbo (via HF Inference API)",
     },
 ]
+def run_api_inference(audio_path: str, api_url: str, model_name: str) -> str:
     """
+    Run inference using any Hugging Face API endpoint.
+    Args:
+        audio_path: Path to the audio file
+        api_url: The API endpoint URL
+        model_name: Name of the model for error messages
+    Returns:
+        Transcribed text
     """
+    if not HF_API_KEY:
+        raise ValueError("HF_API_KEY must be set in environment variables")
+    logger.info(f"Running inference via {model_name}")
+    with open(audio_path, "rb") as f:
+        audio_bytes = f.read()
+    headers = {
+        "Authorization": f"Bearer {HF_API_KEY}",
+        "Content-Type": "audio/wav",
+    }
+    response = requests.post(
+        api_url,
+        headers=headers,
+        data=audio_bytes,
+        timeout=120,
+    )
+    if response.status_code != 200:
+        logger.error(f"{model_name} error: {response.status_code} - {response.text}")
+        try:
+            error_data = response.json()
+            error_msg = error_data.get("error", "")
+            if "paused" in error_msg.lower():
+                return f"⏸️ The {model_name} endpoint is currently paused. Please contact the maintainer to restart it."
+            elif "loading" in error_msg.lower():
+                return f"⏳ {model_name} is loading. Please wait and try again."
+            elif response.status_code == 503:
+                return f"🔄 {model_name} service is temporarily unavailable. Please try again."
+            else:
+                return f"❌ {model_name} Error: {error_msg}"
+        except:
+            return f"❌ {model_name} Error: HTTP {response.status_code}"
+    result = response.json()
+    logger.debug(f"{model_name} response: {result}")
+    if isinstance(result, dict):
+        transcription = result.get("text", "") or result.get("transcription", "")
+    elif isinstance(result, list) and len(result) > 0:
+        transcription = result[0].get("text", "") if isinstance(result[0], dict) else str(result[0])
+    else:
+        transcription = str(result)
+    return transcription.strip()
+def run_inference(audio_path: str, model_config: dict) -> str:
     """
     Run inference on a single model.
         model_config: Model configuration dictionary
     Returns:
+        Transcribed text
     """
     if audio_path is None:
         logger.warning("No audio provided")
+        return "⚠️ No audio provided. Please record or upload audio first."
     try:
         logger.info(f"Running inference with model: {model_config['name']}")
         logger.debug(f"Audio path: {audio_path}")
+        if model_config["id"] == "stuttered" and HF_ENDPOINT and HF_API_KEY:
+            return run_api_inference(audio_path, HF_ENDPOINT, "StutteredSpeechASR")
+        if model_config["id"] == "whisper" and HF_API_KEY:
+            return run_api_inference(audio_path, WHISPER_API_URL, "Whisper Large V3")
+        if model_config["id"] == "whisper_turbo" and HF_API_KEY:
+            return run_api_inference(audio_path, WHISPER_TURBO_API_URL, "Whisper Large V3 Turbo")
+        raise ValueError("HF_API_KEY must be set to use this model")
     except Exception as e:
         logger.error(f"Error during inference with {model_config['name']}: {str(e)}", exc_info=True)
+        return f"❌ Error: {str(e)}"
 def run_all_models(audio):
     """
     Run inference on all models sequentially.
     Args:
         audio: Audio input from Gradio component
     Returns:
+        List of transcription results for each model
     """
     logger.info(f"Starting inference on {len(MODELS)} models")
     results = []
     for model_config in MODELS:
+        text = run_inference(audio, model_config)
+        results.append(text)
     logger.info("All models completed")
     return results
+def load_css():
+    """Load CSS from external file"""
+    css_path = os.path.join(os.path.dirname(__file__), "style.css")
+    try:
+        with open(css_path, "r", encoding="utf-8") as f:
+            return f.read()
+    except FileNotFoundError:
+        logger.warning(f"CSS file not found at {css_path}")
+        return ""
 # Build the Gradio interface
 with gr.Blocks(
     theme=gr.themes.Soft(),
+    title="StutteredSpeechASR Research Demo",
+    css=load_css()
 ) as demo:
     # Title and Description
     gr.Markdown(
         """
+        <div style="text-align: center; max-width: 800px; margin: 0 auto;">
+        # 🗣️ StutteredSpeechASR Research Demo
+        ### Fine-tuned Whisper model for stuttered speech recognition
+        This demo showcases our **StutteredSpeechASR** model, a Whisper model fine-tuned specifically
+        for stuttered speech (Mandarin). Compare its performance against baseline Whisper models
+        to see the improvement on stuttered speech patterns.
+        Upload an audio file or record using your microphone to test the models.
+        </div>
         """,
         elem_classes=["title-text"]
     )
             sources=["microphone", "upload"],
             type="filepath",
             label="Record or Upload Audio",
+            streaming=False,
+            editable=True,
         )
     # Run Button
     run_button = gr.Button(
+        "🚀 Compare Models",
         variant="primary",
         size="lg",
         elem_classes=["run-button"]
     )
     gr.Markdown("---")
+    gr.Markdown("### 📊 Model Comparison Results")
     # Model Output Cards
     with gr.Row(equal_height=True):
         output_components = []
         for model in MODELS:
                     interactive=False,
                 )
+                output_components.append(text_output)
     run_button.click(
         fn=run_all_models,
         inputs=[audio_input],
         """
         <center>
+        **💡 Research Note:**
+        - The StutteredSpeechASR model is designed to better handle stuttered speech patterns
+        - For best results, use clear audio recordings
+        *Research Demo | AImpower StutteredSpeechASR*
         </center>
         """,
 # Launch the app
 if __name__ == "__main__":
+    logger.info("Starting StutteredSpeechASR Research Demo")
     logger.info(f"Models configured: {[m['name'] for m in MODELS]}")
     demo.launch(
         share=False,

docker-compose.cpu.yml DELETED Viewed

@@ -1,16 +0,0 @@
-services:
-  stt-arena:
-    build:
-      context: .
-      dockerfile: Dockerfile.cpu
-    image: stt-arena-cpu
-    container_name: stt-arena
-    ports:
-      - "7860:7860"
-    volumes:
-      # Persist HuggingFace model cache
-      - hf-cache:/app/.cache/huggingface
-    restart: unless-stopped
-volumes:
-  hf-cache:

docker-compose.yml CHANGED Viewed

@@ -1,23 +1,12 @@
 services:
-  stt-arena:
     build:
       context: .
       dockerfile: Dockerfile
-    image: stt-arena
-    container_name: stt-arena
     ports:
       - "7860:7860"
-    volumes:
-      # Persist HuggingFace model cache
-      - hf-cache:/app/.cache/huggingface
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
     restart: unless-stopped
-volumes:
-  hf-cache:

 services:
+  stuttered-speech-asr-demo:
     build:
       context: .
       dockerfile: Dockerfile
+    image: stuttered-speech-asr-demo
+    container_name: stuttered-speech-asr-demo
     ports:
       - "7860:7860"
+    env_file:
+      - .env
     restart: unless-stopped

requirements.txt CHANGED Viewed

@@ -1,6 +1,3 @@
 gradio>=4.0.0
-torch>=2.0.0
-transformers>=4.36.0
-librosa>=0.10.0
-soundfile>=0.12.0
-accelerate>=0.25.0

 gradio>=4.0.0
+python-dotenv>=1.0.0
+requests>=2.31.0

style.css ADDED Viewed

	@@ -0,0 +1,149 @@

+/* Force light mode implementation */
+:root, .dark, body, gradio-app {
+    --background-fill-primary: #ffffff !important;
+    --background-fill-secondary: #f3f4f6 !important;
+    --background-fill-tertiary: #e5e7eb !important;
+    --block-background-fill: #ffffff !important;
+    --block-border-color: #e5e7eb !important;
+    --block-label-text-color: #374151 !important;
+    --body-background-fill: #ffffff !important;
+    --body-text-color: #1f2937 !important;
+    --input-background-fill: #ffffff !important;
+    color-scheme: light !important;
+}
+/* Override dark mode specific styles */
+.dark .gradio-container {
+    background-color: #ffffff !important;
+    color: #1f2937 !important;
+}
+/* Ensure all text is dark and readable */
+p, h1, h2, h3, span, label, textarea, .prose {
+    color: #1f2937 !important;
+}
+/* Transcription textboxes */
+textarea {
+    background-color: #ffffff !important;
+    color: #1f2937 !important;
+    font-size: 16px !important;
+    line-height: 1.6 !important;
+}
+/* Audio component styling */
+.audio-container {
+    background-color: #ffffff !important;
+}
+/* Footer readability */
+.footer {
+    color: #1f2937 !important;
+}
+.footer p {
+    color: #1f2937 !important;
+}
+/* Model Card styling */
+.model-card {
+    border: 1px solid #e0e0e0;
+    border-radius: 12px;
+    padding: 16px;
+    background: #ffffff !important;
+}
+/* Force Textbox background to white explicitly */
+.block.textarea, .block.textbox {
+    background: #ffffff !important;
+}
+/* Ensure model card text is dark */
+.model-card h2, .model-card p, .model-card span {
+    color: #1f2937 !important;
+}
+.run-button {
+    background: linear-gradient(90deg, #667eea 0%, #764ba2 100%) !important;
+    font-size: 1.2em !important;
+    font-weight: bold !important;
+    color: white !important;
+}
+/* Fix the specific "Transcription" label element */
+span[data-testid="block-info"], .svelte-jdcl7l {
+    background: #ffffff !important;
+    background-color: #ffffff !important;
+    color: #1f2937 !important;
+    padding: 4px 8px !important;
+    border-radius: 4px !important;
+    border: 1px solid #e5e7eb !important;
+}
+/* Fix label headers for Audio and Transcription inputs - most aggressive approach */
+* [class*="label"], * [class*="Label"], .label, .Label,
+.block-label, span.label-wrap, .label-wrap span, label,
+.textbox label, .textbox .label-wrap, .textbox .block-label,
+.gr-textbox label, .gr-textbox .label-wrap, .gr-textbox .block-label,
+[data-testid="textbox"] label, [data-testid="textbox"] .label-wrap,
+.gradio-textbox label, .gradio-textbox .label-wrap {
+    background: #ffffff !important;
+    background-color: #ffffff !important;
+    color: #1f2937 !important;
+    border: none !important;
+    font-weight: bold !important;
+    font-size: 1.1em !important;
+    margin-bottom: 8px !important;
+    padding: 4px 8px !important;
+    border-radius: 4px !important;
+}
+/* Ensure specific component headers are readable */
+.svelte-1b6s6s {
+    /* This targets Gradio specific label classes if needed */
+    color: #374151 !important;
+}
+/* Title section centering - universal approach */
+[data-testid="markdown"] {
+    display: flex !important;
+    justify-content: center !important;
+    width: 100% !important;
+}
+[data-testid="markdown"] > * {
+    width: 100% !important;
+    max-width: 800px !important;
+    text-align: center !important;
+}
+/* Target any element with title-text class and all its children */
+.title-text,
+.title-text > *,
+.title-text span,
+.title-text div {
+    text-align: center !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+}
+/* Force center alignment on all heading and paragraph elements in title */
+.title-text h1,
+.title-text h2,
+.title-text h3,
+.title-text p {
+    text-align: center !important;
+    margin-left: auto !important;
+    margin-right: auto !important;
+    display: block !important;
+    width: 100% !important;
+}
+.title-text h1 {
+    color: #4f46e5 !important;
+    margin-bottom: 0.5em !important;
+}
+.title-text h3 {
+    margin-bottom: 1.5em !important;
+    color: #6b7280 !important;
+}