Spaces:

JacobLinCool
/

TWASR

Running on Zero

App Files Files Community

JacobLinCool commited on Mar 13, 2025

Commit

487ed33

1 Parent(s): ff0fd39

feat: add phi

Browse files

Files changed (4) hide show

README.md +1 -1
app.py +129 -96
model.py +188 -18
requirements.txt +3 -2

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 🐠
 colorFrom: red
 colorTo: pink
 sdk: gradio
-sdk_version: 5.4.0
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: red
 colorTo: pink
 sdk: gradio
+sdk_version: 5.20.1
 app_file: app.py
 pinned: false
 license: mit

app.py CHANGED Viewed

@@ -1,109 +1,142 @@
 import gradio as gr
-from huggingface_hub.utils import get_token
-import requests
 import base64
-from model import model_id, transcribe_audio_local
-token = get_token()
 def read_file_as_base64(file_path: str) -> str:
-    with open(file_path, "rb") as f:
-        return base64.b64encode(f.read()).decode()
-def transcribe_audio(audio: str) -> str:
-    print(f"{audio=}")
-    if audio is None:
-        raise gr.Error(
-            "Please wait a moment for the audio to be uploaded, then click the button again."
         )
-    # resample to 16k mono to reduce file size
-    import subprocess
-    import os
-    audio_resampled = audio.replace(".mp3", "_resampled.mp3")
-    subprocess.run(
-        [
-            "ffmpeg",
-            "-i",
-            audio,
-            "-ac",
-            "1",
-            "-ar",
-            "16000",
-            audio_resampled,
-            "-y",
-        ],
-        check=True,
-    )
-    b64 = read_file_as_base64(audio_resampled)
-    url = f"https://api-inference.huggingface.co/models/{model_id}"
-    headers = {
-        "Authorization": f"Bearer {token}",
-        "Content-Type": "application/json",
-        "x-wait-for-model": "true",
-    }
-    data = {
-        "inputs": b64,
-        "parameters": {
-            "generate_kwargs": {
-                "return_timestamps": True,
-            }
-        },
-    }
-    response = requests.post(url, headers=headers, json=data)
-    print(f"{response.text=}")
-    out = response.json()
-    print(f"{out=}")
-    return out["text"]
-with gr.Blocks() as demo:
-    gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition.")
-    gr.Markdown("Upload an audio file or record your voice to transcribe it to text.")
-    gr.Markdown(
-        "First load may take a while to initialize the model, following requests will be faster."
-    )
-    with gr.Row():
-        audio_input = gr.Audio(
-            label="Audio", type="filepath", show_download_button=True
         )
-        text_output = gr.Textbox(label="Transcription")
-    transcribe_local_button = gr.Button(
-        "Transcribe with Transformers", variant="primary"
-    )
-    transcribe_button = gr.Button("Transcribe with Inference API", variant="secondary")
-    transcribe_local_button.click(
-        fn=transcribe_audio_local, inputs=[audio_input], outputs=[text_output]
-    )
-    transcribe_button.click(
-        fn=transcribe_audio, inputs=[audio_input], outputs=[text_output]
-    )
-    gr.Examples(
-        [
-            ["./examples/audio1.mp3"],
-            ["./examples/audio2.mp3"],
-        ],
-        inputs=[audio_input],
-        outputs=[text_output],
-        fn=transcribe_audio_local,
-        cache_examples=True,
-        cache_mode="lazy",
-        run_on_click=True,
-    )
-    gr.Markdown(
-        f"Current model: {model_id}. For more information, visit the [model hub](https://huggingface.co/{model_id})."
-    )
 if __name__ == "__main__":
-    demo.launch()

+import spaces
 import gradio as gr
+import logging
+from pathlib import Path
 import base64
+from model import (
+    MODEL_ID as WHISPER_MODEL_ID,
+    PHI_MODEL_ID,
+    transcribe_audio_local,
+    transcribe_audio_phi,
+    preload_models,
+)
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Constants
+EXAMPLES_DIR = Path("./examples")
+MODEL_CHOICES = {WHISPER_MODEL_ID: "Whisper Model", PHI_MODEL_ID: "Phi-4 Model"}
+EXAMPLE_FILES = [
+    (str(EXAMPLES_DIR / "audio1.mp3"), WHISPER_MODEL_ID),
+    (str(EXAMPLES_DIR / "audio2.mp3"), WHISPER_MODEL_ID),
+]
 def read_file_as_base64(file_path: str) -> str:
+    """
+    Read a file and encode it as base64.
+    Args:
+        file_path: Path to the file to read
+    Returns:
+        Base64 encoded string of file contents
+    """
+    try:
+        with open(file_path, "rb") as f:
+            return base64.b64encode(f.read()).decode()
+    except Exception as e:
+        logger.error(f"Failed to read file {file_path}: {str(e)}")
+        raise
+def combined_transcription(audio: str, model_choice: str) -> str:
+    """
+    Transcribe audio using the selected model.
+    Args:
+        audio: Path to audio file
+        model_choice: Full model ID to use for transcription
+    Returns:
+        Transcription text
+    """
+    if not audio:
+        return "Please provide an audio file to transcribe."
+    try:
+        if model_choice == PHI_MODEL_ID:
+            return transcribe_audio_phi(audio)
+        elif model_choice == WHISPER_MODEL_ID:
+            return transcribe_audio_local(audio)
+        else:
+            logger.error(f"Unknown model choice: {model_choice}")
+            return f"Error: Unknown model {model_choice}"
+    except Exception as e:
+        logger.error(f"Transcription failed: {str(e)}")
+        return f"Error during transcription: {str(e)}"
+def create_demo() -> gr.Blocks:
+    """Create and configure the Gradio demo interface"""
+    with gr.Blocks(theme=gr.themes.Soft()) as demo:
+        gr.Markdown("# TWASR: Chinese (Taiwan) Automatic Speech Recognition")
+        gr.Markdown(
+            "Upload an audio file or record your voice to transcribe it to text."
+        )
+        gr.Markdown(
+            "⚠️ First load may take a while to initialize the model, following requests will be faster."
+        )
+        with gr.Row():
+            audio_input = gr.Audio(
+                label="Audio Input", type="filepath", show_download_button=True
+            )
+            with gr.Column():
+                model_choice = gr.Dropdown(
+                    label="Select Model",
+                    choices=list(MODEL_CHOICES.keys()),
+                    value=WHISPER_MODEL_ID,
+                    info="Select the model for transcription",
+                )
+                text_output = gr.Textbox(label="Transcription Output", lines=5)
+        with gr.Row():
+            transcribe_button = gr.Button("🎯 Transcribe", variant="primary")
+            clear_button = gr.Button("🧹 Clear")
+        transcribe_button.click(
+            fn=combined_transcription,
+            inputs=[audio_input, model_choice],
+            outputs=[text_output],
+            show_progress=True,
+        )
+        clear_button.click(
+            fn=lambda: (None, ""),
+            inputs=[],
+            outputs=[audio_input, text_output],
         )
+        gr.Examples(
+            EXAMPLE_FILES,
+            inputs=[audio_input, model_choice],
+            outputs=[text_output],
+            fn=combined_transcription,
+            cache_examples=True,
+            cache_mode="lazy",
+            run_on_click=True,
         )
+        gr.Markdown("### Model Information")
+        with gr.Accordion("Model Details", open=False):
+            for model_id, model_name in MODEL_CHOICES.items():
+                gr.Markdown(
+                    f"**{model_name}:** [{model_id}](https://huggingface.co/{model_id})"
+                )
+    return demo
 if __name__ == "__main__":
+    # Preload models before starting the app to reduce cold start time
+    logger.info("Preloading models to reduce cold start time")
+    preload_models()
+    demo = create_demo()
+    demo.launch(share=False)

model.py CHANGED Viewed

@@ -1,35 +1,205 @@
-from transformers import pipeline
-from accelerate import Accelerator
 import spaces
 import librosa
-model_id = "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW"
-pipe = None
-def load_model():
     global pipe
-    device = Accelerator().device
-    pipe = pipeline("automatic-speech-recognition", model=model_id, device=device)
 def get_gpu_duration(audio: str) -> int:
-    y, sr = librosa.load(audio)
-    duration = librosa.get_duration(y=y, sr=sr) / 60.0
-    gpu_duration = max(1.0, (duration + 59.0) // 60.0) * 60.0
-    print(f"{duration=}, {gpu_duration=}")
-    return int(gpu_duration)
 @spaces.GPU(duration=get_gpu_duration)
 def transcribe_audio_local(audio: str) -> str:
-    print(f"{audio=}")
-    if pipe is None:
-        load_model()
-    out = pipe(audio, return_timestamps=True)
-    print(f"{out=}")
-    return out["text"]

 import spaces
+from typing import Optional
+import logging
+import time
+import threading
+import torch
 import librosa
+from transformers import pipeline, AutoProcessor, AutoModelForCausalLM, Pipeline
+from accelerate import Accelerator
+# Set up logging
+logging.basicConfig(
+    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+)
+logger = logging.getLogger(__name__)
+# Model constants
+MODEL_ID = "JacobLinCool/whisper-large-v3-turbo-common_voice_19_0-zh-TW"
+PHI_MODEL_ID = "JacobLinCool/Phi-4-multimodal-instruct-commonvoice-zh-tw"
+USE_FA = torch.cuda.is_available()  # Use Flash Attention if CUDA is available
+# Model instances (initialized lazily)
+pipe: Optional[Pipeline] = None
+phi_model = None
+phi_processor = None
+# Lock for thread-safe model loading
+model_loading_lock = threading.Lock()
+def load_model() -> None:
+    """
+    Load the Whisper model for transcription.
+    Uses GPU if available.
+    """
     global pipe
+    if pipe is not None:
+        return  # Model already loaded
+    try:
+        start_time = time.time()
+        logger.info(f"Loading Whisper model {MODEL_ID}...")
+        device = Accelerator().device
+        pipe = pipeline("automatic-speech-recognition", model=MODEL_ID, device=device)
+        logger.info(
+            f"Model loaded successfully in {time.time() - start_time:.2f} seconds"
+        )
+    except Exception as e:
+        logger.error(f"Failed to load Whisper model: {str(e)}")
+        raise
 def get_gpu_duration(audio: str) -> int:
+    """
+    Calculate required GPU allocation time based on audio duration.
+    Args:
+        audio: Path to audio file
+    Returns:
+        GPU allocation time in seconds
+    """
+    try:
+        y, sr = librosa.load(audio)
+        duration = librosa.get_duration(y=y, sr=sr) / 60.0
+        gpu_duration = max(1.0, (duration + 59.0) // 60.0) * 60.0
+        logger.info(
+            f"Audio duration: {duration:.2f} min, Allocated GPU time: {gpu_duration:.2f} min"
+        )
+        return int(gpu_duration)
+    except Exception as e:
+        logger.error(f"Failed to calculate GPU duration: {str(e)}")
+        return 60  # Default to 1 minute if calculation fails
 @spaces.GPU(duration=get_gpu_duration)
 def transcribe_audio_local(audio: str) -> str:
+    """
+    Transcribe audio using the Whisper model.
+    Args:
+        audio: Path to audio file
+    Returns:
+        Transcribed text
+    """
+    try:
+        logger.info(f"Transcribing audio with Whisper: {audio}")
+        if pipe is None:
+            load_model()
+        out = pipe(audio, return_timestamps=True)
+        return out.get("text", "No transcription generated")
+    except Exception as e:
+        logger.error(f"Whisper transcription error: {str(e)}")
+        raise
+def load_phi_model() -> None:
+    """
+    Load the Phi-4 model and processor.
+    Uses GPU with Flash Attention if available.
+    """
+    global phi_model, phi_processor
+    if phi_model is not None and phi_processor is not None:
+        return  # Model already loaded
+    try:
+        start_time = time.time()
+        logger.info(f"Loading Phi-4 model {PHI_MODEL_ID}...")
+        phi_processor = AutoProcessor.from_pretrained(
+            PHI_MODEL_ID, trust_remote_code=True
+        )
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        dtype = torch.bfloat16 if USE_FA else torch.float32
+        attn_implementation = "flash_attention_2" if USE_FA else "sdpa"
+        phi_model = AutoModelForCausalLM.from_pretrained(
+            PHI_MODEL_ID,
+            torch_dtype=dtype,
+            _attn_implementation=attn_implementation,
+            trust_remote_code=True,
+        ).to(device)
+        logger.info(
+            f"Phi-4 model loaded successfully in {time.time() - start_time:.2f} seconds"
+        )
+    except Exception as e:
+        logger.error(f"Failed to load Phi-4 model: {str(e)}")
+        raise
+def transcribe_audio_phi(audio: str) -> str:
+    """
+    Transcribe audio using the Phi-4 model.
+    Args:
+        audio: Path to audio file
+    Returns:
+        Transcribed text
+    """
+    try:
+        logger.info(f"Transcribing audio with Phi-4: {audio}")
+        load_phi_model()
+        # Load and resample audio to 16kHz
+        y, sr = librosa.load(audio, sr=16000)
+        # Prepare the user message and generate the prompt
+        user_message = {
+            "role": "user",
+            "content": "<|audio_1|> Transcribe the audio clip into text.",
+        }
+        prompt = phi_processor.tokenizer.apply_chat_template(
+            [user_message], tokenize=False, add_generation_prompt=True
+        )
+        # Build inputs for the model
+        inputs = phi_processor(text=prompt, audios=[(y, sr)], return_tensors="pt")
+        inputs = {
+            k: v.to(phi_model.device) if hasattr(v, "to") else v
+            for k, v in inputs.items()
+        }
+        # Generate transcription without gradients
+        with torch.no_grad():
+            generated_ids = phi_model.generate(
+                **inputs,
+                eos_token_id=phi_processor.tokenizer.eos_token_id,
+                max_new_tokens=256,  # Increased for longer transcriptions
+                do_sample=False,
+            )
+        # Decode the generated token IDs into text
+        transcription = phi_processor.decode(
+            generated_ids[0, inputs["input_ids"].shape[1] :],
+            skip_special_tokens=True,
+            clean_up_tokenization_spaces=False,
+        )
+        logger.info(f"Phi-4 transcription completed successfully")
+        return transcription
+    except Exception as e:
+        logger.error(f"Phi-4 transcription error: {str(e)}")
+        raise
+def preload_models() -> None:
+    """
+    Preload models into memory to reduce cold start time.
+    This function can be called at application startup.
+    """
+    try:
+        logger.info("Preloading models to reduce cold start time")
+        # Load Whisper model first as it's the default
+        load_model()
+        # Then load Phi model
+        load_phi_model()
+        logger.info("All models preloaded successfully")
+    except Exception as e:
+        logger.error(f"Error during model preloading: {str(e)}")

requirements.txt CHANGED Viewed

@@ -1,6 +1,7 @@
-gradio==5.4.0
-huggingface_hub==0.26.2
 transformers
 accelerate
 spaces
 librosa

+gradio==5.20.1
+huggingface_hub
 transformers
 accelerate
 spaces
 librosa
+flash-attn