Spaces:

balaharan
/

claude

Sleeping

App Files Files Community

balaharan commited on Sep 2, 2025

Commit

e95056a

verified ·

1 Parent(s): a00d269

requierement.txt

Browse files

transformers>=4.45.0
torch>=2.0.0
torchaudio>=2.0.0
gradio>=4.0.0
soundfile>=0.12.0
accelerate>=0.21.0

Files changed (1) hide show

app.py +81 -101

app.py CHANGED Viewed

@@ -1,75 +1,92 @@
 import gradio as gr
 import torch
 import torchaudio
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
-import numpy as np
-# Global variables to store model and processor
 model = None
 processor = None
 device = None
 def load_model():
-    """Load the Granite Speech model and processor"""
     global model, processor, device
     try:
         device = "cuda" if torch.cuda.is_available() else "cpu"
         model_name = "ibm-granite/granite-speech-3.3-2b"
-        # Load processor and model
         processor = AutoProcessor.from_pretrained(model_name)
-        model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name).to(device)
-        return f"✅ Model loaded successfully on {device}"
     except Exception as e:
         return f"❌ Error loading model: {str(e)}"
-def transcribe_audio(audio_file, task_type="transcribe"):
-    """
-    Transcribe audio using Granite Speech model
-    Args:
-        audio_file: Audio file path from Gradio
-        task_type: "transcribe" or "translate"
-    """
     global model, processor, device
     if model is None or processor is None:
-        return "❌ Model not loaded. Please load the model first."
     try:
         # Load and preprocess audio
-        if audio_file is None:
-            return "❌ Please upload an audio file"
-        # Load audio file
         wav, sr = torchaudio.load(audio_file)
-        # Ensure mono and 16kHz
         if wav.shape[0] > 1:
-            wav = wav.mean(dim=0, keepdim=True)  # Convert to mono
         if sr != 16000:
             resampler = torchaudio.transforms.Resample(sr, 16000)
             wav = resampler(wav)
-        # Normalize audio
-        wav = torchaudio.functional.normalize_audio(wav)
-        # Create chat template
-        if task_type == "transcribe":
-            user_content = "<|audio|>can you transcribe the speech into a written format?"
-        else:  # translate
-            user_content = "<|audio|>can you translate this speech to English?"
         chat = [
             {
                 "role": "system",
-                "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant",
             },
             {
                 "role": "user",
-                "content": user_content,
             }
         ]
@@ -83,116 +100,79 @@ def transcribe_audio(audio_file, task_type="transcribe"):
         model_inputs = processor(
             text,
             wav,
-            device=device,
             return_tensors="pt",
         ).to(device)
-        # Generate transcription
         with torch.no_grad():
-            model_outputs = model.generate(
                 **model_inputs,
-                max_new_tokens=200,
-                num_beams=4,
                 do_sample=False,
-                min_length=1,
-                top_p=1.0,
-                repetition_penalty=1.0,
-                length_penalty=1.0,
                 temperature=1.0,
-                bos_token_id=tokenizer.bos_token_id,
-                eos_token_id=tokenizer.eos_token_id,
                 pad_token_id=tokenizer.pad_token_id,
             )
         # Decode output
         num_input_tokens = model_inputs["input_ids"].shape[-1]
-        new_tokens = model_outputs[0, num_input_tokens:].unsqueeze(0)
-        output_text = tokenizer.batch_decode(
-            new_tokens, add_special_tokens=False, skip_special_tokens=True
         )[0]
-        return f"✅ {task_type.capitalize()} Result:\n\n{output_text}"
     except Exception as e:
-        return f"❌ Error during {task_type}: {str(e)}"
-def create_interface():
-    """Create the Gradio interface"""
-    with gr.Blocks(title="Granite Speech 3.3-2B Demo", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🎤 IBM Granite Speech 3.3-2B Demo
-        This demo uses IBM's Granite Speech 3.3-2B model for automatic speech recognition (ASR) and speech translation.
-        **Supported Languages**: English, French, German, Spanish, Portuguese
-        **Features**:
-        - 📝 Speech-to-text transcription
-        - 🌍 Speech translation to English
-        - 🔄 Two-pass design for improved accuracy
         """)
         with gr.Row():
             with gr.Column():
-                # Model loading section
-                gr.Markdown("### 1. Load Model")
-                load_btn = gr.Button("🔄 Load Granite Speech Model", variant="primary")
-                load_status = gr.Textbox(label="Status", interactive=False)
-                # Audio input section
-                gr.Markdown("### 2. Upload Audio")
-                audio_input = gr.Audio(
                     label="Upload Audio File",
                     type="filepath",
                     format="wav"
                 )
-                # Task selection
-                task_choice = gr.Radio(
-                    choices=["transcribe", "translate"],
-                    value="transcribe",
-                    label="Task",
-                    info="Choose whether to transcribe or translate to English"
-                )
-                # Process button
-                process_btn = gr.Button("🎯 Process Audio", variant="secondary")
             with gr.Column():
-                # Output section
-                gr.Markdown("### 3. Results")
-                output_text = gr.Textbox(
-                    label="Output",
-                    lines=10,
-                    interactive=False,
-                    placeholder="Transcription or translation will appear here..."
                 )
-        # Example audio section
         gr.Markdown("""
-        ### 📋 Usage Tips:
-        - **Audio format**: Upload WAV, MP3, or other common audio formats
-        - **Quality**: Clear speech works best (16kHz recommended)
-        - **Length**: Keep audio clips reasonable in length for free tier
-        - **Languages**: Works with English, French, German, Spanish, Portuguese
         """)
         # Event handlers
-        load_btn.click(
-            fn=load_model,
-            outputs=load_status
-        )
-        process_btn.click(
-            fn=transcribe_audio,
-            inputs=[audio_input, task_choice],
-            outputs=output_text
-        )
     return demo
-# Create and launch the interface
 if __name__ == "__main__":
-    demo = create_interface()
     demo.launch()

 import gradio as gr
 import torch
 import torchaudio
+import warnings
+import os
+# Suppress warnings for cleaner output
+warnings.filterwarnings("ignore")
+# Global variables
 model = None
 processor = None
 device = None
 def load_model():
+    """Load the Granite Speech model with error handling"""
     global model, processor, device
     try:
+        # Check available device
         device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {device}")
+        # Import here to catch import errors
+        from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
         model_name = "ibm-granite/granite-speech-3.3-2b"
+        # Load with memory optimization for free tier
+        print("Loading processor...")
         processor = AutoProcessor.from_pretrained(model_name)
+        print("Loading model...")
+        model = AutoModelForSpeechSeq2Seq.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
+            low_cpu_mem_usage=True,
+        ).to(device)
+        # Set to eval mode
+        model.eval()
+        return f"✅ Model loaded successfully on {device}!"
+    except ImportError as e:
+        return f"❌ Import error: {str(e)}. Please check requirements.txt"
+    except torch.cuda.OutOfMemoryError:
+        return "❌ GPU out of memory. Try restarting the Space or use CPU."
     except Exception as e:
         return f"❌ Error loading model: {str(e)}"
+def transcribe_audio(audio_file):
+    """Simple transcription function"""
     global model, processor, device
     if model is None or processor is None:
+        return "❌ Please load the model first by clicking 'Load Model' button."
+    if audio_file is None:
+        return "❌ Please upload an audio file."
     try:
         # Load and preprocess audio
         wav, sr = torchaudio.load(audio_file)
+        # Convert to mono if stereo
         if wav.shape[0] > 1:
+            wav = wav.mean(dim=0, keepdim=True)
+        # Resample to 16kHz if needed
         if sr != 16000:
             resampler = torchaudio.transforms.Resample(sr, 16000)
             wav = resampler(wav)
+        # Limit audio length for free tier (30 seconds max)
+        max_length = 16000 * 30  # 30 seconds at 16kHz
+        if wav.shape[1] > max_length:
+            wav = wav[:, :max_length]
+            print("Audio truncated to 30 seconds for processing")
+        # Create simple chat template
         chat = [
             {
                 "role": "system",
+                "content": "You are Granite, developed by IBM. You are a helpful AI assistant.",
             },
             {
                 "role": "user",
+                "content": "<|audio|>Please transcribe this audio.",
             }
         ]
         model_inputs = processor(
             text,
             wav,
             return_tensors="pt",
+            sampling_rate=16000
         ).to(device)
+        # Generate with conservative settings
         with torch.no_grad():
+            outputs = model.generate(
                 **model_inputs,
+                max_new_tokens=100,
+                num_beams=2,  # Reduced for speed
                 do_sample=False,
                 temperature=1.0,
                 pad_token_id=tokenizer.pad_token_id,
             )
         # Decode output
         num_input_tokens = model_inputs["input_ids"].shape[-1]
+        new_tokens = outputs[0, num_input_tokens:].unsqueeze(0)
+        transcription = tokenizer.batch_decode(
+            new_tokens, skip_special_tokens=True
         )[0]
+        return f"🎤 Transcription:\n\n{transcription}"
     except Exception as e:
+        return f"❌ Error during transcription: {str(e)}"
+# Create Gradio interface
+def create_demo():
+    with gr.Blocks(title="Granite Speech Demo", theme=gr.themes.Soft()) as demo:
         gr.Markdown("""
         # 🎤 IBM Granite Speech 3.3-2B Demo
+        Upload an audio file to transcribe speech to text.
+        **Supported**: English, French, German, Spanish, Portuguese
         """)
         with gr.Row():
             with gr.Column():
+                # Model loading
+                load_btn = gr.Button("🔄 Load Model", variant="primary", size="lg")
+                status = gr.Textbox(label="Status", interactive=False)
+                # Audio input
+                audio = gr.Audio(
                     label="Upload Audio File",
                     type="filepath",
                     format="wav"
                 )
+                transcribe_btn = gr.Button("🎯 Transcribe", variant="secondary")
             with gr.Column():
+                output = gr.Textbox(
+                    label="Transcription Result",
+                    lines=8,
+                    interactive=False
                 )
         gr.Markdown("""
+        ### 💡 Tips:
+        - Keep audio files under 30 seconds for free tier
+        - Clear speech works best
+        - WAV format recommended
         """)
         # Event handlers
+        load_btn.click(load_model, outputs=status)
+        transcribe_btn.click(transcribe_audio, inputs=audio, outputs=output)
     return demo
 if __name__ == "__main__":
+    demo = create_demo()
     demo.launch()