Spaces:

oddadmix
/

egyptian-code-swtiching

Running on Zero

App Files Files Community

oddadmix commited on Nov 7, 2025

Commit

1cf51f9

verified ·

1 Parent(s): 20554b7

Update app.py

Browse files

Files changed (1) hide show

app.py +167 -37

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import gradio as gr
 from transformers import AutoProcessor, Gemma3nForConditionalGeneration
 import torch
 import os
 # Global variables for model and processor
 model = None
@@ -13,8 +14,7 @@ def load_model():
     global model, processor
     print("Loading model...")
-    model_id = "oddadmix/gemma-4b-egyptian-code-switching-b4-g2-merged"
     model = Gemma3nForConditionalGeneration.from_pretrained(
         model_id,
@@ -81,6 +81,85 @@ def transcribe_audio(audio_path, max_tokens=128):
     except Exception as e:
         return f"Error during transcription: {str(e)}"
 # Load model at startup
 load_model()
@@ -90,33 +169,97 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
         """
         # 🎙️ Egyptian Code Switching Audio Transcription
-        Upload an audio file or record your voice to get an automatic transcription.
         Specialized for Egyptian Arabic with English code-switching.
         """
     )
-    with gr.Row():
-        with gr.Column():
-            audio_input = gr.Audio(
-                sources=["upload", "microphone"],
-                type="filepath",
-                label="Audio Input"
             )
-            max_tokens_slider = gr.Slider(
-                minimum=32,
-                maximum=512,
-                value=128,
-                step=32,
-                label="Max Output Tokens"
             )
-            transcribe_btn = gr.Button("Transcribe", variant="primary")
-        with gr.Column():
-            output_text = gr.Textbox(
-                label="Transcription",
-                placeholder="Your transcription will appear here...",
-                lines=10,
-                rtl=True
             )
     gr.Markdown(
@@ -124,23 +267,10 @@ with gr.Blocks(title="Egyptian Code Switching Audio Transcription") as demo:
         ### Tips:
         - For best results, use clear audio with minimal background noise
         - The model specializes in Egyptian Arabic with English code-switching
-        - Recording length should be reasonable (under 30 seconds recommended)
         """
     )
-    # Set up the transcription action
-    transcribe_btn.click(
-        fn=transcribe_audio,
-        inputs=[audio_input, max_tokens_slider],
-        outputs=output_text
-    )
-    # Also allow transcription on audio upload/record
-    audio_input.change(
-        fn=transcribe_audio,
-        inputs=[audio_input, max_tokens_slider],
-        outputs=output_text
-    )
 # Launch the app
 if __name__ == "__main__":

 from transformers import AutoProcessor, Gemma3nForConditionalGeneration
 import torch
 import os
+import numpy as np
 # Global variables for model and processor
 model = None
     global model, processor
     print("Loading model...")
+    model_id = "oddadmix/egyptian-code-switching-b4-g2-merged"
     model = Gemma3nForConditionalGeneration.from_pretrained(
         model_id,
     except Exception as e:
         return f"Error during transcription: {str(e)}"
+@spaces.GPU
+def live_transcribe(audio_stream, max_tokens=128):
+    """Transcribe audio stream in real-time"""
+    if model is None or processor is None:
+        yield "Error: Model not loaded"
+        return
+    if audio_stream is None:
+        yield "Waiting for audio input..."
+        return
+    try:
+        # Extract sample rate and audio data
+        sample_rate, audio_data = audio_stream
+        # Check if we have enough audio data (at least 1 second)
+        if len(audio_data) < sample_rate:
+            yield "Recording... (speak now)"
+            return
+        # Save temporary audio file
+        import tempfile
+        import soundfile as sf
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            tmp_path = tmp_file.name
+            sf.write(tmp_path, audio_data, sample_rate)
+        try:
+            messages = [
+                {
+                    "role": "system",
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": "You are an assistant that transcribes speech accurately.",
+                        }
+                    ],
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "audio", "url": tmp_path},
+                        {"type": "text", "text": "Please transcribe this audio."}
+                    ]
+                }
+            ]
+            inputs = processor.apply_chat_template(
+                messages,
+                add_generation_prompt=True,
+                tokenize=True,
+                return_dict=True,
+                return_tensors="pt",
+            ).to(model.device)
+            input_len = inputs["input_ids"].shape[-1]
+            # Generate transcription
+            with torch.inference_mode():
+                generation = model.generate(
+                    **inputs,
+                    max_new_tokens=max_tokens,
+                    do_sample=False
+                )
+                generation = generation[0][input_len:]
+            response = processor.decode(generation, skip_special_tokens=True)
+            yield response
+        finally:
+            # Clean up temporary file
+            if os.path.exists(tmp_path):
+                os.unlink(tmp_path)
+    except Exception as e:
+        yield f"Error during transcription: {str(e)}"
 # Load model at startup
 load_model()
         """
         # 🎙️ Egyptian Code Switching Audio Transcription
+        Choose between live transcription or file upload for automatic transcription.
         Specialized for Egyptian Arabic with English code-switching.
         """
     )
+    with gr.Tabs():
+        # Live Transcription Tab
+        with gr.Tab("Live Transcription"):
+            gr.Markdown(
+                """
+                ### 🔴 Live Transcription Mode
+                Click the microphone button below and start speaking. The transcription will update in real-time.
+                """
             )
+            with gr.Row():
+                with gr.Column():
+                    live_audio = gr.Audio(
+                        sources=["microphone"],
+                        type="numpy",
+                        label="Live Audio Input",
+                        streaming=True
+                    )
+                    live_max_tokens = gr.Slider(
+                        minimum=32,
+                        maximum=512,
+                        value=128,
+                        step=32,
+                        label="Max Output Tokens"
+                    )
+                with gr.Column():
+                    live_output = gr.Textbox(
+                        label="Live Transcription",
+                        placeholder="Start speaking and transcription will appear here...",
+                        lines=10,
+                        rtl=True
+                    )
+            # Set up live transcription
+            live_audio.stream(
+                fn=live_transcribe,
+                inputs=[live_audio, live_max_tokens],
+                outputs=live_output
+            )
+        # File Upload Tab
+        with gr.Tab("File Upload"):
+            gr.Markdown(
+                """
+                ### 📁 File Upload Mode
+                Upload an audio file or record your voice to get a transcription.
+                """
             )
+            with gr.Row():
+                with gr.Column():
+                    audio_input = gr.Audio(
+                        sources=["upload", "microphone"],
+                        type="filepath",
+                        label="Audio Input"
+                    )
+                    max_tokens_slider = gr.Slider(
+                        minimum=32,
+                        maximum=512,
+                        value=128,
+                        step=32,
+                        label="Max Output Tokens"
+                    )
+                    transcribe_btn = gr.Button("Transcribe", variant="primary")
+                with gr.Column():
+                    output_text = gr.Textbox(
+                        label="Transcription",
+                        placeholder="Your transcription will appear here...",
+                        lines=10,
+                        rtl=True
+                    )
+            # Set up the transcription action
+            transcribe_btn.click(
+                fn=transcribe_audio,
+                inputs=[audio_input, max_tokens_slider],
+                outputs=output_text
+            )
+            # Also allow transcription on audio upload/record
+            audio_input.change(
+                fn=transcribe_audio,
+                inputs=[audio_input, max_tokens_slider],
+                outputs=output_text
             )
     gr.Markdown(
         ### Tips:
         - For best results, use clear audio with minimal background noise
         - The model specializes in Egyptian Arabic with English code-switching
+        - Live mode: Speak in short segments for better results
+        - File mode: Recording length should be reasonable (under 30 seconds recommended)
         """
     )
 # Launch the app
 if __name__ == "__main__":