whisper-tiny

Sleeping

App Files Files Community

nixaut-codelabs commited on Sep 11, 2025

Commit

c1e3042

verified ·

1 Parent(s): d059e20

Update app.py

Browse files

Files changed (1) hide show

app.py +78 -57

app.py CHANGED Viewed

@@ -2,17 +2,23 @@ import gradio as gr
 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import numpy as np
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
-model_id = "openai/whisper-large-v3-turbo"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id,
-    torch_dtype=torch_dtype,
     low_cpu_mem_usage=True,
-    use_safetensors=True
 )
 model.to(device)
@@ -23,10 +29,9 @@ pipe = pipeline(
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    torch_dtype=torch_dtype,
     device=device,
-    chunk_length_s=30,
-    batch_size=8
 )
 def transcribe_audio(audio_file, task="transcribe", language="auto", return_timestamps=False):
@@ -34,28 +39,36 @@ def transcribe_audio(audio_file, task="transcribe", language="auto", return_time
         return "No audio file provided."
     try:
-        generate_kwargs = {
-            "task": task,
-            "language": None if language == "auto" else language,
-        }
-        if task == "translate":
-            generate_kwargs["task"] = "translate"
-        result = pipe(
-            audio_file,
-            return_timestamps=return_timestamps,
-            generate_kwargs=generate_kwargs
-        )
-        if return_timestamps and "chunks" in result:
-            formatted_result = []
-            for chunk in result["chunks"]:
-                timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
-                formatted_result.append(f"{timestamp} {chunk['text']}")
-            return "\n".join(formatted_result)
-        else:
-            return result["text"]
     except Exception as e:
         return f"Error processing audio: {str(e)}"
@@ -69,28 +82,36 @@ def transcribe_microphone(audio_data, task="transcribe", language="auto", return
         audio_array = audio_array.astype(np.float32)
         audio_array = audio_array / np.max(np.abs(audio_array))
-        generate_kwargs = {
-            "task": task,
-            "language": None if language == "auto" else language,
-        }
-        if task == "translate":
-            generate_kwargs["task"] = "translate"
-        result = pipe(
-            {"array": audio_array, "sampling_rate": sample_rate},
-            return_timestamps=return_timestamps,
-            generate_kwargs=generate_kwargs
-        )
-        if return_timestamps and "chunks" in result:
-            formatted_result = []
-            for chunk in result["chunks"]:
-                timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
-                formatted_result.append(f"{timestamp} {chunk['text']}")
-            return "\n".join(formatted_result)
-        else:
-            return result["text"]
     except Exception as e:
         return f"Error processing audio: {str(e)}"
@@ -135,9 +156,9 @@ languages = [
     ("Latin", "la"),
 ]
-with gr.Blocks(title="Whisper Large V3 Turbo - Speech to Text") as demo:
-    gr.Markdown("# 🎤 Whisper Large V3 Turbo - Speech to Text")
-    gr.Markdown("Upload an audio file or record directly to get high-quality transcription using OpenAI's Whisper Large V3 Turbo model.")
     with gr.Tab("Upload Audio File"):
         with gr.Row():
@@ -220,12 +241,12 @@ with gr.Blocks(title="Whisper Large V3 Turbo - Speech to Text") as demo:
     )
     gr.Markdown("### Features:")
-    gr.Markdown("- **High Accuracy**: Powered by Whisper Large V3 Turbo model")
     gr.Markdown("- **CPU Optimized**: Optimized for 2-core CPU with 16GB RAM")
     gr.Markdown("- **Multi-language**: Supports 99+ languages")
     gr.Markdown("- **Translation**: Can translate speech to English")
     gr.Markdown("- **Timestamps**: Optional word-level or sentence-level timestamps")
-    gr.Markdown("- **Memory Efficient**: Uses chunked processing for better performance")
 if __name__ == "__main__":
     demo.launch(

 import torch
 from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
 import numpy as np
+import os
+os.environ["OMP_NUM_THREADS"] = "2"
+os.environ["MKL_NUM_THREADS"] = "2"
+torch.set_num_threads(2)
+device = "cpu"
+torch_dtype = torch.float32
+model_id = "openai/whisper-tiny"
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
     model_id,
+    dtype=torch_dtype,
     low_cpu_mem_usage=True,
+    use_safetensors=True,
+    attn_implementation="sdpa"
 )
 model.to(device)
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    dtype=torch_dtype,
     device=device,
+    ignore_warning=True
 )
 def transcribe_audio(audio_file, task="transcribe", language="auto", return_timestamps=False):
         return "No audio file provided."
     try:
+        with torch.inference_mode():
+            generate_kwargs = {
+                "task": task,
+                "language": None if language == "auto" else language,
+                "num_beams": 1,
+                "do_sample": False,
+                "temperature": 0.0,
+                "max_new_tokens": 448,
+                "compression_ratio_threshold": 1.35,
+                "logprob_threshold": -1.0,
+                "no_speech_threshold": 0.6,
+            }
+            if task == "translate":
+                generate_kwargs["task"] = "translate"
+            result = pipe(
+                audio_file,
+                return_timestamps=return_timestamps,
+                generate_kwargs=generate_kwargs
+            )
+            if return_timestamps and "chunks" in result:
+                formatted_result = []
+                for chunk in result["chunks"]:
+                    timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
+                    formatted_result.append(f"{timestamp} {chunk['text']}")
+                return "\n".join(formatted_result)
+            else:
+                return result["text"]
     except Exception as e:
         return f"Error processing audio: {str(e)}"
         audio_array = audio_array.astype(np.float32)
         audio_array = audio_array / np.max(np.abs(audio_array))
+        with torch.inference_mode():
+            generate_kwargs = {
+                "task": task,
+                "language": None if language == "auto" else language,
+                "num_beams": 1,
+                "do_sample": False,
+                "temperature": 0.0,
+                "max_new_tokens": 448,
+                "compression_ratio_threshold": 1.35,
+                "logprob_threshold": -1.0,
+                "no_speech_threshold": 0.6,
+            }
+            if task == "translate":
+                generate_kwargs["task"] = "translate"
+            result = pipe(
+                {"array": audio_array, "sampling_rate": sample_rate},
+                return_timestamps=return_timestamps,
+                generate_kwargs=generate_kwargs
+            )
+            if return_timestamps and "chunks" in result:
+                formatted_result = []
+                for chunk in result["chunks"]:
+                    timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
+                    formatted_result.append(f"{timestamp} {chunk['text']}")
+                return "\n".join(formatted_result)
+            else:
+                return result["text"]
     except Exception as e:
         return f"Error processing audio: {str(e)}"
     ("Latin", "la"),
 ]
+with gr.Blocks(title="Whisper Tiny - Speech to Text") as demo:
+    gr.Markdown("# 🎤 Whisper Tiny - Speech to Text")
+    gr.Markdown("Upload an audio file or record directly to get fast transcription using OpenAI's Whisper Tiny model (39M parameters).")
     with gr.Tab("Upload Audio File"):
         with gr.Row():
     )
     gr.Markdown("### Features:")
+    gr.Markdown("- **Lightweight**: Powered by Whisper Tiny model (39M parameters)")
     gr.Markdown("- **CPU Optimized**: Optimized for 2-core CPU with 16GB RAM")
     gr.Markdown("- **Multi-language**: Supports 99+ languages")
     gr.Markdown("- **Translation**: Can translate speech to English")
     gr.Markdown("- **Timestamps**: Optional word-level or sentence-level timestamps")
+    gr.Markdown("- **Fast Processing**: Smallest Whisper model for maximum speed")
 if __name__ == "__main__":
     demo.launch(