dr3mro commited on
Commit
9367072
·
unverified ·
1 Parent(s): 4f24fba

Add application file

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
+
5
+ # Set up GPU if available
6
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
8
+
9
+ # Load Whisper model
10
+ model_id = "openai/whisper-large-v3"
11
+
12
+ model = AutoModelForSpeechSeq2Seq.from_pretrained(
13
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
14
+ ).to(device)
15
+
16
+ processor = AutoProcessor.from_pretrained(model_id)
17
+
18
+ # Initialize Whisper ASR pipeline
19
+ pipe = pipeline(
20
+ "automatic-speech-recognition",
21
+ model=model,
22
+ tokenizer=processor.tokenizer,
23
+ feature_extractor=processor.feature_extractor,
24
+ torch_dtype=torch_dtype,
25
+ device=device,
26
+ )
27
+
28
+ # Function to transcribe audio
29
+ def transcribe(audio_file):
30
+ if not audio_file:
31
+ return "Error: No audio provided."
32
+
33
+ # Run ASR pipeline on the WAV file
34
+ result = pipe(audio_file)
35
+ return result["text"]
36
+
37
+ # Create Gradio UI with WAV format
38
+ demo = gr.Interface(
39
+ fn=transcribe,
40
+ inputs=gr.Audio(sources=["microphone", "upload"], type="filepath", label="Record or Upload WAV Audio"),
41
+ outputs=gr.Textbox(),
42
+ title="Whisper ASR (Speech-to-Text)",
43
+ description="Transcribe spoken words into text using OpenAI Whisper Large V3. Supports WAV format.",
44
+ live=True,
45
+ )
46
+
47
+ # Launch Gradio app
48
+ demo.launch()