NoLev commited on
Commit
7d6b2b0
·
verified ·
1 Parent(s): ab63b06

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -0
app.py ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from transformers import pipeline
3
+ import torch
4
+
5
+ # Global cache for pipelines to avoid reloading models
6
+ pipelines = {}
7
+
8
+ # List of available Whisper models (from smallest/fastest to largest/most accurate)
9
+ MODEL_OPTIONS = [
10
+ "openai/whisper-tiny", # ~39M params, fastest but least accurate
11
+ "openai/whisper-base", # ~74M params, good balance
12
+ "openai/whisper-small", # ~244M params, better accuracy
13
+ "openai/whisper-medium", # ~769M params, high accuracy
14
+ "openai/whisper-large", # ~1550M params, very high accuracy
15
+ "openai/whisper-large-v3", # ~1550M params, latest with improvements
16
+ ]
17
+
18
+ # Function to get or load a pipeline for a given model
19
+ def get_pipeline(model_id):
20
+ if model_id not in pipelines:
21
+ print(f"Loading model: {model_id}...") # Log for debugging in Spaces
22
+ pipelines[model_id] = pipeline(
23
+ "automatic-speech-recognition",
24
+ model=model_id,
25
+ device="cuda" if torch.cuda.is_available() else "cpu" # Use GPU if available
26
+ )
27
+ return pipelines[model_id]
28
+
29
+ # Transcription function with chunking for long audio
30
+ def transcribe_speech(audio_file, model_id, language="english", return_timestamps=False):
31
+ if audio_file is None:
32
+ return "Please upload an audio file."
33
+
34
+ pipe = get_pipeline(model_id)
35
+
36
+ # Generate kwargs for transcription
37
+ generate_kwargs = {"task": "transcribe", "language": language}
38
+ if return_timestamps:
39
+ generate_kwargs["return_timestamps"] = True
40
+
41
+ # Transcribe with chunking for long files
42
+ output = pipe(
43
+ audio_file,
44
+ max_new_tokens=128, # Per chunk for stability
45
+ generate_kwargs=generate_kwargs,
46
+ chunk_length_s=30,
47
+ stride_length_s=5, # Overlap for smooth transitions
48
+ batch_size=8 if "tiny" not in model_id and "base" not in model_id else 16, # Adjust batch for smaller models
49
+ return_timestamps=return_timestamps,
50
+ )
51
+
52
+ if return_timestamps:
53
+ # Format with timestamps if requested
54
+ if "chunks" in output:
55
+ formatted = []
56
+ for chunk in output["chunks"]:
57
+ start = f"{chunk['timestamp'][0]:.2f}s" if chunk['timestamp'][0] is not None else "0.00s"
58
+ end = f"{chunk['timestamp'][1]:.2f}s" if chunk['timestamp'][1] is not None else "?.?s"
59
+ formatted.append(f"[{start} - {end}] {chunk['text']}")
60
+ return "\n".join(formatted)
61
+ else:
62
+ return output["text"] # Fallback
63
+ else:
64
+ return output["text"]
65
+
66
+ # Create the Gradio app with a colorful, responsive theme
67
+ theme = gr.themes.Soft(
68
+ primary_hue="blue",
69
+ secondary_hue="purple",
70
+ neutral_hue="slate",
71
+ font=[gr.themes.GoogleFont("Inter"), "ui-sans-serif", "system-ui"]
72
+ )
73
+
74
+ with gr.Blocks(theme=theme, title="MP3 to Text Transcriber") as demo:
75
+ gr.Markdown(
76
+ """
77
+ # 🎤 MP3 to Text Transcription Tool
78
+ Upload an MP3 (or any audio file) and transcribe it to text using OpenAI's Whisper models.
79
+ Supports long files up to hours—handles 45+ minutes effortlessly!
80
+ Choose a model for speed vs. accuracy trade-off.
81
+ """,
82
+ elem_classes=["centered"]
83
+ )
84
+
85
+ with gr.Row(variant="panel", elem_classes=["max-w-4xl mx-auto"]):
86
+ with gr.Column(scale=1):
87
+ audio_input = gr.Audio(
88
+ sources="upload",
89
+ type="filepath",
90
+ label="📁 Upload Audio File (MP3/WAV/etc.)",
91
+ elem_classes=["w-full"]
92
+ )
93
+
94
+ model_dropdown = gr.Dropdown(
95
+ choices=MODEL_OPTIONS,
96
+ value=MODEL_OPTIONS[1], # Default to base
97
+ label="🤖 Select Whisper Model",
98
+ info="Tiny: Fastest | Large-v3: Most accurate (slower on CPU)",
99
+ elem_classes=["w-full"]
100
+ )
101
+
102
+ language_dropdown = gr.Dropdown(
103
+ choices=["english", "french", "german", "spanish", "italian", "portuguese", "dutch", "russian", "swedish", "chinese", "japanese", "korean", "arabic", "hindi"], # Common languages
104
+ value="english",
105
+ label="🌍 Language (for better accuracy)",
106
+ elem_classes=["w-full"]
107
+ )
108
+
109
+ timestamps_checkbox = gr.Checkbox(
110
+ label="⏰ Include Timestamps?",
111
+ value=False,
112
+ info="Adds [start - end] tags to the transcript."
113
+ )
114
+
115
+ transcribe_btn = gr.Button("🚀 Transcribe Audio", variant="primary", size="lg", elem_classes=["w-full"])
116
+
117
+ with gr.Column(scale=1):
118
+ status_output = gr.Markdown("Ready to transcribe! 💬", elem_classes=["text-center"])
119
+
120
+ transcript_output = gr.Textbox(
121
+ label="📝 Transcript",
122
+ lines=15,
123
+ max_lines=20,
124
+ placeholder="Your transcription will appear here...",
125
+ elem_classes=["w-full", "bg-gray-50 dark:bg-gray-800"],
126
+ show_copy_button=True
127
+ )
128
+
129
+ # Event handlers
130
+ def update_status(msg):
131
+ return gr.Markdown(f"**{msg}**")
132
+
133
+ transcribe_btn.click(
134
+ fn=transcribe_speech,
135
+ inputs=[audio_input, model_dropdown, language_dropdown, timestamps_checkbox],
136
+ outputs=transcript_output,
137
+ show_progress=True # Progress bar for long transcriptions
138
+ ).then(
139
+ fn=lambda: update_status("Transcription complete! 🎉"),
140
+ outputs=status_output
141
+ )
142
+
143
+ if __name__ == "__main__":
144
+ demo.launch()