nixaut-codelabs commited on
Commit
c1e3042
·
verified ·
1 Parent(s): d059e20

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +78 -57
app.py CHANGED
@@ -2,17 +2,23 @@ import gradio as gr
2
  import torch
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import numpy as np
 
5
 
6
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
7
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 
8
 
9
- model_id = "openai/whisper-large-v3-turbo"
 
 
 
10
 
11
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
  model_id,
13
- torch_dtype=torch_dtype,
14
  low_cpu_mem_usage=True,
15
- use_safetensors=True
 
16
  )
17
  model.to(device)
18
 
@@ -23,10 +29,9 @@ pipe = pipeline(
23
  model=model,
24
  tokenizer=processor.tokenizer,
25
  feature_extractor=processor.feature_extractor,
26
- torch_dtype=torch_dtype,
27
  device=device,
28
- chunk_length_s=30,
29
- batch_size=8
30
  )
31
 
32
  def transcribe_audio(audio_file, task="transcribe", language="auto", return_timestamps=False):
@@ -34,28 +39,36 @@ def transcribe_audio(audio_file, task="transcribe", language="auto", return_time
34
  return "No audio file provided."
35
 
36
  try:
37
- generate_kwargs = {
38
- "task": task,
39
- "language": None if language == "auto" else language,
40
- }
41
-
42
- if task == "translate":
43
- generate_kwargs["task"] = "translate"
44
-
45
- result = pipe(
46
- audio_file,
47
- return_timestamps=return_timestamps,
48
- generate_kwargs=generate_kwargs
49
- )
50
-
51
- if return_timestamps and "chunks" in result:
52
- formatted_result = []
53
- for chunk in result["chunks"]:
54
- timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
55
- formatted_result.append(f"{timestamp} {chunk['text']}")
56
- return "\n".join(formatted_result)
57
- else:
58
- return result["text"]
 
 
 
 
 
 
 
 
59
 
60
  except Exception as e:
61
  return f"Error processing audio: {str(e)}"
@@ -69,28 +82,36 @@ def transcribe_microphone(audio_data, task="transcribe", language="auto", return
69
  audio_array = audio_array.astype(np.float32)
70
  audio_array = audio_array / np.max(np.abs(audio_array))
71
 
72
- generate_kwargs = {
73
- "task": task,
74
- "language": None if language == "auto" else language,
75
- }
76
-
77
- if task == "translate":
78
- generate_kwargs["task"] = "translate"
79
-
80
- result = pipe(
81
- {"array": audio_array, "sampling_rate": sample_rate},
82
- return_timestamps=return_timestamps,
83
- generate_kwargs=generate_kwargs
84
- )
85
-
86
- if return_timestamps and "chunks" in result:
87
- formatted_result = []
88
- for chunk in result["chunks"]:
89
- timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
90
- formatted_result.append(f"{timestamp} {chunk['text']}")
91
- return "\n".join(formatted_result)
92
- else:
93
- return result["text"]
 
 
 
 
 
 
 
 
94
 
95
  except Exception as e:
96
  return f"Error processing audio: {str(e)}"
@@ -135,9 +156,9 @@ languages = [
135
  ("Latin", "la"),
136
  ]
137
 
138
- with gr.Blocks(title="Whisper Large V3 Turbo - Speech to Text") as demo:
139
- gr.Markdown("# 🎤 Whisper Large V3 Turbo - Speech to Text")
140
- gr.Markdown("Upload an audio file or record directly to get high-quality transcription using OpenAI's Whisper Large V3 Turbo model.")
141
 
142
  with gr.Tab("Upload Audio File"):
143
  with gr.Row():
@@ -220,12 +241,12 @@ with gr.Blocks(title="Whisper Large V3 Turbo - Speech to Text") as demo:
220
  )
221
 
222
  gr.Markdown("### Features:")
223
- gr.Markdown("- **High Accuracy**: Powered by Whisper Large V3 Turbo model")
224
  gr.Markdown("- **CPU Optimized**: Optimized for 2-core CPU with 16GB RAM")
225
  gr.Markdown("- **Multi-language**: Supports 99+ languages")
226
  gr.Markdown("- **Translation**: Can translate speech to English")
227
  gr.Markdown("- **Timestamps**: Optional word-level or sentence-level timestamps")
228
- gr.Markdown("- **Memory Efficient**: Uses chunked processing for better performance")
229
 
230
  if __name__ == "__main__":
231
  demo.launch(
 
2
  import torch
3
  from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
4
  import numpy as np
5
+ import os
6
 
7
+ os.environ["OMP_NUM_THREADS"] = "2"
8
+ os.environ["MKL_NUM_THREADS"] = "2"
9
+ torch.set_num_threads(2)
10
 
11
+ device = "cpu"
12
+ torch_dtype = torch.float32
13
+
14
+ model_id = "openai/whisper-tiny"
15
 
16
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
17
  model_id,
18
+ dtype=torch_dtype,
19
  low_cpu_mem_usage=True,
20
+ use_safetensors=True,
21
+ attn_implementation="sdpa"
22
  )
23
  model.to(device)
24
 
 
29
  model=model,
30
  tokenizer=processor.tokenizer,
31
  feature_extractor=processor.feature_extractor,
32
+ dtype=torch_dtype,
33
  device=device,
34
+ ignore_warning=True
 
35
  )
36
 
37
  def transcribe_audio(audio_file, task="transcribe", language="auto", return_timestamps=False):
 
39
  return "No audio file provided."
40
 
41
  try:
42
+ with torch.inference_mode():
43
+ generate_kwargs = {
44
+ "task": task,
45
+ "language": None if language == "auto" else language,
46
+ "num_beams": 1,
47
+ "do_sample": False,
48
+ "temperature": 0.0,
49
+ "max_new_tokens": 448,
50
+ "compression_ratio_threshold": 1.35,
51
+ "logprob_threshold": -1.0,
52
+ "no_speech_threshold": 0.6,
53
+ }
54
+
55
+ if task == "translate":
56
+ generate_kwargs["task"] = "translate"
57
+
58
+ result = pipe(
59
+ audio_file,
60
+ return_timestamps=return_timestamps,
61
+ generate_kwargs=generate_kwargs
62
+ )
63
+
64
+ if return_timestamps and "chunks" in result:
65
+ formatted_result = []
66
+ for chunk in result["chunks"]:
67
+ timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
68
+ formatted_result.append(f"{timestamp} {chunk['text']}")
69
+ return "\n".join(formatted_result)
70
+ else:
71
+ return result["text"]
72
 
73
  except Exception as e:
74
  return f"Error processing audio: {str(e)}"
 
82
  audio_array = audio_array.astype(np.float32)
83
  audio_array = audio_array / np.max(np.abs(audio_array))
84
 
85
+ with torch.inference_mode():
86
+ generate_kwargs = {
87
+ "task": task,
88
+ "language": None if language == "auto" else language,
89
+ "num_beams": 1,
90
+ "do_sample": False,
91
+ "temperature": 0.0,
92
+ "max_new_tokens": 448,
93
+ "compression_ratio_threshold": 1.35,
94
+ "logprob_threshold": -1.0,
95
+ "no_speech_threshold": 0.6,
96
+ }
97
+
98
+ if task == "translate":
99
+ generate_kwargs["task"] = "translate"
100
+
101
+ result = pipe(
102
+ {"array": audio_array, "sampling_rate": sample_rate},
103
+ return_timestamps=return_timestamps,
104
+ generate_kwargs=generate_kwargs
105
+ )
106
+
107
+ if return_timestamps and "chunks" in result:
108
+ formatted_result = []
109
+ for chunk in result["chunks"]:
110
+ timestamp = f"[{chunk['timestamp'][0]:.2f}s - {chunk['timestamp'][1]:.2f}s]"
111
+ formatted_result.append(f"{timestamp} {chunk['text']}")
112
+ return "\n".join(formatted_result)
113
+ else:
114
+ return result["text"]
115
 
116
  except Exception as e:
117
  return f"Error processing audio: {str(e)}"
 
156
  ("Latin", "la"),
157
  ]
158
 
159
+ with gr.Blocks(title="Whisper Tiny - Speech to Text") as demo:
160
+ gr.Markdown("# 🎤 Whisper Tiny - Speech to Text")
161
+ gr.Markdown("Upload an audio file or record directly to get fast transcription using OpenAI's Whisper Tiny model (39M parameters).")
162
 
163
  with gr.Tab("Upload Audio File"):
164
  with gr.Row():
 
241
  )
242
 
243
  gr.Markdown("### Features:")
244
+ gr.Markdown("- **Lightweight**: Powered by Whisper Tiny model (39M parameters)")
245
  gr.Markdown("- **CPU Optimized**: Optimized for 2-core CPU with 16GB RAM")
246
  gr.Markdown("- **Multi-language**: Supports 99+ languages")
247
  gr.Markdown("- **Translation**: Can translate speech to English")
248
  gr.Markdown("- **Timestamps**: Optional word-level or sentence-level timestamps")
249
+ gr.Markdown("- **Fast Processing**: Smallest Whisper model for maximum speed")
250
 
251
  if __name__ == "__main__":
252
  demo.launch(