Abid Ali Awan commited on
Commit
087adaa
Β·
1 Parent(s): 182bd23

Refactor app.py to optimize CPU performance, update model loading to use fp32 and quantization, and enhance the transcription function with improved audio processing and error handling.

Browse files
Files changed (1) hide show
  1. app.py +63 -42
app.py CHANGED
@@ -1,67 +1,88 @@
 
1
  import gradio as gr
2
- import spaces
3
  import torch
4
  import numpy as np
5
- from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
 
 
 
 
 
 
 
 
 
 
6
 
7
  logging.set_verbosity_error()
8
 
9
- # Model configuration
10
- device = "cuda:0" if torch.cuda.is_available() else "cpu"
11
- torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
12
  model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
13
 
14
- # Initialize model and processor
15
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
16
- model_id,
17
- torch_dtype=torch_dtype,
18
- use_safetensors=True
19
- ).to(device)
20
- model.generation_config.forced_decoder_ids = None
 
21
 
22
  processor = AutoProcessor.from_pretrained(model_id)
23
 
24
- # Create pipeline
25
  transcriber = pipeline(
26
- "automatic-speech-recognition",
27
  model=model,
28
  tokenizer=processor.tokenizer,
29
  feature_extractor=processor.feature_extractor,
30
- torch_dtype=torch_dtype,
31
- device=device,
 
32
  )
33
 
34
- @spaces.GPU
35
  def transcribe(audio):
36
  if audio is None:
37
  return "No audio provided. Please record or upload an audio file."
38
-
39
- try:
40
- sr, y = audio
41
-
42
- # Convert to mono if stereo
43
- if y.ndim > 1:
44
- y = y.mean(axis=1)
45
-
46
- # Convert to float32 and normalize
47
- y = y.astype(np.float32)
48
- if np.max(np.abs(y)) > 0:
49
- y /= np.max(np.abs(y))
50
- else:
51
- return "Audio appears to be silent. Please try again."
52
-
53
- # Transcribe using the pipeline
54
  result = transcriber({"sampling_rate": sr, "raw": y})
55
-
56
- return result["text"]
57
-
58
- except Exception as e:
59
- return f"Error during transcription: {str(e)}"
60
-
61
- description = "<p style='text-align: center'>Record or upload audio in Urdu and get the transcribed text using Whisper Large V3 Turbo Urdu model.</center></p>"
62
- examples = [["samples/audio1.mp3"], ["samples/audio2.mp3"], ["samples/audio3.mp3"]]
63
- article = "<p style='text-align: center; color: #34C759;'><a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>🌿 Explore the project on GitHub πŸ“š</a></p>"
64
- # Create Gradio interface
 
 
 
 
 
 
 
 
 
 
 
 
65
  demo = gr.Interface(
66
  fn=transcribe,
67
  inputs=gr.Audio(
 
1
+ import os
2
  import gradio as gr
 
3
  import torch
4
  import numpy as np
5
+ from transformers import (
6
+ AutoModelForSpeechSeq2Seq,
7
+ AutoProcessor,
8
+ pipeline,
9
+ logging,
10
+ )
11
+
12
+ # β€”β€” CPU performance tweaks β€”β€”
13
+ os.environ["OMP_NUM_THREADS"] = "4"
14
+ os.environ["MKL_NUM_THREADS"] = "4"
15
+ torch.set_num_threads(4)
16
 
17
  logging.set_verbosity_error()
18
 
19
+ # β€”β€” Model & device setup β€”β€”
20
+ device = "cpu"
 
21
  model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
22
 
23
+ # Load in fp32 and quantize to int8
24
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
25
+ model_id,
26
+ torch_dtype=torch.float32,
27
+ use_safetensors=True,
28
+ )
29
+ model.eval()
30
+ model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
31
 
32
  processor = AutoProcessor.from_pretrained(model_id)
33
 
34
+ # Build a CPU-based pipeline with chunking
35
  transcriber = pipeline(
36
+ task="automatic-speech-recognition",
37
  model=model,
38
  tokenizer=processor.tokenizer,
39
  feature_extractor=processor.feature_extractor,
40
+ device=-1, # CPU
41
+ chunk_length_s=30,
42
+ stride_length_s=(5, 5),
43
  )
44
 
45
+
46
  def transcribe(audio):
47
  if audio is None:
48
  return "No audio provided. Please record or upload an audio file."
49
+
50
+ sr, y = audio
51
+ # mono & normalize
52
+ if y.ndim > 1:
53
+ y = y.mean(axis=1)
54
+ y = y.astype(np.float32)
55
+ peak = np.max(np.abs(y))
56
+ if peak > 0:
57
+ y /= peak
58
+ else:
59
+ return "Audio appears to be silent. Please try again."
60
+
61
+ # Inference under no_grad
62
+ with torch.no_grad():
 
 
63
  result = transcriber({"sampling_rate": sr, "raw": y})
64
+ return result.get("text", "")
65
+
66
+
67
+ # β€”β€” Gradio UI β€”β€”
68
+ description = """
69
+ <p style='text-align: center'>
70
+ Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
71
+ </p>
72
+ """
73
+ examples = [
74
+ ["samples/audio1.mp3"],
75
+ ["samples/audio2.mp3"],
76
+ ["samples/audio3.mp3"],
77
+ ]
78
+ article = """
79
+ <p style='text-align: center; color: #34C759;'>
80
+ <a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
81
+ 🌿 Explore the project on GitHub πŸ“š
82
+ </a>
83
+ </p>
84
+ """
85
+
86
  demo = gr.Interface(
87
  fn=transcribe,
88
  inputs=gr.Audio(