LiamKhoaLe commited on
Commit
e47935f
·
1 Parent(s): c0167f3

Upd abort time and smart chunk-batcher

Browse files
Files changed (1) hide show
  1. app.py +71 -10
app.py CHANGED
@@ -9,6 +9,7 @@ from transformers.pipelines.audio_utils import ffmpeg_read
9
  import tempfile
10
  import os
11
  import time
 
12
  import google.generativeai as genai
13
  from dotenv import load_dotenv
14
 
@@ -26,16 +27,80 @@ pipe = pipeline(
26
  model=MODEL_NAME,
27
  device=device,
28
  ignore_warning=True,
29
- model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {}
 
30
  )
31
 
32
 
33
- @spaces.GPU
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def transcribe(inputs, task, summarize=False):
35
  if inputs is None:
36
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
37
  try:
38
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
 
 
 
 
 
39
  except Exception as e:
40
  raise gr.Error(f"Transcription failed: {e}")
41
  if summarize:
@@ -100,7 +165,7 @@ def yt_transcribe(yt_url, task, summarize=False, max_filesize=75.0):
100
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
101
 
102
  try:
103
- text = pipe(inputs, batch_size=BATCH_SIZE, generate_kwargs={"task": task}, return_timestamps=True)["text"]
104
  except Exception as e:
105
  raise gr.Error(f"Transcription failed: {e}")
106
  summary = ""
@@ -139,9 +204,7 @@ file_transcribe = gr.Interface(
139
  outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
140
  title="Whisper Large V3: Audio file",
141
  description=(
142
- "Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the"
143
- f" checkpoint [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe audio files"
144
- " of arbitrary length."
145
  ),
146
  flagging_mode="never",
147
  )
@@ -156,9 +219,7 @@ yt_transcribe = gr.Interface(
156
  outputs=["html", gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
157
  title="Whisper Large V3: Transcribe YouTube",
158
  description=(
159
- "Transcribe long-form YouTube videos with the click of a button! Demo uses the checkpoint"
160
- f" [{MODEL_NAME}](https://huggingface.co/{MODEL_NAME}) and 🤗 Transformers to transcribe video files of"
161
- " arbitrary length."
162
  ),
163
  flagging_mode="never",
164
  )
 
9
  import tempfile
10
  import os
11
  import time
12
+ import numpy as np
13
  import google.generativeai as genai
14
  from dotenv import load_dotenv
15
 
 
27
  model=MODEL_NAME,
28
  device=device,
29
  ignore_warning=True,
30
+ model_kwargs={"torch_dtype": torch.float16} if torch.cuda.is_available() else {},
31
+ chunk_length_s=20, # small chunks to fit ZeroGPU
32
  )
33
 
34
 
35
+ def _concat_text(chunks):
36
+ return " ".join([c.strip() for c in chunks if c and c.strip()])
37
+
38
+
39
+ def _robust_transcribe_array(audio_array: np.ndarray, sr: int, task: str) -> str:
40
+ """Transcribe long/large audio by chunking sequentially to minimize GPU memory.
41
+
42
+ Uses conservative chunking (20s) with 2s overlap, batch_size=1.
43
+ """
44
+ if audio_array.ndim > 1:
45
+ audio_array = np.mean(audio_array, axis=1)
46
+ chunk_s = 20
47
+ overlap_s = 2
48
+ step = int((chunk_s - overlap_s) * sr)
49
+ win = int(chunk_s * sr)
50
+ texts = []
51
+ if len(audio_array) <= win:
52
+ inputs = {"array": audio_array, "sampling_rate": sr}
53
+ out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
54
+ return out["text"]
55
+ start = 0
56
+ while start < len(audio_array):
57
+ end = min(start + win, len(audio_array))
58
+ chunk = audio_array[start:end]
59
+ inputs = {"array": chunk, "sampling_rate": sr}
60
+ out = pipe(inputs, batch_size=1, generate_kwargs={"task": task})
61
+ texts.append(out["text"])
62
+ if end == len(audio_array):
63
+ break
64
+ start += step
65
+ return _concat_text(texts)
66
+
67
+
68
+ def _robust_transcribe_path(path: str, task: str) -> str:
69
+ sr = pipe.feature_extractor.sampling_rate
70
+ audio = ffmpeg_read(path, sr)
71
+ try:
72
+ return _robust_transcribe_array(audio, sr, task)
73
+ except Exception as e:
74
+ # last-chance: shrink chunk and retry small windows
75
+ try:
76
+ small_chunk = 10
77
+ step = int(8 * sr)
78
+ win = int(small_chunk * sr)
79
+ texts = []
80
+ pos = 0
81
+ while pos < len(audio):
82
+ sub = audio[pos:pos+win]
83
+ out = pipe({"array": sub, "sampling_rate": sr}, batch_size=1, generate_kwargs={"task": task})
84
+ texts.append(out["text"])
85
+ if pos + win >= len(audio):
86
+ break
87
+ pos += step
88
+ return _concat_text(texts)
89
+ except Exception as e2:
90
+ raise gr.Error(f"Transcription failed after retries: {e2}")
91
+
92
+
93
+ @spaces.GPU(duration=2400) # 40 minutes
94
  def transcribe(inputs, task, summarize=False):
95
  if inputs is None:
96
  raise gr.Error("No audio file submitted! Please upload or record an audio file before submitting your request.")
97
  try:
98
+ if isinstance(inputs, str):
99
+ text = _robust_transcribe_path(inputs, task)
100
+ elif isinstance(inputs, dict) and "array" in inputs:
101
+ text = _robust_transcribe_array(inputs["array"], inputs.get("sampling_rate", pipe.feature_extractor.sampling_rate), task)
102
+ else:
103
+ text = pipe(inputs, batch_size=1, generate_kwargs={"task": task})["text"]
104
  except Exception as e:
105
  raise gr.Error(f"Transcription failed: {e}")
106
  if summarize:
 
165
  inputs = {"array": inputs, "sampling_rate": pipe.feature_extractor.sampling_rate}
166
 
167
  try:
168
+ text = _robust_transcribe_array(inputs["array"], inputs["sampling_rate"], task)
169
  except Exception as e:
170
  raise gr.Error(f"Transcription failed: {e}")
171
  summary = ""
 
204
  outputs=[gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
205
  title="Whisper Large V3: Audio file",
206
  description=(
207
+ "Transcribe long-form microphone or audio inputs."
 
 
208
  ),
209
  flagging_mode="never",
210
  )
 
219
  outputs=["html", gr.Textbox(label="Transcription"), gr.Textbox(label="Summary")],
220
  title="Whisper Large V3: Transcribe YouTube",
221
  description=(
222
+ "Transcribe long-form YouTube videos."
 
 
223
  ),
224
  flagging_mode="never",
225
  )