testingfaces commited on
Commit
da8f4c4
·
verified ·
1 Parent(s): de2fb00

Update transcriber.py

Browse files
Files changed (1) hide show
  1. transcriber.py +95 -61
transcriber.py CHANGED
@@ -2,11 +2,19 @@
2
  Department 2 — Transcriber
3
  Primary : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day
4
  Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached
 
 
 
 
 
5
  """
6
 
7
  import os
8
  import time
9
  import logging
 
 
 
10
 
11
  logger = logging.getLogger(__name__)
12
 
@@ -19,6 +27,8 @@ LANG_TO_WHISPER = {
19
  "kn": "kn",
20
  }
21
 
 
 
22
 
23
  class Transcriber:
24
  def __init__(self):
@@ -30,105 +40,129 @@ class Transcriber:
30
  print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3")
31
  self._init_groq()
32
  else:
33
- print("[Transcriber] ⚠️ No GROQ_API_KEY — local Whisper large-v3 loads on first use")
34
 
35
- # ── Public ──────────────────────────────────────────────────────
36
  def transcribe(self, audio_path: str, language: str = "auto"):
37
- """
38
- Returns (transcript_text, detected_language_code, method_label)
39
- """
40
  lang_hint = LANG_TO_WHISPER.get(language, None)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
 
 
 
 
 
 
 
42
  if self._groq_client is not None:
43
  try:
44
- return self._transcribe_groq(audio_path, lang_hint)
45
  except Exception as e:
46
- logger.warning(f"[Transcriber] Groq failed ({e}), falling back to local")
47
  if self._local_model is None:
48
  self._init_local()
 
49
 
50
- return self._transcribe_local(audio_path, lang_hint)
51
-
52
- # ── Groq ─────────────────────────────────────────────────────────
53
  def _init_groq(self):
54
  try:
55
  from groq import Groq
56
  self._groq_client = Groq(api_key=self.groq_key)
57
- print("[Transcriber] Groq client initialised")
58
  except Exception as e:
59
- logger.warning(f"[Transcriber] Groq init failed: {e}")
60
  self._groq_client = None
61
- self._init_local()
62
 
63
- def _transcribe_groq(self, audio_path: str, language=None):
64
  t0 = time.time()
65
  with open(audio_path, "rb") as f:
66
- kwargs = dict(
67
- file=f,
68
- model="whisper-large-v3",
69
- response_format="verbose_json",
70
- temperature=0.0,
71
- )
72
  if language:
73
  kwargs["language"] = language
74
-
75
  resp = self._groq_client.audio.transcriptions.create(**kwargs)
76
-
77
  transcript = resp.text.strip()
78
- detected_lang = getattr(resp, "language", language or "en") or "en"
79
- detected_lang = self._normalise_lang(detected_lang)
80
-
81
- logger.info(f"[Transcriber] Groq done in {time.time()-t0:.2f}s, lang={detected_lang}")
82
  return transcript, detected_lang, "Groq Whisper large-v3"
83
 
84
- # ── Local Whisper (UPGRADED: large-v3 int8) ──────────────────────
85
  def _init_local(self):
86
  try:
87
  from faster_whisper import WhisperModel
88
- print("[Transcriber] Loading faster-whisper large-v3 int8 (CPU)…")
89
- # UPGRADED from "small" → "large-v3" with int8 quantization
90
- # Same accuracy as full large-v3, runs on CPU, ~4x faster than standard
91
- self._local_model = WhisperModel(
92
- "large-v3",
93
- device="cpu",
94
- compute_type="int8",
95
- )
96
- print("[Transcriber] ✅ faster-whisper large-v3 int8 ready")
97
  except Exception as e:
98
- logger.error(f"[Transcriber] Local Whisper init failed: {e}")
99
  self._local_model = None
100
 
101
- def _transcribe_local(self, audio_path: str, language=None):
102
  t0 = time.time()
 
 
103
  if self._local_model is None:
104
  raise RuntimeError("No transcription engine available.")
105
-
106
  segments, info = self._local_model.transcribe(
107
- audio_path,
108
- language=language,
109
- beam_size=5,
110
- vad_filter=True, # removes silence automatically
111
- vad_parameters=dict(min_silence_duration_ms=500),
112
- )
113
  transcript = " ".join(seg.text.strip() for seg in segments).strip()
114
  detected_lang = info.language or language or "en"
115
-
116
- logger.info(f"[Transcriber] Local done in {time.time()-t0:.2f}s, lang={detected_lang}")
117
  return transcript, detected_lang, "faster-whisper large-v3 int8 (local)"
118
 
119
- # ── Helpers ──────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
120
  @staticmethod
121
- def _normalise_lang(raw: str) -> str:
122
- mapping = {
123
- "english": "en",
124
- "telugu": "te",
125
- "hindi": "hi",
126
- "tamil": "ta",
127
- "kannada": "kn",
128
- "spanish": "es",
129
- "french": "fr",
130
- "german": "de",
131
- "japanese": "ja",
132
- "chinese": "zh",
133
- }
134
  return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw)
 
2
  Department 2 — Transcriber
3
  Primary : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day
4
  Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached
5
+
6
+ ✅ UPGRADED:
7
+ - Chunking support — splits long audio into 60s pieces automatically
8
+ - Groq limit is 25MB per file, chunking handles large files
9
+ - Chunks rejoined seamlessly into full transcript
10
  """
11
 
12
  import os
13
  import time
14
  import logging
15
+ import subprocess
16
+ import tempfile
17
+ import shutil
18
 
19
  logger = logging.getLogger(__name__)
20
 
 
27
  "kn": "kn",
28
  }
29
 
30
+ CHUNK_DURATION_SEC = 60 # Groq max is 25MB — 60s chunks stay safe
31
+
32
 
33
  class Transcriber:
34
  def __init__(self):
 
40
  print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3")
41
  self._init_groq()
42
  else:
43
+ print("[Transcriber] No GROQ_API_KEY — local Whisper loads on first use")
44
 
 
45
  def transcribe(self, audio_path: str, language: str = "auto"):
 
 
 
46
  lang_hint = LANG_TO_WHISPER.get(language, None)
47
+ duration = self._get_duration(audio_path)
48
+ print(f"[Transcriber] Audio duration: {duration:.1f}s")
49
+
50
+ if duration <= CHUNK_DURATION_SEC:
51
+ return self._transcribe_single(audio_path, lang_hint)
52
+
53
+ print(f"[Transcriber] Long audio — splitting into {CHUNK_DURATION_SEC}s chunks")
54
+ return self._transcribe_chunked(audio_path, lang_hint, duration)
55
+
56
+ def _transcribe_chunked(self, audio_path, language, duration):
57
+ tmp_dir = tempfile.mkdtemp()
58
+ chunks = []
59
+ start = 0
60
+ index = 0
61
+
62
+ while start < duration:
63
+ chunk_path = os.path.join(tmp_dir, f"chunk_{index:03d}.wav")
64
+ subprocess.run([
65
+ "ffmpeg", "-y", "-i", audio_path,
66
+ "-ss", str(start), "-t", str(CHUNK_DURATION_SEC),
67
+ "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
68
+ chunk_path
69
+ ], capture_output=True)
70
+ if os.path.exists(chunk_path):
71
+ chunks.append(chunk_path)
72
+ start += CHUNK_DURATION_SEC
73
+ index += 1
74
+
75
+ print(f"[Transcriber] Processing {len(chunks)} chunks...")
76
+ transcripts = []
77
+ detected_lang = language or "en"
78
+ method = "unknown"
79
+
80
+ for i, chunk in enumerate(chunks):
81
+ print(f"[Transcriber] Chunk {i+1}/{len(chunks)}...")
82
+ try:
83
+ text, lang, m = self._transcribe_single(chunk, language)
84
+ transcripts.append(text.strip())
85
+ detected_lang = lang
86
+ method = m
87
+ except Exception as e:
88
+ logger.warning(f"Chunk {i+1} failed: {e}")
89
 
90
+ shutil.rmtree(tmp_dir, ignore_errors=True)
91
+ full = " ".join(t for t in transcripts if t)
92
+ print(f"[Transcriber] Done — {len(full)} chars total")
93
+ return full, detected_lang, f"{method} (chunked {len(chunks)}x)"
94
+
95
+ def _transcribe_single(self, audio_path, language):
96
  if self._groq_client is not None:
97
  try:
98
+ return self._transcribe_groq(audio_path, language)
99
  except Exception as e:
100
+ logger.warning(f"Groq failed ({e}), falling back to local")
101
  if self._local_model is None:
102
  self._init_local()
103
+ return self._transcribe_local(audio_path, language)
104
 
 
 
 
105
  def _init_groq(self):
106
  try:
107
  from groq import Groq
108
  self._groq_client = Groq(api_key=self.groq_key)
109
+ print("[Transcriber] Groq client initialised")
110
  except Exception as e:
111
+ logger.warning(f"Groq init failed: {e}")
112
  self._groq_client = None
 
113
 
114
+ def _transcribe_groq(self, audio_path, language=None):
115
  t0 = time.time()
116
  with open(audio_path, "rb") as f:
117
+ kwargs = dict(file=f, model="whisper-large-v3",
118
+ response_format="verbose_json", temperature=0.0)
 
 
 
 
119
  if language:
120
  kwargs["language"] = language
 
121
  resp = self._groq_client.audio.transcriptions.create(**kwargs)
 
122
  transcript = resp.text.strip()
123
+ detected_lang = self._normalise_lang(getattr(resp, "language", language or "en") or "en")
124
+ logger.info(f"Groq done in {time.time()-t0:.2f}s, lang={detected_lang}")
 
 
125
  return transcript, detected_lang, "Groq Whisper large-v3"
126
 
 
127
  def _init_local(self):
128
  try:
129
  from faster_whisper import WhisperModel
130
+ print("[Transcriber] Loading faster-whisper large-v3 int8...")
131
+ self._local_model = WhisperModel("large-v3", device="cpu", compute_type="int8")
132
+ print("[Transcriber] faster-whisper ready")
 
 
 
 
 
 
133
  except Exception as e:
134
+ logger.error(f"Local Whisper init failed: {e}")
135
  self._local_model = None
136
 
137
+ def _transcribe_local(self, audio_path, language=None):
138
  t0 = time.time()
139
+ if self._local_model is None:
140
+ self._init_local()
141
  if self._local_model is None:
142
  raise RuntimeError("No transcription engine available.")
 
143
  segments, info = self._local_model.transcribe(
144
+ audio_path, language=language, beam_size=5,
145
+ vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))
 
 
 
 
146
  transcript = " ".join(seg.text.strip() for seg in segments).strip()
147
  detected_lang = info.language or language or "en"
148
+ logger.info(f"Local done in {time.time()-t0:.2f}s")
 
149
  return transcript, detected_lang, "faster-whisper large-v3 int8 (local)"
150
 
151
+ def _get_duration(self, audio_path):
152
+ try:
153
+ result = subprocess.run([
154
+ "ffprobe", "-v", "error",
155
+ "-show_entries", "format=duration",
156
+ "-of", "default=noprint_wrappers=1:nokey=1",
157
+ audio_path
158
+ ], capture_output=True, text=True)
159
+ return float(result.stdout.strip())
160
+ except Exception:
161
+ return 0.0
162
+
163
  @staticmethod
164
+ def _normalise_lang(raw):
165
+ mapping = {"english":"en","telugu":"te","hindi":"hi",
166
+ "tamil":"ta","kannada":"kn","spanish":"es",
167
+ "french":"fr","german":"de","japanese":"ja","chinese":"zh"}
 
 
 
 
 
 
 
 
 
168
  return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw)