Spaces:
Running
Running
| """ | |
| Department 2 — Transcriber | |
| Primary : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day | |
| Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached | |
| ✅ UPGRADED: | |
| - Chunking support — splits long audio into 60s pieces automatically | |
| - Groq limit is 25MB per file, chunking handles large files | |
| - Chunks rejoined seamlessly into full transcript | |
| """ | |
| import os | |
| import time | |
| import logging | |
| import subprocess | |
| import tempfile | |
| import shutil | |
| logger = logging.getLogger(__name__) | |
| LANG_TO_WHISPER = { | |
| "auto": None, | |
| "en": "en", | |
| "te": "te", | |
| "hi": "hi", | |
| "ta": "ta", | |
| "kn": "kn", | |
| } | |
| CHUNK_DURATION_SEC = 60 # Groq max is 25MB — 60s chunks stay safe | |
| class Transcriber: | |
| def __init__(self): | |
| self.groq_key = os.environ.get("GROQ_API_KEY", "") | |
| self._groq_client = None | |
| self._local_model = None | |
| if self.groq_key: | |
| print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3") | |
| self._init_groq() | |
| else: | |
| print("[Transcriber] No GROQ_API_KEY — local Whisper loads on first use") | |
| def transcribe(self, audio_path: str, language: str = "auto"): | |
| lang_hint = LANG_TO_WHISPER.get(language, None) | |
| duration = self._get_duration(audio_path) | |
| print(f"[Transcriber] Audio duration: {duration:.1f}s") | |
| if duration <= CHUNK_DURATION_SEC: | |
| return self._transcribe_single(audio_path, lang_hint) | |
| print(f"[Transcriber] Long audio — splitting into {CHUNK_DURATION_SEC}s chunks") | |
| return self._transcribe_chunked(audio_path, lang_hint, duration) | |
| def _transcribe_chunked(self, audio_path, language, duration): | |
| tmp_dir = tempfile.mkdtemp() | |
| chunks = [] | |
| start = 0 | |
| index = 0 | |
| while start < duration: | |
| chunk_path = os.path.join(tmp_dir, f"chunk_{index:03d}.wav") | |
| subprocess.run([ | |
| "ffmpeg", "-y", "-i", audio_path, | |
| "-ss", str(start), "-t", str(CHUNK_DURATION_SEC), | |
| "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1", | |
| chunk_path | |
| ], capture_output=True) | |
| if os.path.exists(chunk_path): | |
| chunks.append(chunk_path) | |
| start += CHUNK_DURATION_SEC | |
| index += 1 | |
| print(f"[Transcriber] Processing {len(chunks)} chunks...") | |
| transcripts = [] | |
| detected_lang = language or "en" | |
| method = "unknown" | |
| for i, chunk in enumerate(chunks): | |
| print(f"[Transcriber] Chunk {i+1}/{len(chunks)}...") | |
| try: | |
| text, lang, m = self._transcribe_single(chunk, language) | |
| transcripts.append(text.strip()) | |
| detected_lang = lang | |
| method = m | |
| except Exception as e: | |
| logger.warning(f"Chunk {i+1} failed: {e}") | |
| shutil.rmtree(tmp_dir, ignore_errors=True) | |
| full = " ".join(t for t in transcripts if t) | |
| print(f"[Transcriber] Done — {len(full)} chars total") | |
| return full, detected_lang, f"{method} (chunked {len(chunks)}x)" | |
| def _transcribe_single(self, audio_path, language): | |
| if self._groq_client is not None: | |
| try: | |
| return self._transcribe_groq(audio_path, language) | |
| except Exception as e: | |
| logger.warning(f"Groq failed ({e}), falling back to local") | |
| if self._local_model is None: | |
| self._init_local() | |
| return self._transcribe_local(audio_path, language) | |
| def _init_groq(self): | |
| try: | |
| from groq import Groq | |
| self._groq_client = Groq(api_key=self.groq_key) | |
| print("[Transcriber] Groq client initialised") | |
| except Exception as e: | |
| logger.warning(f"Groq init failed: {e}") | |
| self._groq_client = None | |
| def _transcribe_groq(self, audio_path, language=None): | |
| t0 = time.time() | |
| with open(audio_path, "rb") as f: | |
| kwargs = dict(file=f, model="whisper-large-v3", | |
| response_format="verbose_json", temperature=0.0) | |
| if language: | |
| kwargs["language"] = language | |
| resp = self._groq_client.audio.transcriptions.create(**kwargs) | |
| transcript = resp.text.strip() | |
| detected_lang = self._normalise_lang(getattr(resp, "language", language or "en") or "en") | |
| logger.info(f"Groq done in {time.time()-t0:.2f}s, lang={detected_lang}") | |
| return transcript, detected_lang, "Groq Whisper large-v3" | |
| def _init_local(self): | |
| try: | |
| from faster_whisper import WhisperModel | |
| print("[Transcriber] Loading faster-whisper large-v3 int8...") | |
| self._local_model = WhisperModel("large-v3", device="cpu", compute_type="int8") | |
| print("[Transcriber] faster-whisper ready") | |
| except Exception as e: | |
| logger.error(f"Local Whisper init failed: {e}") | |
| self._local_model = None | |
| def _transcribe_local(self, audio_path, language=None): | |
| t0 = time.time() | |
| if self._local_model is None: | |
| self._init_local() | |
| if self._local_model is None: | |
| raise RuntimeError("No transcription engine available.") | |
| segments, info = self._local_model.transcribe( | |
| audio_path, language=language, beam_size=5, | |
| vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500)) | |
| transcript = " ".join(seg.text.strip() for seg in segments).strip() | |
| detected_lang = info.language or language or "en" | |
| logger.info(f"Local done in {time.time()-t0:.2f}s") | |
| return transcript, detected_lang, "faster-whisper large-v3 int8 (local)" | |
| def _get_duration(self, audio_path): | |
| try: | |
| result = subprocess.run([ | |
| "ffprobe", "-v", "error", | |
| "-show_entries", "format=duration", | |
| "-of", "default=noprint_wrappers=1:nokey=1", | |
| audio_path | |
| ], capture_output=True, text=True) | |
| return float(result.stdout.strip()) | |
| except Exception: | |
| return 0.0 | |
| def _normalise_lang(raw): | |
| mapping = {"english":"en","telugu":"te","hindi":"hi", | |
| "tamil":"ta","kannada":"kn","spanish":"es", | |
| "french":"fr","german":"de","japanese":"ja","chinese":"zh"} | |
| return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw) |