Spaces:

testingfaces
/

clearwave-ai

Paused

App Files Files Community

testingfaces commited on 27 days ago

Commit

da8f4c4

verified ·

1 Parent(s): de2fb00

Update transcriber.py

Browse files

Files changed (1) hide show

transcriber.py +95 -61

transcriber.py CHANGED Viewed

@@ -2,11 +2,19 @@
 Department 2 — Transcriber
 Primary  : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day
 Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached
 """
 import os
 import time
 import logging
 logger = logging.getLogger(__name__)
@@ -19,6 +27,8 @@ LANG_TO_WHISPER = {
     "kn":   "kn",
 }
 class Transcriber:
     def __init__(self):
@@ -30,105 +40,129 @@ class Transcriber:
             print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3")
             self._init_groq()
         else:
-            print("[Transcriber] ⚠️  No GROQ_API_KEY — local Whisper large-v3 loads on first use")
-    # ── Public ──────────────────────────────────────────────────────
     def transcribe(self, audio_path: str, language: str = "auto"):
-        """
-        Returns (transcript_text, detected_language_code, method_label)
-        """
         lang_hint = LANG_TO_WHISPER.get(language, None)
         if self._groq_client is not None:
             try:
-                return self._transcribe_groq(audio_path, lang_hint)
             except Exception as e:
-                logger.warning(f"[Transcriber] Groq failed ({e}), falling back to local…")
                 if self._local_model is None:
                     self._init_local()
-        return self._transcribe_local(audio_path, lang_hint)
-    # ── Groq ─────────────────────────────────────────────────────────
     def _init_groq(self):
         try:
             from groq import Groq
             self._groq_client = Groq(api_key=self.groq_key)
-            print("[Transcriber] ✅ Groq client initialised")
         except Exception as e:
-            logger.warning(f"[Transcriber] Groq init failed: {e}")
             self._groq_client = None
-            self._init_local()
-    def _transcribe_groq(self, audio_path: str, language=None):
         t0 = time.time()
         with open(audio_path, "rb") as f:
-            kwargs = dict(
-                file=f,
-                model="whisper-large-v3",
-                response_format="verbose_json",
-                temperature=0.0,
-            )
             if language:
                 kwargs["language"] = language
             resp = self._groq_client.audio.transcriptions.create(**kwargs)
         transcript    = resp.text.strip()
-        detected_lang = getattr(resp, "language", language or "en") or "en"
-        detected_lang = self._normalise_lang(detected_lang)
-        logger.info(f"[Transcriber] Groq done in {time.time()-t0:.2f}s, lang={detected_lang}")
         return transcript, detected_lang, "Groq Whisper large-v3"
-    # ── Local Whisper (UPGRADED: large-v3 int8) ──────────────────────
     def _init_local(self):
         try:
             from faster_whisper import WhisperModel
-            print("[Transcriber] Loading faster-whisper large-v3 int8 (CPU)…")
-            # ✅ UPGRADED from "small" → "large-v3" with int8 quantization
-            # Same accuracy as full large-v3, runs on CPU, ~4x faster than standard
-            self._local_model = WhisperModel(
-                "large-v3",
-                device="cpu",
-                compute_type="int8",
-            )
-            print("[Transcriber] ✅ faster-whisper large-v3 int8 ready")
         except Exception as e:
-            logger.error(f"[Transcriber] Local Whisper init failed: {e}")
             self._local_model = None
-    def _transcribe_local(self, audio_path: str, language=None):
         t0 = time.time()
         if self._local_model is None:
             raise RuntimeError("No transcription engine available.")
         segments, info = self._local_model.transcribe(
-            audio_path,
-            language=language,
-            beam_size=5,
-            vad_filter=True,                              # removes silence automatically
-            vad_parameters=dict(min_silence_duration_ms=500),
-        )
         transcript    = " ".join(seg.text.strip() for seg in segments).strip()
         detected_lang = info.language or language or "en"
-        logger.info(f"[Transcriber] Local done in {time.time()-t0:.2f}s, lang={detected_lang}")
         return transcript, detected_lang, "faster-whisper large-v3 int8 (local)"
-    # ── Helpers ──────────────────────────────────────────────────────
     @staticmethod
-    def _normalise_lang(raw: str) -> str:
-        mapping = {
-            "english":  "en",
-            "telugu":   "te",
-            "hindi":    "hi",
-            "tamil":    "ta",
-            "kannada":  "kn",
-            "spanish":  "es",
-            "french":   "fr",
-            "german":   "de",
-            "japanese": "ja",
-            "chinese":  "zh",
-        }
         return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw)

 Department 2 — Transcriber
 Primary  : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day
 Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached
+✅ UPGRADED:
+  - Chunking support — splits long audio into 60s pieces automatically
+  - Groq limit is 25MB per file, chunking handles large files
+  - Chunks rejoined seamlessly into full transcript
 """
 import os
 import time
 import logging
+import subprocess
+import tempfile
+import shutil
 logger = logging.getLogger(__name__)
     "kn":   "kn",
 }
+CHUNK_DURATION_SEC = 60  # Groq max is 25MB — 60s chunks stay safe
 class Transcriber:
     def __init__(self):
             print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3")
             self._init_groq()
         else:
+            print("[Transcriber] No GROQ_API_KEY — local Whisper loads on first use")
     def transcribe(self, audio_path: str, language: str = "auto"):
         lang_hint = LANG_TO_WHISPER.get(language, None)
+        duration  = self._get_duration(audio_path)
+        print(f"[Transcriber] Audio duration: {duration:.1f}s")
+        if duration <= CHUNK_DURATION_SEC:
+            return self._transcribe_single(audio_path, lang_hint)
+        print(f"[Transcriber] Long audio — splitting into {CHUNK_DURATION_SEC}s chunks")
+        return self._transcribe_chunked(audio_path, lang_hint, duration)
+    def _transcribe_chunked(self, audio_path, language, duration):
+        tmp_dir = tempfile.mkdtemp()
+        chunks  = []
+        start   = 0
+        index   = 0
+        while start < duration:
+            chunk_path = os.path.join(tmp_dir, f"chunk_{index:03d}.wav")
+            subprocess.run([
+                "ffmpeg", "-y", "-i", audio_path,
+                "-ss", str(start), "-t", str(CHUNK_DURATION_SEC),
+                "-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
+                chunk_path
+            ], capture_output=True)
+            if os.path.exists(chunk_path):
+                chunks.append(chunk_path)
+            start += CHUNK_DURATION_SEC
+            index += 1
+        print(f"[Transcriber] Processing {len(chunks)} chunks...")
+        transcripts   = []
+        detected_lang = language or "en"
+        method        = "unknown"
+        for i, chunk in enumerate(chunks):
+            print(f"[Transcriber] Chunk {i+1}/{len(chunks)}...")
+            try:
+                text, lang, m = self._transcribe_single(chunk, language)
+                transcripts.append(text.strip())
+                detected_lang = lang
+                method        = m
+            except Exception as e:
+                logger.warning(f"Chunk {i+1} failed: {e}")
+        shutil.rmtree(tmp_dir, ignore_errors=True)
+        full = " ".join(t for t in transcripts if t)
+        print(f"[Transcriber] Done — {len(full)} chars total")
+        return full, detected_lang, f"{method} (chunked {len(chunks)}x)"
+    def _transcribe_single(self, audio_path, language):
         if self._groq_client is not None:
             try:
+                return self._transcribe_groq(audio_path, language)
             except Exception as e:
+                logger.warning(f"Groq failed ({e}), falling back to local")
                 if self._local_model is None:
                     self._init_local()
+        return self._transcribe_local(audio_path, language)
     def _init_groq(self):
         try:
             from groq import Groq
             self._groq_client = Groq(api_key=self.groq_key)
+            print("[Transcriber] Groq client initialised")
         except Exception as e:
+            logger.warning(f"Groq init failed: {e}")
             self._groq_client = None
+    def _transcribe_groq(self, audio_path, language=None):
         t0 = time.time()
         with open(audio_path, "rb") as f:
+            kwargs = dict(file=f, model="whisper-large-v3",
+                          response_format="verbose_json", temperature=0.0)
             if language:
                 kwargs["language"] = language
             resp = self._groq_client.audio.transcriptions.create(**kwargs)
         transcript    = resp.text.strip()
+        detected_lang = self._normalise_lang(getattr(resp, "language", language or "en") or "en")
+        logger.info(f"Groq done in {time.time()-t0:.2f}s, lang={detected_lang}")
         return transcript, detected_lang, "Groq Whisper large-v3"
     def _init_local(self):
         try:
             from faster_whisper import WhisperModel
+            print("[Transcriber] Loading faster-whisper large-v3 int8...")
+            self._local_model = WhisperModel("large-v3", device="cpu", compute_type="int8")
+            print("[Transcriber] faster-whisper ready")
         except Exception as e:
+            logger.error(f"Local Whisper init failed: {e}")
             self._local_model = None
+    def _transcribe_local(self, audio_path, language=None):
         t0 = time.time()
+        if self._local_model is None:
+            self._init_local()
         if self._local_model is None:
             raise RuntimeError("No transcription engine available.")
         segments, info = self._local_model.transcribe(
+            audio_path, language=language, beam_size=5,
+            vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))
         transcript    = " ".join(seg.text.strip() for seg in segments).strip()
         detected_lang = info.language or language or "en"
+        logger.info(f"Local done in {time.time()-t0:.2f}s")
         return transcript, detected_lang, "faster-whisper large-v3 int8 (local)"
+    def _get_duration(self, audio_path):
+        try:
+            result = subprocess.run([
+                "ffprobe", "-v", "error",
+                "-show_entries", "format=duration",
+                "-of", "default=noprint_wrappers=1:nokey=1",
+                audio_path
+            ], capture_output=True, text=True)
+            return float(result.stdout.strip())
+        except Exception:
+            return 0.0
     @staticmethod
+    def _normalise_lang(raw):
+        mapping = {"english":"en","telugu":"te","hindi":"hi",
+                   "tamil":"ta","kannada":"kn","spanish":"es",
+                   "french":"fr","german":"de","japanese":"ja","chinese":"zh"}
         return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw)