Spaces:

ruvatron
/

voice-intelligence

Sleeping

App Files Files Community

unknownfriend00007 commited on Apr 16

Commit

2ffbd85

verified ·

1 Parent(s): 48c3b28

Upload 8 files

Browse files

Files changed (3) hide show

config.py +3 -0
inference.py +230 -58
requirements.txt +1 -0

config.py CHANGED Viewed

@@ -62,6 +62,9 @@ class VoiceRuntimeConfig:
     diarization_min_speakers: int = int(os.environ.get("VOICE_DIARIZATION_MIN_SPEAKERS", "0"))
     diarization_max_speakers: int = int(os.environ.get("VOICE_DIARIZATION_MAX_SPEAKERS", "0"))
     @classmethod
     def from_env(cls) -> "VoiceRuntimeConfig":
         return cls()

     diarization_min_speakers: int = int(os.environ.get("VOICE_DIARIZATION_MIN_SPEAKERS", "0"))
     diarization_max_speakers: int = int(os.environ.get("VOICE_DIARIZATION_MAX_SPEAKERS", "0"))
+    groq_api_key: str = os.environ.get("GROQ_API_KEY", "")
+    groq_model_id: str = os.environ.get("GROQ_MODEL_ID", "whisper-large-v3-turbo")
     @classmethod
     def from_env(cls) -> "VoiceRuntimeConfig":
         return cls()

inference.py CHANGED Viewed

@@ -1,58 +1,230 @@
-from __future__ import annotations
-import threading
-from typing import Any
-from faster_whisper import WhisperModel
-try:
-    from .config import VoiceRuntimeConfig
-except ImportError:  # HF flat-root execution fallback
-    from config import VoiceRuntimeConfig
-class WhisperRuntime:
-    _lock = threading.Lock()
-    _model: WhisperModel | None = None
-    _loaded_id: str | None = None
-    @classmethod
-    def get_model(cls, config: VoiceRuntimeConfig) -> WhisperModel:
-        with cls._lock:
-            if cls._model is not None and cls._loaded_id == config.runtime_model_id:
-                return cls._model
-            cls._model = WhisperModel(
-                config.runtime_model_id,
-                device="cpu",
-                compute_type=config.compute_type,
-                cpu_threads=config.cpu_threads,
-                num_workers=1,
-            )
-            cls._loaded_id = config.runtime_model_id
-            return cls._model
-def transcribe(
-    wav_path: str,
-    config: VoiceRuntimeConfig,
-    language_hint: str | None,
-) -> tuple[list[Any], str, str]:
-    model = WhisperRuntime.get_model(config)
-    requested_language = None if not language_hint or language_hint == "auto" else language_hint
-    segments_iter, info = model.transcribe(
-        wav_path,
-        task="transcribe",
-        language=requested_language,
-        beam_size=1,
-        best_of=1,
-        temperature=0.0,
-        condition_on_previous_text=False,
-        word_timestamps=True,
-        vad_filter=False,
-    )
-    segments = list(segments_iter)
-    detected_language = (info.language or requested_language or "unknown").lower()
-    language_source = "request" if requested_language else "auto_detect"
-    return segments, detected_language, language_source

+from __future__ import annotations
+import math
+import os
+import tempfile
+import threading
+from typing import Any
+import soundfile as sf
+from faster_whisper import WhisperModel
+try:
+    from .config import VoiceRuntimeConfig
+except ImportError:  # HF flat-root execution fallback
+    from config import VoiceRuntimeConfig
+# ---------------------------------------------------------------------------
+# Local Whisper (CPU fallback)
+# ---------------------------------------------------------------------------
+class WhisperRuntime:
+    _lock = threading.Lock()
+    _model: WhisperModel | None = None
+    _loaded_id: str | None = None
+    @classmethod
+    def get_model(cls, config: VoiceRuntimeConfig) -> WhisperModel:
+        with cls._lock:
+            if cls._model is not None and cls._loaded_id == config.runtime_model_id:
+                return cls._model
+            cls._model = WhisperModel(
+                config.runtime_model_id,
+                device="cpu",
+                compute_type=config.compute_type,
+                cpu_threads=config.cpu_threads,
+                num_workers=1,
+            )
+            cls._loaded_id = config.runtime_model_id
+            return cls._model
+def _transcribe_local(
+    wav_path: str,
+    config: VoiceRuntimeConfig,
+    language_hint: str | None,
+) -> tuple[list[Any], str, str]:
+    model = WhisperRuntime.get_model(config)
+    requested_language = None if not language_hint or language_hint == "auto" else language_hint
+    segments_iter, info = model.transcribe(
+        wav_path,
+        task="transcribe",
+        language=requested_language,
+        beam_size=1,
+        best_of=1,
+        temperature=0.0,
+        condition_on_previous_text=False,
+        word_timestamps=True,
+        vad_filter=False,
+    )
+    segments = list(segments_iter)
+    detected_language = (info.language or requested_language or "unknown").lower()
+    language_source = "request" if requested_language else "auto_detect"
+    return segments, detected_language, language_source
+# ---------------------------------------------------------------------------
+# Groq API path
+# ---------------------------------------------------------------------------
+# Stay safely under Groq's 25 MB per-file limit
+_GROQ_MAX_BYTES = 23 * 1024 * 1024
+class _GroqWord:
+    """Mimics faster_whisper Word namedtuple so service.py needs zero changes."""
+    __slots__ = ("word", "start", "end", "probability")
+    def __init__(self, word: str, start: float, end: float) -> None:
+        self.word = word
+        self.start = start
+        self.end = end
+        self.probability = None  # Groq doesn't provide per-word confidence
+class _GroqSegment:
+    """Mimics faster_whisper Segment namedtuple so service.py needs zero changes."""
+    __slots__ = ("start", "end", "text", "words")
+    def __init__(self, start: float, end: float, text: str, words: list[_GroqWord]) -> None:
+        self.start = start
+        self.end = end
+        self.text = text
+        self.words = words
+def _chunk_wav(wav_path: str, sample_rate: int) -> list[tuple[str, float]]:
+    """
+    Split WAV into chunks that fit within _GROQ_MAX_BYTES.
+    Returns list of (chunk_wav_path, start_time_offset_sec).
+    Chunks are written to a temp dir and must be cleaned up by the caller.
+    """
+    audio, _ = sf.read(wav_path, dtype="float32")
+    bytes_per_sec = sample_rate * 2  # mono PCM_16 = 2 bytes/sample
+    max_samples = int(math.floor(_GROQ_MAX_BYTES / bytes_per_sec) * sample_rate)
+    tmp_dir = tempfile.mkdtemp(prefix="groq-chunks-")
+    chunks: list[tuple[str, float]] = []
+    cursor = 0
+    idx = 0
+    while cursor < len(audio):
+        end = min(cursor + max_samples, len(audio))
+        chunk_path = os.path.join(tmp_dir, f"chunk_{idx:04d}.wav")
+        sf.write(chunk_path, audio[cursor:end], sample_rate, subtype="PCM_16")
+        chunks.append((chunk_path, cursor / sample_rate))
+        cursor = end
+        idx += 1
+    return chunks
+def _call_groq(
+    wav_path: str,
+    api_key: str,
+    groq_model: str,
+    language_hint: str | None,
+) -> dict:
+    """Call Groq transcriptions endpoint for a single chunk file."""
+    from groq import Groq  # imported lazily so local-only installs don't break
+    client = Groq(api_key=api_key)
+    kwargs: dict[str, Any] = {
+        "model": groq_model,
+        "response_format": "verbose_json",
+        "timestamp_granularities": ["word", "segment"],
+    }
+    if language_hint and language_hint != "auto":
+        kwargs["language"] = language_hint
+    with open(wav_path, "rb") as f:
+        result = client.audio.transcriptions.create(file=f, **kwargs)
+    return result.model_dump() if hasattr(result, "model_dump") else dict(result)
+def _transcribe_groq(
+    wav_path: str,
+    config: VoiceRuntimeConfig,
+    language_hint: str | None,
+) -> tuple[list[Any], str, str]:
+    api_key = config.groq_api_key
+    groq_model = config.groq_model_id
+    requested_language = None if not language_hint or language_hint == "auto" else language_hint
+    chunks = _chunk_wav(wav_path, config.sample_rate)
+    all_segments: list[_GroqSegment] = []
+    detected_language: str = requested_language or "unknown"
+    for chunk_path, time_offset in chunks:
+        try:
+            result = _call_groq(chunk_path, api_key, groq_model, language_hint)
+            # Capture language from the first chunk that reports it
+            if detected_language in ("unknown", None):
+                detected_language = (result.get("language") or "unknown").lower()
+            raw_segments: list[dict] = result.get("segments") or []
+            raw_words: list[dict] = result.get("words") or []
+            # Build segment-id → words mapping by time overlap
+            seg_words: dict[int, list[_GroqWord]] = {}
+            for w in raw_words:
+                w_start = float(w.get("start", 0.0)) + time_offset
+                w_end = float(w.get("end", w_start)) + time_offset
+                w_text = str(w.get("word", "")).strip()
+                if not w_text:
+                    continue
+                best_sid: int = 0
+                best_overlap: float = -1.0
+                for seg in raw_segments:
+                    s_start = float(seg.get("start", 0.0)) + time_offset
+                    s_end = float(seg.get("end", s_start)) + time_offset
+                    overlap = min(w_end, s_end) - max(w_start, s_start)
+                    if overlap > best_overlap:
+                        best_overlap = overlap
+                        best_sid = int(seg.get("id", 0))
+                seg_words.setdefault(best_sid, []).append(
+                    _GroqWord(word=w_text, start=w_start, end=w_end)
+                )
+            for seg in raw_segments:
+                sid = int(seg.get("id", 0))
+                all_segments.append(_GroqSegment(
+                    start=float(seg.get("start", 0.0)) + time_offset,
+                    end=float(seg.get("end", 0.0)) + time_offset,
+                    text=str(seg.get("text", "")).strip(),
+                    words=seg_words.get(sid, []),
+                ))
+        finally:
+            try:
+                os.remove(chunk_path)
+            except OSError:
+                pass
+    language_source = "request" if requested_language else "auto_detect"
+    return all_segments, detected_language, language_source
+# ---------------------------------------------------------------------------
+# Public entry point — called by service.py
+# ---------------------------------------------------------------------------
+def transcribe(
+    wav_path: str,
+    config: VoiceRuntimeConfig,
+    language_hint: str | None,
+) -> tuple[list[Any], str, str]:
+    """
+    Routes to Groq API when GROQ_API_KEY is configured, otherwise falls back
+    to local faster-whisper. Both paths return objects compatible with
+    _build_alignment_payload in service.py.
+    """
+    if config.groq_api_key:
+        return _transcribe_groq(wav_path, config, language_hint)
+    return _transcribe_local(wav_path, config, language_hint)

requirements.txt CHANGED Viewed

@@ -5,3 +5,4 @@ faster-whisper>=1.1.1
 numpy>=1.26.0
 soundfile>=0.12.1
 pyannote.audio>=3.3.2

 numpy>=1.26.0
 soundfile>=0.12.1
 pyannote.audio>=3.3.2
+groq>=0.9.0