Spaces:

testingfaces
/

clearwave-ai

Running

App Files Files Community

clearwave-ai / transcriber.py

testingfaces

Update transcriber.py

da8f4c4 verified 14 days ago

raw

history blame

6.56 kB

	"""
	Department 2 — Transcriber
	Primary : Groq API (Whisper large-v3 on H100) — free tier 14,400 s/day
	Fallback : faster-whisper large-v3 int8 (local CPU) if Groq fails or limit reached

	✅ UPGRADED:
	- Chunking support — splits long audio into 60s pieces automatically
	- Groq limit is 25MB per file, chunking handles large files
	- Chunks rejoined seamlessly into full transcript
	"""

	import os
	import time
	import logging
	import subprocess
	import tempfile
	import shutil

	logger = logging.getLogger(__name__)

	LANG_TO_WHISPER = {
	"auto": None,
	"en": "en",
	"te": "te",
	"hi": "hi",
	"ta": "ta",
	"kn": "kn",
	}

	CHUNK_DURATION_SEC = 60 # Groq max is 25MB — 60s chunks stay safe


	class Transcriber:
	def __init__(self):
	self.groq_key = os.environ.get("GROQ_API_KEY", "")
	self._groq_client = None
	self._local_model = None

	if self.groq_key:
	print("[Transcriber] Groq API key found — primary = Groq Whisper large-v3")
	self._init_groq()
	else:
	print("[Transcriber] No GROQ_API_KEY — local Whisper loads on first use")

	def transcribe(self, audio_path: str, language: str = "auto"):
	lang_hint = LANG_TO_WHISPER.get(language, None)
	duration = self._get_duration(audio_path)
	print(f"[Transcriber] Audio duration: {duration:.1f}s")

	if duration <= CHUNK_DURATION_SEC:
	return self._transcribe_single(audio_path, lang_hint)

	print(f"[Transcriber] Long audio — splitting into {CHUNK_DURATION_SEC}s chunks")
	return self._transcribe_chunked(audio_path, lang_hint, duration)

	def _transcribe_chunked(self, audio_path, language, duration):
	tmp_dir = tempfile.mkdtemp()
	chunks = []
	start = 0
	index = 0

	while start < duration:
	chunk_path = os.path.join(tmp_dir, f"chunk_{index:03d}.wav")
	subprocess.run([
	"ffmpeg", "-y", "-i", audio_path,
	"-ss", str(start), "-t", str(CHUNK_DURATION_SEC),
	"-acodec", "pcm_s16le", "-ar", "16000", "-ac", "1",
	chunk_path
	], capture_output=True)
	if os.path.exists(chunk_path):
	chunks.append(chunk_path)
	start += CHUNK_DURATION_SEC
	index += 1

	print(f"[Transcriber] Processing {len(chunks)} chunks...")
	transcripts = []
	detected_lang = language or "en"
	method = "unknown"

	for i, chunk in enumerate(chunks):
	print(f"[Transcriber] Chunk {i+1}/{len(chunks)}...")
	try:
	text, lang, m = self._transcribe_single(chunk, language)
	transcripts.append(text.strip())
	detected_lang = lang
	method = m
	except Exception as e:
	logger.warning(f"Chunk {i+1} failed: {e}")

	shutil.rmtree(tmp_dir, ignore_errors=True)
	full = " ".join(t for t in transcripts if t)
	print(f"[Transcriber] Done — {len(full)} chars total")
	return full, detected_lang, f"{method} (chunked {len(chunks)}x)"

	def _transcribe_single(self, audio_path, language):
	if self._groq_client is not None:
	try:
	return self._transcribe_groq(audio_path, language)
	except Exception as e:
	logger.warning(f"Groq failed ({e}), falling back to local")
	if self._local_model is None:
	self._init_local()
	return self._transcribe_local(audio_path, language)

	def _init_groq(self):
	try:
	from groq import Groq
	self._groq_client = Groq(api_key=self.groq_key)
	print("[Transcriber] Groq client initialised")
	except Exception as e:
	logger.warning(f"Groq init failed: {e}")
	self._groq_client = None

	def _transcribe_groq(self, audio_path, language=None):
	t0 = time.time()
	with open(audio_path, "rb") as f:
	kwargs = dict(file=f, model="whisper-large-v3",
	response_format="verbose_json", temperature=0.0)
	if language:
	kwargs["language"] = language
	resp = self._groq_client.audio.transcriptions.create(**kwargs)
	transcript = resp.text.strip()
	detected_lang = self._normalise_lang(getattr(resp, "language", language or "en") or "en")
	logger.info(f"Groq done in {time.time()-t0:.2f}s, lang={detected_lang}")
	return transcript, detected_lang, "Groq Whisper large-v3"

	def _init_local(self):
	try:
	from faster_whisper import WhisperModel
	print("[Transcriber] Loading faster-whisper large-v3 int8...")
	self._local_model = WhisperModel("large-v3", device="cpu", compute_type="int8")
	print("[Transcriber] faster-whisper ready")
	except Exception as e:
	logger.error(f"Local Whisper init failed: {e}")
	self._local_model = None

	def _transcribe_local(self, audio_path, language=None):
	t0 = time.time()
	if self._local_model is None:
	self._init_local()
	if self._local_model is None:
	raise RuntimeError("No transcription engine available.")
	segments, info = self._local_model.transcribe(
	audio_path, language=language, beam_size=5,
	vad_filter=True, vad_parameters=dict(min_silence_duration_ms=500))
	transcript = " ".join(seg.text.strip() for seg in segments).strip()
	detected_lang = info.language or language or "en"
	logger.info(f"Local done in {time.time()-t0:.2f}s")
	return transcript, detected_lang, "faster-whisper large-v3 int8 (local)"

	def _get_duration(self, audio_path):
	try:
	result = subprocess.run([
	"ffprobe", "-v", "error",
	"-show_entries", "format=duration",
	"-of", "default=noprint_wrappers=1:nokey=1",
	audio_path
	], capture_output=True, text=True)
	return float(result.stdout.strip())
	except Exception:
	return 0.0

	@staticmethod
	def _normalise_lang(raw):
	mapping = {"english":"en","telugu":"te","hindi":"hi",
	"tamil":"ta","kannada":"kn","spanish":"es",
	"french":"fr","german":"de","japanese":"ja","chinese":"zh"}
	return mapping.get(raw.lower(), raw[:2].lower() if len(raw) >= 2 else raw)