# audio_processor.py - Enhanced with AI Translation Support import os from dotenv import load_dotenv import tempfile from typing import List, Dict, Optional, Tuple import json import traceback # --- DEFINITIVE NUMBA FIX --- # This MUST be done BEFORE importing librosa os.environ["NUMBA_CACHE_DIR"] = "/tmp" # Now, import librosa safely import librosa # --- END OF FIX --- import google.generativeai as genai from translator import AITranslator import requests from google.api_core import exceptions as google_exceptions class AudioProcessor: def __init__(self): self.translator = None self.init_error = None self._initialize_translator() def _initialize_translator(self): """Initialize AI translator for multi-language support""" try: self.translator = AITranslator() if self.translator.init_error: print(f"--- WARNING: Translator has initialization error: {self.translator.init_error} ---") except Exception as e: print(f"--- WARNING: Translator initialization failed: {str(e)} ---") self.translator = None def transcribe_audio(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str], Optional[str]]: """ Transcribes audio. Returns (text, error_message). Uses Gemini first (if available), then falls back to Groq Whisper. """ if not os.path.exists(audio_file_path): return None, f"--- ERROR: Audio file for transcription not found at: {audio_file_path} ---", None # Try Gemini first if available gemini_err = None try: if self.translator and self.translator.model: audio_file = genai.upload_file(path=audio_file_path) prompt = ( "You are an ASR system. Transcribe the audio accurately. " "Auto-detect the spoken language and return ONLY the verbatim transcript in that same language. " "Do not translate. Do not add labels or timestamps." ) response = self.translator.model.generate_content([prompt, audio_file]) if response and hasattr(response, 'text') and response.text: return response.text.strip(), None, "Gemini" else: gemini_err = "--- WARNING: Gemini returned an empty response for transcription. ---" except google_exceptions.ResourceExhausted: gemini_err = "--- QUOTA ERROR: You have exceeded the daily free usage limit for the AI service. Please wait for your quota to reset (usually within 24 hours) or upgrade your Google AI plan. ---" except Exception: gemini_err = f"--- FATAL ERROR during Gemini transcription: {traceback.format_exc()} ---" # Fallback: Groq Whisper text, groq_err = self._transcribe_with_groq(audio_file_path) if text: return text, None, "Groq Whisper" # If all failed combined_err = groq_err or gemini_err or "--- ERROR: No transcription provider available. ---" return None, combined_err, None def _transcribe_with_groq(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str]]: """Transcribe using Groq Whisper-compatible endpoint. Returns (text, error).""" try: load_dotenv() groq_key = os.getenv("GROQ_API_KEY") if not groq_key: return None, "--- ERROR: GROQ_API_KEY not set. ---" model = os.getenv("GROQ_WHISPER_MODEL", "whisper-large-v3") url = "https://api.groq.com/openai/v1/audio/transcriptions" headers = {"Authorization": f"Bearer {groq_key}"} # Guess mime type by extension filename = os.path.basename(audio_file_path) mime = "audio/wav" if filename.lower().endswith(".mp3"): mime = "audio/mpeg" elif filename.lower().endswith(".m4a"): mime = "audio/mp4" data = { "model": model, "response_format": "json", } with open(audio_file_path, "rb") as f: files = {"file": (filename, f, mime)} resp = requests.post(url, headers=headers, files=files, data=data, timeout=60) if not resp.ok: try: err = resp.json() except Exception: err = {"error": resp.text} return None, f"--- ERROR: Groq transcription error {resp.status_code}: {err} ---" out = resp.json() text = out.get("text") if not text: return None, "--- ERROR: Groq transcription returned no text. ---" return text.strip(), None except Exception: return None, f"--- FATAL ERROR during Groq transcription: {traceback.format_exc()} ---" def get_audio_duration(self, audio_file_path: str) -> Tuple[Optional[float], Optional[str]]: """ Gets audio duration. Returns (duration, error_message). """ try: if not os.path.exists(audio_file_path): return None, f"--- ERROR: Audio file for duration not found at: {audio_file_path} ---" duration = librosa.get_duration(path=audio_file_path) if duration is None or duration < 0.1: return None, f"--- ERROR: librosa returned an invalid duration: {duration}s ---" return duration, None except Exception as e: error_msg = f"--- FATAL ERROR getting audio duration with librosa: {traceback.format_exc()} ---" return None, error_msg def get_word_timestamps(self, audio_file_path: str) -> Tuple[List[Dict], List[str], Optional[str]]: """ Generates timestamps. Returns (timestamps, log_messages). """ logs = ["--- INFO: Starting get_word_timestamps... ---"] transcription, error, model_used = self.transcribe_audio(audio_file_path) if error: logs.append(error) return [], logs, model_used logs.append(f"--- DEBUG: Transcription successful. Text: '{transcription[:50]}...'") audio_duration, error = self.get_audio_duration(audio_file_path) if error: logs.append(error) return [], logs, model_used logs.append(f"--- DEBUG: Audio duration successful. Duration: {audio_duration:.2f}s") words = transcription.split() if not words: logs.append("--- WARNING: Transcription resulted in zero words. ---") return [], logs, model_used logs.append(f"--- INFO: Distributing {len(words)} words across the duration. ---") word_timestamps = [] total_words = len(words) usable_duration = max(0, audio_duration - 1.0) for i, word in enumerate(words): start_time = 0.5 + (i * (usable_duration / total_words)) end_time = 0.5 + ((i + 1) * (usable_duration / total_words)) word_timestamps.append({'word': word.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)}) logs.append(f"--- SUCCESS: Generated {len(word_timestamps)} word timestamps. ---") return word_timestamps, logs, model_used def get_word_timestamps_with_translation(self, audio_file_path: str, target_language: str = 'ar') -> Tuple[Dict, List[str]]: """ Enhanced function that provides both transcription and translation Args: audio_file_path: Path to audio file target_language: Target language for translation ('ar' for Arabic) Returns: Tuple of (result_dict, log_messages) result_dict contains: { 'original_text': str, 'translated_text': str, 'word_timestamps': List[Dict], 'translated_timestamps': List[Dict], 'language_detected': str, 'target_language': str } """ logs = ["--- INFO: Starting enhanced transcription with translation... ---"] # Get original transcription and timestamps word_timestamps, transcription_logs, model_used = self.get_word_timestamps(audio_file_path) logs.extend(transcription_logs) if not word_timestamps: # Fallback: try plain transcription (Gemini → Groq) then synthesize timestamps logs.append("--- INFO: Falling back to plain transcription because timestamps are empty. ---") plain_text, err, model_used_fallback = self.transcribe_audio(audio_file_path) if model_used_fallback: model_used = model_used_fallback if not plain_text: logs.append(err or "--- ERROR: Plain transcription fallback failed ---") return {}, logs logs.append(f"--- SUCCESS: Plain transcription fallback succeeded. Model: {model_used}") # Synthesize naive word-level timestamps across duration try: duration, derr = self.get_audio_duration(audio_file_path) if derr: logs.append(derr) duration = 0.0 words = plain_text.split() if not words: logs.append("--- WARNING: Fallback transcription produced zero words. ---") return {}, logs if duration and duration > 0.1: usable_duration = max(0, duration - 1.0) start_offset = 0.5 else: # If duration not available, assume ~0.4s per word usable_duration = 0.4 * max(1, len(words)) start_offset = 0.0 word_timestamps = [] total_words = len(words) for i, w in enumerate(words): start_time = start_offset + (i * (usable_duration / total_words)) end_time = start_offset + ((i + 1) * (usable_duration / total_words)) word_timestamps.append({'word': w.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)}) logs.append(f"--- INFO: Synthesized {len(word_timestamps)} timestamps from fallback transcript. ---") except Exception: logs.append(f"--- FATAL ERROR synthesizing timestamps: {traceback.format_exc()} ---") return {}, logs # Extract original text original_text = " ".join([d['word'] for d in word_timestamps]) logs.append(f"--- INFO: Original transcription: '{original_text[:50]}...' ---") # Initialize result dictionary result = { 'original_text': original_text, 'translated_text': '', 'word_timestamps': word_timestamps, 'translated_timestamps': [], 'language_detected': 'unknown', 'target_language': target_language, 'translation_success': False, 'transcription_model': model_used } # Check if translator is available if not self.translator: logs.append("--- WARNING: Translator not available, returning original text only ---") result['translated_text'] = original_text return result, logs try: # Translate the text translated_text, translation_error = self.translator.translate_text( original_text, target_language=target_language ) if translated_text: result['translated_text'] = translated_text result['translation_success'] = True logs.append(f"--- SUCCESS: Translation completed: '{translated_text[:50]}...' ---") # Create translated timestamps by mapping words translated_timestamps = self._create_translated_timestamps( word_timestamps, original_text, translated_text ) result['translated_timestamps'] = translated_timestamps logs.append(f"--- INFO: Created {len(translated_timestamps)} translated timestamps ---") else: logs.append(f"--- ERROR: Translation failed: {translation_error} ---") result['translated_text'] = original_text # Fallback to original result['translated_timestamps'] = word_timestamps # Use original timestamps except Exception as e: error_msg = f"--- FATAL ERROR during translation process: {traceback.format_exc()} ---" logs.append(error_msg) result['translated_text'] = original_text # Fallback result['translated_timestamps'] = word_timestamps return result, logs def _create_translated_timestamps(self, original_timestamps: List[Dict], original_text: str, translated_text: str) -> List[Dict]: """ Create timestamps for translated text by proportional mapping Args: original_timestamps: Original word timestamps original_text: Original transcribed text translated_text: Translated text Returns: List of translated word timestamps """ try: translated_words = translated_text.split() if not translated_words: return [] # Get total duration from original timestamps if not original_timestamps: return [] start_time = original_timestamps[0]['start'] end_time = original_timestamps[-1]['end'] total_duration = end_time - start_time # Create proportional timestamps for translated words translated_timestamps = [] word_count = len(translated_words) for i, word in enumerate(translated_words): # Calculate proportional timing word_start = start_time + (i * total_duration / word_count) word_end = start_time + ((i + 1) * total_duration / word_count) translated_timestamps.append({ 'word': word.strip(), 'start': round(word_start, 3), 'end': round(word_end, 3) }) return translated_timestamps except Exception as e: print(f"--- ERROR creating translated timestamps: {str(e)} ---") return [] def batch_translate_transcription(self, audio_file_path: str, target_languages: List[str]) -> Tuple[Dict, List[str]]: """ Transcribe audio and translate to multiple languages Args: audio_file_path: Path to audio file target_languages: List of target language codes Returns: Tuple of (results_dict, log_messages) """ logs = ["--- INFO: Starting batch translation process... ---"] # Get original transcription word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path) logs.extend(transcription_logs) if not word_timestamps: return {}, logs original_text = " ".join([d['word'] for d in word_timestamps]) # Initialize results results = { 'original': { 'text': original_text, 'timestamps': word_timestamps, 'language': 'detected' }, 'translations': {} } # Translate to each target language if self.translator: for lang_code in target_languages: try: translated_text, error = self.translator.translate_text(original_text, lang_code) if translated_text: translated_timestamps = self._create_translated_timestamps( word_timestamps, original_text, translated_text ) results['translations'][lang_code] = { 'text': translated_text, 'timestamps': translated_timestamps, 'success': True } logs.append(f"--- SUCCESS: Translation to {lang_code} completed ---") else: results['translations'][lang_code] = { 'text': original_text, 'timestamps': word_timestamps, 'success': False, 'error': error } logs.append(f"--- ERROR: Translation to {lang_code} failed: {error} ---") except Exception as e: logs.append(f"--- FATAL ERROR translating to {lang_code}: {str(e)} ---") else: logs.append("--- WARNING: Translator not available for batch translation ---") return results, logs