Spaces:

aseelflihan
/

syncmaster6

Sleeping

File size: 17,191 Bytes
# audio_processor.py - Enhanced with AI Translation Support

import os
from dotenv import load_dotenv
import tempfile
from typing import List, Dict, Optional, Tuple
import json
import traceback

# --- DEFINITIVE NUMBA FIX ---
# This MUST be done BEFORE importing librosa
os.environ["NUMBA_CACHE_DIR"] = "/tmp"

# Now, import librosa safely
import librosa
# --- END OF FIX ---

import google.generativeai as genai
from translator import AITranslator
import requests
from google.api_core import exceptions as google_exceptions

class AudioProcessor:
    def __init__(self):
        self.translator = None
        self.init_error = None
        self._initialize_translator()

    def _initialize_translator(self):
        """Initialize AI translator for multi-language support"""
        try:
            self.translator = AITranslator()
            if self.translator.init_error:
                print(f"--- WARNING: Translator has initialization error: {self.translator.init_error} ---")
        except Exception as e:
            print(f"--- WARNING: Translator initialization failed: {str(e)} ---")
            self.translator = None
    
    def transcribe_audio(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Transcribes audio. Returns (text, error_message).
        Uses Gemini first (if available), then falls back to Groq Whisper.
        """
        if not os.path.exists(audio_file_path):
            return None, f"--- ERROR: Audio file for transcription not found at: {audio_file_path} ---"

        # Try Gemini first if available
        gemini_err = None
        try:
            if self.translator and self.translator.model:
                audio_file = genai.upload_file(path=audio_file_path)
                prompt = (
                    "You are an ASR system. Transcribe the audio accurately. "
                    "Auto-detect the spoken language and return ONLY the verbatim transcript in that same language. "
                    "Do not translate. Do not add labels or timestamps."
                )
                response = self.translator.model.generate_content([prompt, audio_file])
                if response and hasattr(response, 'text') and response.text:
                    return response.text.strip(), None
                else:
                    gemini_err = "--- WARNING: Gemini returned an empty response for transcription. ---"
        except google_exceptions.ResourceExhausted:
            gemini_err = "--- QUOTA ERROR: You have exceeded the daily free usage limit for the AI service. Please wait for your quota to reset (usually within 24 hours) or upgrade your Google AI plan. ---"
        except Exception:
            gemini_err = f"--- FATAL ERROR during Gemini transcription: {traceback.format_exc()} ---"

        # Fallback: Groq Whisper
        text, groq_err = self._transcribe_with_groq(audio_file_path)
        if text:
            return text, None

        # If all failed
        combined_err = groq_err or gemini_err or "--- ERROR: No transcription provider available. ---"
        return None, combined_err

    def _transcribe_with_groq(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str]]:
        """Transcribe using Groq Whisper-compatible endpoint. Returns (text, error)."""
        try:
            load_dotenv()
            groq_key = os.getenv("GROQ_API_KEY")
            if not groq_key:
                return None, "--- ERROR: GROQ_API_KEY not set. ---"
            model = os.getenv("GROQ_WHISPER_MODEL", "whisper-large-v3")
            url = "https://api.groq.com/openai/v1/audio/transcriptions"
            headers = {"Authorization": f"Bearer {groq_key}"}
            # Guess mime type by extension
            filename = os.path.basename(audio_file_path)
            mime = "audio/wav"
            if filename.lower().endswith(".mp3"):
                mime = "audio/mpeg"
            elif filename.lower().endswith(".m4a"):
                mime = "audio/mp4"
            data = {
                "model": model,
                "response_format": "json",
            }
            with open(audio_file_path, "rb") as f:
                files = {"file": (filename, f, mime)}
                resp = requests.post(url, headers=headers, files=files, data=data, timeout=60)
            if not resp.ok:
                try:
                    err = resp.json()
                except Exception:
                    err = {"error": resp.text}
                return None, f"--- ERROR: Groq transcription error {resp.status_code}: {err} ---"
            out = resp.json()
            text = out.get("text")
            if not text:
                return None, "--- ERROR: Groq transcription returned no text. ---"
            return text.strip(), None
        except Exception:
            return None, f"--- FATAL ERROR during Groq transcription: {traceback.format_exc()} ---"

    def get_audio_duration(self, audio_file_path: str) -> Tuple[Optional[float], Optional[str]]:
        """
        Gets audio duration. Returns (duration, error_message).
        """
        try:
            if not os.path.exists(audio_file_path):
                return None, f"--- ERROR: Audio file for duration not found at: {audio_file_path} ---"
            
            duration = librosa.get_duration(path=audio_file_path)
            if duration is None or duration < 0.1:
                return None, f"--- ERROR: librosa returned an invalid duration: {duration}s ---"
            return duration, None
        except Exception as e:
            error_msg = f"--- FATAL ERROR getting audio duration with librosa: {traceback.format_exc()} ---"
            return None, error_msg
    
    def get_word_timestamps(self, audio_file_path: str) -> Tuple[List[Dict], List[str]]:
        """
        Generates timestamps. Returns (timestamps, log_messages).
        """
        logs = ["--- INFO: Starting get_word_timestamps... ---"]
        
        transcription, error = self.transcribe_audio(audio_file_path)
        if error:
            logs.append(error)
            return [], logs
        logs.append(f"--- DEBUG: Transcription successful. Text: '{transcription[:50]}...'")

        audio_duration, error = self.get_audio_duration(audio_file_path)
        if error:
            logs.append(error)
            return [], logs
        logs.append(f"--- DEBUG: Audio duration successful. Duration: {audio_duration:.2f}s")
        
        words = transcription.split()
        if not words:
            logs.append("--- WARNING: Transcription resulted in zero words. ---")
            return [], logs
        
        logs.append(f"--- INFO: Distributing {len(words)} words across the duration. ---")
        word_timestamps = []
        total_words = len(words)
        usable_duration = max(0, audio_duration - 1.0)
        
        for i, word in enumerate(words):
            start_time = 0.5 + (i * (usable_duration / total_words))
            end_time = 0.5 + ((i + 1) * (usable_duration / total_words))
            word_timestamps.append({'word': word.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)})
        
        logs.append(f"--- SUCCESS: Generated {len(word_timestamps)} word timestamps. ---")
        return word_timestamps, logs
    
    def get_word_timestamps_with_translation(self, audio_file_path: str, target_language: str = 'ar') -> Tuple[Dict, List[str]]:
        """
        Enhanced function that provides both transcription and translation
        
        Args:
            audio_file_path: Path to audio file
            target_language: Target language for translation ('ar' for Arabic)
        
        Returns:
            Tuple of (result_dict, log_messages)
            result_dict contains: {
                'original_text': str,
                'translated_text': str,
                'word_timestamps': List[Dict],
                'translated_timestamps': List[Dict],
                'language_detected': str,
                'target_language': str
            }
        """
        logs = ["--- INFO: Starting enhanced transcription with translation... ---"]
        
        # Get original transcription and timestamps
        word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
        logs.extend(transcription_logs)
        
        if not word_timestamps:
            # Fallback: try plain transcription (Gemini → Groq) then synthesize timestamps
            logs.append("--- INFO: Falling back to plain transcription because timestamps are empty. ---")
            plain_text, err = self.transcribe_audio(audio_file_path)
            if not plain_text:
                logs.append(err or "--- ERROR: Plain transcription fallback failed ---")
                return {}, logs
            logs.append("--- SUCCESS: Plain transcription fallback succeeded. ---")
            # Synthesize naive word-level timestamps across duration
            try:
                duration, derr = self.get_audio_duration(audio_file_path)
                if derr:
                    logs.append(derr)
                    duration = 0.0
                words = plain_text.split()
                if not words:
                    logs.append("--- WARNING: Fallback transcription produced zero words. ---")
                    return {}, logs
                if duration and duration > 0.1:
                    usable_duration = max(0, duration - 1.0)
                    start_offset = 0.5
                else:
                    # If duration not available, assume ~0.4s per word
                    usable_duration = 0.4 * max(1, len(words))
                    start_offset = 0.0
                word_timestamps = []
                total_words = len(words)
                for i, w in enumerate(words):
                    start_time = start_offset + (i * (usable_duration / total_words))
                    end_time = start_offset + ((i + 1) * (usable_duration / total_words))
                    word_timestamps.append({'word': w.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)})
                logs.append(f"--- INFO: Synthesized {len(word_timestamps)} timestamps from fallback transcript. ---")
            except Exception:
                logs.append(f"--- FATAL ERROR synthesizing timestamps: {traceback.format_exc()} ---")
                return {}, logs
        
        # Extract original text
        original_text = " ".join([d['word'] for d in word_timestamps])
        logs.append(f"--- INFO: Original transcription: '{original_text[:50]}...' ---")
        
        # Initialize result dictionary
        result = {
            'original_text': original_text,
            'translated_text': '',
            'word_timestamps': word_timestamps,
            'translated_timestamps': [],
            'language_detected': 'unknown',
            'target_language': target_language,
            'translation_success': False
        }
        
        # Check if translator is available
        if not self.translator:
            logs.append("--- WARNING: Translator not available, returning original text only ---")
            result['translated_text'] = original_text
            return result, logs
        
        try:
            # Translate the text
            translated_text, translation_error = self.translator.translate_text(
                original_text, 
                target_language=target_language
            )
            
            if translated_text:
                result['translated_text'] = translated_text
                result['translation_success'] = True
                logs.append(f"--- SUCCESS: Translation completed: '{translated_text[:50]}...' ---")
                
                # Create translated timestamps by mapping words
                translated_timestamps = self._create_translated_timestamps(
                    word_timestamps, 
                    original_text, 
                    translated_text
                )
                result['translated_timestamps'] = translated_timestamps
                logs.append(f"--- INFO: Created {len(translated_timestamps)} translated timestamps ---")
                
            else:
                logs.append(f"--- ERROR: Translation failed: {translation_error} ---")
                result['translated_text'] = original_text  # Fallback to original
                result['translated_timestamps'] = word_timestamps  # Use original timestamps
                
        except Exception as e:
            error_msg = f"--- FATAL ERROR during translation process: {traceback.format_exc()} ---"
            logs.append(error_msg)
            result['translated_text'] = original_text  # Fallback
            result['translated_timestamps'] = word_timestamps
        
        return result, logs
    
    def _create_translated_timestamps(self, original_timestamps: List[Dict], original_text: str, translated_text: str) -> List[Dict]:
        """
        Create timestamps for translated text by proportional mapping
        
        Args:
            original_timestamps: Original word timestamps
            original_text: Original transcribed text
            translated_text: Translated text
        
        Returns:
            List of translated word timestamps
        """
        try:
            translated_words = translated_text.split()
            if not translated_words:
                return []
            
            # Get total duration from original timestamps
            if not original_timestamps:
                return []
            
            start_time = original_timestamps[0]['start']
            end_time = original_timestamps[-1]['end']
            total_duration = end_time - start_time
            
            # Create proportional timestamps for translated words
            translated_timestamps = []
            word_count = len(translated_words)
            
            for i, word in enumerate(translated_words):
                # Calculate proportional timing
                word_start = start_time + (i * total_duration / word_count)
                word_end = start_time + ((i + 1) * total_duration / word_count)
                
                translated_timestamps.append({
                    'word': word.strip(),
                    'start': round(word_start, 3),
                    'end': round(word_end, 3)
                })
            
            return translated_timestamps
            
        except Exception as e:
            print(f"--- ERROR creating translated timestamps: {str(e)} ---")
            return []
    
    def batch_translate_transcription(self, audio_file_path: str, target_languages: List[str]) -> Tuple[Dict, List[str]]:
        """
        Transcribe audio and translate to multiple languages
        
        Args:
            audio_file_path: Path to audio file
            target_languages: List of target language codes
        
        Returns:
            Tuple of (results_dict, log_messages)
        """
        logs = ["--- INFO: Starting batch translation process... ---"]
        
        # Get original transcription
        word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
        logs.extend(transcription_logs)
        
        if not word_timestamps:
            return {}, logs
        
        original_text = " ".join([d['word'] for d in word_timestamps])
        
        # Initialize results
        results = {
            'original': {
                'text': original_text,
                'timestamps': word_timestamps,
                'language': 'detected'
            },
            'translations': {}
        }
        
        # Translate to each target language
        if self.translator:
            for lang_code in target_languages:
                try:
                    translated_text, error = self.translator.translate_text(original_text, lang_code)
                    if translated_text:
                        translated_timestamps = self._create_translated_timestamps(
                            word_timestamps, original_text, translated_text
                        )
                        results['translations'][lang_code] = {
                            'text': translated_text,
                            'timestamps': translated_timestamps,
                            'success': True
                        }
                        logs.append(f"--- SUCCESS: Translation to {lang_code} completed ---")
                    else:
                        results['translations'][lang_code] = {
                            'text': original_text,
                            'timestamps': word_timestamps,
                            'success': False,
                            'error': error
                        }
                        logs.append(f"--- ERROR: Translation to {lang_code} failed: {error} ---")
                except Exception as e:
                    logs.append(f"--- FATAL ERROR translating to {lang_code}: {str(e)} ---")
        else:
            logs.append("--- WARNING: Translator not available for batch translation ---")
        
        return results, logs