Spaces:

aseelflihan
/

syncmaster5

Sleeping

File size: 13,329 Bytes

88fbdc0

# audio_processor.py - Enhanced with AI Translation Support

import os
from dotenv import load_dotenv
import tempfile
from typing import List, Dict, Optional, Tuple
import json
import traceback

# --- DEFINITIVE NUMBA FIX ---
# This MUST be done BEFORE importing librosa
os.environ["NUMBA_CACHE_DIR"] = "/tmp"

# Now, import librosa safely
import librosa
# --- END OF FIX ---

import google.generativeai as genai
from translator import AITranslator
from google.api_core import exceptions as google_exceptions

class AudioProcessor:
    def __init__(self):
        self.translator = None
        self.init_error = None
        self._initialize_translator()

    def _initialize_translator(self):
        """Initialize AI translator for multi-language support"""
        try:
            self.translator = AITranslator()
            if self.translator.init_error:
                print(f"--- WARNING: Translator has initialization error: {self.translator.init_error} ---")
        except Exception as e:
            print(f"--- WARNING: Translator initialization failed: {str(e)} ---")
            self.translator = None
    
    def transcribe_audio(self, audio_file_path: str) -> Tuple[Optional[str], Optional[str]]:
        """
        Transcribes audio. Returns (text, error_message).
        """
        if not self.translator or not self.translator.model:
            return None, "--- ERROR: Translator model is not available for transcription. ---"
        
        try:
            if not os.path.exists(audio_file_path):
                return None, f"--- ERROR: Audio file for transcription not found at: {audio_file_path} ---"
            
            audio_file = genai.upload_file(path=audio_file_path)
            
            # Ask the model to auto-detect the spoken language and return the original-language transcript (no translation)
            prompt = (
                "You are an ASR system. Transcribe the audio accurately. "
                "Auto-detect the spoken language and return ONLY the verbatim transcript in that same language. "
                "Do not translate. Do not add labels or timestamps."
            )
            response = self.translator.model.generate_content([prompt, audio_file])

            if response and hasattr(response, 'text') and response.text:
                return response.text.strip(), None
            else:
                return None, "--- WARNING: Gemini returned an empty response for transcription. ---"
                
        except google_exceptions.ResourceExhausted:
            error_msg = "--- QUOTA ERROR: You have exceeded the daily free usage limit for the AI service. Please wait for your quota to reset (usually within 24 hours) or upgrade your Google AI plan. ---"
            return None, error_msg
        except Exception as e:
            error_msg = f"--- FATAL ERROR during transcription: {traceback.format_exc()} ---"
            return None, error_msg

    def get_audio_duration(self, audio_file_path: str) -> Tuple[Optional[float], Optional[str]]:
        """
        Gets audio duration. Returns (duration, error_message).
        """
        try:
            if not os.path.exists(audio_file_path):
                return None, f"--- ERROR: Audio file for duration not found at: {audio_file_path} ---"
            
            duration = librosa.get_duration(path=audio_file_path)
            if duration is None or duration < 0.1:
                return None, f"--- ERROR: librosa returned an invalid duration: {duration}s ---"
            return duration, None
        except Exception as e:
            error_msg = f"--- FATAL ERROR getting audio duration with librosa: {traceback.format_exc()} ---"
            return None, error_msg
    
    def get_word_timestamps(self, audio_file_path: str) -> Tuple[List[Dict], List[str]]:
        """
        Generates timestamps. Returns (timestamps, log_messages).
        """
        logs = ["--- INFO: Starting get_word_timestamps... ---"]
        
        transcription, error = self.transcribe_audio(audio_file_path)
        if error:
            logs.append(error)
            return [], logs
        logs.append(f"--- DEBUG: Transcription successful. Text: '{transcription[:50]}...'")

        audio_duration, error = self.get_audio_duration(audio_file_path)
        if error:
            logs.append(error)
            return [], logs
        logs.append(f"--- DEBUG: Audio duration successful. Duration: {audio_duration:.2f}s")
        
        words = transcription.split()
        if not words:
            logs.append("--- WARNING: Transcription resulted in zero words. ---")
            return [], logs
        
        logs.append(f"--- INFO: Distributing {len(words)} words across the duration. ---")
        word_timestamps = []
        total_words = len(words)
        usable_duration = max(0, audio_duration - 1.0)
        
        for i, word in enumerate(words):
            start_time = 0.5 + (i * (usable_duration / total_words))
            end_time = 0.5 + ((i + 1) * (usable_duration / total_words))
            word_timestamps.append({'word': word.strip(), 'start': round(start_time, 3), 'end': round(end_time, 3)})
        
        logs.append(f"--- SUCCESS: Generated {len(word_timestamps)} word timestamps. ---")
        return word_timestamps, logs
    
    def get_word_timestamps_with_translation(self, audio_file_path: str, target_language: str = 'ar') -> Tuple[Dict, List[str]]:
        """
        Enhanced function that provides both transcription and translation
        
        Args:
            audio_file_path: Path to audio file
            target_language: Target language for translation ('ar' for Arabic)
        
        Returns:
            Tuple of (result_dict, log_messages)
            result_dict contains: {
                'original_text': str,
                'translated_text': str,
                'word_timestamps': List[Dict],
                'translated_timestamps': List[Dict],
                'language_detected': str,
                'target_language': str
            }
        """
        logs = ["--- INFO: Starting enhanced transcription with translation... ---"]
        
        # Get original transcription and timestamps
        word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
        logs.extend(transcription_logs)
        
        if not word_timestamps:
            logs.append("--- ERROR: No transcription available for translation ---")
            return {}, logs
        
        # Extract original text
        original_text = " ".join([d['word'] for d in word_timestamps])
        logs.append(f"--- INFO: Original transcription: '{original_text[:50]}...' ---")
        
        # Initialize result dictionary
        result = {
            'original_text': original_text,
            'translated_text': '',
            'word_timestamps': word_timestamps,
            'translated_timestamps': [],
            'language_detected': 'unknown',
            'target_language': target_language,
            'translation_success': False
        }
        
        # Check if translator is available
        if not self.translator:
            logs.append("--- WARNING: Translator not available, returning original text only ---")
            result['translated_text'] = original_text
            return result, logs
        
        try:
            # Translate the text
            translated_text, translation_error = self.translator.translate_text(
                original_text, 
                target_language=target_language
            )
            
            if translated_text:
                result['translated_text'] = translated_text
                result['translation_success'] = True
                logs.append(f"--- SUCCESS: Translation completed: '{translated_text[:50]}...' ---")
                
                # Create translated timestamps by mapping words
                translated_timestamps = self._create_translated_timestamps(
                    word_timestamps, 
                    original_text, 
                    translated_text
                )
                result['translated_timestamps'] = translated_timestamps
                logs.append(f"--- INFO: Created {len(translated_timestamps)} translated timestamps ---")
                
            else:
                logs.append(f"--- ERROR: Translation failed: {translation_error} ---")
                result['translated_text'] = original_text  # Fallback to original
                result['translated_timestamps'] = word_timestamps  # Use original timestamps
                
        except Exception as e:
            error_msg = f"--- FATAL ERROR during translation process: {traceback.format_exc()} ---"
            logs.append(error_msg)
            result['translated_text'] = original_text  # Fallback
            result['translated_timestamps'] = word_timestamps
        
        return result, logs
    
    def _create_translated_timestamps(self, original_timestamps: List[Dict], original_text: str, translated_text: str) -> List[Dict]:
        """
        Create timestamps for translated text by proportional mapping
        
        Args:
            original_timestamps: Original word timestamps
            original_text: Original transcribed text
            translated_text: Translated text
        
        Returns:
            List of translated word timestamps
        """
        try:
            translated_words = translated_text.split()
            if not translated_words:
                return []
            
            # Get total duration from original timestamps
            if not original_timestamps:
                return []
            
            start_time = original_timestamps[0]['start']
            end_time = original_timestamps[-1]['end']
            total_duration = end_time - start_time
            
            # Create proportional timestamps for translated words
            translated_timestamps = []
            word_count = len(translated_words)
            
            for i, word in enumerate(translated_words):
                # Calculate proportional timing
                word_start = start_time + (i * total_duration / word_count)
                word_end = start_time + ((i + 1) * total_duration / word_count)
                
                translated_timestamps.append({
                    'word': word.strip(),
                    'start': round(word_start, 3),
                    'end': round(word_end, 3)
                })
            
            return translated_timestamps
            
        except Exception as e:
            print(f"--- ERROR creating translated timestamps: {str(e)} ---")
            return []
    
    def batch_translate_transcription(self, audio_file_path: str, target_languages: List[str]) -> Tuple[Dict, List[str]]:
        """
        Transcribe audio and translate to multiple languages
        
        Args:
            audio_file_path: Path to audio file
            target_languages: List of target language codes
        
        Returns:
            Tuple of (results_dict, log_messages)
        """
        logs = ["--- INFO: Starting batch translation process... ---"]
        
        # Get original transcription
        word_timestamps, transcription_logs = self.get_word_timestamps(audio_file_path)
        logs.extend(transcription_logs)
        
        if not word_timestamps:
            return {}, logs
        
        original_text = " ".join([d['word'] for d in word_timestamps])
        
        # Initialize results
        results = {
            'original': {
                'text': original_text,
                'timestamps': word_timestamps,
                'language': 'detected'
            },
            'translations': {}
        }
        
        # Translate to each target language
        if self.translator:
            for lang_code in target_languages:
                try:
                    translated_text, error = self.translator.translate_text(original_text, lang_code)
                    if translated_text:
                        translated_timestamps = self._create_translated_timestamps(
                            word_timestamps, original_text, translated_text
                        )
                        results['translations'][lang_code] = {
                            'text': translated_text,
                            'timestamps': translated_timestamps,
                            'success': True
                        }
                        logs.append(f"--- SUCCESS: Translation to {lang_code} completed ---")
                    else:
                        results['translations'][lang_code] = {
                            'text': original_text,
                            'timestamps': word_timestamps,
                            'success': False,
                            'error': error
                        }
                        logs.append(f"--- ERROR: Translation to {lang_code} failed: {error} ---")
                except Exception as e:
                    logs.append(f"--- FATAL ERROR translating to {lang_code}: {str(e)} ---")
        else:
            logs.append("--- WARNING: Translator not available for batch translation ---")
        
        return results, logs