Spaces:

Ken-Z
/

Latin-Conversation-Bot

Sleeping

File size: 17,185 Bytes

163b430

import gradio as gr
import time
import torch
import os
import gc
import psutil
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, VitsModel, VitsTokenizer
import soundfile as sf
import librosa
import tempfile
import google.generativeai as genai
from dotenv import load_dotenv

# Try to load .env file as fallback (for local development)
# HF Spaces will use secrets directly, so this won't override them
load_dotenv()

# Set environment variables for optimization
os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid warnings
os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"  # Use tmp for HF Spaces
os.environ["HF_HOME"] = "/tmp/huggingface"  # Cache location

def get_memory_usage():
    """Get current memory usage in MB"""
    process = psutil.Process(os.getpid())
    return process.memory_info().rss / 1024 / 1024

def log_memory(context=""):
    """Log current memory usage"""
    memory_mb = get_memory_usage()
    print(f"Memory usage {context}: {memory_mb:.1f} MB")

class LatinConversationBot:
    def __init__(self):
        log_memory("at initialization start")
        
        # Force CPU-only to reduce memory usage on Hugging Face Spaces
        self.device = "cpu"
        self.message_audio = {}
        self.message_texts = {}
        
        # Initialize Gemini using HF Spaces secret or .env fallback
        api_key = os.getenv("GEMINI_API_KEY")
        if not api_key:
            # More helpful error message for both HF Spaces and local dev
            raise ValueError(
                "GEMINI_API_KEY not found!\n"
                "For Hugging Face Spaces:\n"
                "  1. Go to your Space settings\n"
                "  2. Click on 'Repository secrets'\n" 
                "  3. Add 'GEMINI_API_KEY' with your API key\n"
                "For Local Development:\n"
                "  1. Create a .env file in the project root\n"
                "  2. Add: GEMINI_API_KEY=your_api_key_here"
            )
        genai.configure(api_key=api_key)
        self.gemini_model = genai.GenerativeModel('gemini-flash-latest')
        
        # Model containers
        self.asr_processor = None
        self.asr_model = None
        self.tts_model = None
        self.tts_tokenizer = None
        self.models_loaded = {"asr": False, "tts": False}
        
        print(f"Bot initialized on device: {self.device}")
        
        # Pre-load models at startup for faster response
        try:
            print("🚀 Starting model pre-loading...")
            self._preload_models()
            print("✅ All models loaded successfully!")
        except Exception as e:
            print(f"⚠️ Model pre-loading failed: {e}")
            print("Models will be loaded on-demand")
        
        log_memory("after initialization")
    
    def _preload_models(self):
        """Pre-load models at startup but manage memory efficiently"""
        try:
            # Load ASR first with optimizations
            print("📥 Loading ASR models...")
            self.asr_processor = AutoProcessor.from_pretrained(
                "ken-z/latin_whisper-small",
                cache_dir="/tmp/transformers_cache",
                local_files_only=False
            )
            self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
                "ken-z/latin_whisper-small",
                torch_dtype=torch.float32,
                cache_dir="/tmp/transformers_cache",
                low_cpu_mem_usage=True,  # Optimize memory usage
                local_files_only=False
            ).to(self.device)
            self.models_loaded["asr"] = True
            log_memory("after ASR loading")
            
            # Load TTS with optimizations
            print("🎵 Loading TTS models...")
            self.tts_tokenizer = VitsTokenizer.from_pretrained(
                "Ken-Z/latin_SpeechT5",
                cache_dir="/tmp/transformers_cache",
                local_files_only=False
            )
            self.tts_model = VitsModel.from_pretrained(
                "Ken-Z/latin_SpeechT5",
                torch_dtype=torch.float32,
                cache_dir="/tmp/transformers_cache",
                low_cpu_mem_usage=True,  # Optimize memory usage
                local_files_only=False
            ).to(self.device)
            self.models_loaded["tts"] = True
            log_memory("after TTS loading")
            
        except Exception as e:
            print(f"Error in model loading: {e}")
            # Fallback to lazy loading
            self.models_loaded = {"asr": False, "tts": False}
            raise e
    
    def _ensure_asr_loaded(self):
        """Ensure ASR models are loaded"""
        if not self.models_loaded["asr"]:
            print("Loading ASR models on-demand...")
            self.asr_processor = AutoProcessor.from_pretrained("ken-z/latin_whisper-small")
            self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
                "ken-z/latin_whisper-small", 
                torch_dtype=torch.float32
            ).to(self.device)
            self.models_loaded["asr"] = True
    
    def _ensure_tts_loaded(self):
        """Ensure TTS models are loaded"""
        if not self.models_loaded["tts"]:
            print("Loading TTS models on-demand...")
            self.tts_tokenizer = VitsTokenizer.from_pretrained("Ken-Z/latin_SpeechT5")
            self.tts_model = VitsModel.from_pretrained(
                "Ken-Z/latin_SpeechT5",
                torch_dtype=torch.float32
            ).to(self.device)
            self.models_loaded["tts"] = True
    
    def _cleanup_models(self):
        """Free up memory by clearing unused models"""
        log_memory("before cleanup")
        if self.asr_model is not None:
            del self.asr_model
            self.asr_model = None
            self.models_loaded["asr"] = False
        if self.asr_processor is not None:
            del self.asr_processor
            self.asr_processor = None
        if self.tts_model is not None:
            del self.tts_model
            self.tts_model = None
            self.models_loaded["tts"] = False
        if self.tts_tokenizer is not None:
            del self.tts_tokenizer
            self.tts_tokenizer = None
        gc.collect()
        log_memory("after cleanup")
        print("Models cleaned up from memory")
    
    def transcribe_audio(self, audio_path):
        try:
            # Ensure ASR models are loaded
            self._ensure_asr_loaded()
            
            audio, _ = librosa.load(audio_path, sr=16000)
            input_features = self.asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(self.device)
            with torch.no_grad():
                predicted_ids = self.asr_model.generate(input_features)
                result = self.asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
            
            # Clean up tensors but keep models loaded
            del input_features, predicted_ids
            gc.collect()
            
            return result
        except Exception as e:
            print(f"ASR Error: {str(e)}")
            return f"Error: {str(e)}"
    
    def _call_gemini(self, prompt):
        try:
            return self.gemini_model.generate_content(prompt).text.strip()
        except Exception as e:
            print(f"Gemini API error: {e}")
            return "Error: Gemini API not available"
    
    def generate_response(self, text):
        prompt = f"""You are a Latin conversation bot. Respond ONLY in Latin, keep responses to 1-2 sentences, use proper Classical Latin grammar with proper diacritics, and be conversational.

Examples: "Salve" → "Salve! Quid agis hodie?", "Hello" → "Salve! Latine loquere, quaeso!"

User: {text}
Response:"""
        return self._call_gemini(prompt)
    
    def improve_latin_grammar(self, text):
        prompt = f"""Fix Latin grammar, diacritics, and word order. Format:
CORRECTED: [corrected text]
EXPLANATION: [brief explanation of fixes only]

Text: {text}"""
        
        response = self._call_gemini(prompt)
        
        # Parse response
        corrected = explanation = ""
        for line in response.split('\n'):
            if line.startswith("CORRECTED:"):
                corrected = line[10:].strip()
            elif line.startswith("EXPLANATION:"):
                explanation = line[12:].strip()
        
        return {
            "corrected": corrected or text,
            "explanation": explanation or "No explanation provided."
        }
    
    def translate_latin(self, text, target_language):
        prompt = f"""Translate this Latin text to {target_language}. Return ONLY the translation, no explanations.

Latin text: {text}
{target_language} translation:"""
        return self._call_gemini(prompt)
    
    def synthesize_speech(self, text):
        try:
            # Ensure TTS models are loaded
            self._ensure_tts_loaded()
            
            inputs = self.tts_tokenizer(text, return_tensors="pt")
            inputs = {k: v.to(self.device) for k, v in inputs.items()}
            with torch.no_grad():
                speech = self.tts_model(**inputs).waveform.squeeze().cpu().numpy()
            
            # Clean up tensors but keep models loaded
            del inputs
            gc.collect()
            
            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
                sf.write(tmp_file.name, speech, samplerate=16000)
                return tmp_file.name
        except Exception as e:
            print(f"TTS error: {e}")
            return None
    
bot_instance = LatinConversationBot()

def add_message(history, message):
    for file_info in message["files"]:
        file_path = file_info.path if hasattr(file_info, 'path') else file_info
        if file_path.endswith(('.wav', '.mp3', '.m4a', '.ogg', '.flac')):
            transcription = bot_instance.transcribe_audio(file_path)
            history.append({"role": "user", "content": f"🎤 {transcription}"})
    
    if message["text"] and message["text"].strip():
        history.append({"role": "user", "content": message["text"]})
    
    return history, gr.MultimodalTextbox(value=None, interactive=False)

def get_dropdown_choices(history):
    """Generate all dropdown choices at once"""
    replay_choices = [(f"🔊 {text[:30]}{'...' if len(text) > 30 else ''}", msg_id) 
                     for msg_id, text in bot_instance.message_texts.items()]
    improve_choices = [(f"Message {i+1}: {msg['content'].replace('🎤 ', '')[:50]}{'...' if len(msg['content'].replace('🎤 ', '')) > 50 else ''}", i)
                      for i, msg in enumerate(history) if msg["role"] == "user"]
    translate_choices = [(f"Bot {i+1}: {msg['content'][:50]}{'...' if len(msg['content']) > 50 else ''}", i)
                        for i, msg in enumerate(history) if msg["role"] == "assistant"]
    return replay_choices, improve_choices, translate_choices

def bot(history):
    if not history:
        return history, None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
    
    last_message = history[-1]["content"]
    user_text = last_message.replace("🎤 ", "") if last_message.startswith("🎤 ") else last_message
    
    response_text = bot_instance.generate_response(user_text)
    message_id = f"msg_{len(history)}_{int(time.time())}"
    
    history.append({"role": "assistant", "content": response_text})
    
    audio_file = bot_instance.synthesize_speech(response_text)
    if audio_file:
        bot_instance.message_audio[message_id] = audio_file
        bot_instance.message_texts[message_id] = response_text
    
    replay_choices, improve_choices, translate_choices = get_dropdown_choices(history)
    return history, audio_file, gr.Dropdown(choices=replay_choices), gr.Dropdown(choices=improve_choices), gr.Dropdown(choices=translate_choices)

def improve_message_grammar(history, message_index):
    if not history or message_index < 0 or message_index >= len(history) or history[message_index]["role"] != "user":
        return history, ""
    
    original_text = history[message_index]["content"]
    prefix = "🎤 " if original_text.startswith("🎤 ") else ""
    text_to_improve = original_text.replace("🎤 ", "")
    
    improvement_result = bot_instance.improve_latin_grammar(text_to_improve)
    corrected_text = improvement_result["corrected"]
    explanation = improvement_result["explanation"]
    
    if corrected_text and corrected_text != text_to_improve:
        history[message_index]["content"] = f"{prefix}{corrected_text} ✨"
    
    return history, explanation

def clear_all_data():
    bot_instance.message_audio.clear()
    bot_instance.message_texts.clear()
    # Also clean up models to free memory
    bot_instance._cleanup_models()
    print("All data and models cleared from memory")
    return [], None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])

# Initialize the bot instance early
print("🚀 Initializing Latin Conversation Bot...")
bot_instance = LatinConversationBot()

with gr.Blocks(title="🏛️ Latin Conversation Bot", theme=gr.themes.Soft()) as demo:
    gr.Markdown("""
    # 🏛️ Latin Conversation Bot
    Speak or type in Latin for AI-powered conversations with speech synthesis and grammar improvement!
    """)

    
    chatbot = gr.Chatbot(type="messages", height=400, show_label=False)
    
    chat_input = gr.MultimodalTextbox(
        interactive=True, file_types=["audio"], placeholder="🎤 Record or type in Latin...",
        show_label=False, sources=["microphone", "upload"]
    )
    
    with gr.Row():
        audio_output = gr.Audio(label="🔊 Bot Response", autoplay=True, scale=2)
        replay_dropdown = gr.Dropdown(label="🔄 Replay Message", choices=[], scale=1)
    
    with gr.Row():
        improve_dropdown = gr.Dropdown(label="✨ Select Message to Improve", choices=[], scale=2)
        improve_btn = gr.Button("✨ Improve Grammar", size="sm", variant="secondary", scale=1)
    
    grammar_explanation = gr.Textbox(label="📚 Grammar Explanation", interactive=False, visible=False)
    
    with gr.Row():
        translate_dropdown = gr.Dropdown(label="🌍 Select Bot Message to Translate", choices=[], scale=2)
        language_dropdown = gr.Dropdown(
            label="Target Language", 
            choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Chinese", "Japanese"], 
            value="English", 
            scale=1
        )
        translate_btn = gr.Button("🌍 Translate", size="sm", variant="secondary", scale=1)
    
    translation_output = gr.Textbox(label="📝 Translation", interactive=False, visible=False)
    
    clear_btn = gr.Button("🗑️ Clear", size="sm")

    # Event handlers
    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
    bot_msg = chat_msg.then(bot, chatbot, [chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
    
    replay_dropdown.change(
        lambda msg_id: bot_instance.message_audio.get(msg_id) if msg_id else None,
        inputs=[replay_dropdown], outputs=[audio_output]
    )
    
    clear_btn.click(clear_all_data, outputs=[chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
    
    def improve_selected_message(history, selected_index):
        if selected_index is None:
            _, improve_choices, _ = get_dropdown_choices(history)
            return history, gr.Dropdown(choices=improve_choices), gr.Textbox(visible=False)
        
        improved_history, explanation = improve_message_grammar(history, selected_index)
        _, improve_choices, _ = get_dropdown_choices(improved_history)
        
        show_explanation = explanation and explanation != "No corrections needed."
        return improved_history, gr.Dropdown(choices=improve_choices), gr.Textbox(value=explanation if show_explanation else "", visible=show_explanation)
    
    def translate_selected_message(history, selected_index, target_language):
        if selected_index is None or not history or selected_index >= len(history) or history[selected_index]["role"] != "assistant":
            return gr.Textbox(visible=False)
        
        latin_text = history[selected_index]["content"]
        translation = bot_instance.translate_latin(latin_text, target_language)
        return gr.Textbox(value=f"Original: {latin_text}\n\n{target_language}: {translation}", visible=True)
    
    improve_btn.click(improve_selected_message, [chatbot, improve_dropdown], [chatbot, improve_dropdown, grammar_explanation])
    translate_btn.click(translate_selected_message, [chatbot, translate_dropdown, language_dropdown], [translation_output])

if __name__ == "__main__":
    # Launch with optimized settings for HF Spaces
    demo.launch(
        server_port=7860,  # Standard HF Spaces port
        share=False,
        show_error=True,
        quiet=False  # Show startup logs
    )