import gradio as gr import time import torch import os import gc import psutil from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, VitsModel, VitsTokenizer import soundfile as sf import librosa import tempfile import google.generativeai as genai from dotenv import load_dotenv # Try to load .env file as fallback (for local development) # HF Spaces will use secrets directly, so this won't override them load_dotenv() # Set environment variables for optimization os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" # Use tmp for HF Spaces os.environ["HF_HOME"] = "/tmp/huggingface" # Cache location def get_memory_usage(): """Get current memory usage in MB""" process = psutil.Process(os.getpid()) return process.memory_info().rss / 1024 / 1024 def log_memory(context=""): """Log current memory usage""" memory_mb = get_memory_usage() print(f"Memory usage {context}: {memory_mb:.1f} MB") class LatinConversationBot: def __init__(self): log_memory("at initialization start") # Force CPU-only to reduce memory usage on Hugging Face Spaces self.device = "cpu" self.message_audio = {} self.message_texts = {} # Initialize Gemini using HF Spaces secret or .env fallback api_key = os.getenv("GEMINI_API_KEY") if not api_key: # More helpful error message for both HF Spaces and local dev raise ValueError( "GEMINI_API_KEY not found!\n" "For Hugging Face Spaces:\n" " 1. Go to your Space settings\n" " 2. Click on 'Repository secrets'\n" " 3. Add 'GEMINI_API_KEY' with your API key\n" "For Local Development:\n" " 1. Create a .env file in the project root\n" " 2. Add: GEMINI_API_KEY=your_api_key_here" ) genai.configure(api_key=api_key) self.gemini_model = genai.GenerativeModel('gemini-flash-latest') # Model containers self.asr_processor = None self.asr_model = None self.tts_model = None self.tts_tokenizer = None self.models_loaded = {"asr": False, "tts": False} print(f"Bot initialized on device: {self.device}") # Pre-load models at startup for faster response try: print("🚀 Starting model pre-loading...") self._preload_models() print("✅ All models loaded successfully!") except Exception as e: print(f"⚠️ Model pre-loading failed: {e}") print("Models will be loaded on-demand") log_memory("after initialization") def _preload_models(self): """Pre-load models at startup but manage memory efficiently""" try: # Load ASR first with optimizations print("📥 Loading ASR models...") self.asr_processor = AutoProcessor.from_pretrained( "ken-z/latin_whisper-small", cache_dir="/tmp/transformers_cache", local_files_only=False ) self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained( "ken-z/latin_whisper-small", torch_dtype=torch.float32, cache_dir="/tmp/transformers_cache", low_cpu_mem_usage=True, # Optimize memory usage local_files_only=False ).to(self.device) self.models_loaded["asr"] = True log_memory("after ASR loading") # Load TTS with optimizations print("🎵 Loading TTS models...") self.tts_tokenizer = VitsTokenizer.from_pretrained( "Ken-Z/latin_SpeechT5", cache_dir="/tmp/transformers_cache", local_files_only=False ) self.tts_model = VitsModel.from_pretrained( "Ken-Z/latin_SpeechT5", torch_dtype=torch.float32, cache_dir="/tmp/transformers_cache", low_cpu_mem_usage=True, # Optimize memory usage local_files_only=False ).to(self.device) self.models_loaded["tts"] = True log_memory("after TTS loading") except Exception as e: print(f"Error in model loading: {e}") # Fallback to lazy loading self.models_loaded = {"asr": False, "tts": False} raise e def _ensure_asr_loaded(self): """Ensure ASR models are loaded""" if not self.models_loaded["asr"]: print("Loading ASR models on-demand...") self.asr_processor = AutoProcessor.from_pretrained("ken-z/latin_whisper-small") self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained( "ken-z/latin_whisper-small", torch_dtype=torch.float32 ).to(self.device) self.models_loaded["asr"] = True def _ensure_tts_loaded(self): """Ensure TTS models are loaded""" if not self.models_loaded["tts"]: print("Loading TTS models on-demand...") self.tts_tokenizer = VitsTokenizer.from_pretrained("Ken-Z/latin_SpeechT5") self.tts_model = VitsModel.from_pretrained( "Ken-Z/latin_SpeechT5", torch_dtype=torch.float32 ).to(self.device) self.models_loaded["tts"] = True def _cleanup_models(self): """Free up memory by clearing unused models""" log_memory("before cleanup") if self.asr_model is not None: del self.asr_model self.asr_model = None self.models_loaded["asr"] = False if self.asr_processor is not None: del self.asr_processor self.asr_processor = None if self.tts_model is not None: del self.tts_model self.tts_model = None self.models_loaded["tts"] = False if self.tts_tokenizer is not None: del self.tts_tokenizer self.tts_tokenizer = None gc.collect() log_memory("after cleanup") print("Models cleaned up from memory") def transcribe_audio(self, audio_path): try: # Ensure ASR models are loaded self._ensure_asr_loaded() audio, _ = librosa.load(audio_path, sr=16000) input_features = self.asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(self.device) with torch.no_grad(): predicted_ids = self.asr_model.generate(input_features) result = self.asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip() # Clean up tensors but keep models loaded del input_features, predicted_ids gc.collect() return result except Exception as e: print(f"ASR Error: {str(e)}") return f"Error: {str(e)}" def _call_gemini(self, prompt): try: return self.gemini_model.generate_content(prompt).text.strip() except Exception as e: print(f"Gemini API error: {e}") return "Error: Gemini API not available" def generate_response(self, text): prompt = f"""You are a Latin conversation bot. Respond ONLY in Latin, keep responses to 1-2 sentences, use proper Classical Latin grammar with proper diacritics, and be conversational. Examples: "Salve" → "Salve! Quid agis hodie?", "Hello" → "Salve! Latine loquere, quaeso!" User: {text} Response:""" return self._call_gemini(prompt) def improve_latin_grammar(self, text): prompt = f"""Fix Latin grammar, diacritics, and word order. Format: CORRECTED: [corrected text] EXPLANATION: [brief explanation of fixes only] Text: {text}""" response = self._call_gemini(prompt) # Parse response corrected = explanation = "" for line in response.split('\n'): if line.startswith("CORRECTED:"): corrected = line[10:].strip() elif line.startswith("EXPLANATION:"): explanation = line[12:].strip() return { "corrected": corrected or text, "explanation": explanation or "No explanation provided." } def translate_latin(self, text, target_language): prompt = f"""Translate this Latin text to {target_language}. Return ONLY the translation, no explanations. Latin text: {text} {target_language} translation:""" return self._call_gemini(prompt) def synthesize_speech(self, text): try: # Ensure TTS models are loaded self._ensure_tts_loaded() inputs = self.tts_tokenizer(text, return_tensors="pt") inputs = {k: v.to(self.device) for k, v in inputs.items()} with torch.no_grad(): speech = self.tts_model(**inputs).waveform.squeeze().cpu().numpy() # Clean up tensors but keep models loaded del inputs gc.collect() with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file: sf.write(tmp_file.name, speech, samplerate=16000) return tmp_file.name except Exception as e: print(f"TTS error: {e}") return None bot_instance = LatinConversationBot() def add_message(history, message): for file_info in message["files"]: file_path = file_info.path if hasattr(file_info, 'path') else file_info if file_path.endswith(('.wav', '.mp3', '.m4a', '.ogg', '.flac')): transcription = bot_instance.transcribe_audio(file_path) history.append({"role": "user", "content": f"🎤 {transcription}"}) if message["text"] and message["text"].strip(): history.append({"role": "user", "content": message["text"]}) return history, gr.MultimodalTextbox(value=None, interactive=False) def get_dropdown_choices(history): """Generate all dropdown choices at once""" replay_choices = [(f"🔊 {text[:30]}{'...' if len(text) > 30 else ''}", msg_id) for msg_id, text in bot_instance.message_texts.items()] improve_choices = [(f"Message {i+1}: {msg['content'].replace('🎤 ', '')[:50]}{'...' if len(msg['content'].replace('🎤 ', '')) > 50 else ''}", i) for i, msg in enumerate(history) if msg["role"] == "user"] translate_choices = [(f"Bot {i+1}: {msg['content'][:50]}{'...' if len(msg['content']) > 50 else ''}", i) for i, msg in enumerate(history) if msg["role"] == "assistant"] return replay_choices, improve_choices, translate_choices def bot(history): if not history: return history, None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]) last_message = history[-1]["content"] user_text = last_message.replace("🎤 ", "") if last_message.startswith("🎤 ") else last_message response_text = bot_instance.generate_response(user_text) message_id = f"msg_{len(history)}_{int(time.time())}" history.append({"role": "assistant", "content": response_text}) audio_file = bot_instance.synthesize_speech(response_text) if audio_file: bot_instance.message_audio[message_id] = audio_file bot_instance.message_texts[message_id] = response_text replay_choices, improve_choices, translate_choices = get_dropdown_choices(history) return history, audio_file, gr.Dropdown(choices=replay_choices), gr.Dropdown(choices=improve_choices), gr.Dropdown(choices=translate_choices) def improve_message_grammar(history, message_index): if not history or message_index < 0 or message_index >= len(history) or history[message_index]["role"] != "user": return history, "" original_text = history[message_index]["content"] prefix = "🎤 " if original_text.startswith("🎤 ") else "" text_to_improve = original_text.replace("🎤 ", "") improvement_result = bot_instance.improve_latin_grammar(text_to_improve) corrected_text = improvement_result["corrected"] explanation = improvement_result["explanation"] if corrected_text and corrected_text != text_to_improve: history[message_index]["content"] = f"{prefix}{corrected_text} ✨" return history, explanation def clear_all_data(): bot_instance.message_audio.clear() bot_instance.message_texts.clear() # Also clean up models to free memory bot_instance._cleanup_models() print("All data and models cleared from memory") return [], None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[]) # Initialize the bot instance early print("🚀 Initializing Latin Conversation Bot...") bot_instance = LatinConversationBot() with gr.Blocks(title="🏛️ Latin Conversation Bot", theme=gr.themes.Soft()) as demo: gr.Markdown(""" # 🏛️ Latin Conversation Bot Speak or type in Latin for AI-powered conversations with speech synthesis and grammar improvement! """) chatbot = gr.Chatbot(type="messages", height=400, show_label=False) chat_input = gr.MultimodalTextbox( interactive=True, file_types=["audio"], placeholder="🎤 Record or type in Latin...", show_label=False, sources=["microphone", "upload"] ) with gr.Row(): audio_output = gr.Audio(label="🔊 Bot Response", autoplay=True, scale=2) replay_dropdown = gr.Dropdown(label="🔄 Replay Message", choices=[], scale=1) with gr.Row(): improve_dropdown = gr.Dropdown(label="✨ Select Message to Improve", choices=[], scale=2) improve_btn = gr.Button("✨ Improve Grammar", size="sm", variant="secondary", scale=1) grammar_explanation = gr.Textbox(label="📚 Grammar Explanation", interactive=False, visible=False) with gr.Row(): translate_dropdown = gr.Dropdown(label="🌍 Select Bot Message to Translate", choices=[], scale=2) language_dropdown = gr.Dropdown( label="Target Language", choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Chinese", "Japanese"], value="English", scale=1 ) translate_btn = gr.Button("🌍 Translate", size="sm", variant="secondary", scale=1) translation_output = gr.Textbox(label="📝 Translation", interactive=False, visible=False) clear_btn = gr.Button("🗑️ Clear", size="sm") # Event handlers chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input]) bot_msg = chat_msg.then(bot, chatbot, [chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown]) bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input]) replay_dropdown.change( lambda msg_id: bot_instance.message_audio.get(msg_id) if msg_id else None, inputs=[replay_dropdown], outputs=[audio_output] ) clear_btn.click(clear_all_data, outputs=[chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown]) def improve_selected_message(history, selected_index): if selected_index is None: _, improve_choices, _ = get_dropdown_choices(history) return history, gr.Dropdown(choices=improve_choices), gr.Textbox(visible=False) improved_history, explanation = improve_message_grammar(history, selected_index) _, improve_choices, _ = get_dropdown_choices(improved_history) show_explanation = explanation and explanation != "No corrections needed." return improved_history, gr.Dropdown(choices=improve_choices), gr.Textbox(value=explanation if show_explanation else "", visible=show_explanation) def translate_selected_message(history, selected_index, target_language): if selected_index is None or not history or selected_index >= len(history) or history[selected_index]["role"] != "assistant": return gr.Textbox(visible=False) latin_text = history[selected_index]["content"] translation = bot_instance.translate_latin(latin_text, target_language) return gr.Textbox(value=f"Original: {latin_text}\n\n{target_language}: {translation}", visible=True) improve_btn.click(improve_selected_message, [chatbot, improve_dropdown], [chatbot, improve_dropdown, grammar_explanation]) translate_btn.click(translate_selected_message, [chatbot, translate_dropdown, language_dropdown], [translation_output]) if __name__ == "__main__": # Launch with optimized settings for HF Spaces demo.launch( server_port=7860, # Standard HF Spaces port share=False, show_error=True, quiet=False # Show startup logs )