Spaces:

Ken-Z
/

Latin-Conversation-Bot

Running

App Files Files Community

Ken commited on Oct 1, 2025

Commit

163b430

1 Parent(s): 31f1596

feat: add app

Browse files

Files changed (5) hide show

.dockerignore +32 -0
.gitignore +3 -0
README.md +10 -7
app.py +403 -0
requirements.txt +9 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,32 @@

+# Ignore unnecessary files to reduce build time
+**pycache**/
+_.pyc
+_.pyo
+_.pyd
+.Python
+env/
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage._
+.cache
+nosetests.xml
+coverage.xml
+_.cover
+_.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis
+# Local development files
+.env
+.venv/
+venv/
+ENV/
+env/
+.DS_Store
+\*.local

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+*.pyc
+__pycache__/
+.env

README.md CHANGED Viewed

@@ -1,14 +1,17 @@
 ---
-title: Latin Conversation Bot
-emoji: 🐨
-colorFrom: yellow
-colorTo: indigo
 sdk: gradio
-sdk_version: 5.47.2
 app_file: app.py
 pinned: false
 license: cc-by-4.0
-short_description: A Latin audio conversation bot
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Latin Audio Chat Bot
+emoji: 🏛️
+colorFrom: purple
+colorTo: pink
 sdk: gradio
 app_file: app.py
 pinned: false
 license: cc-by-4.0
+short_description: Latin audio chat bot
 ---
+# 🏛️ Latin Audio Chat Bot
+An app that allows users to chat in Latin using text or audio input. The app leverages the **Gemini Flash** for natural language processing, **ken-z/latin_whisper-small** for speech-to-text conversion, and **ken-z/latin_speecht5** for text-to-speech synthesis.
+---

app.py ADDED Viewed

	@@ -0,0 +1,403 @@

+import gradio as gr
+import time
+import torch
+import os
+import gc
+import psutil
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, VitsModel, VitsTokenizer
+import soundfile as sf
+import librosa
+import tempfile
+import google.generativeai as genai
+from dotenv import load_dotenv
+# Try to load .env file as fallback (for local development)
+# HF Spaces will use secrets directly, so this won't override them
+load_dotenv()
+# Set environment variables for optimization
+os.environ["TOKENIZERS_PARALLELISM"] = "false"  # Avoid warnings
+os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"  # Use tmp for HF Spaces
+os.environ["HF_HOME"] = "/tmp/huggingface"  # Cache location
+def get_memory_usage():
+    """Get current memory usage in MB"""
+    process = psutil.Process(os.getpid())
+    return process.memory_info().rss / 1024 / 1024
+def log_memory(context=""):
+    """Log current memory usage"""
+    memory_mb = get_memory_usage()
+    print(f"Memory usage {context}: {memory_mb:.1f} MB")
+class LatinConversationBot:
+    def __init__(self):
+        log_memory("at initialization start")
+        # Force CPU-only to reduce memory usage on Hugging Face Spaces
+        self.device = "cpu"
+        self.message_audio = {}
+        self.message_texts = {}
+        # Initialize Gemini using HF Spaces secret or .env fallback
+        api_key = os.getenv("GEMINI_API_KEY")
+        if not api_key:
+            # More helpful error message for both HF Spaces and local dev
+            raise ValueError(
+                "GEMINI_API_KEY not found!\n"
+                "For Hugging Face Spaces:\n"
+                "  1. Go to your Space settings\n"
+                "  2. Click on 'Repository secrets'\n"
+                "  3. Add 'GEMINI_API_KEY' with your API key\n"
+                "For Local Development:\n"
+                "  1. Create a .env file in the project root\n"
+                "  2. Add: GEMINI_API_KEY=your_api_key_here"
+            )
+        genai.configure(api_key=api_key)
+        self.gemini_model = genai.GenerativeModel('gemini-flash-latest')
+        # Model containers
+        self.asr_processor = None
+        self.asr_model = None
+        self.tts_model = None
+        self.tts_tokenizer = None
+        self.models_loaded = {"asr": False, "tts": False}
+        print(f"Bot initialized on device: {self.device}")
+        # Pre-load models at startup for faster response
+        try:
+            print("🚀 Starting model pre-loading...")
+            self._preload_models()
+            print("✅ All models loaded successfully!")
+        except Exception as e:
+            print(f"⚠️ Model pre-loading failed: {e}")
+            print("Models will be loaded on-demand")
+        log_memory("after initialization")
+    def _preload_models(self):
+        """Pre-load models at startup but manage memory efficiently"""
+        try:
+            # Load ASR first with optimizations
+            print("📥 Loading ASR models...")
+            self.asr_processor = AutoProcessor.from_pretrained(
+                "ken-z/latin_whisper-small",
+                cache_dir="/tmp/transformers_cache",
+                local_files_only=False
+            )
+            self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                "ken-z/latin_whisper-small",
+                torch_dtype=torch.float32,
+                cache_dir="/tmp/transformers_cache",
+                low_cpu_mem_usage=True,  # Optimize memory usage
+                local_files_only=False
+            ).to(self.device)
+            self.models_loaded["asr"] = True
+            log_memory("after ASR loading")
+            # Load TTS with optimizations
+            print("🎵 Loading TTS models...")
+            self.tts_tokenizer = VitsTokenizer.from_pretrained(
+                "Ken-Z/latin_SpeechT5",
+                cache_dir="/tmp/transformers_cache",
+                local_files_only=False
+            )
+            self.tts_model = VitsModel.from_pretrained(
+                "Ken-Z/latin_SpeechT5",
+                torch_dtype=torch.float32,
+                cache_dir="/tmp/transformers_cache",
+                low_cpu_mem_usage=True,  # Optimize memory usage
+                local_files_only=False
+            ).to(self.device)
+            self.models_loaded["tts"] = True
+            log_memory("after TTS loading")
+        except Exception as e:
+            print(f"Error in model loading: {e}")
+            # Fallback to lazy loading
+            self.models_loaded = {"asr": False, "tts": False}
+            raise e
+    def _ensure_asr_loaded(self):
+        """Ensure ASR models are loaded"""
+        if not self.models_loaded["asr"]:
+            print("Loading ASR models on-demand...")
+            self.asr_processor = AutoProcessor.from_pretrained("ken-z/latin_whisper-small")
+            self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
+                "ken-z/latin_whisper-small",
+                torch_dtype=torch.float32
+            ).to(self.device)
+            self.models_loaded["asr"] = True
+    def _ensure_tts_loaded(self):
+        """Ensure TTS models are loaded"""
+        if not self.models_loaded["tts"]:
+            print("Loading TTS models on-demand...")
+            self.tts_tokenizer = VitsTokenizer.from_pretrained("Ken-Z/latin_SpeechT5")
+            self.tts_model = VitsModel.from_pretrained(
+                "Ken-Z/latin_SpeechT5",
+                torch_dtype=torch.float32
+            ).to(self.device)
+            self.models_loaded["tts"] = True
+    def _cleanup_models(self):
+        """Free up memory by clearing unused models"""
+        log_memory("before cleanup")
+        if self.asr_model is not None:
+            del self.asr_model
+            self.asr_model = None
+            self.models_loaded["asr"] = False
+        if self.asr_processor is not None:
+            del self.asr_processor
+            self.asr_processor = None
+        if self.tts_model is not None:
+            del self.tts_model
+            self.tts_model = None
+            self.models_loaded["tts"] = False
+        if self.tts_tokenizer is not None:
+            del self.tts_tokenizer
+            self.tts_tokenizer = None
+        gc.collect()
+        log_memory("after cleanup")
+        print("Models cleaned up from memory")
+    def transcribe_audio(self, audio_path):
+        try:
+            # Ensure ASR models are loaded
+            self._ensure_asr_loaded()
+            audio, _ = librosa.load(audio_path, sr=16000)
+            input_features = self.asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(self.device)
+            with torch.no_grad():
+                predicted_ids = self.asr_model.generate(input_features)
+                result = self.asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
+            # Clean up tensors but keep models loaded
+            del input_features, predicted_ids
+            gc.collect()
+            return result
+        except Exception as e:
+            print(f"ASR Error: {str(e)}")
+            return f"Error: {str(e)}"
+    def _call_gemini(self, prompt):
+        try:
+            return self.gemini_model.generate_content(prompt).text.strip()
+        except Exception as e:
+            print(f"Gemini API error: {e}")
+            return "Error: Gemini API not available"
+    def generate_response(self, text):
+        prompt = f"""You are a Latin conversation bot. Respond ONLY in Latin, keep responses to 1-2 sentences, use proper Classical Latin grammar with proper diacritics, and be conversational.
+Examples: "Salve" → "Salve! Quid agis hodie?", "Hello" → "Salve! Latine loquere, quaeso!"
+User: {text}
+Response:"""
+        return self._call_gemini(prompt)
+    def improve_latin_grammar(self, text):
+        prompt = f"""Fix Latin grammar, diacritics, and word order. Format:
+CORRECTED: [corrected text]
+EXPLANATION: [brief explanation of fixes only]
+Text: {text}"""
+        response = self._call_gemini(prompt)
+        # Parse response
+        corrected = explanation = ""
+        for line in response.split('\n'):
+            if line.startswith("CORRECTED:"):
+                corrected = line[10:].strip()
+            elif line.startswith("EXPLANATION:"):
+                explanation = line[12:].strip()
+        return {
+            "corrected": corrected or text,
+            "explanation": explanation or "No explanation provided."
+        }
+    def translate_latin(self, text, target_language):
+        prompt = f"""Translate this Latin text to {target_language}. Return ONLY the translation, no explanations.
+Latin text: {text}
+{target_language} translation:"""
+        return self._call_gemini(prompt)
+    def synthesize_speech(self, text):
+        try:
+            # Ensure TTS models are loaded
+            self._ensure_tts_loaded()
+            inputs = self.tts_tokenizer(text, return_tensors="pt")
+            inputs = {k: v.to(self.device) for k, v in inputs.items()}
+            with torch.no_grad():
+                speech = self.tts_model(**inputs).waveform.squeeze().cpu().numpy()
+            # Clean up tensors but keep models loaded
+            del inputs
+            gc.collect()
+            with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
+                sf.write(tmp_file.name, speech, samplerate=16000)
+                return tmp_file.name
+        except Exception as e:
+            print(f"TTS error: {e}")
+            return None
+bot_instance = LatinConversationBot()
+def add_message(history, message):
+    for file_info in message["files"]:
+        file_path = file_info.path if hasattr(file_info, 'path') else file_info
+        if file_path.endswith(('.wav', '.mp3', '.m4a', '.ogg', '.flac')):
+            transcription = bot_instance.transcribe_audio(file_path)
+            history.append({"role": "user", "content": f"🎤 {transcription}"})
+    if message["text"] and message["text"].strip():
+        history.append({"role": "user", "content": message["text"]})
+    return history, gr.MultimodalTextbox(value=None, interactive=False)
+def get_dropdown_choices(history):
+    """Generate all dropdown choices at once"""
+    replay_choices = [(f"🔊 {text[:30]}{'...' if len(text) > 30 else ''}", msg_id)
+                     for msg_id, text in bot_instance.message_texts.items()]
+    improve_choices = [(f"Message {i+1}: {msg['content'].replace('🎤 ', '')[:50]}{'...' if len(msg['content'].replace('🎤 ', '')) > 50 else ''}", i)
+                      for i, msg in enumerate(history) if msg["role"] == "user"]
+    translate_choices = [(f"Bot {i+1}: {msg['content'][:50]}{'...' if len(msg['content']) > 50 else ''}", i)
+                        for i, msg in enumerate(history) if msg["role"] == "assistant"]
+    return replay_choices, improve_choices, translate_choices
+def bot(history):
+    if not history:
+        return history, None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
+    last_message = history[-1]["content"]
+    user_text = last_message.replace("🎤 ", "") if last_message.startswith("🎤 ") else last_message
+    response_text = bot_instance.generate_response(user_text)
+    message_id = f"msg_{len(history)}_{int(time.time())}"
+    history.append({"role": "assistant", "content": response_text})
+    audio_file = bot_instance.synthesize_speech(response_text)
+    if audio_file:
+        bot_instance.message_audio[message_id] = audio_file
+        bot_instance.message_texts[message_id] = response_text
+    replay_choices, improve_choices, translate_choices = get_dropdown_choices(history)
+    return history, audio_file, gr.Dropdown(choices=replay_choices), gr.Dropdown(choices=improve_choices), gr.Dropdown(choices=translate_choices)
+def improve_message_grammar(history, message_index):
+    if not history or message_index < 0 or message_index >= len(history) or history[message_index]["role"] != "user":
+        return history, ""
+    original_text = history[message_index]["content"]
+    prefix = "🎤 " if original_text.startswith("🎤 ") else ""
+    text_to_improve = original_text.replace("🎤 ", "")
+    improvement_result = bot_instance.improve_latin_grammar(text_to_improve)
+    corrected_text = improvement_result["corrected"]
+    explanation = improvement_result["explanation"]
+    if corrected_text and corrected_text != text_to_improve:
+        history[message_index]["content"] = f"{prefix}{corrected_text} ✨"
+    return history, explanation
+def clear_all_data():
+    bot_instance.message_audio.clear()
+    bot_instance.message_texts.clear()
+    # Also clean up models to free memory
+    bot_instance._cleanup_models()
+    print("All data and models cleared from memory")
+    return [], None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
+# Initialize the bot instance early
+print("🚀 Initializing Latin Conversation Bot...")
+bot_instance = LatinConversationBot()
+with gr.Blocks(title="🏛️ Latin Conversation Bot", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("""
+    # 🏛️ Latin Conversation Bot
+    Speak or type in Latin for AI-powered conversations with speech synthesis and grammar improvement!
+    """)
+    chatbot = gr.Chatbot(type="messages", height=400, show_label=False)
+    chat_input = gr.MultimodalTextbox(
+        interactive=True, file_types=["audio"], placeholder="🎤 Record or type in Latin...",
+        show_label=False, sources=["microphone", "upload"]
+    )
+    with gr.Row():
+        audio_output = gr.Audio(label="🔊 Bot Response", autoplay=True, scale=2)
+        replay_dropdown = gr.Dropdown(label="🔄 Replay Message", choices=[], scale=1)
+    with gr.Row():
+        improve_dropdown = gr.Dropdown(label="✨ Select Message to Improve", choices=[], scale=2)
+        improve_btn = gr.Button("✨ Improve Grammar", size="sm", variant="secondary", scale=1)
+    grammar_explanation = gr.Textbox(label="📚 Grammar Explanation", interactive=False, visible=False)
+    with gr.Row():
+        translate_dropdown = gr.Dropdown(label="🌍 Select Bot Message to Translate", choices=[], scale=2)
+        language_dropdown = gr.Dropdown(
+            label="Target Language",
+            choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Chinese", "Japanese"],
+            value="English",
+            scale=1
+        )
+        translate_btn = gr.Button("🌍 Translate", size="sm", variant="secondary", scale=1)
+    translation_output = gr.Textbox(label="📝 Translation", interactive=False, visible=False)
+    clear_btn = gr.Button("🗑️ Clear", size="sm")
+    # Event handlers
+    chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
+    bot_msg = chat_msg.then(bot, chatbot, [chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
+    bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
+    replay_dropdown.change(
+        lambda msg_id: bot_instance.message_audio.get(msg_id) if msg_id else None,
+        inputs=[replay_dropdown], outputs=[audio_output]
+    )
+    clear_btn.click(clear_all_data, outputs=[chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
+    def improve_selected_message(history, selected_index):
+        if selected_index is None:
+            _, improve_choices, _ = get_dropdown_choices(history)
+            return history, gr.Dropdown(choices=improve_choices), gr.Textbox(visible=False)
+        improved_history, explanation = improve_message_grammar(history, selected_index)
+        _, improve_choices, _ = get_dropdown_choices(improved_history)
+        show_explanation = explanation and explanation != "No corrections needed."
+        return improved_history, gr.Dropdown(choices=improve_choices), gr.Textbox(value=explanation if show_explanation else "", visible=show_explanation)
+    def translate_selected_message(history, selected_index, target_language):
+        if selected_index is None or not history or selected_index >= len(history) or history[selected_index]["role"] != "assistant":
+            return gr.Textbox(visible=False)
+        latin_text = history[selected_index]["content"]
+        translation = bot_instance.translate_latin(latin_text, target_language)
+        return gr.Textbox(value=f"Original: {latin_text}\n\n{target_language}: {translation}", visible=True)
+    improve_btn.click(improve_selected_message, [chatbot, improve_dropdown], [chatbot, improve_dropdown, grammar_explanation])
+    translate_btn.click(translate_selected_message, [chatbot, translate_dropdown, language_dropdown], [translation_output])
+if __name__ == "__main__":
+    # Launch with optimized settings for HF Spaces
+    demo.launch(
+        server_port=7860,  # Standard HF Spaces port
+        share=False,
+        show_error=True,
+        quiet=False  # Show startup logs
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+gradio>=4.0.0
+transformers>=4.21.0
+torch>=1.9.0
+torchaudio>=0.9.0
+librosa
+soundfile
+google-generativeai
+python-dotenv
+psutil