Ken commited on
Commit
163b430
Β·
1 Parent(s): 31f1596

feat: add app

Browse files
Files changed (5) hide show
  1. .dockerignore +32 -0
  2. .gitignore +3 -0
  3. README.md +10 -7
  4. app.py +403 -0
  5. requirements.txt +9 -0
.dockerignore ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore unnecessary files to reduce build time
2
+
3
+ **pycache**/
4
+ _.pyc
5
+ _.pyo
6
+ _.pyd
7
+ .Python
8
+ env/
9
+ pip-log.txt
10
+ pip-delete-this-directory.txt
11
+ .tox
12
+ .coverage
13
+ .coverage._
14
+ .cache
15
+ nosetests.xml
16
+ coverage.xml
17
+ _.cover
18
+ _.log
19
+ .git
20
+ .mypy_cache
21
+ .pytest_cache
22
+ .hypothesis
23
+
24
+ # Local development files
25
+
26
+ .env
27
+ .venv/
28
+ venv/
29
+ ENV/
30
+ env/
31
+ .DS_Store
32
+ \*.local
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ *.pyc
2
+ __pycache__/
3
+ .env
README.md CHANGED
@@ -1,14 +1,17 @@
1
  ---
2
- title: Latin Conversation Bot
3
- emoji: 🐨
4
- colorFrom: yellow
5
- colorTo: indigo
6
  sdk: gradio
7
- sdk_version: 5.47.2
8
  app_file: app.py
9
  pinned: false
10
  license: cc-by-4.0
11
- short_description: A Latin audio conversation bot
12
  ---
13
 
14
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
1
  ---
2
+ title: Latin Audio Chat Bot
3
+ emoji: πŸ›οΈ
4
+ colorFrom: purple
5
+ colorTo: pink
6
  sdk: gradio
 
7
  app_file: app.py
8
  pinned: false
9
  license: cc-by-4.0
10
+ short_description: Latin audio chat bot
11
  ---
12
 
13
+ # πŸ›οΈ Latin Audio Chat Bot
14
+
15
+ An app that allows users to chat in Latin using text or audio input. The app leverages the **Gemini Flash** for natural language processing, **ken-z/latin_whisper-small** for speech-to-text conversion, and **ken-z/latin_speecht5** for text-to-speech synthesis.
16
+
17
+ ---
app.py ADDED
@@ -0,0 +1,403 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import time
3
+ import torch
4
+ import os
5
+ import gc
6
+ import psutil
7
+ from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq, VitsModel, VitsTokenizer
8
+ import soundfile as sf
9
+ import librosa
10
+ import tempfile
11
+ import google.generativeai as genai
12
+ from dotenv import load_dotenv
13
+
14
+ # Try to load .env file as fallback (for local development)
15
+ # HF Spaces will use secrets directly, so this won't override them
16
+ load_dotenv()
17
+
18
+ # Set environment variables for optimization
19
+ os.environ["TOKENIZERS_PARALLELISM"] = "false" # Avoid warnings
20
+ os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache" # Use tmp for HF Spaces
21
+ os.environ["HF_HOME"] = "/tmp/huggingface" # Cache location
22
+
23
+ def get_memory_usage():
24
+ """Get current memory usage in MB"""
25
+ process = psutil.Process(os.getpid())
26
+ return process.memory_info().rss / 1024 / 1024
27
+
28
+ def log_memory(context=""):
29
+ """Log current memory usage"""
30
+ memory_mb = get_memory_usage()
31
+ print(f"Memory usage {context}: {memory_mb:.1f} MB")
32
+
33
+ class LatinConversationBot:
34
+ def __init__(self):
35
+ log_memory("at initialization start")
36
+
37
+ # Force CPU-only to reduce memory usage on Hugging Face Spaces
38
+ self.device = "cpu"
39
+ self.message_audio = {}
40
+ self.message_texts = {}
41
+
42
+ # Initialize Gemini using HF Spaces secret or .env fallback
43
+ api_key = os.getenv("GEMINI_API_KEY")
44
+ if not api_key:
45
+ # More helpful error message for both HF Spaces and local dev
46
+ raise ValueError(
47
+ "GEMINI_API_KEY not found!\n"
48
+ "For Hugging Face Spaces:\n"
49
+ " 1. Go to your Space settings\n"
50
+ " 2. Click on 'Repository secrets'\n"
51
+ " 3. Add 'GEMINI_API_KEY' with your API key\n"
52
+ "For Local Development:\n"
53
+ " 1. Create a .env file in the project root\n"
54
+ " 2. Add: GEMINI_API_KEY=your_api_key_here"
55
+ )
56
+ genai.configure(api_key=api_key)
57
+ self.gemini_model = genai.GenerativeModel('gemini-flash-latest')
58
+
59
+ # Model containers
60
+ self.asr_processor = None
61
+ self.asr_model = None
62
+ self.tts_model = None
63
+ self.tts_tokenizer = None
64
+ self.models_loaded = {"asr": False, "tts": False}
65
+
66
+ print(f"Bot initialized on device: {self.device}")
67
+
68
+ # Pre-load models at startup for faster response
69
+ try:
70
+ print("πŸš€ Starting model pre-loading...")
71
+ self._preload_models()
72
+ print("βœ… All models loaded successfully!")
73
+ except Exception as e:
74
+ print(f"⚠️ Model pre-loading failed: {e}")
75
+ print("Models will be loaded on-demand")
76
+
77
+ log_memory("after initialization")
78
+
79
+ def _preload_models(self):
80
+ """Pre-load models at startup but manage memory efficiently"""
81
+ try:
82
+ # Load ASR first with optimizations
83
+ print("πŸ“₯ Loading ASR models...")
84
+ self.asr_processor = AutoProcessor.from_pretrained(
85
+ "ken-z/latin_whisper-small",
86
+ cache_dir="/tmp/transformers_cache",
87
+ local_files_only=False
88
+ )
89
+ self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
90
+ "ken-z/latin_whisper-small",
91
+ torch_dtype=torch.float32,
92
+ cache_dir="/tmp/transformers_cache",
93
+ low_cpu_mem_usage=True, # Optimize memory usage
94
+ local_files_only=False
95
+ ).to(self.device)
96
+ self.models_loaded["asr"] = True
97
+ log_memory("after ASR loading")
98
+
99
+ # Load TTS with optimizations
100
+ print("🎡 Loading TTS models...")
101
+ self.tts_tokenizer = VitsTokenizer.from_pretrained(
102
+ "Ken-Z/latin_SpeechT5",
103
+ cache_dir="/tmp/transformers_cache",
104
+ local_files_only=False
105
+ )
106
+ self.tts_model = VitsModel.from_pretrained(
107
+ "Ken-Z/latin_SpeechT5",
108
+ torch_dtype=torch.float32,
109
+ cache_dir="/tmp/transformers_cache",
110
+ low_cpu_mem_usage=True, # Optimize memory usage
111
+ local_files_only=False
112
+ ).to(self.device)
113
+ self.models_loaded["tts"] = True
114
+ log_memory("after TTS loading")
115
+
116
+ except Exception as e:
117
+ print(f"Error in model loading: {e}")
118
+ # Fallback to lazy loading
119
+ self.models_loaded = {"asr": False, "tts": False}
120
+ raise e
121
+
122
+ def _ensure_asr_loaded(self):
123
+ """Ensure ASR models are loaded"""
124
+ if not self.models_loaded["asr"]:
125
+ print("Loading ASR models on-demand...")
126
+ self.asr_processor = AutoProcessor.from_pretrained("ken-z/latin_whisper-small")
127
+ self.asr_model = AutoModelForSpeechSeq2Seq.from_pretrained(
128
+ "ken-z/latin_whisper-small",
129
+ torch_dtype=torch.float32
130
+ ).to(self.device)
131
+ self.models_loaded["asr"] = True
132
+
133
+ def _ensure_tts_loaded(self):
134
+ """Ensure TTS models are loaded"""
135
+ if not self.models_loaded["tts"]:
136
+ print("Loading TTS models on-demand...")
137
+ self.tts_tokenizer = VitsTokenizer.from_pretrained("Ken-Z/latin_SpeechT5")
138
+ self.tts_model = VitsModel.from_pretrained(
139
+ "Ken-Z/latin_SpeechT5",
140
+ torch_dtype=torch.float32
141
+ ).to(self.device)
142
+ self.models_loaded["tts"] = True
143
+
144
+ def _cleanup_models(self):
145
+ """Free up memory by clearing unused models"""
146
+ log_memory("before cleanup")
147
+ if self.asr_model is not None:
148
+ del self.asr_model
149
+ self.asr_model = None
150
+ self.models_loaded["asr"] = False
151
+ if self.asr_processor is not None:
152
+ del self.asr_processor
153
+ self.asr_processor = None
154
+ if self.tts_model is not None:
155
+ del self.tts_model
156
+ self.tts_model = None
157
+ self.models_loaded["tts"] = False
158
+ if self.tts_tokenizer is not None:
159
+ del self.tts_tokenizer
160
+ self.tts_tokenizer = None
161
+ gc.collect()
162
+ log_memory("after cleanup")
163
+ print("Models cleaned up from memory")
164
+
165
+ def transcribe_audio(self, audio_path):
166
+ try:
167
+ # Ensure ASR models are loaded
168
+ self._ensure_asr_loaded()
169
+
170
+ audio, _ = librosa.load(audio_path, sr=16000)
171
+ input_features = self.asr_processor(audio, sampling_rate=16000, return_tensors="pt").input_features.to(self.device)
172
+ with torch.no_grad():
173
+ predicted_ids = self.asr_model.generate(input_features)
174
+ result = self.asr_processor.batch_decode(predicted_ids, skip_special_tokens=True)[0].strip()
175
+
176
+ # Clean up tensors but keep models loaded
177
+ del input_features, predicted_ids
178
+ gc.collect()
179
+
180
+ return result
181
+ except Exception as e:
182
+ print(f"ASR Error: {str(e)}")
183
+ return f"Error: {str(e)}"
184
+
185
+ def _call_gemini(self, prompt):
186
+ try:
187
+ return self.gemini_model.generate_content(prompt).text.strip()
188
+ except Exception as e:
189
+ print(f"Gemini API error: {e}")
190
+ return "Error: Gemini API not available"
191
+
192
+ def generate_response(self, text):
193
+ prompt = f"""You are a Latin conversation bot. Respond ONLY in Latin, keep responses to 1-2 sentences, use proper Classical Latin grammar with proper diacritics, and be conversational.
194
+
195
+ Examples: "Salve" β†’ "Salve! Quid agis hodie?", "Hello" β†’ "Salve! Latine loquere, quaeso!"
196
+
197
+ User: {text}
198
+ Response:"""
199
+ return self._call_gemini(prompt)
200
+
201
+ def improve_latin_grammar(self, text):
202
+ prompt = f"""Fix Latin grammar, diacritics, and word order. Format:
203
+ CORRECTED: [corrected text]
204
+ EXPLANATION: [brief explanation of fixes only]
205
+
206
+ Text: {text}"""
207
+
208
+ response = self._call_gemini(prompt)
209
+
210
+ # Parse response
211
+ corrected = explanation = ""
212
+ for line in response.split('\n'):
213
+ if line.startswith("CORRECTED:"):
214
+ corrected = line[10:].strip()
215
+ elif line.startswith("EXPLANATION:"):
216
+ explanation = line[12:].strip()
217
+
218
+ return {
219
+ "corrected": corrected or text,
220
+ "explanation": explanation or "No explanation provided."
221
+ }
222
+
223
+ def translate_latin(self, text, target_language):
224
+ prompt = f"""Translate this Latin text to {target_language}. Return ONLY the translation, no explanations.
225
+
226
+ Latin text: {text}
227
+ {target_language} translation:"""
228
+ return self._call_gemini(prompt)
229
+
230
+ def synthesize_speech(self, text):
231
+ try:
232
+ # Ensure TTS models are loaded
233
+ self._ensure_tts_loaded()
234
+
235
+ inputs = self.tts_tokenizer(text, return_tensors="pt")
236
+ inputs = {k: v.to(self.device) for k, v in inputs.items()}
237
+ with torch.no_grad():
238
+ speech = self.tts_model(**inputs).waveform.squeeze().cpu().numpy()
239
+
240
+ # Clean up tensors but keep models loaded
241
+ del inputs
242
+ gc.collect()
243
+
244
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as tmp_file:
245
+ sf.write(tmp_file.name, speech, samplerate=16000)
246
+ return tmp_file.name
247
+ except Exception as e:
248
+ print(f"TTS error: {e}")
249
+ return None
250
+
251
+ bot_instance = LatinConversationBot()
252
+
253
+ def add_message(history, message):
254
+ for file_info in message["files"]:
255
+ file_path = file_info.path if hasattr(file_info, 'path') else file_info
256
+ if file_path.endswith(('.wav', '.mp3', '.m4a', '.ogg', '.flac')):
257
+ transcription = bot_instance.transcribe_audio(file_path)
258
+ history.append({"role": "user", "content": f"🎀 {transcription}"})
259
+
260
+ if message["text"] and message["text"].strip():
261
+ history.append({"role": "user", "content": message["text"]})
262
+
263
+ return history, gr.MultimodalTextbox(value=None, interactive=False)
264
+
265
+ def get_dropdown_choices(history):
266
+ """Generate all dropdown choices at once"""
267
+ replay_choices = [(f"πŸ”Š {text[:30]}{'...' if len(text) > 30 else ''}", msg_id)
268
+ for msg_id, text in bot_instance.message_texts.items()]
269
+ improve_choices = [(f"Message {i+1}: {msg['content'].replace('🎀 ', '')[:50]}{'...' if len(msg['content'].replace('🎀 ', '')) > 50 else ''}", i)
270
+ for i, msg in enumerate(history) if msg["role"] == "user"]
271
+ translate_choices = [(f"Bot {i+1}: {msg['content'][:50]}{'...' if len(msg['content']) > 50 else ''}", i)
272
+ for i, msg in enumerate(history) if msg["role"] == "assistant"]
273
+ return replay_choices, improve_choices, translate_choices
274
+
275
+ def bot(history):
276
+ if not history:
277
+ return history, None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
278
+
279
+ last_message = history[-1]["content"]
280
+ user_text = last_message.replace("🎀 ", "") if last_message.startswith("🎀 ") else last_message
281
+
282
+ response_text = bot_instance.generate_response(user_text)
283
+ message_id = f"msg_{len(history)}_{int(time.time())}"
284
+
285
+ history.append({"role": "assistant", "content": response_text})
286
+
287
+ audio_file = bot_instance.synthesize_speech(response_text)
288
+ if audio_file:
289
+ bot_instance.message_audio[message_id] = audio_file
290
+ bot_instance.message_texts[message_id] = response_text
291
+
292
+ replay_choices, improve_choices, translate_choices = get_dropdown_choices(history)
293
+ return history, audio_file, gr.Dropdown(choices=replay_choices), gr.Dropdown(choices=improve_choices), gr.Dropdown(choices=translate_choices)
294
+
295
+ def improve_message_grammar(history, message_index):
296
+ if not history or message_index < 0 or message_index >= len(history) or history[message_index]["role"] != "user":
297
+ return history, ""
298
+
299
+ original_text = history[message_index]["content"]
300
+ prefix = "🎀 " if original_text.startswith("🎀 ") else ""
301
+ text_to_improve = original_text.replace("🎀 ", "")
302
+
303
+ improvement_result = bot_instance.improve_latin_grammar(text_to_improve)
304
+ corrected_text = improvement_result["corrected"]
305
+ explanation = improvement_result["explanation"]
306
+
307
+ if corrected_text and corrected_text != text_to_improve:
308
+ history[message_index]["content"] = f"{prefix}{corrected_text} ✨"
309
+
310
+ return history, explanation
311
+
312
+ def clear_all_data():
313
+ bot_instance.message_audio.clear()
314
+ bot_instance.message_texts.clear()
315
+ # Also clean up models to free memory
316
+ bot_instance._cleanup_models()
317
+ print("All data and models cleared from memory")
318
+ return [], None, gr.Dropdown(choices=[]), gr.Dropdown(choices=[]), gr.Dropdown(choices=[])
319
+
320
+ # Initialize the bot instance early
321
+ print("πŸš€ Initializing Latin Conversation Bot...")
322
+ bot_instance = LatinConversationBot()
323
+
324
+ with gr.Blocks(title="πŸ›οΈ Latin Conversation Bot", theme=gr.themes.Soft()) as demo:
325
+ gr.Markdown("""
326
+ # πŸ›οΈ Latin Conversation Bot
327
+ Speak or type in Latin for AI-powered conversations with speech synthesis and grammar improvement!
328
+ """)
329
+
330
+
331
+ chatbot = gr.Chatbot(type="messages", height=400, show_label=False)
332
+
333
+ chat_input = gr.MultimodalTextbox(
334
+ interactive=True, file_types=["audio"], placeholder="🎀 Record or type in Latin...",
335
+ show_label=False, sources=["microphone", "upload"]
336
+ )
337
+
338
+ with gr.Row():
339
+ audio_output = gr.Audio(label="πŸ”Š Bot Response", autoplay=True, scale=2)
340
+ replay_dropdown = gr.Dropdown(label="πŸ”„ Replay Message", choices=[], scale=1)
341
+
342
+ with gr.Row():
343
+ improve_dropdown = gr.Dropdown(label="✨ Select Message to Improve", choices=[], scale=2)
344
+ improve_btn = gr.Button("✨ Improve Grammar", size="sm", variant="secondary", scale=1)
345
+
346
+ grammar_explanation = gr.Textbox(label="πŸ“š Grammar Explanation", interactive=False, visible=False)
347
+
348
+ with gr.Row():
349
+ translate_dropdown = gr.Dropdown(label="🌍 Select Bot Message to Translate", choices=[], scale=2)
350
+ language_dropdown = gr.Dropdown(
351
+ label="Target Language",
352
+ choices=["English", "Spanish", "French", "German", "Italian", "Portuguese", "Chinese", "Japanese"],
353
+ value="English",
354
+ scale=1
355
+ )
356
+ translate_btn = gr.Button("🌍 Translate", size="sm", variant="secondary", scale=1)
357
+
358
+ translation_output = gr.Textbox(label="πŸ“ Translation", interactive=False, visible=False)
359
+
360
+ clear_btn = gr.Button("πŸ—‘οΈ Clear", size="sm")
361
+
362
+ # Event handlers
363
+ chat_msg = chat_input.submit(add_message, [chatbot, chat_input], [chatbot, chat_input])
364
+ bot_msg = chat_msg.then(bot, chatbot, [chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
365
+ bot_msg.then(lambda: gr.MultimodalTextbox(interactive=True), None, [chat_input])
366
+
367
+ replay_dropdown.change(
368
+ lambda msg_id: bot_instance.message_audio.get(msg_id) if msg_id else None,
369
+ inputs=[replay_dropdown], outputs=[audio_output]
370
+ )
371
+
372
+ clear_btn.click(clear_all_data, outputs=[chatbot, audio_output, replay_dropdown, improve_dropdown, translate_dropdown])
373
+
374
+ def improve_selected_message(history, selected_index):
375
+ if selected_index is None:
376
+ _, improve_choices, _ = get_dropdown_choices(history)
377
+ return history, gr.Dropdown(choices=improve_choices), gr.Textbox(visible=False)
378
+
379
+ improved_history, explanation = improve_message_grammar(history, selected_index)
380
+ _, improve_choices, _ = get_dropdown_choices(improved_history)
381
+
382
+ show_explanation = explanation and explanation != "No corrections needed."
383
+ return improved_history, gr.Dropdown(choices=improve_choices), gr.Textbox(value=explanation if show_explanation else "", visible=show_explanation)
384
+
385
+ def translate_selected_message(history, selected_index, target_language):
386
+ if selected_index is None or not history or selected_index >= len(history) or history[selected_index]["role"] != "assistant":
387
+ return gr.Textbox(visible=False)
388
+
389
+ latin_text = history[selected_index]["content"]
390
+ translation = bot_instance.translate_latin(latin_text, target_language)
391
+ return gr.Textbox(value=f"Original: {latin_text}\n\n{target_language}: {translation}", visible=True)
392
+
393
+ improve_btn.click(improve_selected_message, [chatbot, improve_dropdown], [chatbot, improve_dropdown, grammar_explanation])
394
+ translate_btn.click(translate_selected_message, [chatbot, translate_dropdown, language_dropdown], [translation_output])
395
+
396
+ if __name__ == "__main__":
397
+ # Launch with optimized settings for HF Spaces
398
+ demo.launch(
399
+ server_port=7860, # Standard HF Spaces port
400
+ share=False,
401
+ show_error=True,
402
+ quiet=False # Show startup logs
403
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ transformers>=4.21.0
3
+ torch>=1.9.0
4
+ torchaudio>=0.9.0
5
+ librosa
6
+ soundfile
7
+ google-generativeai
8
+ python-dotenv
9
+ psutil