import gradio as gr import requests import uuid import base64 import json from pathlib import Path # ── CONFIG ──────────────────────────────────────────────────────────────────── BUILD_PERSONA_URL = "https://sheikhmdrakib-career--build-persona.modal.run" CHAT_URL = "https://sheikhmdrakib-career--chat.modal.run" TRANSCRIBE_URL = "https://sheikhmdrakib-career--transcribe.modal.run" VISION_URL = "https://sheikhmdrakib-career--describe-photo.modal.run" OCR_URL = "https://sheikhmdrakib-career--ocr-document.modal.run" TTS_URL = "https://sheikhmdrakib-career--text-to-speech.modal.run" LIST_PERSONAS_URL = "https://sheikhmdrakib-career--list-personas.modal.run" # ───────────────────────────────────────────────────────────────────────────── def encode_file(path): with open(path, "rb") as f: return base64.b64encode(f.read()).decode() def build_persona(name, relationship, text_input, photo_captions, voice_file, photo_files, scanned_files): if not name.strip(): return "❌ Please enter the person's name.", None, gr.update() texts = [t.strip() for t in text_input.strip().split("---") if t.strip()] if text_input.strip() else [] captions = [c.strip() for c in photo_captions.strip().split("\n") if c.strip()] if photo_captions.strip() else [] voice_transcripts = [] # We will build a step-by-step log to show the user exactly what succeeded/failed status_log = [] if not texts and not captions and voice_file is None and not photo_files and not scanned_files: return "❌ Please provide at least one input.", None, gr.update() # 1. Transcribe voice note (Cohere ASR) if voice_file is not None: try: r = requests.post(TRANSCRIBE_URL, json={ "audio_b64": encode_file(voice_file), "filename": Path(voice_file).name, }, timeout=180) if r.status_code == 200: transcript = r.json().get("transcript", "") if transcript: voice_transcripts.append(transcript) status_log.append("✅ Voice note transcribed successfully.") else: status_log.append("⚠️ Voice note processed, but no text was found.") else: status_log.append(f"❌ Voice transcription failed (HTTP {r.status_code}): {r.text}") except Exception as e: status_log.append(f"❌ Voice transcription failed: {e}") # 2. Describe uploaded photos (MiniCPM-V) if photo_files: success_count = 0 for i, photo in enumerate(photo_files): try: r = requests.post(VISION_URL, json={"image_b64": encode_file(photo)}, timeout=180) if r.status_code == 200: desc = r.json().get("description", "") if desc: captions.append(desc) success_count += 1 else: status_log.append(f"❌ Photo {i+1} description failed (HTTP {r.status_code}).") except Exception as e: status_log.append(f"❌ Photo {i+1} description failed: {e}") if success_count > 0: status_log.append(f"✅ {success_count}/{len(photo_files)} photos described successfully.") # 3. OCR scanned letters (Nemotron Parse) if scanned_files: success_count = 0 for i, scan in enumerate(scanned_files): try: r = requests.post(OCR_URL, json={"image_b64": encode_file(scan)}, timeout=180) if r.status_code == 200: ocr_text = r.json().get("text", "") if ocr_text: texts.append(ocr_text) success_count += 1 else: status_log.append(f"❌ Scan {i+1} OCR failed (HTTP {r.status_code}).") except Exception as e: status_log.append(f"❌ Scan {i+1} OCR failed: {e}") if success_count > 0: status_log.append(f"✅ {success_count}/{len(scanned_files)} scanned documents read successfully.") # Check if we have AT LEAST SOME data to build the persona if not texts and not captions and not voice_transcripts: status_log.append("\n❌ **ABORTED:** All AI processing failed, and no manual text/captions were provided. Cannot build persona.") return "\n\n".join(status_log), None, gr.update() # 4. Build persona (Qwen 32B) persona_id = str(uuid.uuid4())[:8] try: r = requests.post(BUILD_PERSONA_URL, json={ "persona_id": persona_id, "name": name.strip(), "relationship": relationship.strip(), "texts": texts, "photo_captions": captions, "voice_transcripts": voice_transcripts, }, timeout=1200) if r.status_code == 200: result = r.json() if result.get("success"): persona = result["persona"] summary = f"""\n🎉 **{name}'s memory has been successfully preserved!** **Persona ID:** `{persona_id}` **Personality:** {', '.join(persona.get('personality_traits', [])[:3])} **Language:** {persona.get('language', 'Auto')} **Memories captured:** {len(persona.get('key_memories', []))} Go to the **💬 Talk** tab and enter the Persona ID.""" status_log.append(summary) return "\n".join(status_log), persona_id, gr.update(value=persona_id) else: status_log.append(f"\n❌ Persona builder failed: {result}") else: status_log.append(f"\n❌ Persona builder failed (HTTP {r.status_code}): {r.text}") except Exception as e: status_log.append(f"\n❌ Persona builder failed: {e}") # Fallback return if the final step failed return "\n\n".join(status_log), None, gr.update() def chat_with_persona(persona_id, message, history, language, enable_voice): history = history or [] if not persona_id.strip(): history = history + [{"role": "assistant", "content": "⚠️ Please enter a Persona ID first."}] return "", history, None if not message.strip(): return "", history, None try: r = requests.post(CHAT_URL, json={ "persona_id": persona_id.strip(), "history": [{"role": m["role"], "content": m["content"]} for m in history], "message": message.strip(), "language": language, }, timeout=180) result = r.json() response_text = result.get("text", result.get("response", "...")) voice_desc = result.get("voice_description", "warm elderly voice") except Exception as e: response_text = f"⚠️ Error: {e}" voice_desc = "warm elderly voice" history = history + [ {"role": "user", "content": message}, {"role": "assistant", "content": response_text}, ] # Generate voice response (VoxCPM2) audio_path = None if enable_voice: try: r = requests.post(TTS_URL, json={ "text": response_text, "voice_description": voice_desc, }, timeout=180) if r.status_code == 200: import tempfile with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: f.write(r.content) audio_path = f.name except Exception: pass return "", history, audio_path def load_personas(): for attempt in range(2): try: r = requests.get(LIST_PERSONAS_URL, timeout=90) personas = r.json().get("personas", []) if not personas: return "No personas saved yet." lines = [f"**{p['name']}** ({p['relationship']}) — ID: `{p['id']}`" for p in personas] return "\n\n".join(lines) except Exception as e: if attempt == 0: continue return f"⚠️ Modal is waking up, please try again in 30 seconds." # ── UI ──────────────────────────────────────────────────────────────────────── css = """ @import url('https://fonts.googleapis.com/css2?family=Lora:ital,wght@0,400;0,600;1,400&family=Source+Sans+3:wght@300;400;600&display=swap'); * { box-sizing: border-box; } body, .gradio-container { background: #0e0b08 !important; font-family: 'Source Sans 3', sans-serif !important; color: #e8dcc8 !important; } .gradio-container { max-width: 900px !important; margin: 0 auto !important; } h1, h2, h3 { font-family: 'Lora', serif !important; color: #d4a96a !important; } .header-title { text-align: center; font-family: 'Lora', serif; font-size: 2.4em; color: #d4a96a; margin: 24px 0 4px 0; } .header-sub { text-align: center; color: #8a7560; font-size: 1em; margin-bottom: 28px; font-style: italic; } .divider { border: none; border-top: 1px solid #2a2015; margin: 20px 0; } label { color: #8a7560 !important; font-size: 0.85em !important; letter-spacing: 0.08em !important; text-transform: uppercase !important; } textarea, input[type="text"] { background: #1a1510 !important; border: 1px solid #3a2e1e !important; color: #e8dcc8 !important; border-radius: 6px !important; } .model-badge { display: inline-block; background: #1f1710; border: 1px solid #3a2e1e; border-radius: 4px; padding: 2px 8px; font-size: 0.75em; color: #8a7560; margin: 2px; } """ with gr.Blocks(title="Memory Keeper") as demo: gr.HTML("""
Upload letters, photos, voice notes, or scanned documents. Each is processed by a specialized AI model.
") with gr.Row(): name_input = gr.Textbox(label="Their Name", placeholder="e.g. Dadu, Nana, Abba...") relationship_input = gr.Textbox(label="Your Relationship", placeholder="e.g. Grandfather, Mother...") text_input = gr.Textbox( label="📝 Letters / Diary Entries / Writings", placeholder="Paste their writings here. Separate multiple entries with ---", lines=6, ) with gr.Row(): photo_files = gr.File( label="🖼️ Photos (MiniCPM-V 4.6 will describe them)", file_count="multiple", file_types=["image"], ) scanned_files = gr.File( label="📄 Scanned Letters/Docs (Nemotron Parse OCR)", file_count="multiple", file_types=["image"], ) photo_captions = gr.Textbox( label="🖼️ Manual Photo Captions (optional, one per line)", placeholder="Or describe photos manually here...", lines=3, ) voice_input = gr.Audio( label="🎤 Voice Note (Cohere Transcribe ASR)", type="filepath", sources=["upload", "microphone"], ) build_btn = gr.Button("✨ Preserve Their Memory", variant="primary") build_output = gr.Markdown() persona_id_state = gr.State() persona_id_hidden = gr.Textbox(visible=False) build_btn.click( fn=build_persona, inputs=[name_input, relationship_input, text_input, photo_captions, voice_input, photo_files, scanned_files], outputs=[build_output, persona_id_state, persona_id_hidden], show_progress="full", ) # ── TAB 2: TALK ── with gr.Tab("💬 Talk to Them"): gr.HTML("Enter the Persona ID and start a conversation. Enable voice to hear them speak.
") with gr.Row(): persona_id_input = gr.Textbox(label="Persona ID", placeholder="e.g. a3f9c2b1") language_select = gr.Dropdown( label="Language", choices=["auto", "English", "Bengali", "Hindi", "Chinese", "Japanese", "Korean", "Thai"], value="auto", ) enable_voice = gr.Checkbox(label="🔊 Voice Response (VoxCPM2)", value=False) chatbot = gr.Chatbot(label="", height=420, placeholder="*Their words will appear here...*") with gr.Row(): msg_input = gr.Textbox(label="Your message", placeholder="What would you like to say?", lines=2, scale=4) send_btn = gr.Button("Send →", variant="primary", scale=1) voice_output = gr.Audio(label="🔊 Voice Response", visible=True, autoplay=True) clear_btn = gr.Button("Clear conversation", variant="secondary", size="sm") chat_history = gr.State([]) send_btn.click( fn=chat_with_persona, inputs=[persona_id_input, msg_input, chat_history, language_select, enable_voice], outputs=[msg_input, chatbot, voice_output], ) msg_input.submit( fn=chat_with_persona, inputs=[persona_id_input, msg_input, chat_history, language_select, enable_voice], outputs=[msg_input, chatbot, voice_output], ) clear_btn.click(lambda: ([], []), outputs=[chat_history, chatbot]) # ── TAB 3: SAVED ── with gr.Tab("📁 Saved Memories"): refresh_btn = gr.Button("🔄 Load Saved Memories", variant="secondary") personas_output = gr.Markdown() refresh_btn.click(fn=load_personas, outputs=personas_output) gr.HTML("""Built for Build Small Hackathon · 6 AI Models · Hosted on Modal + Hugging Face
""") if __name__ == "__main__": demo.launch(css=css, share=True)