Spaces:
Runtime error
Runtime error
| import modal | |
| import json | |
| import os | |
| import base64 | |
| from pathlib import Path | |
| app = modal.App("memory-keeper") | |
| # ── Volumes ─────────────────────────────────────────────────────────────────── | |
| volume = modal.Volume.from_name("memory-keeper-personas", create_if_missing=True) | |
| hf_cache_vol = modal.Volume.from_name("memory-keeper-hf-cache", create_if_missing=True) | |
| # ── MiniCPM Hosted API ──────────────────────────────────────────────────────── | |
| MINICPM_LLM_URL = "http://35.203.155.71:8001/v1" # MiniCPM4.1-8B | |
| MINICPM_VISION_URL = "http://35.203.155.71:8003/v1" # MiniCPM-V-4.6 | |
| MINICPM_AUTH = "Bearer sk-minicpm-V8bcD-YTAMxECagaKOnbwTCN69IlN2LhSezGOgq2Ues" | |
| MINICPM_LLM_MODEL = "MiniCPM4.1-8B" | |
| MINICPM_VIS_MODEL = "MiniCPM-V-4.6" | |
| # ── Model IDs (Modal-hosted) ────────────────────────────────────────────────── | |
| ASR_MODEL = "CohereLabs/cohere-transcribe-03-2026" | |
| OCR_MODEL = "nvidia/NVIDIA-Nemotron-Parse-v1.2" | |
| TTS_MODEL = "openbmb/VoxCPM2" | |
| MULTILINGUAL_LM_FIRE = "CohereLabs/tiny-aya-fire" # South Asian (Bengali, Hindi, Urdu) | |
| MULTILINGUAL_LM_WATER = "CohereLabs/tiny-aya-water" # Asia Pacific | |
| MINUTES = 60 | |
| # ── Images ──────────────────────────────────────────────────────────────────── | |
| base_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install("openai", "requests", "fastapi[standard]", "huggingface_hub") | |
| ) | |
| asr_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "torch", "transformers>=4.50.0", "torchaudio", | |
| "huggingface_hub", "soundfile", "fastapi[standard]", | |
| ) | |
| ) | |
| ocr_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "torch", "torchvision", "transformers>=4.50.0", | |
| "Pillow", "huggingface_hub", "fastapi[standard]", | |
| ) | |
| ) | |
| tts_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "torch", "soundfile", "huggingface_hub", "fastapi[standard]", | |
| "voxcpm", | |
| ) | |
| ) | |
| multilingual_image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .pip_install( | |
| "torch", "transformers>=4.50.0", | |
| "huggingface_hub", "fastapi[standard]", | |
| ) | |
| ) | |
| # ── Helper: MiniCPM client ──────────────────────────────────────────────────── | |
| def get_llm_client(): | |
| from openai import OpenAI | |
| return OpenAI( | |
| base_url=MINICPM_LLM_URL, | |
| api_key=MINICPM_AUTH.replace("Bearer ", ""), | |
| ) | |
| def get_vision_client(): | |
| from openai import OpenAI | |
| return OpenAI( | |
| base_url=MINICPM_VISION_URL, | |
| api_key=MINICPM_AUTH.replace("Bearer ", ""), | |
| ) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 1. ASR — Cohere Transcribe 2B | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def transcribe_audio(audio_bytes: bytes, filename: str = "audio.wav") -> str: | |
| import tempfile, torch, soundfile as sf | |
| from transformers import pipeline | |
| pipe = pipeline( | |
| "automatic-speech-recognition", | |
| model=ASR_MODEL, | |
| trust_remote_code=True, | |
| device="cuda", | |
| torch_dtype=torch.float16, | |
| ) | |
| with tempfile.NamedTemporaryFile(suffix=Path(filename).suffix, delete=False) as f: | |
| f.write(audio_bytes) | |
| tmp_path = f.name | |
| result = pipe(tmp_path, return_timestamps=False) | |
| os.unlink(tmp_path) | |
| return result["text"] if isinstance(result, dict) else str(result) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 2. OCR — Nemotron Parse v1.2 | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def ocr_document(image_bytes: bytes) -> str: | |
| import tempfile, torch | |
| from PIL import Image | |
| from transformers import AutoModelForImageTextToText, AutoProcessor | |
| processor = AutoProcessor.from_pretrained(OCR_MODEL, trust_remote_code=True) | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| OCR_MODEL, trust_remote_code=True, | |
| torch_dtype=torch.float16, device_map="auto", | |
| ) | |
| model.eval() | |
| with tempfile.NamedTemporaryFile(suffix=".jpg", delete=False) as f: | |
| f.write(image_bytes) | |
| tmp_path = f.name | |
| image = Image.open(tmp_path).convert("RGB") | |
| os.unlink(tmp_path) | |
| messages = [{"role": "user", "content": [ | |
| {"type": "image"}, | |
| {"type": "text", "text": "Extract all text from this document. Preserve structure and formatting."}, | |
| ]}] | |
| inputs = processor.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, | |
| return_dict=True, return_tensors="pt", | |
| ).to(model.device) | |
| generated_ids = model.generate(**inputs, max_new_tokens=2048) | |
| trimmed = [out[len(inp):] for inp, out in zip(inputs.input_ids, generated_ids)] | |
| return processor.batch_decode(trimmed, skip_special_tokens=True)[0] | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 3. TTS — VoxCPM2 | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def text_to_speech(text: str, voice_description: str = None) -> bytes: | |
| import soundfile as sf, io | |
| from voxcpm import VoxCPM | |
| model = VoxCPM.from_pretrained(TTS_MODEL, load_denoiser=False) | |
| if voice_description: | |
| text = f"({voice_description}){text}" | |
| wav = model.generate(text=text, cfg_value=2.0, inference_timesteps=10) | |
| buf = io.BytesIO() | |
| sf.write(buf, wav, model.tts_model.sample_rate, format="WAV") | |
| return buf.getvalue() | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 4. Multilingual — Tiny Aya Fire (Bengali/South Asian fallback) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def chat_multilingual(system_prompt: str, history: list, user_message: str, model_id: str = None) -> str: | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM | |
| if model_id is None: | |
| model_id = MULTILINGUAL_LM_FIRE | |
| tokenizer = AutoTokenizer.from_pretrained(model_id) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, torch_dtype=torch.float16, device_map="auto", | |
| ) | |
| model.eval() | |
| messages = [{"role": "system", "content": system_prompt}] | |
| for msg in history[-8:]: | |
| messages.append({"role": msg["role"], "content": msg["content"]}) | |
| messages.append({"role": "user", "content": user_message}) | |
| input_ids = tokenizer.apply_chat_template( | |
| messages, tokenize=True, add_generation_prompt=True, return_tensors="pt", | |
| ).to(model.device) | |
| gen_tokens = model.generate( | |
| input_ids, max_new_tokens=512, do_sample=True, | |
| temperature=0.8, top_p=0.95, | |
| ) | |
| output = gen_tokens[0][input_ids.shape[-1]:] | |
| return tokenizer.decode(output, skip_special_tokens=True) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 5. Photo Description — MiniCPM-V 4.6 (hosted API) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def describe_photo(image_bytes: bytes, prompt: str = None) -> str: | |
| import base64 | |
| client = get_vision_client() | |
| img_b64 = base64.b64encode(image_bytes).decode() | |
| if prompt is None: | |
| prompt = ( | |
| "Describe this person in detail: their appearance, expression, what they are doing, " | |
| "the setting, and any emotional tone you sense. " | |
| "This will help reconstruct their memory and personality." | |
| ) | |
| response = client.chat.completions.create( | |
| model=MINICPM_VIS_MODEL, | |
| messages=[{"role": "user", "content": [ | |
| {"type": "image_url", "image_url": {"url": f"data:image/jpeg;base64,{img_b64}"}}, | |
| {"type": "text", "text": prompt}, | |
| ]}], | |
| max_tokens=512, | |
| temperature=0.3, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 6. Persona Builder — MiniCPM4.1-8B (hosted API) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| def build_persona( | |
| persona_id: str, name: str, relationship: str, | |
| texts: list, photo_captions: list, voice_transcripts: list, | |
| ) -> dict: | |
| client = get_llm_client() | |
| all_content = [] | |
| if texts: | |
| all_content.append("=== LETTERS / DIARY ENTRIES ===\n" + "\n---\n".join(texts)) | |
| if photo_captions: | |
| all_content.append("=== PHOTO DESCRIPTIONS ===\n" + "\n---\n".join(photo_captions)) | |
| if voice_transcripts: | |
| all_content.append("=== VOICE TRANSCRIPTS ===\n" + "\n---\n".join(voice_transcripts)) | |
| combined = "\n\n".join(all_content) | |
| prompt = f"""You are helping preserve the memory of {name} ({relationship}). | |
| Below is everything we have from them: | |
| {combined} | |
| Extract a rich persona profile. Return ONLY valid JSON, no markdown, no backticks: | |
| {{ | |
| "name": "{name}", | |
| "relationship": "{relationship}", | |
| "personality_traits": ["5-8 traits"], | |
| "speech_style": "how they speak, tone, vocabulary, sentence length", | |
| "common_phrases": ["phrases or expressions they often used"], | |
| "key_memories": ["10-15 specific memories or life events"], | |
| "values": ["what they cared about most"], | |
| "language": "primary language (English/Bengali/Hindi/mixed)", | |
| "emotional_tone": "overall emotional quality", | |
| "topics_they_loved": ["subjects they talked about often"], | |
| "advice_they_gave": ["wisdom or advice they shared"], | |
| "voice_description": "describe their voice: age, gender, tone, accent e.g. elderly Bengali man, warm gentle voice", | |
| "system_prompt": "A 300-word system prompt starting with: You are {name}..." | |
| }}""" | |
| response = client.chat.completions.create( | |
| model=MINICPM_LLM_MODEL, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.3, | |
| max_tokens=2048, | |
| ) | |
| import re | |
| raw = response.choices[0].message.content.strip() | |
| print(f"RAW RESPONSE (first 500 chars): {raw[:500]}") | |
| # Strip <think>...</think> tags | |
| raw = re.sub(r'<think>.*?</think>', '', raw, flags=re.DOTALL).strip() | |
| print(f"AFTER THINK STRIP (first 500 chars): {raw[:500]}") | |
| # Strip markdown code blocks | |
| if "```" in raw: | |
| parts = raw.split("```") | |
| for part in parts: | |
| if part.startswith("json"): | |
| part = part[4:] | |
| part = part.strip() | |
| if part.startswith("{"): | |
| raw = part | |
| break | |
| # Extract JSON object | |
| json_match = re.search(r'[{].*[}]', raw, re.DOTALL) | |
| if json_match: | |
| raw = json_match.group(0) | |
| print(f"FINAL RAW FOR PARSING (first 300 chars): {raw[:300]}") | |
| persona = json.loads(raw) | |
| os.makedirs("/personas", exist_ok=True) | |
| with open(f"/personas/{persona_id}.json", "w", encoding="utf-8") as f: | |
| json.dump(persona, f, ensure_ascii=False, indent=2) | |
| volume.commit() | |
| return persona | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 7. Chat — MiniCPM4.1-8B (hosted API) | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| BENGALI_CHARS = set("অআইঈউঊঋএঐওঔকখগঘঙচছজঝঞটঠডঢণতথদধনপফবভমযরলশষসহড়ঢ়য়ংঃ") | |
| # Asia Pacific: Chinese, Japanese, Korean, Thai, Vietnamese tones etc. | |
| ASIA_PACIFIC_RANGES = [(0x4E00, 0x9FFF), (0x3040, 0x30FF), (0xAC00, 0xD7AF), (0x0E00, 0x0E7F)] | |
| def is_asia_pacific(text): | |
| for ch in text: | |
| cp = ord(ch) | |
| if any(lo <= cp <= hi for lo, hi in ASIA_PACIFIC_RANGES): | |
| return True | |
| return False | |
| def chat_with_persona( | |
| persona_id: str, history: list, user_message: str, language: str = "auto", | |
| ) -> dict: | |
| persona_path = f"/personas/{persona_id}.json" | |
| if not os.path.exists(persona_path): | |
| raise FileNotFoundError(f"Persona '{persona_id}' not found.") | |
| with open(persona_path, "r", encoding="utf-8") as f: | |
| persona = json.load(f) | |
| system_prompt = persona.get("system_prompt", f"You are {persona['name']}.") | |
| voice_desc = persona.get("voice_description", "warm elderly voice") | |
| full_system = f"""{system_prompt} | |
| IMPORTANT: | |
| - You ARE {persona['name']}. Never break character. | |
| - Use their speech style, phrases, and memories naturally. | |
| - Be warm and personal, not like an AI. | |
| - Detect the user's language and respond in the same language. | |
| - Keep responses 2-4 sentences unless sharing a story.""" | |
| import re | |
| is_bengali = any(c in BENGALI_CHARS for c in user_message) | |
| is_ap = is_asia_pacific(user_message) | |
| # All languages through MiniCPM4.1-8B (handles Bengali/Hindi/Chinese natively) | |
| # Tiny Aya Fire/Water kept for dedicated multilingual endpoint (future use) | |
| client = get_llm_client() | |
| # Add language instruction to system prompt | |
| if is_bengali or language == "Bengali": | |
| full_system = full_system + "\n\nIMPORTANT: The user is writing in Bengali. You MUST respond in Bengali only." | |
| elif is_ap or language in ("Chinese", "Japanese", "Korean", "Thai"): | |
| full_system = full_system + "\n\nIMPORTANT: Respond in " + str(language) + " only." | |
| messages = [{"role": "system", "content": full_system}] | |
| for msg in history[-10:]: | |
| messages.append({"role": msg["role"], "content": msg["content"]}) | |
| messages.append({"role": "user", "content": user_message}) | |
| response = client.chat.completions.create( | |
| model=MINICPM_LLM_MODEL, | |
| messages=messages, | |
| temperature=0.8, | |
| max_tokens=1024, | |
| stop=None, | |
| ) | |
| response_text = response.choices[0].message.content.strip() | |
| response_text = re.sub(r'<think>.*?</think>', '', response_text, flags=re.DOTALL).strip() | |
| # Remove any truncated incomplete sentence at end | |
| if response_text and not response_text[-1] in '.!?।': | |
| last_punct = max( | |
| response_text.rfind('.'), | |
| response_text.rfind('!'), | |
| response_text.rfind('?'), | |
| response_text.rfind('।'), | |
| ) | |
| if last_punct > len(response_text) // 2: | |
| response_text = response_text[:last_punct+1] | |
| return {"text": response_text, "voice_description": voice_desc} | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| # 8. Web Endpoints | |
| # ───────────────────────────────────────────────────────────────────────────── | |
| from fastapi import FastAPI, Request | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.responses import Response | |
| def make_app(): | |
| web_app = FastAPI() | |
| web_app.add_middleware( | |
| CORSMiddleware, allow_origins=["*"], | |
| allow_methods=["*"], allow_headers=["*"], | |
| ) | |
| return web_app | |
| # Health check — tests MiniCPM API reachability | |
| health_web = make_app() | |
| def health_endpoint(): | |
| async def handler(): | |
| try: | |
| client = get_llm_client() | |
| r = client.chat.completions.create( | |
| model=MINICPM_LLM_MODEL, | |
| messages=[{"role": "user", "content": "Hi"}], | |
| max_tokens=5, | |
| ) | |
| return {"status": "ok", "minicpm": "reachable", "reply": r.choices[0].message.content} | |
| except Exception as e: | |
| return {"status": "error", "minicpm": "unreachable", "error": str(e)} | |
| return health_web | |
| # Build persona | |
| build_web = make_app() | |
| def build_persona_endpoint(): | |
| async def handler(request: Request): | |
| try: | |
| data = await request.json() | |
| persona = build_persona.remote( | |
| persona_id=data["persona_id"], name=data["name"], | |
| relationship=data["relationship"], texts=data.get("texts", []), | |
| photo_captions=data.get("photo_captions", []), | |
| voice_transcripts=data.get("voice_transcripts", []), | |
| ) | |
| return {"success": True, "persona": persona} | |
| except Exception as e: | |
| return {"success": False, "error": str(e)} | |
| return build_web | |
| # Chat | |
| chat_web = make_app() | |
| def chat_endpoint(): | |
| async def handler(request: Request): | |
| data = await request.json() | |
| result = chat_with_persona.remote( | |
| persona_id=data["persona_id"], history=data.get("history", []), | |
| user_message=data["message"], language=data.get("language", "auto"), | |
| ) | |
| return result | |
| return chat_web | |
| # Transcribe | |
| transcribe_web = make_app() | |
| def transcribe_endpoint(): | |
| async def handler(request: Request): | |
| data = await request.json() | |
| audio_bytes = base64.b64decode(data["audio_b64"]) | |
| transcript = transcribe_audio.remote(audio_bytes, data.get("filename", "audio.wav")) | |
| return {"transcript": transcript} | |
| return transcribe_web | |
| # Vision | |
| vision_web = make_app() | |
| def vision_endpoint(): | |
| async def handler(request: Request): | |
| data = await request.json() | |
| image_bytes = base64.b64decode(data["image_b64"]) | |
| description = describe_photo.remote(image_bytes, data.get("prompt")) | |
| return {"description": description} | |
| return vision_web | |
| # OCR | |
| ocr_web = make_app() | |
| def ocr_endpoint(): | |
| async def handler(request: Request): | |
| data = await request.json() | |
| image_bytes = base64.b64decode(data["image_b64"]) | |
| text = ocr_document.remote(image_bytes) | |
| return {"text": text} | |
| return ocr_web | |
| # TTS | |
| tts_web = make_app() | |
| def tts_endpoint(): | |
| async def handler(request: Request): | |
| data = await request.json() | |
| wav_bytes = text_to_speech.remote(data["text"], data.get("voice_description")) | |
| return Response(content=wav_bytes, media_type="audio/wav") | |
| return tts_web | |
| # List personas | |
| list_web = make_app() | |
| def list_personas_endpoint(): | |
| async def handler(): | |
| personas = [] | |
| if os.path.exists("/personas"): | |
| for f in Path("/personas").glob("*.json"): | |
| with open(f) as fp: | |
| data = json.load(fp) | |
| personas.append({ | |
| "id": f.stem, | |
| "name": data.get("name"), | |
| "relationship": data.get("relationship"), | |
| }) | |
| return {"personas": personas} | |
| return list_web |