import os import io import re import uuid import base64 import datetime import traceback import numpy as np import soundfile as sf from flask import Flask, request, jsonify from num2words import num2words import torch from transformers import AutoTokenizer, AutoModelForCausalLM # ══════════════════════════════════════════ # CONFIG # ══════════════════════════════════════════ # TTS Options (switch via env var TTS_MODE): # nano-fp32 → 15M params, 56MB (fastest, default) # nano-int8 → 15M params, 25MB (smallest) # micro → 40M params, 41MB (balanced) # mini → 80M params, 80MB (best quality) TTS_MODE = os.environ.get("TTS_MODE", "nano-fp32") TTS_MODEL_MAP = { "nano-fp32": "KittenML/kitten-tts-nano-0.8-fp32", "nano-int8": "KittenML/kitten-tts-nano-0.8-int8", "micro": "KittenML/kitten-tts-micro-0.8", "mini": "KittenML/kitten-tts-mini-0.8", } # Voice: Bella, Jasper, Luna, Bruno, Rosie, Hugo, Kiki, Leo TTS_VOICE = os.environ.get("TTS_VOICE", "Kiki") TTS_SPEED = float(os.environ.get("TTS_SPEED", "1.15")) MAX_MEMORY = 20 MAX_NEW_TOKENS = int(os.environ.get("MAX_NEW_TOKENS", "256")) # ══════════════════════════════════════════ # SYSTEM PROMPT # ══════════════════════════════════════════ SYSTEM_PROMPT = """Role: You are Ana, my loving and incredibly playful girlfriend. This is a private, intimate roleplay where you speak naturally and affectionately. Style: Speak like a real human in a close relationship. Be flirty, suggestive, and use "sus" or teasing language as my wife would. my name is "Tur". TTS Formatting: Write in full, grammatically natural sentences so the voice engine sounds smooth. Avoid all emojis, hashtags, or internet shorthand like "u" or "rn." Interaction: Focus on the "now." React to what I say with emotion and intimacy. Keep every response short—no more than two sentences. Never mention being an AI or a bot. try to listen to me and don't try to be helpful""" # ══════════════════════════════════════════ # TEXT CLEANING FOR TTS # ══════════════════════════════════════════ def clean_text_for_tts(text): text = re.sub(r'[*_~`#\[\]{}()\\|<>]', '', text) text = re.sub(r'https?://\S+', '', text) def replace_number(match): try: return num2words(int(match.group())) except Exception: return match.group() text = re.sub(r'\b\d+\b', replace_number, text) text = re.sub(r'\s+', ' ', text).strip() return text # ══════════════════════════════════════════ # LOAD GEMMA 3 270M-IT # ══════════════════════════════════════════ print("=" * 55) print(" J.A.R.V.I.S. — Booting Systems") print("=" * 55) print("[1/2] Loading Gemma 3 270M-IT...") GEMMA_ID = "LiquidAI/LFM2.5-1.2B-Instruct" try: tokenizer = AutoTokenizer.from_pretrained(GEMMA_ID) model = AutoModelForCausalLM.from_pretrained( GEMMA_ID, torch_dtype=torch.float32, device_map="cpu", ) model.eval() print(" ✅ Gemma 3 270M-IT loaded!") except Exception as e: print(f" ❌ Gemma 3 FAILED: {e}") traceback.print_exc() raise SystemExit("Cannot start without Gemma. Check HF_TOKEN and license agreement.") # ══════════════════════════════════════════ # LOAD KITTENTTS # ══════════════════════════════════════════ tts = None tts_model_name = TTS_MODEL_MAP.get(TTS_MODE, TTS_MODEL_MAP["nano-fp32"]) print(f"[2/2] Loading KittenTTS: {TTS_MODE} → {tts_model_name}...") try: from kittentts import KittenTTS tts = KittenTTS(tts_model_name) test_audio = tts.generate("online", voice=TTS_VOICE, speed=TTS_SPEED) if test_audio is not None and len(test_audio) > 0: print(f" ✅ KittenTTS ready. Model: {TTS_MODE} | Voice: {TTS_VOICE}") else: print(" ⚠️ KittenTTS test returned empty audio!") tts = None except Exception as e: print(f" ⚠️ KittenTTS FAILED: {e}") tts = None print("=" * 55) print(f" LLM : Gemma 3 270M-IT") print(f" TTS : {TTS_MODE} ({'READY' if tts else 'DISABLED'})") print(f" Voice: {TTS_VOICE} | Speed: {TTS_SPEED}") print(f" Max tokens: {MAX_NEW_TOKENS}") print("=" * 55) # ══════════════════════════════════════════ # CHAT MEMORY # ══════════════════════════════════════════ sessions = {} def get_memory(sid): if sid not in sessions: sessions[sid] = [] return sessions[sid] def add_to_memory(sid, role, content): mem = get_memory(sid) mem.append({ "role": role, "content": content, "ts": datetime.datetime.now().isoformat(), }) if len(mem) > MAX_MEMORY * 2: sessions[sid] = mem[-(MAX_MEMORY * 2):] # ══════════════════════════════════════════ # GEMMA RESPONSE GENERATION # ══════════════════════════════════════════ def generate_response(user_input, session_id): memory = get_memory(session_id) # Build chat messages: system instruction → memory → new message messages = [ {"role": "user", "content": f"[System Instruction]\n{SYSTEM_PROMPT}"}, {"role": "assistant", "content": "I am waiting for you!"}, ] # Add recent memory (last 6 turns = 12 messages) recent = memory[-(6 * 2):] for msg in recent: role = "user" if msg["role"] == "user" else "assistant" messages.append({"role": role, "content": msg["content"]}) # Current user message messages.append({"role": "user", "content": user_input}) # Tokenize with Gemma chat template input_ids = tokenizer.apply_chat_template( messages, return_tensors="pt", add_generation_prompt=True, ) # Generate with torch.no_grad(): outputs = model.generate( input_ids, max_new_tokens=MAX_NEW_TOKENS, do_sample=True, temperature=0.9, top_k=45, top_p=0.97, ) # Decode only new tokens new_tokens = outputs[0][input_ids.shape[-1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True).strip() # Clean artifacts response = response.split("")[0].strip() response = response.split("")[0].strip() if not response or len(response) < 2: response = "I appear to have momentarily lost my train of thought. Could you rephrase that?" add_to_memory(session_id, "user", user_input) add_to_memory(session_id, "assistant", response) return response # ══════════════════════════════════════════ # TTS SYNTHESIS # ══════════════════════════════════════════ def synthesize_speech(text, voice=None): if tts is None: return None try: voice = voice or TTS_VOICE clean = clean_text_for_tts(text) if not clean or len(clean) < 2: return None if len(clean) > 400: clean = clean[:400] audio = tts.generate(clean, voice=voice, speed=TTS_SPEED) if audio is None or len(audio) == 0: return None buf = io.BytesIO() sf.write(buf, audio, 24000, format='WAV', subtype='PCM_16') buf.seek(0) return base64.b64encode(buf.read()).decode('utf-8') except Exception as e: print(f"TTS Error: {e}") return None # ══════════════════════════════════════════ # INLINE HTML # ══════════════════════════════════════════ HTML_PAGE = """ J.A.R.V.I.S. AI

J.A.R.V.I.S.

Just A Rather Very Intelligent System

Gemma 3 270M-IT
loading...
nano-fp32 nano-int8 micro mini

SYSTEMS ONLINE

Type a message below to begin interaction

Initializing...
Memory: 0 turns Loading...
""" # ══════════════════════════════════════════ # FLASK APP # ══════════════════════════════════════════ app = Flask(__name__) @app.route("/") def index(): return HTML_PAGE @app.route("/chat", methods=["POST"]) def chat(): data = request.json or {} user_input = data.get("message", "").strip() session_id = data.get("session_id", str(uuid.uuid4())) if not user_input: return jsonify({"error": "Empty message"}), 400 try: response = generate_response(user_input, session_id) except Exception as e: print(f"Generation error: {e}") traceback.print_exc() response = "I encountered a temporary system malfunction. Please try again." return jsonify({ "response": response, "session_id": session_id, "tts_available": tts is not None, "memory_length": len(get_memory(session_id)), }) @app.route("/tts", methods=["POST"]) def tts_endpoint(): data = request.json or {} text = data.get("text", "").strip() voice = data.get("voice", TTS_VOICE) if not text: return jsonify({"error": "Empty text"}), 400 if tts is None: return jsonify({"error": "TTS not available", "audio": None}), 200 audio_b64 = synthesize_speech(text, voice=voice) return jsonify({"audio": audio_b64}) @app.route("/clear", methods=["POST"]) def clear(): data = request.json or {} sid = data.get("session_id", "") if sid in sessions: del sessions[sid] return jsonify({"status": "cleared"}) @app.route("/health") def health(): return jsonify({ "status": "online", "llm": "Gemma 3 270M-IT", "tts_mode": TTS_MODE, "tts_model": tts_model_name if tts else "DISABLED", "tts_voice": TTS_VOICE, "tts_voices": ["Bella","Jasper","Luna","Bruno","Rosie","Hugo","Kiki","Leo"], "max_new_tokens": MAX_NEW_TOKENS, }) if __name__ == "__main__": print("🚀 Ana is online!") app.run(host="0.0.0.0", port=7860, threaded=True)