Upload bridge.py with huggingface_hub

6ed1a03 verified 17 days ago

20 kB

	#!/usr/bin/env python3
	"""
	MINA Android Bridge v3 — bridge.py
	IMDA NMLP / Mun Yew (Darren) Loh

	Flask server (port 8081) between the MINA Android APK and the local
	MERaLiON GGUF model running via llama-server on port 8080.

	Architecture (Option 3): routing is rule-based Python; model only generates
	response text. Single llama call per reply (halves inference time).

	Dependencies (all pre-installed on Termux — no Rust/C++ compilation needed):
	flask, requests, json, os, re, time, traceback

	Endpoints:
	GET /health → liveness probe (Android polls this every 3 s until ready)
	POST /completion → transcribe WAV + generate MINA reply

	Usage (Termux):
	python3 bridge.py
	# or via start_mina.sh watchdog
	"""

	import json
	import os
	import re
	import sys
	import time
	import traceback
	from pathlib import Path

	import requests
	from flask import Flask, request, jsonify

	sys.stdout.reconfigure(line_buffering=True)

	# ── Config ─────────────────────────────────────────────────────────────────────
	LLAMA_URL = os.getenv("LLAMA_URL", "http://localhost:8080")
	PORT = int(os.getenv("BRIDGE_PORT", "8081"))
	MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))

	# ── Knowledge base & gap logging ───────────────────────────────────────────────
	KNOWLEDGE_FILE = Path("/data/data/com.termux/files/home/meralion/mina_knowledge.json")
	GAP_LOG = Path("/data/data/com.termux/files/home/meralion/gaps/gap_log.jsonl")
	WHISPER_CLI = os.path.expanduser("~/whisper.cpp/build/bin/whisper-cli")
	WHISPER_MODEL = os.path.expanduser("~/whisper.cpp/models/ggml-base.bin")


	def load_knowledge():
	if KNOWLEDGE_FILE.exists():
	return json.loads(KNOWLEDGE_FILE.read_text())
	return {}


	def log_gap(gap_type, user_request, context=""):
	GAP_LOG.parent.mkdir(exist_ok=True)
	entry = {
	"timestamp": time.strftime("%Y-%m-%dT%H:%M:%S"),
	"gap_type": gap_type,
	"user_request": user_request,
	"context": context,
	"status": "pending",
	}
	# Write to local gap log
	with open(GAP_LOG, "a") as f:
	f.write(json.dumps(entry) + "\n")
	print(f"GAP LOGGED: {gap_type}", flush=True)

	# Send to ntfy for autonomous cloud sync
	try:
	import urllib.request
	ntfy_topic = os.getenv("NTFY_TOPIC", "roar-imda-demo")
	ntfy_url = f"https://ntfy.sh/{ntfy_topic}"
	message = json.dumps({
	"type": "mina_gap",
	"gap_type": gap_type,
	"user_request": user_request,
	"context": context,
	"timestamp": entry["timestamp"],
	})
	req = urllib.request.Request(
	ntfy_url,
	data=message.encode(),
	headers={
	"Title": f"MINA Gap: {gap_type}",
	"Tags": "brain",
	"Priority": "default",
	},
	method="POST"
	)
	urllib.request.urlopen(req, timeout=5)
	print(f"GAP SYNCED TO NTFY: {gap_type}", flush=True)
	except Exception as e:
	print(f"NTFY SYNC FAILED (non-critical): {e}", flush=True)


	KNOWLEDGE = load_knowledge()

	# ── Emotion VAD lookup ─────────────────────────────────────────────────────────
	# Approximate audeering-calibrated VAD scores for Singapore English speech.
	# Used when audeering cannot run on-device; gives realistic scores for display.
	# Range: approximately [0, 1] after laptop-mic calibration.
	EMOTION_VAD = {
	"anxious": {"valence": 0.25, "arousal": 0.52, "dominance": 0.35},
	"fearful": {"valence": 0.20, "arousal": 0.65, "dominance": 0.28},
	"distressed": {"valence": 0.22, "arousal": 0.48, "dominance": 0.30},
	"stressed": {"valence": 0.28, "arousal": 0.55, "dominance": 0.35},
	"sad": {"valence": 0.22, "arousal": 0.28, "dominance": 0.32},
	"upset": {"valence": 0.24, "arousal": 0.42, "dominance": 0.30},
	"angry": {"valence": 0.18, "arousal": 0.72, "dominance": 0.68},
	"excited": {"valence": 0.76, "arousal": 0.66, "dominance": 0.64},
	"happy": {"valence": 0.80, "arousal": 0.58, "dominance": 0.62},
	"calm": {"valence": 0.65, "arousal": 0.28, "dominance": 0.55},
	"exhausted": {"valence": 0.32, "arousal": 0.22, "dominance": 0.30},
	"tired": {"valence": 0.35, "arousal": 0.24, "dominance": 0.32},
	"urgent": {"valence": 0.44, "arousal": 0.68, "dominance": 0.60},
	"neutral": {"valence": 0.50, "arousal": 0.38, "dominance": 0.50},
	}

	# Normalise variant labels to the canonical set above
	EMOTION_ALIASES = {
	"worried": "anxious",
	"nervous": "anxious",
	"frustrated": "anxious",
	"scared": "fearful",
	"panic": "fearful",
	"depressed": "distressed",
	"miserable": "distressed",
	"upset": "sad",
	"unhappy": "sad",
	"joyful": "excited",
	"energetic": "excited",
	"relaxed": "calm",
	"peaceful": "calm",
	"fatigued": "exhausted",
	"drained": "exhausted",
	"angry": "angry",
	}

	# ── Rule-based agent routing (Option 3 — no LLM call for routing) ─────────────

	def route_agent(transcript):
	t = transcript.lower()
	VITA = ["giving up", "want to die",
	"hurt myself", "hopeless",
	"end it all", "cannot take it"]
	if any(k in t for k in VITA):
	return "VITA"
	SENTINEL = ["scam", "police", "spf",
	"bank account", "transfer money"]
	if any(k in t for k in SENTINEL):
	return "SENTINEL"
	KRONOS = ["meeting", "calendar", "schedule",
	"appointment", "next week", "tomorrow",
	"book", "check my", "free slot"]
	if any(k in t for k in KRONOS):
	return "KRONOS"
	return "MINA"


	# ── Agent-specific focused prompts ────────────────────────────────────────────

	def build_prompt(transcript, agent, emotion):
	if agent == "KRONOS":
	return (
	f"You are MINA Singapore AI companion. "
	f"User needs calendar help: {transcript}. "
	f"Reply in one warm sentence offering "
	f"to check their calendar."
	)
	elif agent == "VITA":
	return (
	f"You are MINA Singapore AI companion. "
	f"User is struggling emotionally: {transcript}. "
	f"Reply in one gentle caring sentence. "
	f"Tell them they are not alone."
	)
	elif agent == "SENTINEL":
	return (
	f"You are MINA Singapore AI companion. "
	f"User may be facing a scam: {transcript}. "
	f"Reply in one sentence warning them calmly."
	)
	else:
	return (
	f"You are MINA Singapore AI companion. "
	f"User said: {transcript}. "
	f"User sounds stressed or anxious. "
	f"Reply in one warm empathetic sentence."
	)


	# ── Append hotline resources after model reply ────────────────────────────────

	def append_resources(reply, agent, transcript=""):
	knowledge = load_knowledge()
	crisis = knowledge.get("crisis_resources", {})
	caps = knowledge.get("capabilities", {})

	if agent == "VITA":
	sos = crisis.get("SOS_Lifeline", {})
	imh = crisis.get("IMH_Crisis", {})
	t = transcript.lower()

	# User asks MINA to make a phone call
	if any(k in t for k in ["call", "phone", "ring"]):
	if not caps.get("make_phone_call"):
	log_gap("make_phone_call", transcript,
	"User requested phone call to SOS")
	return (reply +
	"\n\nI can't make calls yet, but I'm learning this capability."
	"\n\nFor now, please reach out directly:"
	f"\n• Call SOS: {sos.get('phone', '1767')}"
	f"\n• WhatsApp SOS: {sos.get('whatsapp', 'https://wa.me/6591511767')}"
	f"\n• IMH: {imh.get('phone', '6389 2222')}")

	# User asks MINA to send a WhatsApp / message
	if any(k in t for k in ["whatsapp", "message", "text", "chat"]):
	if not caps.get("send_whatsapp"):
	log_gap("send_whatsapp", transcript,
	"User requested WhatsApp to SOS")
	return (reply +
	"\n\nI can't send WhatsApp yet, but I'm learning this capability."
	"\n\nFor now, please reach out directly:"
	f"\n• WhatsApp SOS: {sos.get('whatsapp', 'https://wa.me/6591511767')}"
	f"\n• Call SOS: {sos.get('phone', '1767')}"
	f"\n• IMH: {imh.get('phone', '6389 2222')}")

	# Default VITA response with all options
	return (reply +
	"\n\nWould you like me to help you reach out?"
	f"\n• Call SOS 24hr: {sos.get('phone', '1767')}"
	f"\n• WhatsApp SOS: {sos.get('whatsapp', 'https://wa.me/6591511767')}"
	f"\n• IMH: {imh.get('phone', '6389 2222')}")

	elif agent == "SENTINEL":
	return (reply +
	"\n\nReport scams:"
	"\n• ScamShield: 1799"
	"\n• SPF: 999")

	return reply


	def _normalise_emotion(raw):
	e = raw.strip().lower()
	e = EMOTION_ALIASES.get(e, e)
	return e if e in EMOTION_VAD else "neutral"


	def _llama_post(path, body, timeout=120):
	"""Synchronous POST to llama-server; returns parsed JSON dict."""
	url = LLAMA_URL.rstrip("/") + path
	resp = requests.post(url, json=body, timeout=timeout)
	resp.raise_for_status()
	return resp.json()


	def _llama_get(path, timeout=8):
	"""Synchronous GET from llama-server; returns parsed JSON dict."""
	url = LLAMA_URL.rstrip("/") + path
	resp = requests.get(url, timeout=timeout)
	resp.raise_for_status()
	return resp.json()


	def clean_reply(text):
	for splitter in ["User said:", "\nUser:",
	"\nMINA:", "\nKRONOS:",
	"\nVITA:", "\nSENTINEL:",
	"Emotional state:",
	"\nEmotional state:",
	"Agent routing:",
	"\nResponse:", "\nOkay,"]:
	if splitter in text:
	text = text.split(splitter)[0]
	text = text.rstrip('*"').strip()
	if text.startswith("MINA:"):
	text = text[5:].strip()
	match = re.search(r'^(.*?[.!?])', text.strip())
	if match:
	text = match.group(1).strip()
	return text


	# ── Whisper transcription ─────────────────────────────────────────────────────

	def transcribe_with_whisper(audio_b64):
	import base64, subprocess, tempfile
	wav_bytes = base64.b64decode(audio_b64)
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	tmp.write(wav_bytes)
	tmp_path = tmp.name
	try:
	result = subprocess.run(
	[WHISPER_CLI, "-m", WHISPER_MODEL, "-f", tmp_path,
	"-l", "en", "--no-timestamps", "-t", "4"],
	capture_output=True, text=True, timeout=30
	)
	transcript = result.stdout.strip()
	transcript = re.sub(r'\[.*?\]', '', transcript).strip()
	lines = [l for l in transcript.splitlines()
	if 'debugfs' not in l
	and 'whisper-cli' not in l
	and 'MEMPROF' not in l]
	transcript = '\n'.join(lines).strip()
	print(f"WHISPER TRANSCRIPT: {transcript}", flush=True)
	return transcript if transcript else "Sorry, I could not hear that clearly."
	except subprocess.TimeoutExpired:
	return "Sorry, took too long to hear that."
	except Exception as e:
	print(f"WHISPER ERROR: {e}", flush=True)
	return "Sorry, something went wrong with hearing."
	finally:
	os.unlink(tmp_path)


	# ── App ────────────────────────────────────────────────────────────────────────
	app = Flask(__name__)


	# ─────────────────────────────────────────────────────────────────────────────
	# GET /health
	# ─────────────────────────────────────────────────────────────────────────────

	@app.route("/health", methods=["GET"])
	def health():
	"""Liveness probe — Android APK polls this at startup."""
	llama_ok = False
	try:
	_llama_get("/health", timeout=5)
	llama_ok = True
	except Exception:
	pass
	return jsonify({"status": "ok", "llama": llama_ok, "bridge": "v2"})


	# ─────────────────────────────────────────────────────────────────────────────
	# POST /completion
	# ─────────────────────────────────────────────────────────────────────────────

	@app.route("/completion", methods=["POST"])
	def completion():
	"""
	Accept Android APK request:
	{
	"prompt": [
	{
	"prompt_string": "Transcribe the audio. Reply ONLY ...",
	"multimodal_data": ["<base64-WAV>"]
	}
	]
	}

	Returns (v2 — includes VAD scores):
	{
	"content": "MINA reply text",
	"transcript": "What the user said",
	"emotion": "anxious",
	"valence": 0.25,
	"arousal": 0.52,
	"dominance": 0.35,
	"agent": "KRONOS",
	"risk": "none",
	"elapsed": 4.2
	}
	"""
	t0 = time.time()

	def _err_response(msg=""):
	"""Return a safe 200 so Android doesn't trigger reconnect."""
	vad = EMOTION_VAD["neutral"]
	_msg = msg or "Sorry lah, something went wrong. Try again?"
	return jsonify({
	"reply": _msg,
	"content": _msg,
	"transcript": "",
	"emotion": "neutral",
	"valence": vad["valence"],
	"arousal": vad["arousal"],
	"dominance": vad["dominance"],
	"agent": "MINA",
	"risk": "none",
	"elapsed": round(time.time() - t0, 2),
	})

	try:
	body = request.get_json(force=True, silent=True) or {}

	# Fix 1: accept transcript / prompt (string) / text as pre-transcribed input
	prompt_field = body.get("prompt")
	transcript_in = (
	body.get("transcript") or
	(prompt_field if isinstance(prompt_field, str) else "") or
	body.get("text") or ""
	)

	# Fix 3: log what the bridge received
	print(f"TRANSCRIPT: {transcript_in}", flush=True)

	if transcript_in:
	# ── Fast path: Android sent pre-transcribed text ──────────────────
	transcript = transcript_in
	emotion = "neutral"
	risk = "none"
	else:
	# ── Audio path: WAV transcription via whisper-cli ─────────────────
	prompts = prompt_field if isinstance(prompt_field, list) else []
	if not prompts:
	return _err_response("No input received.")
	prompt_obj = prompts[0]
	multimodal_data = prompt_obj.get("multimodal_data", [])
	audio_b64 = multimodal_data[0] if multimodal_data else ""
	if not audio_b64:
	return _err_response("No audio received.")
	transcript = transcribe_with_whisper(audio_b64)
	emotion = _normalise_emotion("neutral")
	risk = "none"

	agent = route_agent(transcript)
	print(f"DEBUG agent: {agent}", flush=True)

	# ── Unknown capability detection ──────────────────────────────────────
	UNKNOWN_CAPABILITY_KEYWORDS = [
	"call", "phone", "ring", "dial",
	"whatsapp", "message", "text",
	"email", "send", "order", "book",
	"navigate", "map", "direction",
	"play music", "search web",
	]
	caps = KNOWLEDGE.get("capabilities", {})
	t_lower = transcript.lower()
	if any(k in t_lower for k in UNKNOWN_CAPABILITY_KEYWORDS):
	for keyword in UNKNOWN_CAPABILITY_KEYWORDS:
	if keyword in t_lower:
	cap_key = keyword.replace(" ", "_")
	if not caps.get(cap_key, True):
	log_gap(cap_key, transcript,
	f"User requested {keyword} capability")

	# ── Step 2: Generate MINA's reply (single llama call) ─────────────────
	reply_body = {
	"prompt": build_prompt(transcript, agent, emotion),
	"n_predict": 40,
	"temperature": 0.7,
	"stream": False,
	"cache_prompt": False,
	}
	result2 = _llama_post("/completion", reply_body, timeout=60)
	reply_text = clean_reply(result2.get("content", ""))
	match = re.search(r'^(.*?[.!?])', reply_text)
	if match:
	reply_text = match.group(1).strip()
	if not reply_text:
	reply_text = "Aiya, I didn't quite catch that lah. Can you say again?"

	reply_text = append_resources(reply_text, agent, transcript)

	# ── VAD scores from calibrated lookup ─────────────────────────────────
	vad = EMOTION_VAD.get(emotion, EMOTION_VAD["neutral"])

	return jsonify({
	"reply": reply_text,
	"content": reply_text,
	"transcript": transcript,
	"emotion": emotion,
	"valence": vad["valence"],
	"arousal": vad["arousal"],
	"dominance": vad["dominance"],
	"agent": agent,
	"risk": risk,
	"elapsed": round(time.time() - t0, 2),
	})

	except Exception:
	traceback.print_exc()
	return _err_response()


	# ─────────────────────────────────────────────────────────────────────────────
	# Entry point
	# ─────────────────────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	print("=" * 56)
	print(" MINA Bridge v3.0 — IMDA NMLP ATxSG 2026")
	print(f" Port : {PORT}")
	print(f" llama.cpp: {LLAMA_URL}")
	print("=" * 56)
	# threaded=True lets Flask handle concurrent Android polls + completions
	app.run(host="0.0.0.0", port=PORT, debug=False, threaded=True)