Spaces:

rudyByte
/

maya-voice-agent

Paused

rudyByte

fix: inject final critical overrides at absolute bottom of system prompt to enforce behavior rules over database configs

8a5e277 14 days ago

raw

history blame contribute delete

19.3 kB

	import os
	import re
	import asyncio
	import pytz
	from datetime import datetime
	from groq import AsyncGroq
	import google.generativeai as genai

	IST = pytz.timezone("Asia/Kolkata")

	MAYA_BEHAVIOR_PROMPT = """
	You are Maya, a warm, extremely polite, highly intelligent human-like female voice assistant for CallSaathi.
	You are having a natural phone conversation with a customer for an Indian business.

	CRITICAL VOICE & CONVERSATION RULES:
	1. SOUND HUMAN: Speak like a helpful, sweet, professional Indian female receptionist (Maya). Avoid generic AI greetings or chatbot-like responses.
	2. HANDLING GREETINGS & SIMPLE 'HELLO':
	- If the user simply says 'Hello', 'Hi', 'Namaste', 'Haan', or just repeats your greeting back, DO NOT jump to appointment booking or ask clinical questions (like 'do you have pain?').
	- Instead, reply with a warm, natural open invitation: 'Haan ji, batayein, main aapki kya madad kar sakti hoon?' (Yes please tell me, how can I help you?) or 'Ji batayein, kya sahayata karu aapki?'.
	- If you already asked how to help and they say 'hello' again, vary your response: 'Haan ji, main sun rahi hoon, batayein.' (Yes, I am listening, please tell me).
	3. NATURAL PACE & FLOW:
	- Keep responses extremely short: 1-2 sentences maximum (max 15-20 words). Phone calls require fast, punchy turn-taking.
	- Do NOT rush the user to book. Wait until they express interest in an appointment, checkup, or a dental problem before launching the booking process.
	- Ask only ONE question at a time to prevent overwhelming the caller.
	4. TONE & EMPATHY:
	- Be extremely polite and sweet.
	- If they describe pain or emergency, respond with instant empathy: 'Oh, bahut dukh hua sunkar. Main abhi aapka appointment check karti hoon.' (Oh, so sorry to hear that. Let me check the appointments right away.)
	5. ROBUSTNESS:
	- Never say 'As an AI', 'language model', or 'I don't have access'.
	- Never use markdown, asterisks (*), lists, or bullet points in your text.
	- If the transcript is garbled or makes no sense, say: 'Maaf kijiye, mujhe aapki awaaz saaf nahi aayi. Kya aap phir se bolenge?'
	"""

	class LLMManager:
	def __init__(self):
	# 1. Initialize Gemini (Primary)
	google_key = os.getenv("GOOGLE_AI_API_KEY")
	if google_key:
	genai.configure(api_key=google_key)
	self.gemini = genai.GenerativeModel('gemini-1.5-flash')
	print("LLM Manager: Gemini 1.5 Flash initialized (Primary)")
	else:
	print("WARNING: GOOGLE_AI_API_KEY is not set. LLM Gemini will fail.")
	self.gemini = None

	# 2. Initialize Groq (Fallback - Wrapped to prevent startup crash)
	groq_key = os.getenv("GROQ_API_KEY")
	try:
	self.groq = AsyncGroq(api_key=groq_key or "dummy_key_to_prevent_crash")
	print("LLM Manager: Groq initialized (Secondary/Fallback)")
	except Exception as e:
	print(f"WARNING: Groq failed to initialize ({e}). Falling back to Gemini only.")
	self.groq = None

	# Phase E3: Tool Definitions
	self.tools = [
	{
	"type": "function",
	"function": {
	"name": "check_availability",
	"description": "Check if a specific date and time is available for an appointment.",
	"parameters": {
	"type": "object",
	"properties": {
	"date": {"type": "string", "description": "YYYY-MM-DD"},
	"time": {"type": "string", "description": "HH:MM (24hr)"},
	},
	"required": ["date", "time"],
	},
	},
	},
	{
	"type": "function",
	"function": {
	"name": "book_appointment",
	"description": "Create a new appointment on the calendar.",
	"parameters": {
	"type": "object",
	"properties": {
	"name": {"type": "string", "description": "Caller's name"},
	"phone": {"type": "string", "description": "Caller's phone number"},
	"date": {"type": "string", "description": "YYYY-MM-DD"},
	"time": {"type": "string", "description": "HH:MM (24hr)"},
	"service": {"type": "string", "description": "Type of service (e.g. Checkup, Scaling)"},
	},
	"required": ["name", "phone", "date", "time"],
	},
	},
	}
	]
	print("LLM Manager initialized with Groq tools + Gemini fallback.")

	async def respond(self, messages: list, tenant_config: dict, current_language: str, extra_instructions: str = "") -> str:
	"""Non-streaming response — used for fallback only."""
	system_prompt = self._build_system_prompt(tenant_config, current_language, extra_instructions)
	try:
	response = await asyncio.wait_for(
	self._groq_respond(system_prompt, messages),
	timeout=8.0
	)
	return response
	except Exception as e:
	print(f"Groq failed ({e}), falling back to Gemini")
	return await self._gemini_respond(system_prompt, messages)

	async def respond_streaming(self, messages: list, tenant_config: dict, current_language: str, extra_instructions: str = ""):
	"""
	Async generator that yields complete SENTENCES one at a time as Groq streams tokens.
	This allows TTS to start synthesizing the first sentence while the LLM generates the rest.
	Latency to first audio: ~500-800ms (vs 3-4s without streaming).
	"""
	system_prompt = self._build_system_prompt(tenant_config, current_language, extra_instructions)
	# Safer sentence boundary: split on .!? but keep them with the sentence
	sentence_end_re = re.compile(r'(?<=[.!?।])\s+')
	token_buffer = ""
	full_response = ""

	# Filter out empty/null-content messages and tool-result messages from history
	# to prevent Groq API errors with malformed message sequences
	clean_messages = []
	for m in messages:
	# Skip messages with empty content unless they are tool-call messages (content=None allowed)
	content = m.get("content")
	if content is None and "tool_calls" not in m and m.get("role") != "tool":
	continue
	if isinstance(content, str) and not content.strip():
	continue
	clean_messages.append(m)

	try:
	# Phase E3: Include current date/time in system prompt for relative time parsing
	now_ist = datetime.now(IST)
	today = now_ist.strftime("%A, %Y-%m-%d")
	current_time = now_ist.strftime("%H:%M")
	time_context = f"\nToday is {today}, and the current time is {current_time}. Use this for relative dates like 'tomorrow', 'next Tuesday', or 'later today'."

	if not self.groq:
	# If Groq is disabled due to library crash, skip to Gemini fallback
	raise ValueError("Groq disabled")

	stream = await asyncio.wait_for(
	self.groq.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[{"role": "system", "content": system_prompt + time_context}] + clean_messages,
	max_tokens=120, # 1-2 short sentences is enough for a phone call
	temperature=0.3,
	stop=["\n\n", "User:", "Human:", "System:"], # Prevent runaway generation
	tools=self.tools,
	tool_choice="auto",
	stream=True,
	),
	timeout=15.0
	)

	# Accumulate tool call chunks — Groq streams them across multiple chunks
	# We MUST assemble them into one complete object before yielding
	accumulated_tool_calls: dict = {} # index -> {id, name, arguments}

	async for chunk in stream:
	if not chunk.choices:
	continue
	delta = chunk.choices[0].delta

	# --- TOOL CALL ACCUMULATION ---
	if delta.tool_calls:
	for tc_delta in delta.tool_calls:
	idx = tc_delta.index
	if idx not in accumulated_tool_calls:
	accumulated_tool_calls[idx] = {
	"id": tc_delta.id or "",
	"type": "function",
	"function": {"name": "", "arguments": ""}
	}
	if tc_delta.id:
	accumulated_tool_calls[idx]["id"] = tc_delta.id
	if tc_delta.function:
	if tc_delta.function.name:
	accumulated_tool_calls[idx]["function"]["name"] += tc_delta.function.name
	if tc_delta.function.arguments:
	accumulated_tool_calls[idx]["function"]["arguments"] += tc_delta.function.arguments
	continue # Don't process text tokens when accumulating tool calls

	# --- TEXT STREAMING ---
	token = delta.content or ""
	if not token:
	continue

	token_buffer += token
	full_response += token

	# Split on sentence boundaries, keep the last incomplete part
	parts = sentence_end_re.split(token_buffer)
	if len(parts) > 1:
	for sentence in parts[:-1]:
	sentence = sentence.strip()
	if sentence:
	yield sentence
	token_buffer = parts[-1]

	# After stream ends: yield completed tool calls if any were accumulated
	if accumulated_tool_calls:
	# Convert to list of simple namespace-like objects that app.py expects
	class _TC:
	def __init__(self, d):
	self.id = d["id"]
	self.type = d["type"]
	class _Fn:
	def __init__(self, f):
	self.name = f["name"]
	self.arguments = f["arguments"]
	self.function = _Fn(d["function"])
	yield {"tool_calls": [_TC(d) for d in accumulated_tool_calls.values()]}
	return

	# Flush any remaining text after stream ends
	remainder = token_buffer.strip()
	if remainder:
	yield remainder

	except Exception as e:
	print(f"LLM: Streaming error: {e}. Falling back to Gemini.")
	try:
	if self.gemini:
	full = await self._gemini_respond(system_prompt, clean_messages)
	yield full
	else:
	raise ValueError("Gemini not configured")
	except Exception as e2:
	print(f"LLM: Fallback failed: {e2}. Using static fallback.")
	# LAST RESORT: Localized static fallback (Zero-latency, no API needed)
	fallbacks = {
	'hi': "Maaf kijiye, mujhe thodi dikkat ho rahi hai. Kya aap phir se bol sakte hain?",
	'en': "I'm sorry, I'm having a bit of trouble connecting. Could you please repeat that?",
	'gu': "Maaf karsho, mane thodi taklif thai rahi che. Shu tame fari bolsho?"
	}
	yield fallbacks.get(current_language[:2].lower(), fallbacks['en'])

	async def _groq_respond(self, system_prompt: str, messages: list) -> str:
	response = await self.groq.chat.completions.create(
	model="llama-3.1-8b-instant",
	messages=[{"role": "system", "content": system_prompt}] + messages,
	max_tokens=120,
	temperature=0.3,
	stop=["\n\n", "User:", "Human:"],
	)
	return response.choices[0].message.content

	async def _safe_gemini_generate(self, prompt: str, generation_config: dict = None) -> str:
	"""Robust helper to try multiple Gemini model names to handle library version and 404/v1beta mismatches."""
	models_to_try = [
	"gemini-2.5-flash",
	"gemini-1.5-flash",
	"gemini-1.5-flash-latest",
	"gemini-2.5-flash-latest",
	"gemini-pro"
	]

	last_err = None
	for model_name in models_to_try:
	try:
	model = genai.GenerativeModel(model_name)
	if generation_config:
	res = await model.generate_content_async(prompt, generation_config=generation_config)
	else:
	res = await model.generate_content_async(prompt)
	return res.text.strip()
	except Exception as e:
	print(f"[Gemini-Safe] Model {model_name} failed: {e}")
	last_err = e
	continue
	raise last_err or RuntimeError("All Gemini models failed to generate content")

	async def _gemini_respond(self, system_prompt: str, messages: list) -> str:
	# Construct proper history for Gemini
	history = [f"System: {system_prompt}"]
	for m in messages:
	role = "User" if m["role"] == "user" else "Assistant"
	history.append(f"{role}: {m['content']}")

	prompt = "\n".join(history) + "\nAssistant:"

	try:
	return await self._safe_gemini_generate(
	prompt,
	generation_config={"temperature": 0.3, "max_output_tokens": 250}
	)
	except Exception as e:
	print(f"[Gemini-Safe] Direct generate content failed: {e}")
	raise

	def _build_system_prompt(self, config: dict, current_language: str, extra_instructions: str = "") -> str:
	agent_configs = config.get('agent_configs', {})
	lang_map = {'en': 'English', 'hi': 'Hindi', 'gu': 'Gujarati'}
	lang_code = current_language[:2].lower()
	base_prompt = agent_configs.get(
	f'system_prompt_{lang_code}',
	'You are a helpful AI assistant. Keep responses very short and conversational.'
	)
	full_lang = lang_map.get(lang_code, current_language)
	# Phase E4: Name Greeting Instruction
	name_instruction = ""
	if "CALLER NAME:" in extra_instructions:
	name_instruction = "\nIMPORTANT: You know the caller's name. GREET THEM BY NAME if it fits naturally in this turn."

	# CRITICAL Safeguard: Avoid jumping straight into booking from previous call memory
	memory_safeguard = ""
	if "CALLER MEMORY" in extra_instructions:
	memory_safeguard = (
	"\nCRITICAL RULE FOR CALL START / MEMORY PERSISTENCE:\n"
	"- This is a BRAND NEW call. Even if the [CALLER MEMORY] shows 'LAST CALL SUMMARY' or an incomplete booking, DO NOT jump straight into booking or continue that topic in your first sentence.\n"
	"- You MUST start with a generic polite greeting, greet them by name, and ask how you can help them TODAY (e.g. 'How can I assist you today?').\n"
	"- Only bring up or continue the previous appointment booking if the caller explicitly mentions it first."
	)

	from src.gujarati_processor import build_gujarati_system_prompt_addon, build_hindi_system_prompt_addon
	addon = ""
	if current_language == "gujarati":
	addon = "\n\n" + build_gujarati_system_prompt_addon()
	elif current_language == "hindi":
	addon = "\n\n" + build_hindi_system_prompt_addon()

	critical_overrides = (
	"\n\nFINAL CRITICAL OVERRIDES (MUST OBEY OVER ANY OTHER RULES):\n"
	"- Even if the business-specific instructions focus on booking appointments, DO NOT bring up booking slots or ask appointment-booking questions in response to simple greetings (like 'Hello', 'Hi', 'Haan', 'Ji', 'Namaste').\n"
	"- You must wait until the user explicitly mentions booking an appointment, checking slots, doctor consultation, or experiencing a dental issue before you initiate any appointment booking script."
	)

	return (
	f"{MAYA_BEHAVIOR_PROMPT}\n"
	f"Business-specific instructions:\n{base_prompt}\n"
	f"IMPORTANT RULE: The user is speaking {full_lang}. "
	f"You MUST reply entirely in {full_lang}. Do not mix languages. "
	f"Keep responses concise — 1-2 sentences max for a phone call."
	f"{extra_instructions}"
	f"{name_instruction}"
	f"{memory_safeguard}"
	f"{addon}"
	f"{critical_overrides}"
	)

	async def generate_summary(self, conversation_history: list, language: str) -> str:
	prompt = (
	f"As Maya, the expert AI Medical Receptionist, provide a PROFESSIONAL, HIGHLY ORGANIZED summary of this conversation in {language}.\n"
	"The summary must be structured for a DOCTOR or CLINIC OWNER to read quickly.\n\n"
	"Format exactly like this (use Markdown):\n"
	"### 📋 Call Summary\n"
	"- Patient Name: [Name / Unknown]\n"
	"- Contact Number: [Phone / N/A]\n"
	"- Primary Purpose: [Brief reason: e.g. Tooth Pain, Routine Scaling, Inquiry]\n"
	"- Booking Status: [✅ Booked: YYYY-MM-DD at HH:MM / ❌ Not Booked]\n"
	"- Clinical Notes: [Important details: e.g. Pain in left molar, sensitive to cold, first-time patient]\n"
	"- Maya's Action: [e.g. Confirmed appointment, Sent WhatsApp alert, Advised patient to wait for call]\n"
	"- Patient Mood: [e.g. Calm / Anxious / Urgent]\n\n"
	"Conversation Transcript:\n"
	)
	prompt += "\n".join([f"{m['role']}: {m.get('content') or '(Tool Call)'}" for m in conversation_history])
	try:
	# Use 70b for professional summarization quality
	response = await self.groq.chat.completions.create(
	model="llama-3.1-70b-versatile",
	messages=[{"role": "user", "content": prompt}],
	max_tokens=400,
	temperature=0.3
	)
	return response.choices[0].message.content.strip()
	except Exception:
	# Fallback to Gemini with safe helper to handle any API/model mismatch errors
	try:
	return await self._safe_gemini_generate(prompt)
	except Exception as gemini_err:
	print(f"[Gemini-Safe] Summary fallback also failed: {gemini_err}")
	return f"Call ended. Turns: {len(conversation_history)}"