Spaces:

build-small-hackathon
/

Vital

Running

App Files Files Community

Vital / scripts /eval_llm.py

eddyejembi

Upload folder using huggingface_hub

4e88df3 verified 16 days ago

Raw

History Blame Contribute Delete

7.66 kB

	"""
	Live LLM evaluation against your Modal Nemotron endpoint.

	Seeds the local DB with sample data so tool calls return real results.

	Prerequisites:
	modal deploy infra/vllm_serve.py
	Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env

	Usage:
	uv run python -m scripts.eval_llm
	"""

	import json
	import os
	import sys
	from datetime import date, datetime, timezone
	from unittest.mock import patch

	from db import queries
	from db.database import reset_database
	from llm.client import get_llm_client
	from llm.config import get_llm_config
	from llm.tool_runner import execute_tool
	from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput


	def seed_eval_database() -> None:
	"""Insert a realistic Amara-like profile and today's logs for tool testing."""
	queries.save_profile(
	ProfileInput(
	name="Amara",
	age=24,
	city="Port Harcourt",
	profession="Student",
	goal="Manage a health condition",
	conditions=["sickle cell disease"],
	medications=[
	MedicationRecord(name="Folic acid", dose="5mg", time="08:00"),
	MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"),
	],
	triggers=["dehydration", "cold temperatures", "stress"],
	wake_time="07:00",
	sleep_time="23:00",
	desk_worker=True,
	exercise_level="light",
	dietary_notes="Avoid processed sugar",
	local_foods="eba, egusi soup, beans, oranges",
	)
	)

	today = date.today()
	queries.insert_medication_log(
	MedicationLogEntry(
	date=today,
	medication_name="Folic acid",
	dose="5mg",
	scheduled_time="08:00",
	taken=True,
	taken_at=datetime.now(timezone.utc),
	)
	)
	queries.insert_medication_log(
	MedicationLogEntry(
	date=today,
	medication_name="Vitamin C",
	dose="500mg",
	scheduled_time="18:00",
	taken=False,
	taken_at=None,
	)
	)
	queries.upsert_daily_log(today, "pain_level", "3")
	queries.upsert_daily_log(today, "water_cups", "6")
	queries.upsert_daily_log(today, "energy_level", "7")


	def print_db_snapshot(label: str) -> None:
	"""Print medications and daily logs currently stored in SQLite."""
	today = date.today()
	medications = queries.get_medications_for_date(today)
	logs = queries.get_daily_logs_for_date(today)

	print(f" {label}")
	print(f" - Medications today ({len(medications)}):")
	for med in medications:
	status = "taken" if med.taken else "pending"
	print(f" {med.scheduled_time} {med.medication_name} ({med.dose}) — {status}")
	print(f" - Daily logs today ({len(logs)}):")
	for entry in logs:
	print(f" {entry.field_id}: {entry.value}")
	print()


	def print_tool_trace(tool_name: str, arguments: dict[str, object]):
	"""Print when the LLM triggers a tool during eval and return the real result."""
	print(f" [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True)
	result = execute_tool(tool_name, arguments)
	print(f" [TOOL SUCCESS] {result.success}", flush=True)
	if result.success:
	result_text = json.dumps(result.result, indent=2, ensure_ascii=False)
	print(" [TOOL RESULT]", flush=True)
	for line in result_text.splitlines():
	print(f" \| {line}", flush=True)
	if result.error:
	print(f" [TOOL ERROR] {result.error}", flush=True)
	return result


	def medication_is_taken(medication_name: str, scheduled_time: str) -> bool:
	"""Return whether a scheduled dose is marked taken in today's medication log."""
	today = date.today()
	for med in queries.get_medications_for_date(today):
	if med.medication_name == medication_name and med.scheduled_time == scheduled_time:
	return med.taken
	return False


	def main() -> None:
	"""Run live checks: chat, JSON, read tools, write tools (with DB verify)."""
	config = get_llm_config()
	if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"):
	print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.")
	sys.exit(1)

	skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes")
	if skip_reset:
	print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n")
	else:
	reset_database()
	seed_eval_database()

	client = get_llm_client()

	print("=" * 60)
	print("1) Simple chat (no tools)")
	print("=" * 60)
	reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False)
	print(f" Reply: {reply}\n")

	print("=" * 60)
	print("2) JSON output — onboarding Call 1 (realistic)")
	print("=" * 60)
	onboarding_prompt = """
	You are generating adaptive follow-up questions for a new Vitál user.

	Profile:
	- Name: Amara, age 24, Port Harcourt
	- Condition: sickle cell disease (HbSS)
	- Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00
	- Triggers: dehydration, cold, stress
	- Goal: manage health condition, fewer pain crises
	- Desk worker, light exercise, local foods: eba, egusi, beans

	Return ONLY valid JSON with this exact shape (max 3 questions):
	{
	"follow_up_questions": [
	{
	"question_id": "snake_case_id",
	"question": "question text for the user",
	"type": "number",
	"reason": "why this question matters"
	}
	]
	}

	Use type as one of: number, text. If no follow-ups needed, return an empty array.
	"""
	payload = client.generate_json(
	onboarding_prompt,
	system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.",
	)
	print(" Full JSON response:")
	print(json.dumps(payload, indent=2, ensure_ascii=False))
	print()

	print("=" * 60)
	print("3) Tool call — read medications (seeded DB)")
	print("=" * 60)
	print_db_snapshot("DB before LLM (read test):")

	with patch("llm.client.execute_tool", side_effect=print_tool_trace):
	tool_reply = client.chat(
	"What medications do I have scheduled today and which ones have I already taken?",
	use_tools=True,
	)

	print(" LLM final answer:")
	print(f" {tool_reply}\n")

	print("=" * 60)
	print("4) Tool call — write to DB (log_medication_taken)")
	print("=" * 60)
	print_db_snapshot("DB before LLM (write test):")
	print(" Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.")
	print(" Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,")
	print(" mark_medication_taken only updates matching rows (updated: true/false).\n")

	with patch("llm.client.execute_tool", side_effect=print_tool_trace):
	write_reply = client.chat(
	"I just took my evening Vitamin C 500mg dose scheduled for 18:00. "
	"Please log it as taken in my medication log.",
	use_tools=True,
	)

	print(" LLM final answer:")
	print(f" {write_reply}\n")
	print_db_snapshot("DB after LLM (write test):")

	vitamin_c_taken = medication_is_taken("Vitamin C", "18:00")
	if vitamin_c_taken:
	print(" [PASS] DB verify: Vitamin C 18:00 is now marked taken.")
	else:
	print(" [FAIL] DB verify: Vitamin C 18:00 is still pending.")
	print(" Check [TOOL CALLED] above — validation may have rejected bad args.")

	print("\nAll eval checks completed.")


	if __name__ == "__main__":
	main()