Vital / scripts /eval_llm.py
eddyejembi's picture
Upload folder using huggingface_hub
4e88df3 verified
Raw
History Blame Contribute Delete
7.66 kB
"""
Live LLM evaluation against your Modal Nemotron endpoint.
Seeds the local DB with sample data so tool calls return real results.
Prerequisites:
modal deploy infra/vllm_serve.py
Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env
Usage:
uv run python -m scripts.eval_llm
"""
import json
import os
import sys
from datetime import date, datetime, timezone
from unittest.mock import patch
from db import queries
from db.database import reset_database
from llm.client import get_llm_client
from llm.config import get_llm_config
from llm.tool_runner import execute_tool
from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput
def seed_eval_database() -> None:
"""Insert a realistic Amara-like profile and today's logs for tool testing."""
queries.save_profile(
ProfileInput(
name="Amara",
age=24,
city="Port Harcourt",
profession="Student",
goal="Manage a health condition",
conditions=["sickle cell disease"],
medications=[
MedicationRecord(name="Folic acid", dose="5mg", time="08:00"),
MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"),
],
triggers=["dehydration", "cold temperatures", "stress"],
wake_time="07:00",
sleep_time="23:00",
desk_worker=True,
exercise_level="light",
dietary_notes="Avoid processed sugar",
local_foods="eba, egusi soup, beans, oranges",
)
)
today = date.today()
queries.insert_medication_log(
MedicationLogEntry(
date=today,
medication_name="Folic acid",
dose="5mg",
scheduled_time="08:00",
taken=True,
taken_at=datetime.now(timezone.utc),
)
)
queries.insert_medication_log(
MedicationLogEntry(
date=today,
medication_name="Vitamin C",
dose="500mg",
scheduled_time="18:00",
taken=False,
taken_at=None,
)
)
queries.upsert_daily_log(today, "pain_level", "3")
queries.upsert_daily_log(today, "water_cups", "6")
queries.upsert_daily_log(today, "energy_level", "7")
def print_db_snapshot(label: str) -> None:
"""Print medications and daily logs currently stored in SQLite."""
today = date.today()
medications = queries.get_medications_for_date(today)
logs = queries.get_daily_logs_for_date(today)
print(f" {label}")
print(f" - Medications today ({len(medications)}):")
for med in medications:
status = "taken" if med.taken else "pending"
print(f" {med.scheduled_time} {med.medication_name} ({med.dose}) — {status}")
print(f" - Daily logs today ({len(logs)}):")
for entry in logs:
print(f" {entry.field_id}: {entry.value}")
print()
def print_tool_trace(tool_name: str, arguments: dict[str, object]):
"""Print when the LLM triggers a tool during eval and return the real result."""
print(f" [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True)
result = execute_tool(tool_name, arguments)
print(f" [TOOL SUCCESS] {result.success}", flush=True)
if result.success:
result_text = json.dumps(result.result, indent=2, ensure_ascii=False)
print(" [TOOL RESULT]", flush=True)
for line in result_text.splitlines():
print(f" | {line}", flush=True)
if result.error:
print(f" [TOOL ERROR] {result.error}", flush=True)
return result
def medication_is_taken(medication_name: str, scheduled_time: str) -> bool:
"""Return whether a scheduled dose is marked taken in today's medication log."""
today = date.today()
for med in queries.get_medications_for_date(today):
if med.medication_name == medication_name and med.scheduled_time == scheduled_time:
return med.taken
return False
def main() -> None:
"""Run live checks: chat, JSON, read tools, write tools (with DB verify)."""
config = get_llm_config()
if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"):
print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.")
sys.exit(1)
skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes")
if skip_reset:
print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n")
else:
reset_database()
seed_eval_database()
client = get_llm_client()
print("=" * 60)
print("1) Simple chat (no tools)")
print("=" * 60)
reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False)
print(f" Reply: {reply}\n")
print("=" * 60)
print("2) JSON output — onboarding Call 1 (realistic)")
print("=" * 60)
onboarding_prompt = """
You are generating adaptive follow-up questions for a new Vitál user.
Profile:
- Name: Amara, age 24, Port Harcourt
- Condition: sickle cell disease (HbSS)
- Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00
- Triggers: dehydration, cold, stress
- Goal: manage health condition, fewer pain crises
- Desk worker, light exercise, local foods: eba, egusi, beans
Return ONLY valid JSON with this exact shape (max 3 questions):
{
"follow_up_questions": [
{
"question_id": "snake_case_id",
"question": "question text for the user",
"type": "number",
"reason": "why this question matters"
}
]
}
Use type as one of: number, text. If no follow-ups needed, return an empty array.
"""
payload = client.generate_json(
onboarding_prompt,
system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.",
)
print(" Full JSON response:")
print(json.dumps(payload, indent=2, ensure_ascii=False))
print()
print("=" * 60)
print("3) Tool call — read medications (seeded DB)")
print("=" * 60)
print_db_snapshot("DB before LLM (read test):")
with patch("llm.client.execute_tool", side_effect=print_tool_trace):
tool_reply = client.chat(
"What medications do I have scheduled today and which ones have I already taken?",
use_tools=True,
)
print(" LLM final answer:")
print(f" {tool_reply}\n")
print("=" * 60)
print("4) Tool call — write to DB (log_medication_taken)")
print("=" * 60)
print_db_snapshot("DB before LLM (write test):")
print(" Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.")
print(" Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,")
print(" mark_medication_taken only updates matching rows (updated: true/false).\n")
with patch("llm.client.execute_tool", side_effect=print_tool_trace):
write_reply = client.chat(
"I just took my evening Vitamin C 500mg dose scheduled for 18:00. "
"Please log it as taken in my medication log.",
use_tools=True,
)
print(" LLM final answer:")
print(f" {write_reply}\n")
print_db_snapshot("DB after LLM (write test):")
vitamin_c_taken = medication_is_taken("Vitamin C", "18:00")
if vitamin_c_taken:
print(" [PASS] DB verify: Vitamin C 18:00 is now marked taken.")
else:
print(" [FAIL] DB verify: Vitamin C 18:00 is still pending.")
print(" Check [TOOL CALLED] above — validation may have rejected bad args.")
print("\nAll eval checks completed.")
if __name__ == "__main__":
main()