Spaces:
Running
Running
File size: 7,660 Bytes
4e88df3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 | """
Live LLM evaluation against your Modal Nemotron endpoint.
Seeds the local DB with sample data so tool calls return real results.
Prerequisites:
modal deploy infra/vllm_serve.py
Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env
Usage:
uv run python -m scripts.eval_llm
"""
import json
import os
import sys
from datetime import date, datetime, timezone
from unittest.mock import patch
from db import queries
from db.database import reset_database
from llm.client import get_llm_client
from llm.config import get_llm_config
from llm.tool_runner import execute_tool
from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput
def seed_eval_database() -> None:
"""Insert a realistic Amara-like profile and today's logs for tool testing."""
queries.save_profile(
ProfileInput(
name="Amara",
age=24,
city="Port Harcourt",
profession="Student",
goal="Manage a health condition",
conditions=["sickle cell disease"],
medications=[
MedicationRecord(name="Folic acid", dose="5mg", time="08:00"),
MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"),
],
triggers=["dehydration", "cold temperatures", "stress"],
wake_time="07:00",
sleep_time="23:00",
desk_worker=True,
exercise_level="light",
dietary_notes="Avoid processed sugar",
local_foods="eba, egusi soup, beans, oranges",
)
)
today = date.today()
queries.insert_medication_log(
MedicationLogEntry(
date=today,
medication_name="Folic acid",
dose="5mg",
scheduled_time="08:00",
taken=True,
taken_at=datetime.now(timezone.utc),
)
)
queries.insert_medication_log(
MedicationLogEntry(
date=today,
medication_name="Vitamin C",
dose="500mg",
scheduled_time="18:00",
taken=False,
taken_at=None,
)
)
queries.upsert_daily_log(today, "pain_level", "3")
queries.upsert_daily_log(today, "water_cups", "6")
queries.upsert_daily_log(today, "energy_level", "7")
def print_db_snapshot(label: str) -> None:
"""Print medications and daily logs currently stored in SQLite."""
today = date.today()
medications = queries.get_medications_for_date(today)
logs = queries.get_daily_logs_for_date(today)
print(f" {label}")
print(f" - Medications today ({len(medications)}):")
for med in medications:
status = "taken" if med.taken else "pending"
print(f" {med.scheduled_time} {med.medication_name} ({med.dose}) — {status}")
print(f" - Daily logs today ({len(logs)}):")
for entry in logs:
print(f" {entry.field_id}: {entry.value}")
print()
def print_tool_trace(tool_name: str, arguments: dict[str, object]):
"""Print when the LLM triggers a tool during eval and return the real result."""
print(f" [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True)
result = execute_tool(tool_name, arguments)
print(f" [TOOL SUCCESS] {result.success}", flush=True)
if result.success:
result_text = json.dumps(result.result, indent=2, ensure_ascii=False)
print(" [TOOL RESULT]", flush=True)
for line in result_text.splitlines():
print(f" | {line}", flush=True)
if result.error:
print(f" [TOOL ERROR] {result.error}", flush=True)
return result
def medication_is_taken(medication_name: str, scheduled_time: str) -> bool:
"""Return whether a scheduled dose is marked taken in today's medication log."""
today = date.today()
for med in queries.get_medications_for_date(today):
if med.medication_name == medication_name and med.scheduled_time == scheduled_time:
return med.taken
return False
def main() -> None:
"""Run live checks: chat, JSON, read tools, write tools (with DB verify)."""
config = get_llm_config()
if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"):
print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.")
sys.exit(1)
skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes")
if skip_reset:
print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n")
else:
reset_database()
seed_eval_database()
client = get_llm_client()
print("=" * 60)
print("1) Simple chat (no tools)")
print("=" * 60)
reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False)
print(f" Reply: {reply}\n")
print("=" * 60)
print("2) JSON output — onboarding Call 1 (realistic)")
print("=" * 60)
onboarding_prompt = """
You are generating adaptive follow-up questions for a new Vitál user.
Profile:
- Name: Amara, age 24, Port Harcourt
- Condition: sickle cell disease (HbSS)
- Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00
- Triggers: dehydration, cold, stress
- Goal: manage health condition, fewer pain crises
- Desk worker, light exercise, local foods: eba, egusi, beans
Return ONLY valid JSON with this exact shape (max 3 questions):
{
"follow_up_questions": [
{
"question_id": "snake_case_id",
"question": "question text for the user",
"type": "number",
"reason": "why this question matters"
}
]
}
Use type as one of: number, text. If no follow-ups needed, return an empty array.
"""
payload = client.generate_json(
onboarding_prompt,
system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.",
)
print(" Full JSON response:")
print(json.dumps(payload, indent=2, ensure_ascii=False))
print()
print("=" * 60)
print("3) Tool call — read medications (seeded DB)")
print("=" * 60)
print_db_snapshot("DB before LLM (read test):")
with patch("llm.client.execute_tool", side_effect=print_tool_trace):
tool_reply = client.chat(
"What medications do I have scheduled today and which ones have I already taken?",
use_tools=True,
)
print(" LLM final answer:")
print(f" {tool_reply}\n")
print("=" * 60)
print("4) Tool call — write to DB (log_medication_taken)")
print("=" * 60)
print_db_snapshot("DB before LLM (write test):")
print(" Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.")
print(" Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,")
print(" mark_medication_taken only updates matching rows (updated: true/false).\n")
with patch("llm.client.execute_tool", side_effect=print_tool_trace):
write_reply = client.chat(
"I just took my evening Vitamin C 500mg dose scheduled for 18:00. "
"Please log it as taken in my medication log.",
use_tools=True,
)
print(" LLM final answer:")
print(f" {write_reply}\n")
print_db_snapshot("DB after LLM (write test):")
vitamin_c_taken = medication_is_taken("Vitamin C", "18:00")
if vitamin_c_taken:
print(" [PASS] DB verify: Vitamin C 18:00 is now marked taken.")
else:
print(" [FAIL] DB verify: Vitamin C 18:00 is still pending.")
print(" Check [TOOL CALLED] above — validation may have rejected bad args.")
print("\nAll eval checks completed.")
if __name__ == "__main__":
main()
|