Spaces:
Running
Running
| """ | |
| Live LLM evaluation against your Modal Nemotron endpoint. | |
| Seeds the local DB with sample data so tool calls return real results. | |
| Prerequisites: | |
| modal deploy infra/vllm_serve.py | |
| Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env | |
| Usage: | |
| uv run python -m scripts.eval_llm | |
| """ | |
| import json | |
| import os | |
| import sys | |
| from datetime import date, datetime, timezone | |
| from unittest.mock import patch | |
| from db import queries | |
| from db.database import reset_database | |
| from llm.client import get_llm_client | |
| from llm.config import get_llm_config | |
| from llm.tool_runner import execute_tool | |
| from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput | |
| def seed_eval_database() -> None: | |
| """Insert a realistic Amara-like profile and today's logs for tool testing.""" | |
| queries.save_profile( | |
| ProfileInput( | |
| name="Amara", | |
| age=24, | |
| city="Port Harcourt", | |
| profession="Student", | |
| goal="Manage a health condition", | |
| conditions=["sickle cell disease"], | |
| medications=[ | |
| MedicationRecord(name="Folic acid", dose="5mg", time="08:00"), | |
| MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"), | |
| ], | |
| triggers=["dehydration", "cold temperatures", "stress"], | |
| wake_time="07:00", | |
| sleep_time="23:00", | |
| desk_worker=True, | |
| exercise_level="light", | |
| dietary_notes="Avoid processed sugar", | |
| local_foods="eba, egusi soup, beans, oranges", | |
| ) | |
| ) | |
| today = date.today() | |
| queries.insert_medication_log( | |
| MedicationLogEntry( | |
| date=today, | |
| medication_name="Folic acid", | |
| dose="5mg", | |
| scheduled_time="08:00", | |
| taken=True, | |
| taken_at=datetime.now(timezone.utc), | |
| ) | |
| ) | |
| queries.insert_medication_log( | |
| MedicationLogEntry( | |
| date=today, | |
| medication_name="Vitamin C", | |
| dose="500mg", | |
| scheduled_time="18:00", | |
| taken=False, | |
| taken_at=None, | |
| ) | |
| ) | |
| queries.upsert_daily_log(today, "pain_level", "3") | |
| queries.upsert_daily_log(today, "water_cups", "6") | |
| queries.upsert_daily_log(today, "energy_level", "7") | |
| def print_db_snapshot(label: str) -> None: | |
| """Print medications and daily logs currently stored in SQLite.""" | |
| today = date.today() | |
| medications = queries.get_medications_for_date(today) | |
| logs = queries.get_daily_logs_for_date(today) | |
| print(f" {label}") | |
| print(f" - Medications today ({len(medications)}):") | |
| for med in medications: | |
| status = "taken" if med.taken else "pending" | |
| print(f" {med.scheduled_time} {med.medication_name} ({med.dose}) — {status}") | |
| print(f" - Daily logs today ({len(logs)}):") | |
| for entry in logs: | |
| print(f" {entry.field_id}: {entry.value}") | |
| print() | |
| def print_tool_trace(tool_name: str, arguments: dict[str, object]): | |
| """Print when the LLM triggers a tool during eval and return the real result.""" | |
| print(f" [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True) | |
| result = execute_tool(tool_name, arguments) | |
| print(f" [TOOL SUCCESS] {result.success}", flush=True) | |
| if result.success: | |
| result_text = json.dumps(result.result, indent=2, ensure_ascii=False) | |
| print(" [TOOL RESULT]", flush=True) | |
| for line in result_text.splitlines(): | |
| print(f" | {line}", flush=True) | |
| if result.error: | |
| print(f" [TOOL ERROR] {result.error}", flush=True) | |
| return result | |
| def medication_is_taken(medication_name: str, scheduled_time: str) -> bool: | |
| """Return whether a scheduled dose is marked taken in today's medication log.""" | |
| today = date.today() | |
| for med in queries.get_medications_for_date(today): | |
| if med.medication_name == medication_name and med.scheduled_time == scheduled_time: | |
| return med.taken | |
| return False | |
| def main() -> None: | |
| """Run live checks: chat, JSON, read tools, write tools (with DB verify).""" | |
| config = get_llm_config() | |
| if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"): | |
| print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.") | |
| sys.exit(1) | |
| skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes") | |
| if skip_reset: | |
| print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n") | |
| else: | |
| reset_database() | |
| seed_eval_database() | |
| client = get_llm_client() | |
| print("=" * 60) | |
| print("1) Simple chat (no tools)") | |
| print("=" * 60) | |
| reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False) | |
| print(f" Reply: {reply}\n") | |
| print("=" * 60) | |
| print("2) JSON output — onboarding Call 1 (realistic)") | |
| print("=" * 60) | |
| onboarding_prompt = """ | |
| You are generating adaptive follow-up questions for a new Vitál user. | |
| Profile: | |
| - Name: Amara, age 24, Port Harcourt | |
| - Condition: sickle cell disease (HbSS) | |
| - Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00 | |
| - Triggers: dehydration, cold, stress | |
| - Goal: manage health condition, fewer pain crises | |
| - Desk worker, light exercise, local foods: eba, egusi, beans | |
| Return ONLY valid JSON with this exact shape (max 3 questions): | |
| { | |
| "follow_up_questions": [ | |
| { | |
| "question_id": "snake_case_id", | |
| "question": "question text for the user", | |
| "type": "number", | |
| "reason": "why this question matters" | |
| } | |
| ] | |
| } | |
| Use type as one of: number, text. If no follow-ups needed, return an empty array. | |
| """ | |
| payload = client.generate_json( | |
| onboarding_prompt, | |
| system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.", | |
| ) | |
| print(" Full JSON response:") | |
| print(json.dumps(payload, indent=2, ensure_ascii=False)) | |
| print() | |
| print("=" * 60) | |
| print("3) Tool call — read medications (seeded DB)") | |
| print("=" * 60) | |
| print_db_snapshot("DB before LLM (read test):") | |
| with patch("llm.client.execute_tool", side_effect=print_tool_trace): | |
| tool_reply = client.chat( | |
| "What medications do I have scheduled today and which ones have I already taken?", | |
| use_tools=True, | |
| ) | |
| print(" LLM final answer:") | |
| print(f" {tool_reply}\n") | |
| print("=" * 60) | |
| print("4) Tool call — write to DB (log_medication_taken)") | |
| print("=" * 60) | |
| print_db_snapshot("DB before LLM (write test):") | |
| print(" Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.") | |
| print(" Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,") | |
| print(" mark_medication_taken only updates matching rows (updated: true/false).\n") | |
| with patch("llm.client.execute_tool", side_effect=print_tool_trace): | |
| write_reply = client.chat( | |
| "I just took my evening Vitamin C 500mg dose scheduled for 18:00. " | |
| "Please log it as taken in my medication log.", | |
| use_tools=True, | |
| ) | |
| print(" LLM final answer:") | |
| print(f" {write_reply}\n") | |
| print_db_snapshot("DB after LLM (write test):") | |
| vitamin_c_taken = medication_is_taken("Vitamin C", "18:00") | |
| if vitamin_c_taken: | |
| print(" [PASS] DB verify: Vitamin C 18:00 is now marked taken.") | |
| else: | |
| print(" [FAIL] DB verify: Vitamin C 18:00 is still pending.") | |
| print(" Check [TOOL CALLED] above — validation may have rejected bad args.") | |
| print("\nAll eval checks completed.") | |
| if __name__ == "__main__": | |
| main() | |