Spaces:

build-small-hackathon
/

Vital

Running

File size: 7,660 Bytes

4e88df3

"""
Live LLM evaluation against your Modal Nemotron endpoint.

Seeds the local DB with sample data so tool calls return real results.

Prerequisites:
    modal deploy infra/vllm_serve.py
    Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env

Usage:
    uv run python -m scripts.eval_llm
"""

import json
import os
import sys
from datetime import date, datetime, timezone
from unittest.mock import patch

from db import queries
from db.database import reset_database
from llm.client import get_llm_client
from llm.config import get_llm_config
from llm.tool_runner import execute_tool
from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput


def seed_eval_database() -> None:
    """Insert a realistic Amara-like profile and today's logs for tool testing."""
    queries.save_profile(
        ProfileInput(
            name="Amara",
            age=24,
            city="Port Harcourt",
            profession="Student",
            goal="Manage a health condition",
            conditions=["sickle cell disease"],
            medications=[
                MedicationRecord(name="Folic acid", dose="5mg", time="08:00"),
                MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"),
            ],
            triggers=["dehydration", "cold temperatures", "stress"],
            wake_time="07:00",
            sleep_time="23:00",
            desk_worker=True,
            exercise_level="light",
            dietary_notes="Avoid processed sugar",
            local_foods="eba, egusi soup, beans, oranges",
        )
    )

    today = date.today()
    queries.insert_medication_log(
        MedicationLogEntry(
            date=today,
            medication_name="Folic acid",
            dose="5mg",
            scheduled_time="08:00",
            taken=True,
            taken_at=datetime.now(timezone.utc),
        )
    )
    queries.insert_medication_log(
        MedicationLogEntry(
            date=today,
            medication_name="Vitamin C",
            dose="500mg",
            scheduled_time="18:00",
            taken=False,
            taken_at=None,
        )
    )
    queries.upsert_daily_log(today, "pain_level", "3")
    queries.upsert_daily_log(today, "water_cups", "6")
    queries.upsert_daily_log(today, "energy_level", "7")


def print_db_snapshot(label: str) -> None:
    """Print medications and daily logs currently stored in SQLite."""
    today = date.today()
    medications = queries.get_medications_for_date(today)
    logs = queries.get_daily_logs_for_date(today)

    print(f"   {label}")
    print(f"   - Medications today ({len(medications)}):")
    for med in medications:
        status = "taken" if med.taken else "pending"
        print(f"       {med.scheduled_time}  {med.medication_name} ({med.dose}) — {status}")
    print(f"   - Daily logs today ({len(logs)}):")
    for entry in logs:
        print(f"       {entry.field_id}: {entry.value}")
    print()


def print_tool_trace(tool_name: str, arguments: dict[str, object]):
    """Print when the LLM triggers a tool during eval and return the real result."""
    print(f"   [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True)
    result = execute_tool(tool_name, arguments)
    print(f"   [TOOL SUCCESS] {result.success}", flush=True)
    if result.success:
        result_text = json.dumps(result.result, indent=2, ensure_ascii=False)
        print("   [TOOL RESULT]", flush=True)
        for line in result_text.splitlines():
            print(f"   | {line}", flush=True)
    if result.error:
        print(f"   [TOOL ERROR]  {result.error}", flush=True)
    return result


def medication_is_taken(medication_name: str, scheduled_time: str) -> bool:
    """Return whether a scheduled dose is marked taken in today's medication log."""
    today = date.today()
    for med in queries.get_medications_for_date(today):
        if med.medication_name == medication_name and med.scheduled_time == scheduled_time:
            return med.taken
    return False


def main() -> None:
    """Run live checks: chat, JSON, read tools, write tools (with DB verify)."""
    config = get_llm_config()
    if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"):
        print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.")
        sys.exit(1)

    skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes")
    if skip_reset:
        print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n")
    else:
        reset_database()
        seed_eval_database()

    client = get_llm_client()

    print("=" * 60)
    print("1) Simple chat (no tools)")
    print("=" * 60)
    reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False)
    print(f"   Reply: {reply}\n")

    print("=" * 60)
    print("2) JSON output — onboarding Call 1 (realistic)")
    print("=" * 60)
    onboarding_prompt = """
You are generating adaptive follow-up questions for a new Vitál user.

Profile:
- Name: Amara, age 24, Port Harcourt
- Condition: sickle cell disease (HbSS)
- Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00
- Triggers: dehydration, cold, stress
- Goal: manage health condition, fewer pain crises
- Desk worker, light exercise, local foods: eba, egusi, beans

Return ONLY valid JSON with this exact shape (max 3 questions):
{
  "follow_up_questions": [
    {
      "question_id": "snake_case_id",
      "question": "question text for the user",
      "type": "number",
      "reason": "why this question matters"
    }
  ]
}

Use type as one of: number, text. If no follow-ups needed, return an empty array.
"""
    payload = client.generate_json(
        onboarding_prompt,
        system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.",
    )
    print("   Full JSON response:")
    print(json.dumps(payload, indent=2, ensure_ascii=False))
    print()

    print("=" * 60)
    print("3) Tool call — read medications (seeded DB)")
    print("=" * 60)
    print_db_snapshot("DB before LLM (read test):")

    with patch("llm.client.execute_tool", side_effect=print_tool_trace):
        tool_reply = client.chat(
            "What medications do I have scheduled today and which ones have I already taken?",
            use_tools=True,
        )

    print("   LLM final answer:")
    print(f"   {tool_reply}\n")

    print("=" * 60)
    print("4) Tool call — write to DB (log_medication_taken)")
    print("=" * 60)
    print_db_snapshot("DB before LLM (write test):")
    print("   Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.")
    print("   Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,")
    print("   mark_medication_taken only updates matching rows (updated: true/false).\n")

    with patch("llm.client.execute_tool", side_effect=print_tool_trace):
        write_reply = client.chat(
            "I just took my evening Vitamin C 500mg dose scheduled for 18:00. "
            "Please log it as taken in my medication log.",
            use_tools=True,
        )

    print("   LLM final answer:")
    print(f"   {write_reply}\n")
    print_db_snapshot("DB after LLM (write test):")

    vitamin_c_taken = medication_is_taken("Vitamin C", "18:00")
    if vitamin_c_taken:
        print("   [PASS] DB verify: Vitamin C 18:00 is now marked taken.")
    else:
        print("   [FAIL] DB verify: Vitamin C 18:00 is still pending.")
        print("     Check [TOOL CALLED] above — validation may have rejected bad args.")

    print("\nAll eval checks completed.")


if __name__ == "__main__":
    main()