File size: 7,660 Bytes
4e88df3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
"""
Live LLM evaluation against your Modal Nemotron endpoint.

Seeds the local DB with sample data so tool calls return real results.

Prerequisites:
    modal deploy infra/vllm_serve.py
    Set VITAL_LLM_BASE_URL and VITAL_MODEL_ID in .env

Usage:
    uv run python -m scripts.eval_llm
"""

import json
import os
import sys
from datetime import date, datetime, timezone
from unittest.mock import patch

from db import queries
from db.database import reset_database
from llm.client import get_llm_client
from llm.config import get_llm_config
from llm.tool_runner import execute_tool
from vital_types.db import MedicationLogEntry, MedicationRecord, ProfileInput


def seed_eval_database() -> None:
    """Insert a realistic Amara-like profile and today's logs for tool testing."""
    queries.save_profile(
        ProfileInput(
            name="Amara",
            age=24,
            city="Port Harcourt",
            profession="Student",
            goal="Manage a health condition",
            conditions=["sickle cell disease"],
            medications=[
                MedicationRecord(name="Folic acid", dose="5mg", time="08:00"),
                MedicationRecord(name="Vitamin C", dose="500mg", time="18:00"),
            ],
            triggers=["dehydration", "cold temperatures", "stress"],
            wake_time="07:00",
            sleep_time="23:00",
            desk_worker=True,
            exercise_level="light",
            dietary_notes="Avoid processed sugar",
            local_foods="eba, egusi soup, beans, oranges",
        )
    )

    today = date.today()
    queries.insert_medication_log(
        MedicationLogEntry(
            date=today,
            medication_name="Folic acid",
            dose="5mg",
            scheduled_time="08:00",
            taken=True,
            taken_at=datetime.now(timezone.utc),
        )
    )
    queries.insert_medication_log(
        MedicationLogEntry(
            date=today,
            medication_name="Vitamin C",
            dose="500mg",
            scheduled_time="18:00",
            taken=False,
            taken_at=None,
        )
    )
    queries.upsert_daily_log(today, "pain_level", "3")
    queries.upsert_daily_log(today, "water_cups", "6")
    queries.upsert_daily_log(today, "energy_level", "7")


def print_db_snapshot(label: str) -> None:
    """Print medications and daily logs currently stored in SQLite."""
    today = date.today()
    medications = queries.get_medications_for_date(today)
    logs = queries.get_daily_logs_for_date(today)

    print(f"   {label}")
    print(f"   - Medications today ({len(medications)}):")
    for med in medications:
        status = "taken" if med.taken else "pending"
        print(f"       {med.scheduled_time}  {med.medication_name} ({med.dose}) — {status}")
    print(f"   - Daily logs today ({len(logs)}):")
    for entry in logs:
        print(f"       {entry.field_id}: {entry.value}")
    print()


def print_tool_trace(tool_name: str, arguments: dict[str, object]):
    """Print when the LLM triggers a tool during eval and return the real result."""
    print(f"   [TOOL CALLED] {tool_name}({json.dumps(arguments)})", flush=True)
    result = execute_tool(tool_name, arguments)
    print(f"   [TOOL SUCCESS] {result.success}", flush=True)
    if result.success:
        result_text = json.dumps(result.result, indent=2, ensure_ascii=False)
        print("   [TOOL RESULT]", flush=True)
        for line in result_text.splitlines():
            print(f"   | {line}", flush=True)
    if result.error:
        print(f"   [TOOL ERROR]  {result.error}", flush=True)
    return result


def medication_is_taken(medication_name: str, scheduled_time: str) -> bool:
    """Return whether a scheduled dose is marked taken in today's medication log."""
    today = date.today()
    for med in queries.get_medications_for_date(today):
        if med.medication_name == medication_name and med.scheduled_time == scheduled_time:
            return med.taken
    return False


def main() -> None:
    """Run live checks: chat, JSON, read tools, write tools (with DB verify)."""
    config = get_llm_config()
    if "localhost" in config.base_url and not os.getenv("VITAL_LLM_BASE_URL"):
        print("Set VITAL_LLM_BASE_URL in .env to your Modal /v1 endpoint first.")
        sys.exit(1)

    skip_reset = os.getenv("VITAL_EVAL_SKIP_RESET", "").lower() in ("1", "true", "yes")
    if skip_reset:
        print("WARNING: VITAL_EVAL_SKIP_RESET is set — using existing DB (may be stale).\n")
    else:
        reset_database()
        seed_eval_database()

    client = get_llm_client()

    print("=" * 60)
    print("1) Simple chat (no tools)")
    print("=" * 60)
    reply = client.chat("Write a short story about a cat in 20 words.", use_tools=False)
    print(f"   Reply: {reply}\n")

    print("=" * 60)
    print("2) JSON output — onboarding Call 1 (realistic)")
    print("=" * 60)
    onboarding_prompt = """
You are generating adaptive follow-up questions for a new Vitál user.

Profile:
- Name: Amara, age 24, Port Harcourt
- Condition: sickle cell disease (HbSS)
- Medications: Folic acid 5mg at 08:00, Vitamin C 500mg at 18:00
- Triggers: dehydration, cold, stress
- Goal: manage health condition, fewer pain crises
- Desk worker, light exercise, local foods: eba, egusi, beans

Return ONLY valid JSON with this exact shape (max 3 questions):
{
  "follow_up_questions": [
    {
      "question_id": "snake_case_id",
      "question": "question text for the user",
      "type": "number",
      "reason": "why this question matters"
    }
  ]
}

Use type as one of: number, text. If no follow-ups needed, return an empty array.
"""
    payload = client.generate_json(
        onboarding_prompt,
        system_addition="Return ONLY valid JSON. No markdown fences. No extra keys.",
    )
    print("   Full JSON response:")
    print(json.dumps(payload, indent=2, ensure_ascii=False))
    print()

    print("=" * 60)
    print("3) Tool call — read medications (seeded DB)")
    print("=" * 60)
    print_db_snapshot("DB before LLM (read test):")

    with patch("llm.client.execute_tool", side_effect=print_tool_trace):
        tool_reply = client.chat(
            "What medications do I have scheduled today and which ones have I already taken?",
            use_tools=True,
        )

    print("   LLM final answer:")
    print(f"   {tool_reply}\n")

    print("=" * 60)
    print("4) Tool call — write to DB (log_medication_taken)")
    print("=" * 60)
    print_db_snapshot("DB before LLM (write test):")
    print("   Expect: Vitamin C 18:00 is pending; LLM should call log_medication_taken.")
    print("   Guardrails: TOOL_NAMES whitelist, HH:MM time, non-empty medication_name,")
    print("   mark_medication_taken only updates matching rows (updated: true/false).\n")

    with patch("llm.client.execute_tool", side_effect=print_tool_trace):
        write_reply = client.chat(
            "I just took my evening Vitamin C 500mg dose scheduled for 18:00. "
            "Please log it as taken in my medication log.",
            use_tools=True,
        )

    print("   LLM final answer:")
    print(f"   {write_reply}\n")
    print_db_snapshot("DB after LLM (write test):")

    vitamin_c_taken = medication_is_taken("Vitamin C", "18:00")
    if vitamin_c_taken:
        print("   [PASS] DB verify: Vitamin C 18:00 is now marked taken.")
    else:
        print("   [FAIL] DB verify: Vitamin C 18:00 is still pending.")
        print("     Check [TOOL CALLED] above — validation may have rejected bad args.")

    print("\nAll eval checks completed.")


if __name__ == "__main__":
    main()