US-aware Capture/Ledger + wire in the reasoning LLM agent (Qwen3-8B via vLLM/Modal) with router fallback
d103096 verified | """Test the vLLM reasoning agent end-to-end against the live Modal endpoint. | |
| Warms up the endpoint (cold start downloads + loads the model), then runs a range of | |
| questions through the real Agent loop and prints the tools the model chose + answers. | |
| """ | |
| import json | |
| import os | |
| import sys | |
| import time | |
| import urllib.request | |
| try: | |
| sys.stdout.reconfigure(encoding="utf-8") | |
| except Exception: | |
| pass | |
| BASE = "https://carloscmoracd--pa-agent-llm-serve.modal.run/v1" | |
| os.environ["PA_LLM_MODE"] = "openai" | |
| os.environ["PA_LLM_ENDPOINT"] = BASE | |
| os.environ["PA_LLM_MODEL"] = "pa-agent" | |
| from src.agent import Agent, ToolContext, build_tools | |
| from src.agent.serving import OpenAIToolClient | |
| from src.finetune import RuleClassifier | |
| from src.ledger import Ledger | |
| from src.retrieval import Retriever | |
| from src import config | |
| def warmup(max_wait=600): | |
| print("Warming up endpoint (cold start can take a few minutes)…", flush=True) | |
| start = time.time() | |
| while time.time() - start < max_wait: | |
| try: | |
| req = urllib.request.Request(BASE + "/models", | |
| headers={"Authorization": "Bearer EMPTY"}) | |
| with urllib.request.urlopen(req, timeout=30) as r: | |
| data = json.loads(r.read()) | |
| ids = [m["id"] for m in data.get("data", [])] | |
| print(f" ready after {int(time.time()-start)}s. models: {ids}", flush=True) | |
| return True | |
| except Exception as e: | |
| print(f" …not ready ({int(time.time()-start)}s): {type(e).__name__}", flush=True) | |
| time.sleep(15) | |
| return False | |
| def main(): | |
| if not warmup(): | |
| print("Endpoint never came up."); return | |
| lg = Ledger(":memory:") | |
| lg.record_income("2024-05-03", "Branding", 18000, iva_rate="0.16") | |
| lg.record_income("2024-05-17", "Website", 27000, iva_rate="0.16", | |
| isr_retenido="2700", iva_retenido="2880") | |
| lg.record_expense("2024-05-05", "Adobe", 1300, iva_rate="0.16") | |
| lg.record_expense("2024-05-12", "Office rent", 4000, iva_rate="0.16") | |
| retriever = Retriever.from_corpus_dir(config.REGULATION_DIR) | |
| ctx = ToolContext(lg, retriever=retriever, classifier=RuleClassifier(), country="MX") | |
| tools = build_tools(ctx) | |
| cases = [ | |
| ("[mx][en][2024-05] How much VAT do I owe this month?", ), | |
| ("[mx][en][2024-05] Which tax regime suits me better?", ), | |
| ("[mx][es][2024-05] ¿Puedo deducir mi laptop nueva?", ), | |
| ("[us][en][2024-05] How much federal tax will I owe this year?", ), | |
| ("[us][en][2024-05] What is the QBI deduction and do I qualify?", ), | |
| ("[mx][en][2024-05] Explain what RESICO is in one sentence.", ), | |
| ] | |
| for (q,) in cases: | |
| print("\n" + "=" * 70) | |
| print("Q:", q) | |
| t0 = time.time() | |
| try: | |
| trace = Agent(OpenAIToolClient(BASE), tools, max_steps=5).run(q) | |
| print(f"tools used: {[s.tool for s in trace.steps]} ({time.time()-t0:.1f}s)") | |
| print("A:", (trace.final_answer or "").replace("\n", " ")[:400]) | |
| except Exception as e: | |
| print("ERROR:", repr(e)[:300]) | |
| if __name__ == "__main__": | |
| main() | |