post-audit / audit_client.py
pasternake's picture
feat(ui): light/dark theme toggle (PR #7)
b90345a verified
Raw
History Blame Contribute Delete
7.27 kB
"""HTTP client for Modal audit endpoint with mock fallback."""
from __future__ import annotations
import json
import os
import urllib.error
import urllib.request
from typing import Any
from merge import parse_llm_json
from prompts import FEW_SHOT_ASSISTANT, build_messages
def get_modal_url() -> str | None:
return os.environ.get("MODAL_AUDIT_URL") or os.environ.get("MODAL_AUDIT_ENDPOINT")
def get_modal_timeout() -> float:
# A cold Modal container loads the quantized model (and on a fresh deploy
# pulls the GGUF), which can push the first request past a minute. Generous default.
return float(os.environ.get("MODAL_AUDIT_TIMEOUT", "300"))
def get_ollama_model() -> str | None:
"""Local Ollama model tag (e.g. 'gemma4:e4b'); enables the local-LLM path."""
return os.environ.get("OLLAMA_MODEL")
def get_ollama_url() -> str:
return os.environ.get("OLLAMA_URL", "http://localhost:11434")
def get_ollama_timeout() -> float:
# Local CPU/Metal inference is slow: a full audit is ~2.5 min on an M1 with
# gemma3:4b, and the first (cold) call adds model-load time. Generous default.
return float(os.environ.get("OLLAMA_TIMEOUT", "300"))
def backend_label() -> str:
"""Human-readable description of the inference backend run() will use."""
if get_modal_url():
return "live Gemma 4 E4B on Modal"
model = get_ollama_model()
if model:
return f"local Ollama ({model})"
return "mock LLM (set MODAL_AUDIT_URL or OLLAMA_MODEL)"
def call_modal_audit(
platform: str,
goal: str,
audience: str,
post: str,
*,
timeout: float | None = None,
) -> dict[str, Any]:
"""Dispatch to a backend: Modal endpoint, local Ollama, or deterministic mock."""
if timeout is None:
timeout = get_modal_timeout()
url = get_modal_url()
if not url:
if get_ollama_model():
return _call_ollama(platform, goal, audience, post)
return _mock_llm_response(platform, goal, audience, post)
payload = json.dumps(
{
"platform": platform,
"goal": goal,
"audience": audience,
"post": post,
}
).encode("utf-8")
# Modal serves a single fastapi_endpoint at the root of its URL — no path suffix.
headers = {"Content-Type": "application/json"}
token = os.environ.get("MODAL_AUDIT_TOKEN")
if token:
headers["X-Audit-Token"] = token
req = urllib.request.Request(
url.rstrip("/"),
data=payload,
headers=headers,
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
data = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
body = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"Modal HTTP {exc.code}: {body}") from exc
if "raw" in data:
return parse_llm_json(data["raw"])
return data
def _ollama_chat(model: str, messages: list[dict[str, str]], timeout: float) -> str:
"""POST messages to Ollama's /api/chat with JSON-constrained output; return content."""
body = json.dumps(
{
"model": model,
"messages": messages,
"stream": False,
"format": "json", # constrain output to valid JSON
"options": {"temperature": 0, "num_predict": 2048},
}
).encode("utf-8")
req = urllib.request.Request(
get_ollama_url().rstrip("/") + "/api/chat",
data=body,
headers={"Content-Type": "application/json"},
method="POST",
)
try:
with urllib.request.urlopen(req, timeout=timeout) as resp:
data = json.loads(resp.read().decode("utf-8"))
except urllib.error.HTTPError as exc:
detail = exc.read().decode("utf-8", errors="replace")
raise RuntimeError(f"Ollama HTTP {exc.code}: {detail}") from exc
except urllib.error.URLError as exc:
raise RuntimeError(
f"Cannot reach Ollama at {get_ollama_url()} — is it running? ({exc.reason})"
) from exc
return data.get("message", {}).get("content", "")
def _call_ollama(
platform: str,
goal: str,
audience: str,
post: str,
*,
timeout: float | None = None,
) -> dict[str, Any]:
"""Run the audit against a local Ollama model. First call may be slow (model load)."""
if timeout is None:
timeout = get_ollama_timeout()
model = get_ollama_model()
messages = build_messages(platform, goal, audience, post)
raw = _ollama_chat(model, messages, timeout)
try:
return parse_llm_json(raw)
except (json.JSONDecodeError, ValueError):
# One retry with an explicit instruction, mirroring the Modal path.
retry = messages + [
{"role": "user", "content": "Return ONLY valid JSON matching the schema. No other text."}
]
return parse_llm_json(_ollama_chat(model, retry, timeout))
def _mock_llm_response(
platform: str,
goal: str,
audience: str,
post: str,
) -> dict[str, Any]:
"""Deterministic mock when Modal URL is unset — uses few-shot shape for webinar-like posts."""
del platform, goal, audience
lower = post.lower()
if "link in bio" in lower or "webinar" in lower:
return json.loads(FEW_SHOT_ASSISTANT)
return {
"briefCheck": {
"status": "ok",
"inferred": {
"goal": "Unclear from post — needs editor review",
"audience": "In-context colleagues",
},
"gaps": [],
},
"auditReport": {
"goalAlignment": {
"overall": 40,
"cappedBy": [],
"dimensions": [
{"key": "hook", "score": 2, "rationale": "Opening lacks a clear stake or benefit."},
{"key": "clarity", "score": 2, "rationale": "Multiple topics mixed in one dump."},
{"key": "audienceFit", "score": 3, "rationale": "Jargon may fit insiders but structure is rough."},
{"key": "goalService", "score": 2, "rationale": "Does not clearly drive the stated goal actions."},
{"key": "cta", "score": 2, "rationale": "Call to action is logistics-only or missing deadline."},
],
"summary": "Mock audit (set MODAL_AUDIT_URL for live Gemma 4 E4B). Post needs structure and a clearer CTA.",
},
"warnings": [
{
"code": "MIXED_MESSAGES",
"severity": "warning",
"source": "llm",
"message": "Artifact, task, and logistics appear mixed.",
},
{
"code": "NO_CLEAR_CTA",
"severity": "warning",
"source": "llm",
"message": "No explicit personal action with a deadline.",
},
],
"rewriteHints": [
"Lead with why the reader should act now.",
"Separate artifact, task, and logistics into sections.",
"Add one explicit CTA with a deadline.",
],
},
}