noticecheck / scripts /generate_example_cache.py
Abid Ali Awan
Refactor and rebrand app as NoticeCheck
6cf9641
Raw
History Blame Contribute Delete
3.6 kB
"""Generate bundled example assessments with the configured model endpoint."""
from __future__ import annotations
import base64
import json
import mimetypes
import sys
from datetime import date
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
sys.path.insert(0, str(ROOT))
from app.config import EXAMPLE_CACHE_PATH, model_config # noqa: E402
from app.model_endpoint import call_model # noqa: E402
TEXT_EXAMPLES = {
"text-courier": (
"PAKISTAN POST: Your parcel address is incomplete. Pay Rs. 85 today at "
"http://pakpost-delivery.xyz or the parcel will be destroyed."
),
"text-fbr": (
"FBR REFUND: You are eligible for Rs 42,500. Submit your CNIC and bank "
"card details at the link today to receive payment."
),
"text-bank": (
"HBL Security: Your account will be suspended. Share the OTP sent to "
"your phone with our support team immediately."
),
}
IMAGE_EXAMPLES = {
"image-courier": ROOT / "static" / "example-courier.jpeg",
"image-mobile": ROOT / "static" / "example-mobile.png",
"image-traffic": ROOT / "static" / "example-trafic.png",
}
def image_data_url(path: Path) -> str:
mime_type = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
encoded = base64.b64encode(path.read_bytes()).decode("ascii")
return f"data:{mime_type};base64,{encoded}"
def quality_issue(example_id: str, assessment: dict[str, object]) -> str:
explanation = str(assessment["simple_explanation"]).lower()
next_steps = " ".join(
str(item) for item in assessment["safe_next_steps"] # type: ignore[union-attr]
).lower()
if "social media" in next_steps:
return "safe next steps recommend social media"
if any(phrase in explanation for phrase in ("in the future", "in the past")):
return "explanation makes an unsupported date comparison"
if example_id == "image-traffic" and any(
name in next_steps for name in ("fbr", "nadra")
):
return "traffic fine advice names an unrelated authority"
return ""
def generate_assessment(
example_id: str,
*,
text: str = "",
image: str = "",
) -> dict[str, object]:
last_issue = ""
for attempt in range(1, 4):
assessment = call_model(text, image)
last_issue = quality_issue(example_id, assessment)
if not last_issue:
print(f"{example_id}: accepted on attempt {attempt}")
return assessment
print(f"{example_id}: retrying after attempt {attempt}: {last_issue}")
raise RuntimeError(f"{example_id} failed cache quality checks: {last_issue}")
def main() -> None:
config = model_config()
examples = {
example_id: generate_assessment(example_id, text=text)
for example_id, text in TEXT_EXAMPLES.items()
}
examples.update(
{
example_id: generate_assessment(
example_id,
image=image_data_url(path),
)
for example_id, path in IMAGE_EXAMPLES.items()
}
)
document = {
"model_repo": config.repo_id,
"model_name": config.filename,
"endpoint": config.source,
"endpoint_type": "In-process llama.cpp runtime",
"generated_at": date.today().isoformat(),
"examples": examples,
}
EXAMPLE_CACHE_PATH.write_text(
json.dumps(document, indent=2, ensure_ascii=True) + "\n",
encoding="utf-8",
)
print(f"Generated {len(examples)} assessments in {EXAMPLE_CACHE_PATH}")
if __name__ == "__main__":
main()