noticecheck

Running on Zero

noticecheck / scripts /generate_example_cache.py

Abid Ali Awan

Refactor and rebrand app as NoticeCheck

6cf9641 17 days ago

3.6 kB

	"""Generate bundled example assessments with the configured model endpoint."""

	from __future__ import annotations

	import base64
	import json
	import mimetypes
	import sys
	from datetime import date
	from pathlib import Path

	ROOT = Path(__file__).resolve().parents[1]
	sys.path.insert(0, str(ROOT))

	from app.config import EXAMPLE_CACHE_PATH, model_config # noqa: E402
	from app.model_endpoint import call_model # noqa: E402

	TEXT_EXAMPLES = {
	"text-courier": (
	"PAKISTAN POST: Your parcel address is incomplete. Pay Rs. 85 today at "
	"http://pakpost-delivery.xyz or the parcel will be destroyed."
	),
	"text-fbr": (
	"FBR REFUND: You are eligible for Rs 42,500. Submit your CNIC and bank "
	"card details at the link today to receive payment."
	),
	"text-bank": (
	"HBL Security: Your account will be suspended. Share the OTP sent to "
	"your phone with our support team immediately."
	),
	}

	IMAGE_EXAMPLES = {
	"image-courier": ROOT / "static" / "example-courier.jpeg",
	"image-mobile": ROOT / "static" / "example-mobile.png",
	"image-traffic": ROOT / "static" / "example-trafic.png",
	}


	def image_data_url(path: Path) -> str:
	mime_type = mimetypes.guess_type(path.name)[0] or "application/octet-stream"
	encoded = base64.b64encode(path.read_bytes()).decode("ascii")
	return f"data:{mime_type};base64,{encoded}"


	def quality_issue(example_id: str, assessment: dict[str, object]) -> str:
	explanation = str(assessment["simple_explanation"]).lower()
	next_steps = " ".join(
	str(item) for item in assessment["safe_next_steps"] # type: ignore[union-attr]
	).lower()
	if "social media" in next_steps:
	return "safe next steps recommend social media"
	if any(phrase in explanation for phrase in ("in the future", "in the past")):
	return "explanation makes an unsupported date comparison"
	if example_id == "image-traffic" and any(
	name in next_steps for name in ("fbr", "nadra")
	):
	return "traffic fine advice names an unrelated authority"
	return ""


	def generate_assessment(
	example_id: str,
	*,
	text: str = "",
	image: str = "",
	) -> dict[str, object]:
	last_issue = ""
	for attempt in range(1, 4):
	assessment = call_model(text, image)
	last_issue = quality_issue(example_id, assessment)
	if not last_issue:
	print(f"{example_id}: accepted on attempt {attempt}")
	return assessment
	print(f"{example_id}: retrying after attempt {attempt}: {last_issue}")
	raise RuntimeError(f"{example_id} failed cache quality checks: {last_issue}")


	def main() -> None:
	config = model_config()
	examples = {
	example_id: generate_assessment(example_id, text=text)
	for example_id, text in TEXT_EXAMPLES.items()
	}
	examples.update(
	{
	example_id: generate_assessment(
	example_id,
	image=image_data_url(path),
	)
	for example_id, path in IMAGE_EXAMPLES.items()
	}
	)

	document = {
	"model_repo": config.repo_id,
	"model_name": config.filename,
	"endpoint": config.source,
	"endpoint_type": "In-process llama.cpp runtime",
	"generated_at": date.today().isoformat(),
	"examples": examples,
	}
	EXAMPLE_CACHE_PATH.write_text(
	json.dumps(document, indent=2, ensure_ascii=True) + "\n",
	encoding="utf-8",
	)
	print(f"Generated {len(examples)} assessments in {EXAMPLE_CACHE_PATH}")


	if __name__ == "__main__":
	main()