"""Download the ONNX models for Phase A (semantic embeddings + LM detectors). models/minilm/ all-MiniLM-L6-v2 (semantic paraphrase signal S2) models/gpt2/ GPT-2 124M quantized (Binoculars observer + perplexity) models/distilgpt2/ distilGPT-2 quantized (Binoculars performer) All quantized int8 — total ~250MB, CPU-friendly, no torch. """ import os import sys import requests ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) MODELS = os.path.join(ROOT, "models") UA = {"User-Agent": "plagcheck-edu/0.1"} HF = "https://huggingface.co" DOWNLOADS = [ # (repo, remote file, local dir, local name; first existing alt wins) ("sentence-transformers/all-MiniLM-L6-v2", ["onnx/model_quint8_avx2.onnx", "onnx/model.onnx"], "minilm", "model.onnx"), ("sentence-transformers/all-MiniLM-L6-v2", ["tokenizer.json"], "minilm", "tokenizer.json"), ("Xenova/gpt2", ["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"], "gpt2", "model.onnx"), ("Xenova/gpt2", ["tokenizer.json"], "gpt2", "tokenizer.json"), ("Xenova/distilgpt2", ["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"], "distilgpt2", "model.onnx"), ] def fetch(repo, remote_alts, subdir, name): dest_dir = os.path.join(MODELS, subdir) os.makedirs(dest_dir, exist_ok=True) dest = os.path.join(dest_dir, name) if os.path.exists(dest) and os.path.getsize(dest) > 10_000: print(f"[skip] {subdir}/{name} already present " f"({os.path.getsize(dest)/1e6:.1f} MB)") return True for remote in remote_alts: url = f"{HF}/{repo}/resolve/main/{remote}" try: r = requests.get(url, headers=UA, stream=True, timeout=60) if r.status_code != 200: print(f"[miss] {url} -> HTTP {r.status_code}") continue total = int(r.headers.get("content-length") or 0) done = 0 tmp = dest + ".part" with open(tmp, "wb") as f: for chunk in r.iter_content(1 << 20): f.write(chunk) done += len(chunk) if total: pct = 100 * done / total print(f"\r[get ] {subdir}/{name} {pct:5.1f}% " f"({done/1e6:.0f}/{total/1e6:.0f} MB)", end="", flush=True) print() os.replace(tmp, dest) print(f"[done] {subdir}/{name} <- {remote} " f"({os.path.getsize(dest)/1e6:.1f} MB)") return True except requests.RequestException as exc: print(f"[fail] {url}: {exc}") return False if __name__ == "__main__": ok = True for repo, alts, subdir, name in DOWNLOADS: ok = fetch(repo, alts, subdir, name) and ok print("ALL MODELS READY" if ok else "SOME MODELS MISSING") sys.exit(0 if ok else 1)