Spaces:
Sleeping
Sleeping
| """Download the ONNX models for Phase A (semantic embeddings + LM detectors). | |
| models/minilm/ all-MiniLM-L6-v2 (semantic paraphrase signal S2) | |
| models/gpt2/ GPT-2 124M quantized (Binoculars observer + perplexity) | |
| models/distilgpt2/ distilGPT-2 quantized (Binoculars performer) | |
| All quantized int8 β total ~250MB, CPU-friendly, no torch. | |
| """ | |
| import os | |
| import sys | |
| import requests | |
| ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) | |
| MODELS = os.path.join(ROOT, "models") | |
| UA = {"User-Agent": "plagcheck-edu/0.1"} | |
| HF = "https://huggingface.co" | |
| DOWNLOADS = [ | |
| # (repo, remote file, local dir, local name; first existing alt wins) | |
| ("sentence-transformers/all-MiniLM-L6-v2", | |
| ["onnx/model_quint8_avx2.onnx", "onnx/model.onnx"], | |
| "minilm", "model.onnx"), | |
| ("sentence-transformers/all-MiniLM-L6-v2", | |
| ["tokenizer.json"], "minilm", "tokenizer.json"), | |
| ("Xenova/gpt2", | |
| ["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"], | |
| "gpt2", "model.onnx"), | |
| ("Xenova/gpt2", ["tokenizer.json"], "gpt2", "tokenizer.json"), | |
| ("Xenova/distilgpt2", | |
| ["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"], | |
| "distilgpt2", "model.onnx"), | |
| ] | |
| def fetch(repo, remote_alts, subdir, name): | |
| dest_dir = os.path.join(MODELS, subdir) | |
| os.makedirs(dest_dir, exist_ok=True) | |
| dest = os.path.join(dest_dir, name) | |
| if os.path.exists(dest) and os.path.getsize(dest) > 10_000: | |
| print(f"[skip] {subdir}/{name} already present " | |
| f"({os.path.getsize(dest)/1e6:.1f} MB)") | |
| return True | |
| for remote in remote_alts: | |
| url = f"{HF}/{repo}/resolve/main/{remote}" | |
| try: | |
| r = requests.get(url, headers=UA, stream=True, timeout=60) | |
| if r.status_code != 200: | |
| print(f"[miss] {url} -> HTTP {r.status_code}") | |
| continue | |
| total = int(r.headers.get("content-length") or 0) | |
| done = 0 | |
| tmp = dest + ".part" | |
| with open(tmp, "wb") as f: | |
| for chunk in r.iter_content(1 << 20): | |
| f.write(chunk) | |
| done += len(chunk) | |
| if total: | |
| pct = 100 * done / total | |
| print(f"\r[get ] {subdir}/{name} {pct:5.1f}% " | |
| f"({done/1e6:.0f}/{total/1e6:.0f} MB)", | |
| end="", flush=True) | |
| print() | |
| os.replace(tmp, dest) | |
| print(f"[done] {subdir}/{name} <- {remote} " | |
| f"({os.path.getsize(dest)/1e6:.1f} MB)") | |
| return True | |
| except requests.RequestException as exc: | |
| print(f"[fail] {url}: {exc}") | |
| return False | |
| if __name__ == "__main__": | |
| ok = True | |
| for repo, alts, subdir, name in DOWNLOADS: | |
| ok = fetch(repo, alts, subdir, name) and ok | |
| print("ALL MODELS READY" if ok else "SOME MODELS MISSING") | |
| sys.exit(0 if ok else 1) | |