Plaiglab / scripts /get_models.py
SanidhyaDhangar's picture
PlaigLab β€” Hugging Face Space (Docker) clean deploy
ebebfe8
Raw
History Blame Contribute Delete
2.97 kB
"""Download the ONNX models for Phase A (semantic embeddings + LM detectors).
models/minilm/ all-MiniLM-L6-v2 (semantic paraphrase signal S2)
models/gpt2/ GPT-2 124M quantized (Binoculars observer + perplexity)
models/distilgpt2/ distilGPT-2 quantized (Binoculars performer)
All quantized int8 β€” total ~250MB, CPU-friendly, no torch.
"""
import os
import sys
import requests
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
MODELS = os.path.join(ROOT, "models")
UA = {"User-Agent": "plagcheck-edu/0.1"}
HF = "https://huggingface.co"
DOWNLOADS = [
# (repo, remote file, local dir, local name; first existing alt wins)
("sentence-transformers/all-MiniLM-L6-v2",
["onnx/model_quint8_avx2.onnx", "onnx/model.onnx"],
"minilm", "model.onnx"),
("sentence-transformers/all-MiniLM-L6-v2",
["tokenizer.json"], "minilm", "tokenizer.json"),
("Xenova/gpt2",
["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"],
"gpt2", "model.onnx"),
("Xenova/gpt2", ["tokenizer.json"], "gpt2", "tokenizer.json"),
("Xenova/distilgpt2",
["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"],
"distilgpt2", "model.onnx"),
]
def fetch(repo, remote_alts, subdir, name):
dest_dir = os.path.join(MODELS, subdir)
os.makedirs(dest_dir, exist_ok=True)
dest = os.path.join(dest_dir, name)
if os.path.exists(dest) and os.path.getsize(dest) > 10_000:
print(f"[skip] {subdir}/{name} already present "
f"({os.path.getsize(dest)/1e6:.1f} MB)")
return True
for remote in remote_alts:
url = f"{HF}/{repo}/resolve/main/{remote}"
try:
r = requests.get(url, headers=UA, stream=True, timeout=60)
if r.status_code != 200:
print(f"[miss] {url} -> HTTP {r.status_code}")
continue
total = int(r.headers.get("content-length") or 0)
done = 0
tmp = dest + ".part"
with open(tmp, "wb") as f:
for chunk in r.iter_content(1 << 20):
f.write(chunk)
done += len(chunk)
if total:
pct = 100 * done / total
print(f"\r[get ] {subdir}/{name} {pct:5.1f}% "
f"({done/1e6:.0f}/{total/1e6:.0f} MB)",
end="", flush=True)
print()
os.replace(tmp, dest)
print(f"[done] {subdir}/{name} <- {remote} "
f"({os.path.getsize(dest)/1e6:.1f} MB)")
return True
except requests.RequestException as exc:
print(f"[fail] {url}: {exc}")
return False
if __name__ == "__main__":
ok = True
for repo, alts, subdir, name in DOWNLOADS:
ok = fetch(repo, alts, subdir, name) and ok
print("ALL MODELS READY" if ok else "SOME MODELS MISSING")
sys.exit(0 if ok else 1)