Spaces:

SanidhyaDhangar
/

Plaiglab

Sleeping

App Files Files Community

Plaiglab / scripts /get_models.py

SanidhyaDhangar

PlaigLab — Hugging Face Space (Docker) clean deploy

ebebfe8 11 days ago

Raw

History Blame Contribute Delete

2.97 kB

	"""Download the ONNX models for Phase A (semantic embeddings + LM detectors).

	models/minilm/ all-MiniLM-L6-v2 (semantic paraphrase signal S2)
	models/gpt2/ GPT-2 124M quantized (Binoculars observer + perplexity)
	models/distilgpt2/ distilGPT-2 quantized (Binoculars performer)

	All quantized int8 — total ~250MB, CPU-friendly, no torch.
	"""
	import os
	import sys

	import requests

	ROOT = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
	MODELS = os.path.join(ROOT, "models")
	UA = {"User-Agent": "plagcheck-edu/0.1"}

	HF = "https://huggingface.co"
	DOWNLOADS = [
	# (repo, remote file, local dir, local name; first existing alt wins)
	("sentence-transformers/all-MiniLM-L6-v2",
	["onnx/model_quint8_avx2.onnx", "onnx/model.onnx"],
	"minilm", "model.onnx"),
	("sentence-transformers/all-MiniLM-L6-v2",
	["tokenizer.json"], "minilm", "tokenizer.json"),
	("Xenova/gpt2",
	["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"],
	"gpt2", "model.onnx"),
	("Xenova/gpt2", ["tokenizer.json"], "gpt2", "tokenizer.json"),
	("Xenova/distilgpt2",
	["onnx/decoder_model_quantized.onnx", "onnx/decoder_model.onnx"],
	"distilgpt2", "model.onnx"),
	]


	def fetch(repo, remote_alts, subdir, name):
	dest_dir = os.path.join(MODELS, subdir)
	os.makedirs(dest_dir, exist_ok=True)
	dest = os.path.join(dest_dir, name)
	if os.path.exists(dest) and os.path.getsize(dest) > 10_000:
	print(f"[skip] {subdir}/{name} already present "
	f"({os.path.getsize(dest)/1e6:.1f} MB)")
	return True
	for remote in remote_alts:
	url = f"{HF}/{repo}/resolve/main/{remote}"
	try:
	r = requests.get(url, headers=UA, stream=True, timeout=60)
	if r.status_code != 200:
	print(f"[miss] {url} -> HTTP {r.status_code}")
	continue
	total = int(r.headers.get("content-length") or 0)
	done = 0
	tmp = dest + ".part"
	with open(tmp, "wb") as f:
	for chunk in r.iter_content(1 << 20):
	f.write(chunk)
	done += len(chunk)
	if total:
	pct = 100 * done / total
	print(f"\r[get ] {subdir}/{name} {pct:5.1f}% "
	f"({done/1e6:.0f}/{total/1e6:.0f} MB)",
	end="", flush=True)
	print()
	os.replace(tmp, dest)
	print(f"[done] {subdir}/{name} <- {remote} "
	f"({os.path.getsize(dest)/1e6:.1f} MB)")
	return True
	except requests.RequestException as exc:
	print(f"[fail] {url}: {exc}")
	return False


	if __name__ == "__main__":
	ok = True
	for repo, alts, subdir, name in DOWNLOADS:
	ok = fetch(repo, alts, subdir, name) and ok
	print("ALL MODELS READY" if ok else "SOME MODELS MISSING")
	sys.exit(0 if ok else 1)