Spaces:

ChatterjeeLab
/

MetaLATTE-demo

Runtime error

App Files Files Community

MetaLATTE-demo / app.py

yinuozhang

try to fix storage

660dc20 verified 2 months ago

raw

history blame

3.83 kB

	# ---- BOOTSTRAP: keep storage under control on Spaces ----
	import os, shutil, subprocess
	from huggingface_hub import scan_cache_dir, snapshot_download

	# 1) Put ALL caches in /data so they’re manageable & persistent
	os.makedirs("/data/.cache", exist_ok=True)
	os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
	os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
	os.environ.setdefault("HF_HUB_CACHE", "/data/.cache/huggingface/hub")
	os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers")
	os.environ.setdefault("DATASETS_CACHE", "/data/.cache/huggingface/datasets")

	# 2) Prune old HF cache revisions (keeps current blobs, deletes stale revs)
	try:
	cache = scan_cache_dir(os.environ["HF_HUB_CACHE"])
	cache.delete_revisions([rev for rev in cache.revisions])
	except Exception as e:
	print(f"[cache prune] skipped: {e}")

	# (Optional) light guard: trim pip wheel cache
	try:
	subprocess.run(["pip", "cache", "purge"], check=False)
	except Exception:
	pass
	# ---- END BOOTSTRAP ----

	import gradio as gr
	import sys
	import pandas as pd
	from transformers import AutoTokenizer, AutoModel, AutoConfig

	# If you want fully reproducible rebuilds, set these in Space → Settings → Variables
	# (or leave blank to use latest)
	MODEL_ID = "ChatterjeeLab/MetaLATTE"
	TOKENIZER_ID = "facebook/esm2_t33_650M_UR50D"
	MODEL_REV = os.getenv("MODEL_REV", "") # e.g. "a1b2c3d"
	TOKENIZER_REV = os.getenv("TOKENIZER_REV", "") # e.g. "9f8e7d6"

	# Prefer downloading exactly what you need to /data and load locally.
	# This avoids multiple revision copies over time.
	def maybe_snapshot(repo_id, revision, allow_patterns):
	kw = dict(repo_id=repo_id, local_dir=None, ignore_regex=None)
	if revision:
	kw["revision"] = revision
	# Download to HF cache in /data; return the resolved local dir
	return snapshot_download(allow_patterns=allow_patterns, **kw)

	# Download tokenizer files only (small)
	esm_local = maybe_snapshot(
	TOKENIZER_ID, TOKENIZER_REV,
	allow_patterns=[
	"tokenizer.json","tokenizer_config.json","vocab.","merges.",
	"special_tokens_map.json",".model","tokenizer.txt","spiece.",".tiktoken"
	]
	)

	# Download MetaLATTE (weights + config only)
	metalatte_local = maybe_snapshot(
	MODEL_ID, MODEL_REV,
	allow_patterns=[".json",".safetensors",".bin",".model","*.txt"] # keep it tight
	)

	# Add the current directory to the system path for your custom code
	metalatte_path = '.'
	sys.path.insert(0, metalatte_path)

	# Import the custom configuration and model
	from configuration import MetaLATTEConfig
	from modeling_metalatte import MultitaskProteinModel
	AutoConfig.register("metalatte", MetaLATTEConfig)
	AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)

	# Load from the local snapshot dirs (avoids re-downloading on rebuilds)
	tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
	config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
	model = AutoModel.from_pretrained(metalatte_local, config=config, local_files_only=True)


	def predict(sequence):
	inputs = tokenizer(sequence, return_tensors="pt")
	raw_probs, predictions = model.predict(**inputs)

	id2label = config.id2label
	results = {}
	for i, pred in enumerate(predictions[0]):
	metal = id2label[i]
	probability = raw_probs[0][i].item()
	results[metal] = '✓' if pred == 1 else ''

	df = pd.DataFrame([results])
	return df

	iface = gr.Interface(
	fn=predict,
	inputs=gr.Textbox(lines=3, placeholder="Enter protein sequence here..."),
	outputs=gr.Dataframe(headers=list(config.id2label.values())),
	title="MetaLATTE: Metal Binding Prediction",
	description="Enter a protein sequence to predict its metal binding properties."
	)

	iface.launch()