yinuozhang commited on
Commit
660dc20
·
verified ·
1 Parent(s): 7116be1

try to fix storage

Browse files
Files changed (1) hide show
  1. app.py +65 -5
app.py CHANGED
@@ -1,20 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import gradio as gr
2
  import sys
3
  import pandas as pd
4
  from transformers import AutoTokenizer, AutoModel, AutoConfig
5
- # Add the current directory to the system path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  metalatte_path = '.'
7
  sys.path.insert(0, metalatte_path)
8
 
9
  # Import the custom configuration and model
10
  from configuration import MetaLATTEConfig
11
- from modeling_metalatte import MultitaskProteinModel
12
  AutoConfig.register("metalatte", MetaLATTEConfig)
13
  AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
14
 
15
- tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
16
- config = AutoConfig.from_pretrained("ChatterjeeLab/MetaLATTE")
17
- model = AutoModel.from_pretrained("ChatterjeeLab/MetaLATTE", config=config)
 
 
18
 
19
  def predict(sequence):
20
  inputs = tokenizer(sequence, return_tensors="pt")
 
1
+ # ---- BOOTSTRAP: keep storage under control on Spaces ----
2
+ import os, shutil, subprocess
3
+ from huggingface_hub import scan_cache_dir, snapshot_download
4
+
5
+ # 1) Put ALL caches in /data so they’re manageable & persistent
6
+ os.makedirs("/data/.cache", exist_ok=True)
7
+ os.environ.setdefault("XDG_CACHE_HOME", "/data/.cache")
8
+ os.environ.setdefault("HF_HOME", "/data/.cache/huggingface")
9
+ os.environ.setdefault("HF_HUB_CACHE", "/data/.cache/huggingface/hub")
10
+ os.environ.setdefault("TRANSFORMERS_CACHE", "/data/.cache/huggingface/transformers")
11
+ os.environ.setdefault("DATASETS_CACHE", "/data/.cache/huggingface/datasets")
12
+
13
+ # 2) Prune old HF cache revisions (keeps current blobs, deletes stale revs)
14
+ try:
15
+ cache = scan_cache_dir(os.environ["HF_HUB_CACHE"])
16
+ cache.delete_revisions([rev for rev in cache.revisions])
17
+ except Exception as e:
18
+ print(f"[cache prune] skipped: {e}")
19
+
20
+ # (Optional) light guard: trim pip wheel cache
21
+ try:
22
+ subprocess.run(["pip", "cache", "purge"], check=False)
23
+ except Exception:
24
+ pass
25
+ # ---- END BOOTSTRAP ----
26
+
27
  import gradio as gr
28
  import sys
29
  import pandas as pd
30
  from transformers import AutoTokenizer, AutoModel, AutoConfig
31
+
32
+ # If you want fully reproducible rebuilds, set these in Space → Settings → Variables
33
+ # (or leave blank to use latest)
34
+ MODEL_ID = "ChatterjeeLab/MetaLATTE"
35
+ TOKENIZER_ID = "facebook/esm2_t33_650M_UR50D"
36
+ MODEL_REV = os.getenv("MODEL_REV", "") # e.g. "a1b2c3d"
37
+ TOKENIZER_REV = os.getenv("TOKENIZER_REV", "") # e.g. "9f8e7d6"
38
+
39
+ # Prefer downloading *exactly* what you need to /data and load locally.
40
+ # This avoids multiple revision copies over time.
41
+ def maybe_snapshot(repo_id, revision, allow_patterns):
42
+ kw = dict(repo_id=repo_id, local_dir=None, ignore_regex=None)
43
+ if revision:
44
+ kw["revision"] = revision
45
+ # Download to HF cache in /data; return the resolved local dir
46
+ return snapshot_download(allow_patterns=allow_patterns, **kw)
47
+
48
+ # Download tokenizer files only (small)
49
+ esm_local = maybe_snapshot(
50
+ TOKENIZER_ID, TOKENIZER_REV,
51
+ allow_patterns=[
52
+ "tokenizer.json","tokenizer_config.json","vocab.*","merges.*",
53
+ "special_tokens_map.json","*.model","tokenizer*.txt","spiece.*","*.tiktoken"
54
+ ]
55
+ )
56
+
57
+ # Download MetaLATTE (weights + config only)
58
+ metalatte_local = maybe_snapshot(
59
+ MODEL_ID, MODEL_REV,
60
+ allow_patterns=["*.json","*.safetensors","*.bin","*.model","*.txt"] # keep it tight
61
+ )
62
+
63
+ # Add the current directory to the system path for your custom code
64
  metalatte_path = '.'
65
  sys.path.insert(0, metalatte_path)
66
 
67
  # Import the custom configuration and model
68
  from configuration import MetaLATTEConfig
69
+ from modeling_metalatte import MultitaskProteinModel
70
  AutoConfig.register("metalatte", MetaLATTEConfig)
71
  AutoModel.register(MetaLATTEConfig, MultitaskProteinModel)
72
 
73
+ # Load from the local snapshot dirs (avoids re-downloading on rebuilds)
74
+ tokenizer = AutoTokenizer.from_pretrained(esm_local, local_files_only=True)
75
+ config = AutoConfig.from_pretrained(metalatte_local, local_files_only=True)
76
+ model = AutoModel.from_pretrained(metalatte_local, config=config, local_files_only=True)
77
+
78
 
79
  def predict(sequence):
80
  inputs = tokenizer(sequence, return_tensors="pt")