MetiMiester commited on
Commit
2c954bb
·
verified ·
1 Parent(s): 03e120f

Update download_assets.py

Browse files
Files changed (1) hide show
  1. download_assets.py +112 -119
download_assets.py CHANGED
@@ -1,119 +1,112 @@
1
- # download_assets.py — fetch models from a Google Drive FOLDER (no zips)
2
- # Expects a folder with subfolders: Text/, Image/, Audio/
3
- # Env: GDRIVE_FOLDER_ID=<your folder id>
4
- # Coder: Amir
5
-
6
- import os, sys, shutil, pathlib
7
- from typing import Optional
8
- from filelock import FileLock
9
- import gdown
10
-
11
- BASE = pathlib.Path(__file__).resolve().parent
12
- CACHE_DIR = BASE / "cache"
13
- DL_DIR = CACHE_DIR / "gdrive_folder" # where we download the Drive folder
14
-
15
- TARGETS = {
16
- "text": {
17
- "dst": BASE / "Text",
18
- "must": ["config.json", "tokenizer.json", "model.safetensors", "vocab.json"],
19
- "folder_name": "Text",
20
- },
21
- "image": {
22
- "dst": BASE / "Image",
23
- "must": ["resnet_safety_classifier.pth", "clip_safety_classifier.pth"],
24
- "folder_name": "Image",
25
- },
26
- "audio": {
27
- "dst": BASE / "Audio",
28
- "must": ["text_pipeline_balanced.joblib"],
29
- "folder_name": "Audio",
30
- },
31
- }
32
-
33
- def _ready(kind: str) -> bool:
34
- info = TARGETS[kind]
35
- d = info["dst"]
36
- return d.exists() and all((d / m).exists() for m in info["must"])
37
-
38
- def _find_subdir(root: pathlib.Path, name: str) -> Optional[pathlib.Path]:
39
- lname = name.lower()
40
- for p in root.rglob("*"):
41
- if p.is_dir() and p.name.lower() == lname:
42
- return p
43
- return None
44
-
45
- def _merge_copy(src: pathlib.Path, dst: pathlib.Path):
46
- dst.mkdir(parents=True, exist_ok=True)
47
- for root, dirs, files in os.walk(src):
48
- rroot = pathlib.Path(root)
49
- rel = rroot.relative_to(src)
50
- # create subdirs
51
- for d in dirs:
52
- (dst / rel / d).mkdir(parents=True, exist_ok=True)
53
- # copy files
54
- for f in files:
55
- s = rroot / f
56
- t = dst / rel / f
57
- t.parent.mkdir(parents=True, exist_ok=True)
58
- shutil.copy2(s, t)
59
-
60
- def _download_folder(folder_id: str):
61
- DL_DIR.mkdir(parents=True, exist_ok=True)
62
- lock = FileLock(str(DL_DIR) + ".lock")
63
- with lock:
64
- # If already downloaded once, skip re-download
65
- if any(DL_DIR.iterdir()):
66
- print("[assets] Drive folder already present; skipping fresh download.")
67
- return
68
- print(f"[assets] downloading Drive folder {folder_id} …")
69
- gdown.download_folder(
70
- id=folder_id,
71
- output=str(DL_DIR),
72
- quiet=False,
73
- use_cookies=False,
74
- remaining_ok=True,
75
- )
76
-
77
- def ensure_all_assets():
78
- # If everything already exists, return fast
79
- if all(_ready(k) for k in TARGETS):
80
- print("[assets] all bundles already present.")
81
- return
82
-
83
- folder_id = os.getenv("GDRIVE_FOLDER_ID", "").strip()
84
- if not folder_id:
85
- raise RuntimeError("[assets] Please set GDRIVE_FOLDER_ID (top-level Drive folder with Text/Image/Audio)")
86
-
87
- _download_folder(folder_id)
88
-
89
- # Drive may nest one extra directory level; detect the top
90
- # Pick the largest immediate subdir if needed, else use DL_DIR itself
91
- root = DL_DIR
92
- # If there is exactly one child dir and it contains our subfolders, use it
93
- subdirs = [p for p in DL_DIR.iterdir() if p.is_dir()]
94
- if len(subdirs) == 1:
95
- maybe = subdirs[0]
96
- if all(_find_subdir(maybe, TARGETS[k]["folder_name"]) for k in TARGETS):
97
- root = maybe
98
-
99
- # For each target, locate its folder and merge-copy into app dirs
100
- for kind, info in TARGETS.items():
101
- if _ready(kind):
102
- print(f"[assets] {kind}: already ready.")
103
- continue
104
-
105
- src_dir = _find_subdir(root, info["folder_name"])
106
- if not src_dir:
107
- raise RuntimeError(f"[assets] {kind}: could not find '{info['folder_name']}' folder in Drive download")
108
-
109
- print(f"[assets] {kind}: copying from {src_dir} → {info['dst']}")
110
- _merge_copy(src_dir, info["dst"])
111
-
112
- # sanity check
113
- missing = [m for m in info["must"] if not (info["dst"] / m).exists()]
114
- if missing:
115
- raise RuntimeError(f"[assets] {kind}: missing files after copy → {missing}")
116
-
117
- print(f"[assets] {kind}: ready.")
118
-
119
- print("[assets] all bundles ready.")
 
1
+ # download_assets.py — fetch models from a Google Drive FOLDER (no zips)
2
+ # Expects a folder with subfolders: Text/, Image/, Audio/
3
+ # Env: GDRIVE_FOLDER_ID=<your folder id>
4
+ # Author: Amir
5
+
6
+ import os, sys, shutil, pathlib
7
+ from typing import Optional
8
+ from filelock import FileLock
9
+ import gdown
10
+
11
+ BASE = pathlib.Path(__file__).resolve().parent
12
+ CACHE_DIR = BASE / "cache"
13
+ DL_DIR = CACHE_DIR / "gdrive_folder" # where we download the Drive folder
14
+
15
+ TARGETS = {
16
+ "text": {
17
+ "dst": BASE / "Text",
18
+ "must": ["config.json", "tokenizer.json", "model.safetensors", "vocab.json"],
19
+ "folder_name": "Text",
20
+ },
21
+ "image": {
22
+ "dst": BASE / "Image",
23
+ "must": ["resnet_safety_classifier.pth", "clip_safety_classifier.pth"],
24
+ "folder_name": "Image",
25
+ },
26
+ "audio": {
27
+ "dst": BASE / "Audio",
28
+ "must": ["text_pipeline_balanced.joblib"],
29
+ "folder_name": "Audio",
30
+ },
31
+ }
32
+
33
+ def _ready(kind: str) -> bool:
34
+ info = TARGETS[kind]
35
+ d = info["dst"]
36
+ return d.exists() and all((d / m).exists() for m in info["must"])
37
+
38
+ def _find_subdir(root: pathlib.Path, name: str) -> Optional[pathlib.Path]:
39
+ lname = name.lower()
40
+ for p in root.rglob("*"):
41
+ if p.is_dir() and p.name.lower() == lname:
42
+ return p
43
+ return None
44
+
45
+ def _merge_copy(src: pathlib.Path, dst: pathlib.Path):
46
+ dst.mkdir(parents=True, exist_ok=True)
47
+ for root, dirs, files in os.walk(src):
48
+ rroot = pathlib.Path(root)
49
+ rel = rroot.relative_to(src)
50
+ for d in dirs:
51
+ (dst / rel / d).mkdir(parents=True, exist_ok=True)
52
+ for f in files:
53
+ s = rroot / f
54
+ t = dst / rel / f
55
+ t.parent.mkdir(parents=True, exist_ok=True)
56
+ shutil.copy2(s, t)
57
+
58
+ def _download_folder(folder_id: str):
59
+ DL_DIR.mkdir(parents=True, exist_ok=True)
60
+ lock = FileLock(str(DL_DIR) + ".lock")
61
+ with lock:
62
+ if any(DL_DIR.iterdir()):
63
+ print("[assets] Drive folder already present; skipping fresh download.")
64
+ return
65
+ print(f"[assets] downloading Drive folder {folder_id} …")
66
+ gdown.download_folder(
67
+ id=folder_id,
68
+ output=str(DL_DIR),
69
+ quiet=False,
70
+ use_cookies=False,
71
+ remaining_ok=True,
72
+ )
73
+
74
+ def ensure_all_assets():
75
+ # If everything already exists, return fast
76
+ if all(_ready(k) for k in TARGETS):
77
+ print("[assets] all bundles already present.")
78
+ return
79
+
80
+ folder_id = os.getenv("GDRIVE_FOLDER_ID", "").strip()
81
+ if not folder_id:
82
+ raise RuntimeError("[assets] Please set GDRIVE_FOLDER_ID (top-level Drive folder with Text/Image/Audio)")
83
+
84
+ _download_folder(folder_id)
85
+
86
+ # Detect an extra nesting level that gdown sometimes creates
87
+ root = DL_DIR
88
+ subdirs = [p for p in DL_DIR.iterdir() if p.is_dir()]
89
+ if len(subdirs) == 1:
90
+ maybe = subdirs[0]
91
+ if all(_find_subdir(maybe, TARGETS[k]["folder_name"]) for k in TARGETS):
92
+ root = maybe
93
+
94
+ for kind, info in TARGETS.items():
95
+ if _ready(kind):
96
+ print(f"[assets] {kind}: already ready.")
97
+ continue
98
+
99
+ src_dir = _find_subdir(root, info["folder_name"])
100
+ if not src_dir:
101
+ raise RuntimeError(f"[assets] {kind}: could not find '{info['folder_name']}' folder in Drive download")
102
+
103
+ print(f"[assets] {kind}: copying from {src_dir} → {info['dst']}")
104
+ _merge_copy(src_dir, info["dst"])
105
+
106
+ missing = [m for m in info["must"] if not (info["dst"] / m).exists()]
107
+ if missing:
108
+ raise RuntimeError(f"[assets] {kind}: missing files after copy → {missing}")
109
+
110
+ print(f"[assets] {kind}: ready.")
111
+
112
+ print("[assets] all bundles ready.")