NimrodDev commited on
Commit
23a9a72
Β·
1 Parent(s): fa064b7
Dockerfile CHANGED
@@ -14,6 +14,10 @@ RUN mkdir -p /code/.cache && chmod 777 /code/.cache
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
 
 
 
 
17
  # copy app code
18
  COPY . .
19
 
 
14
  COPY requirements.txt .
15
  RUN pip install --no-cache-dir -r requirements.txt
16
 
17
+ # ---- NEW: pre-download embedding model (build-time, online) ----
18
+ COPY pre_download.py .
19
+ RUN HF_HOME=/code/.cache python pre_download.py
20
+
21
  # copy app code
22
  COPY . .
23
 
create_json.py DELETED
@@ -1,11 +0,0 @@
1
- # create_json.py
2
- from datasets import load_dataset
3
- import json
4
-
5
- ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
6
- plain = [{"text": row["text"]} for row in ds if row.get("text")]
7
-
8
- with open("ld_events_text.json", "w", encoding="utf-8") as f:
9
- json.dump(plain, f, ensure_ascii=False, indent=2)
10
-
11
- print("Saved", len(plain), "rows to ld_events_text.json")
 
 
 
 
 
 
 
 
 
 
 
 
fetch_text.py DELETED
@@ -1,13 +0,0 @@
1
- # fetch_text.py –- runs ONCE during HF build (on-line)
2
- import requests, json, os
3
-
4
- URL = "https://datasets-server.huggingface.co/rows"
5
- params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
6
- rows = requests.get(URL, params=params, timeout=60).json()["rows"]
7
-
8
- plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
9
-
10
- with open("ld_events_text.json", "w", encoding="utf-8") as f:
11
- json.dump(plain, f, ensure_ascii=False, indent=2)
12
-
13
- print("Fetched & saved", len(plain), "rows to ld_events_text.json")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
install_cache.sh DELETED
@@ -1,15 +0,0 @@
1
- #!/bin/bash
2
- # install_cache.sh –– runs ONCE during HF build (online) β†’ caches into ./.cache
3
- set -e
4
- python - <<'PY'
5
- from datasets import load_dataset
6
- from sentence_transformers import SentenceTransformer
7
-
8
- # 1. download plain text dataset (online, build-time only)
9
- ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train")
10
- print("βœ“ Dataset cached at build time")
11
-
12
- # 2. download embedding model (online, build-time only)
13
- SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
14
- print("βœ“ Embedding model cached at build time")
15
- PY
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cache_ds.py β†’ pre_download.py RENAMED
@@ -1,12 +1,4 @@
1
- # cache_ds.py
2
- # cache_ds.py –- build-time pre-cache (runs ONCE on HF builder)
3
- from datasets import load_dataset
4
  from sentence_transformers import SentenceTransformer
5
-
6
- # 1. plain JSON dataset – zero custom features
7
- ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train", keep_in_memory=True)
8
- print("βœ“ Text dataset cached at build time")
9
-
10
- # 2. embedding model
11
  SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
12
  print("βœ“ Embedding model cached at build time")
 
1
+ # pre_download.py
 
 
2
  from sentence_transformers import SentenceTransformer
 
 
 
 
 
 
3
  SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
4
  print("βœ“ Embedding model cached at build time")