cmc
Browse files- Dockerfile +4 -0
- create_json.py +0 -11
- fetch_text.py +0 -13
- install_cache.sh +0 -15
- cache_ds.py β pre_download.py +1 -9
Dockerfile
CHANGED
|
@@ -14,6 +14,10 @@ RUN mkdir -p /code/.cache && chmod 777 /code/.cache
|
|
| 14 |
COPY requirements.txt .
|
| 15 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
# copy app code
|
| 18 |
COPY . .
|
| 19 |
|
|
|
|
| 14 |
COPY requirements.txt .
|
| 15 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 16 |
|
| 17 |
+
# ---- NEW: pre-download embedding model (build-time, online) ----
|
| 18 |
+
COPY pre_download.py .
|
| 19 |
+
RUN HF_HOME=/code/.cache python pre_download.py
|
| 20 |
+
|
| 21 |
# copy app code
|
| 22 |
COPY . .
|
| 23 |
|
create_json.py
DELETED
|
@@ -1,11 +0,0 @@
|
|
| 1 |
-
# create_json.py
|
| 2 |
-
from datasets import load_dataset
|
| 3 |
-
import json
|
| 4 |
-
|
| 5 |
-
ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
|
| 6 |
-
plain = [{"text": row["text"]} for row in ds if row.get("text")]
|
| 7 |
-
|
| 8 |
-
with open("ld_events_text.json", "w", encoding="utf-8") as f:
|
| 9 |
-
json.dump(plain, f, ensure_ascii=False, indent=2)
|
| 10 |
-
|
| 11 |
-
print("Saved", len(plain), "rows to ld_events_text.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
fetch_text.py
DELETED
|
@@ -1,13 +0,0 @@
|
|
| 1 |
-
# fetch_text.py β- runs ONCE during HF build (on-line)
|
| 2 |
-
import requests, json, os
|
| 3 |
-
|
| 4 |
-
URL = "https://datasets-server.huggingface.co/rows"
|
| 5 |
-
params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
|
| 6 |
-
rows = requests.get(URL, params=params, timeout=60).json()["rows"]
|
| 7 |
-
|
| 8 |
-
plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
|
| 9 |
-
|
| 10 |
-
with open("ld_events_text.json", "w", encoding="utf-8") as f:
|
| 11 |
-
json.dump(plain, f, ensure_ascii=False, indent=2)
|
| 12 |
-
|
| 13 |
-
print("Fetched & saved", len(plain), "rows to ld_events_text.json")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
install_cache.sh
DELETED
|
@@ -1,15 +0,0 @@
|
|
| 1 |
-
#!/bin/bash
|
| 2 |
-
# install_cache.sh ββ runs ONCE during HF build (online) β caches into ./.cache
|
| 3 |
-
set -e
|
| 4 |
-
python - <<'PY'
|
| 5 |
-
from datasets import load_dataset
|
| 6 |
-
from sentence_transformers import SentenceTransformer
|
| 7 |
-
|
| 8 |
-
# 1. download plain text dataset (online, build-time only)
|
| 9 |
-
ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train")
|
| 10 |
-
print("β Dataset cached at build time")
|
| 11 |
-
|
| 12 |
-
# 2. download embedding model (online, build-time only)
|
| 13 |
-
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 14 |
-
print("β Embedding model cached at build time")
|
| 15 |
-
PY
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
cache_ds.py β pre_download.py
RENAMED
|
@@ -1,12 +1,4 @@
|
|
| 1 |
-
#
|
| 2 |
-
# cache_ds.py β- build-time pre-cache (runs ONCE on HF builder)
|
| 3 |
-
from datasets import load_dataset
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
-
|
| 6 |
-
# 1. plain JSON dataset β zero custom features
|
| 7 |
-
ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train", keep_in_memory=True)
|
| 8 |
-
print("β Text dataset cached at build time")
|
| 9 |
-
|
| 10 |
-
# 2. embedding model
|
| 11 |
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 12 |
print("β Embedding model cached at build time")
|
|
|
|
| 1 |
+
# pre_download.py
|
|
|
|
|
|
|
| 2 |
from sentence_transformers import SentenceTransformer
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
|
| 4 |
print("β Embedding model cached at build time")
|