Spaces:

NimrodDev
/

RAG_SPACE

Build error

NimrodDev commited on Oct 31, 2025

Commit

23a9a72

1 Parent(s): fa064b7

cmc

Files changed (5) hide show

Dockerfile CHANGED Viewed

@@ -14,6 +14,10 @@ RUN mkdir -p /code/.cache && chmod 777 /code/.cache
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 # copy app code
 COPY . .

 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# ---- NEW: pre-download embedding model (build-time, online) ----
+COPY pre_download.py .
+RUN HF_HOME=/code/.cache python pre_download.py
 # copy app code
 COPY . .

create_json.py DELETED Viewed

@@ -1,11 +0,0 @@
-# create_json.py
-from datasets import load_dataset
-import json
-ds = load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
-plain = [{"text": row["text"]} for row in ds if row.get("text")]
-with open("ld_events_text.json", "w", encoding="utf-8") as f:
-    json.dump(plain, f, ensure_ascii=False, indent=2)
-print("Saved", len(plain), "rows to ld_events_text.json")

fetch_text.py DELETED Viewed

@@ -1,13 +0,0 @@
-# fetch_text.py  –- runs ONCE during HF build (on-line)
-import requests, json, os
-URL = "https://datasets-server.huggingface.co/rows"
-params = dict(dataset="NimrodDev/LD_Events2", config="default", split="train", offset=0, length=1000)
-rows = requests.get(URL, params=params, timeout=60).json()["rows"]
-plain = [{"text": r["row"]["text"]} for r in rows if r["row"].get("text")]
-with open("ld_events_text.json", "w", encoding="utf-8") as f:
-    json.dump(plain, f, ensure_ascii=False, indent=2)
-print("Fetched & saved", len(plain), "rows to ld_events_text.json")

install_cache.sh DELETED Viewed

@@ -1,15 +0,0 @@
-#!/bin/bash
-# install_cache.sh  –– runs ONCE during HF build (online) → caches into ./.cache
-set -e
-python - <<'PY'
-from datasets import load_dataset
-from sentence_transformers import SentenceTransformer
-# 1. download plain text dataset (online, build-time only)
-ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train")
-print("✓ Dataset cached at build time")
-# 2. download embedding model (online, build-time only)
-SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
-print("✓ Embedding model cached at build time")
-PY

cache_ds.py → pre_download.py RENAMED Viewed

@@ -1,12 +1,4 @@
-# cache_ds.py
-# cache_ds.py  –- build-time pre-cache (runs ONCE on HF builder)
-from datasets import load_dataset
 from sentence_transformers import SentenceTransformer
-# 1. plain JSON dataset – zero custom features
-ds = load_dataset("NimrodDev/LD_Events_TEXT", split="train", keep_in_memory=True)
-print("✓ Text dataset cached at build time")
-# 2. embedding model
 SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 print("✓ Embedding model cached at build time")

+# pre_download.py
 from sentence_transformers import SentenceTransformer
 SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
 print("✓ Embedding model cached at build time")