cmc
Browse files- Dockerfile +16 -1
- cache_ds.py +5 -0
- rag.py +16 -3
Dockerfile
CHANGED
|
@@ -1,12 +1,27 @@
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
-
# install
|
| 4 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
build-essential \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
WORKDIR /code
|
|
|
|
|
|
|
| 9 |
COPY requirements.txt .
|
| 10 |
RUN pip install --no-cache-dir -r requirements.txt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 11 |
COPY . .
|
|
|
|
| 12 |
CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]
|
|
|
|
| 1 |
FROM python:3.11-slim
|
| 2 |
|
| 3 |
+
# install build deps
|
| 4 |
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 5 |
build-essential \
|
| 6 |
&& rm -rf /var/lib/apt/lists/*
|
| 7 |
|
| 8 |
WORKDIR /code
|
| 9 |
+
|
| 10 |
+
# install python deps
|
| 11 |
COPY requirements.txt .
|
| 12 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 13 |
+
|
| 14 |
+
# ---- NEW: pre-cache dataset into /code/.cache ----
|
| 15 |
+
COPY cache_ds.py .
|
| 16 |
+
RUN python cache_ds.py
|
| 17 |
+
|
| 18 |
+
# tell HF/datasets to use the pre-cached dir (read-only at runtime)
|
| 19 |
+
ENV HF_DATASETS_OFFLINE=1
|
| 20 |
+
ENV HF_DATASETS_CACHE=/code/.cache
|
| 21 |
+
ENV TRANSFORMERS_CACHE=/code/.cache
|
| 22 |
+
ENV TRANSFORMERS_OFFLINE=1
|
| 23 |
+
|
| 24 |
+
# copy app code
|
| 25 |
COPY . .
|
| 26 |
+
|
| 27 |
CMD ["gunicorn", "app:app", "-b", "0.0.0.0:7860"]
|
cache_ds.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# cache_ds.py
|
| 2 |
+
from datasets import load_dataset
|
| 3 |
+
# downloads & caches the parquet branch into /code/.cache
|
| 4 |
+
load_dataset("NimrodDev/LD_Events2", revision="refs/convert/parquet", split="train")
|
| 5 |
+
print("Dataset cached.")
|
rag.py
CHANGED
|
@@ -94,20 +94,33 @@ def _fallback_answer(company: str, intent: str) -> str:
|
|
| 94 |
# ------------------------------------------------------------------
|
| 95 |
# RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
|
| 96 |
# ------------------------------------------------------------------
|
|
|
|
|
|
|
|
|
|
| 97 |
def load_texts() -> List[str]:
|
| 98 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 99 |
return [row["text"] for row in ds if row.get("text")]
|
| 100 |
|
| 101 |
# ------------------------------------------------------------------
|
| 102 |
-
# SINGLE-BUILD VECTOR STORE
|
| 103 |
# ------------------------------------------------------------------
|
| 104 |
@lru_cache(maxsize=1)
|
| 105 |
def get_vectorstore() -> FAISS:
|
| 106 |
texts = load_texts()
|
| 107 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 108 |
docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
|
|
|
|
|
|
|
|
|
|
| 109 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 110 |
-
return FAISS.from_documents(docs, embeddings)
|
| 111 |
|
| 112 |
# ------------------------------------------------------------------
|
| 113 |
# LLM
|
|
|
|
| 94 |
# ------------------------------------------------------------------
|
| 95 |
# RAM-ONLY DOCUMENT LOADER – PARQUET BRANCH
|
| 96 |
# ------------------------------------------------------------------
|
| 97 |
+
# ------------------------------------------------------------------
|
| 98 |
+
# RAM-ONLY DOCUMENT LOADER – OFF-LINE / PRE-CACHED
|
| 99 |
+
# ------------------------------------------------------------------
|
| 100 |
def load_texts() -> List[str]:
|
| 101 |
+
# offline + trust_remote_code=False -> no write, no download
|
| 102 |
+
ds = load_dataset(
|
| 103 |
+
HF_DS,
|
| 104 |
+
revision="refs/convert/parquet",
|
| 105 |
+
split="train",
|
| 106 |
+
trust_remote_code=False,
|
| 107 |
+
keep_in_memory=True # force RAM, no disk touch
|
| 108 |
+
)
|
| 109 |
return [row["text"] for row in ds if row.get("text")]
|
| 110 |
|
| 111 |
# ------------------------------------------------------------------
|
| 112 |
+
# SINGLE-BUILD VECTOR STORE (cached for life of worker)
|
| 113 |
# ------------------------------------------------------------------
|
| 114 |
@lru_cache(maxsize=1)
|
| 115 |
def get_vectorstore() -> FAISS:
|
| 116 |
texts = load_texts()
|
| 117 |
splitter = RecursiveCharacterTextSplitter(chunk_size=600, chunk_overlap=50)
|
| 118 |
docs = splitter.create_documents(texts, metadatas=[{"source": HF_DS}] * len(texts))
|
| 119 |
+
|
| 120 |
+
# tell sentence-transformers to use the pre-cached model inside the image
|
| 121 |
+
os.environ["HF_HOME"] = "/code/.cache" # <-- NEW
|
| 122 |
embeddings = HuggingFaceEmbeddings(model_name=EMBED_MODEL)
|
| 123 |
+
return FAISS.from_documents(docs, embeddings) # built ONCE per worker
|
| 124 |
|
| 125 |
# ------------------------------------------------------------------
|
| 126 |
# LLM
|