AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 28

Commit

40676fe

1 Parent(s): f09a853

update

Browse files

Files changed (3) hide show

Dockerfile +10 -7
download_models.py +11 -0
ingest.py +9 -34

Dockerfile CHANGED Viewed

@@ -2,21 +2,24 @@ FROM python:3.10-slim
 WORKDIR /app
-# Install system dependencies for git and PDF processing
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Install Python requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy project files
-COPY app.py rag.py ingest.py config.py ./
-COPY frontend ./frontend
-# CRITICAL: Build the knowledge base during the Docker build process
 RUN python ingest.py
-# Hugging Face Spaces standard port
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 WORKDIR /app
+# Install system dependencies
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 # Install Python requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
+# Copy all project files
+COPY . .
+# ---------------------------------------------------------
+# PRE-BUILD PHASE
+# This downloads models and processes PDFs during the build.
+# This prevents httpx.ReadTimeout errors at runtime.
+# ---------------------------------------------------------
+RUN python download_models.py
 RUN python ingest.py
+# Hugging Face Space setup
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

download_models.py ADDED Viewed

	@@ -0,0 +1,11 @@

+# download_models.py
+from transformers import pipeline
+from langchain_huggingface import HuggingFaceEmbeddings
+from config import EMBEDDING_MODEL, LLM_MODEL
+print("⏳ Pre-downloading models...")
+# Download Embedding Model
+HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+# Download LLM
+pipeline("text-generation", model=LLM_MODEL)
+print("✅ Models downloaded successfully")

ingest.py CHANGED Viewed

@@ -8,61 +8,36 @@ from langchain_chroma import Chroma
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
-    # 1. Clean Environment
-    if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
-    if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
     for i, row in enumerate(dataset):
         pdf_feature = row.get("pdf")
-        # Determine Source Path
-        # HF PdfFolder datasets store the local path in the 'path' key of the feature
         src_path = None
-        if isinstance(pdf_feature, dict) and "path" in pdf_feature:
-            src_path = pdf_feature["path"]
-        elif hasattr(pdf_feature, "filename"):
-            src_path = pdf_feature.filename
         if src_path and os.path.exists(src_path):
             dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
             shutil.copy(src_path, dest_path)
             pdf_paths.append(dest_path)
-            print(f"✅ Cached: doc_{i}.pdf")
-        else:
-            print(f"⚠️ Could not resolve path for doc_{i}, skipping.")
-    # 2. Process Documents
-    print(f"📄 Processing {len(pdf_paths)} documents...")
     docs = []
     for p in pdf_paths:
         try:
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
-        except Exception as e:
-            print(f"❌ Error reading {p}: {e}")
-    if not docs:
-        print("❌ CRITICAL: No documents were successfully loaded.")
-        return
-    # 3. Chunk and Embed
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    splits = splitter.split_documents(docs)
-    print(f"🧠 Indexing {len(splits)} chunks...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    Chroma.from_documents(
-        documents=splits,
-        embedding=embeddings,
-        persist_directory=CHROMA_DIR
-    )
-    print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
     os.makedirs(KB_DIR, exist_ok=True)
+    # Load dataset - this will use the cached version from build phase
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
     for i, row in enumerate(dataset):
         pdf_feature = row.get("pdf")
+        # Access local path directly from HF cache
         src_path = None
+        if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
+        elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
         if src_path and os.path.exists(src_path):
             dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
             shutil.copy(src_path, dest_path)
             pdf_paths.append(dest_path)
     docs = []
     for p in pdf_paths:
         try:
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
+        except Exception as e: print(f"❌ Error: {e}")
+    if not docs: return
+    splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
+    print("✅ KB Initialized")
 if __name__ == "__main__":
     run_ingestion()