AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on 21 days ago

Commit

19d8cbd

1 Parent(s): 9edda50

update

Browse files

Files changed (4) hide show

Dockerfile +0 -14
config.py +0 -21
ingest.py +17 -48
rag.py +6 -15

Dockerfile CHANGED Viewed

@@ -1,24 +1,10 @@
 FROM python:3.10-slim
 WORKDIR /app
-# Install system dependencies
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
-# Install Python requirements
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
-# Copy everything (including your config and scripts)
 COPY . .
-# ---------------------------------------------------------
-# PRE-BUILD PHASE
-# We run these in the container so they are "baked into" the image.
-# ---------------------------------------------------------
 RUN python download_models.py
 RUN python ingest.py
-# Hugging Face Space setup
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

 FROM python:3.10-slim
 WORKDIR /app
 RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
 COPY requirements.txt .
 RUN pip install --no-cache-dir -r requirements.txt
 COPY . .
 RUN python download_models.py
 RUN python ingest.py
 EXPOSE 7860
 CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

config.py CHANGED Viewed

@@ -1,36 +1,15 @@
-# config.py
-# Central configuration for HubRAG (HF Space safe)
 import os
-# -----------------------------
-# Path Configuration
-# -----------------------------
-# Using absolute paths ensures the app finds the DB built in Dockerfile
 BASE_DIR = "/app"
-# Hugging Face Dataset
 HF_DATASET_REPO = "Zubaish/hubrag-kb"
 HF_TOKEN = os.getenv("HF_TOKEN")
-# Vector Store Path
 CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
-# Knowledge Base (Temp PDF storage)
 KB_DIR = os.path.join(BASE_DIR, "kb")
-# -----------------------------
-# Model Configuration
-# -----------------------------
-# Small, fast, CPU-safe for free-tier Spaces
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "google/flan-t5-small"
-# LLM Task type: 'text-generation' is more universally supported
-# than 'text2text-generation' in some transformers versions.
 LLM_TASK = "text-generation"
-# -----------------------------
-# Text splitting
-# -----------------------------
 CHUNK_SIZE = 1000
 CHUNK_OVERLAP = 100

 import os
 BASE_DIR = "/app"
 HF_DATASET_REPO = "Zubaish/hubrag-kb"
 HF_TOKEN = os.getenv("HF_TOKEN")
 CHROMA_DIR = os.path.join(BASE_DIR, "chroma_db")
 KB_DIR = os.path.join(BASE_DIR, "kb")
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 LLM_MODEL = "google/flan-t5-small"
 LLM_TASK = "text-generation"
 CHUNK_SIZE = 1000
 CHUNK_OVERLAP = 100

ingest.py CHANGED Viewed

@@ -1,6 +1,4 @@
-# ingest.py
-import os
-import shutil
 from datasets import load_dataset
 from langchain_community.document_loaders import Docx2txtLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -9,64 +7,35 @@ from langchain_chroma import Chroma
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
 def run_ingestion():
-    # 1. Clean directories
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Loading files from {HF_DATASET_REPO}...")
-    # Use standard load without extra flags that cause ValueErrors
-    dataset = load_dataset(HF_DATASET_REPO, split="train")
     docs = []
-    # Loop through the rows to find paths to files
     for i, row in enumerate(dataset):
-        # We check common keys used by HF for file paths
-        file_info = row.get("docx") or row.get("file") or row.get("pdf")
-        src_path = None
-        if isinstance(file_info, dict):
-            src_path = file_info.get("path")
-        elif isinstance(file_info, str):
-            src_path = file_info
-        if src_path and os.path.exists(src_path):
-            ext = os.path.splitext(src_path)[1].lower()
-            # ONLY process .docx files to avoid the PDF error
-            if ext == ".docx":
-                dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
-                shutil.copy(src_path, dest_path)
-                try:
-                    loader = Docx2txtLoader(dest_path)
-                    docs.extend(loader.load())
-                    print(f"✅ Successfully loaded: doc_{i}.docx")
-                except Exception as e:
-                    print(f"❌ Loader error on doc_{i}: {e}")
-            else:
-                print(f"⏭️ Skipping non-docx file: {src_path}")
     if not docs:
-        print("❌ CRITICAL: No .docx documents found. Ensure your dataset has .docx files.")
         return
-    # 2. Chunking
-    splitter = RecursiveCharacterTextSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
-    )
-    splits = splitter.split_documents(docs)
-    # 3. Embedding and Storage
-    print(f"🧠 Indexing {len(splits)} chunks...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    Chroma.from_documents(
-        documents=splits,
-        embedding=embeddings,
-        persist_directory=CHROMA_DIR
-    )
     print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":

+import os, shutil
 from datasets import load_dataset
 from langchain_community.document_loaders import Docx2txtLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
 def run_ingestion():
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
+    print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
+    dataset = load_dataset(HF_DATASET_REPO, split="train", decode=False)
     docs = []
     for i, row in enumerate(dataset):
+        file_item = row.get("docx") or row.get("file")
+        src_path = file_item.get("path") if isinstance(file_item, dict) else None
+        if src_path and src_path.lower().endswith(".docx"):
+            dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
+            shutil.copy(src_path, dest_path)
+            try:
+                loader = Docx2txtLoader(dest_path)
+                docs.extend(loader.load())
+                print(f"✅ Loaded: doc_{i}.docx")
+            except Exception as e:
+                print(f"❌ Error loading doc_{i}: {e}")
     if not docs:
+        print("❌ CRITICAL: No .docx documents found.")
         return
+    splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
     print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":

rag.py CHANGED Viewed

@@ -1,35 +1,26 @@
-# rag.py
 import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
-from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-# Load database created in build phase
-if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
-    print("✅ Vector DB ready")
 else:
     vectordb = None
-    print("⚠️ Vector DB missing")
-# Use generic text-generation for broadest compatibility
-qa_pipeline = pipeline(
-    task="text-generation",
-    model=LLM_MODEL,
-    max_new_tokens=256,
-    trust_remote_code=True
-)
 def ask_rag_with_status(question: str):
     if vectordb is None:
-        return "Knowledge base not initialized.", "ERROR"
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
-    prompt = f"Using the context, answer correctly.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
     result = qa_pipeline(prompt)
     answer = result[0]["generated_text"].split("Answer:")[-1].strip()

 import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
+from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+if os.path.exists(CHROMA_DIR) and os.path.isdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
+    print("✅ Vector DB loaded")
 else:
     vectordb = None
+qa_pipeline = pipeline(task=LLM_TASK, model=LLM_MODEL, max_new_tokens=256, trust_remote_code=True)
 def ask_rag_with_status(question: str):
     if vectordb is None:
+        return "Knowledge base not initialized. Check build logs.", "ERROR"
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
+    prompt = f"Answer using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
     result = qa_pipeline(prompt)
     answer = result[0]["generated_text"].split("Answer:")[-1].strip()