Zubaish commited on
Commit
6d3d36d
·
1 Parent(s): 40676fe
Files changed (3) hide show
  1. Dockerfile +2 -3
  2. ingest.py +26 -7
  3. rag.py +7 -9
Dockerfile CHANGED
@@ -9,13 +9,12 @@ RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
- # Copy all project files
13
  COPY . .
14
 
15
  # ---------------------------------------------------------
16
  # PRE-BUILD PHASE
17
- # This downloads models and processes PDFs during the build.
18
- # This prevents httpx.ReadTimeout errors at runtime.
19
  # ---------------------------------------------------------
20
  RUN python download_models.py
21
  RUN python ingest.py
 
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
+ # Copy everything (including your config and scripts)
13
  COPY . .
14
 
15
  # ---------------------------------------------------------
16
  # PRE-BUILD PHASE
17
+ # We run these in the container so they are "baked into" the image.
 
18
  # ---------------------------------------------------------
19
  RUN python download_models.py
20
  RUN python ingest.py
ingest.py CHANGED
@@ -8,15 +8,20 @@ from langchain_chroma import Chroma
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
 
 
 
11
  os.makedirs(KB_DIR, exist_ok=True)
12
- # Load dataset - this will use the cached version from build phase
 
13
  dataset = load_dataset(HF_DATASET_REPO, split="train")
14
 
15
  pdf_paths = []
16
  for i, row in enumerate(dataset):
17
  pdf_feature = row.get("pdf")
18
- # Access local path directly from HF cache
19
  src_path = None
 
 
20
  if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
21
  elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
22
 
@@ -24,20 +29,34 @@ def run_ingestion():
24
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
25
  shutil.copy(src_path, dest_path)
26
  pdf_paths.append(dest_path)
 
27
 
 
28
  docs = []
29
  for p in pdf_paths:
30
  try:
31
  loader = PyPDFLoader(p)
32
  docs.extend(loader.load())
33
- except Exception as e: print(f"❌ Error: {e}")
 
 
 
 
 
34
 
35
- if not docs: return
 
36
 
37
- splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
38
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
39
- Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
40
- print("✅ KB Initialized")
 
 
 
 
 
 
41
 
42
  if __name__ == "__main__":
43
  run_ingestion()
 
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
+ # Clean setup for the container
12
+ if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
+ if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
+
16
+ print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
17
  dataset = load_dataset(HF_DATASET_REPO, split="train")
18
 
19
  pdf_paths = []
20
  for i, row in enumerate(dataset):
21
  pdf_feature = row.get("pdf")
 
22
  src_path = None
23
+
24
+ # Access the cached file path from Hugging Face's internal download
25
  if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
26
  elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
27
 
 
29
  dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
30
  shutil.copy(src_path, dest_path)
31
  pdf_paths.append(dest_path)
32
+ print(f"✅ Cached: doc_{i}.pdf")
33
 
34
+ print(f"📄 Processing {len(pdf_paths)} documents...")
35
  docs = []
36
  for p in pdf_paths:
37
  try:
38
  loader = PyPDFLoader(p)
39
  docs.extend(loader.load())
40
+ except Exception as e:
41
+ print(f"❌ Error loading {p}: {e}")
42
+
43
+ if not docs:
44
+ print("❌ Error: No documents successfully loaded.")
45
+ return
46
 
47
+ splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
48
+ splits = splitter.split_documents(docs)
49
 
50
+ print(f"🧠 Building Vector DB at {CHROMA_DIR}...")
51
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
52
+
53
+ # This creates the physical folder and files
54
+ Chroma.from_documents(
55
+ documents=splits,
56
+ embedding=embeddings,
57
+ persist_directory=CHROMA_DIR
58
+ )
59
+ print("✅ Ingestion complete.")
60
 
61
  if __name__ == "__main__":
62
  run_ingestion()
rag.py CHANGED
@@ -7,32 +7,30 @@ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
- # 2. Load Vector DB (Load only)
11
- if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
 
12
  vectordb = Chroma(
13
  persist_directory=CHROMA_DIR,
14
  embedding_function=embeddings
15
  )
16
  print(f"✅ Vector DB loaded from {CHROMA_DIR}")
17
  else:
 
18
  vectordb = None
19
- print("❌ Vector DB directory is missing or empty")
20
 
21
  # 3. LLM Pipeline
22
  qa_pipeline = pipeline("text-generation", model=LLM_MODEL, max_new_tokens=256)
23
 
24
  def ask_rag_with_status(question: str):
25
  if vectordb is None:
26
- return "The knowledge base is not initialized.", "ERROR"
27
 
28
  docs = vectordb.similarity_search(question, k=3)
29
- if not docs:
30
- return "No relevant information found.", "NO_MATCH"
31
-
32
  context = "\n\n".join(d.page_content for d in docs)
33
- prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
34
 
35
  result = qa_pipeline(prompt)
36
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
37
 
38
- return answer, ["Context retrieved", "OK"]
 
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
+ # 2. Load Vector DB
11
+ # Ensure CHROMA_DIR is exactly the same as in ingest.py
12
+ if os.path.exists(CHROMA_DIR) and len(os.listdir(CHROMA_DIR)) > 0:
13
  vectordb = Chroma(
14
  persist_directory=CHROMA_DIR,
15
  embedding_function=embeddings
16
  )
17
  print(f"✅ Vector DB loaded from {CHROMA_DIR}")
18
  else:
19
+ print(f"⚠️ Vector DB not found at {CHROMA_DIR}")
20
  vectordb = None
 
21
 
22
  # 3. LLM Pipeline
23
  qa_pipeline = pipeline("text-generation", model=LLM_MODEL, max_new_tokens=256)
24
 
25
  def ask_rag_with_status(question: str):
26
  if vectordb is None:
27
+ return "The knowledge base is not initialized. Please check build logs.", "ERROR"
28
 
29
  docs = vectordb.similarity_search(question, k=3)
 
 
 
30
  context = "\n\n".join(d.page_content for d in docs)
31
+ prompt = f"Use the context to answer Gandhi related questions.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"
32
 
33
  result = qa_pipeline(prompt)
34
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
35
 
36
+ return answer, ["Context retrieved", "Answer generated"]