Zubaish commited on
Commit
6d3e4d2
·
1 Parent(s): ebecac1
Files changed (3) hide show
  1. Dockerfile +8 -1
  2. ingest.py +53 -31
  3. rag.py +19 -12
Dockerfile CHANGED
@@ -1,15 +1,22 @@
1
  FROM python:3.10-slim
 
2
  WORKDIR /app
 
 
3
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
4
 
 
5
  COPY requirements.txt .
6
  RUN pip install --no-cache-dir -r requirements.txt
7
 
 
8
  COPY app.py rag.py ingest.py config.py ./
9
  COPY frontend ./frontend
10
 
11
- # This will now succeed because requirements.txt has langchain-chroma
12
  RUN python ingest.py
13
 
 
14
  EXPOSE 7860
 
15
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
1
  FROM python:3.10-slim
2
+
3
  WORKDIR /app
4
+
5
+ # Install system dependencies for git and PDF processing
6
  RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
7
 
8
+ # Install Python requirements
9
  COPY requirements.txt .
10
  RUN pip install --no-cache-dir -r requirements.txt
11
 
12
+ # Copy project files
13
  COPY app.py rag.py ingest.py config.py ./
14
  COPY frontend ./frontend
15
 
16
+ # CRITICAL: Build the knowledge base during the Docker build process
17
  RUN python ingest.py
18
 
19
+ # Hugging Face Spaces standard port
20
  EXPOSE 7860
21
+
22
  CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
ingest.py CHANGED
@@ -2,37 +2,59 @@ import os
2
  from datasets import load_dataset
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
- from langchain_community.embeddings import HuggingFaceEmbeddings
6
  from langchain_chroma import Chroma
7
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
8
 
9
- os.makedirs(KB_DIR, exist_ok=True)
10
-
11
- print("⬇️ Downloading PDFs from HF Dataset...")
12
- dataset = load_dataset(HF_DATASET_REPO, split="train")
13
-
14
- pdf_paths = []
15
- for row in dataset:
16
- path = os.path.join(KB_DIR, row["file_name"])
17
- with open(path, "wb") as f:
18
- f.write(row["file"])
19
- pdf_paths.append(path)
20
-
21
- print("📄 Loading documents...")
22
- docs = []
23
- for p in pdf_paths:
24
- docs.extend(PyPDFLoader(p).load())
25
-
26
- splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
27
- splits = splitter.split_documents(docs)
28
-
29
- print("🧠 Creating embeddings...")
30
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
31
-
32
- Chroma.from_documents(
33
- splits,
34
- embedding=embeddings,
35
- persist_directory=CHROMA_DIR
36
- )
37
-
38
- print("✅ Ingestion complete")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  from datasets import load_dataset
3
  from langchain_community.document_loaders import PyPDFLoader
4
  from langchain_text_splitters import RecursiveCharacterTextSplitter
5
+ from langchain_huggingface import HuggingFaceEmbeddings
6
  from langchain_chroma import Chroma
7
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
8
 
9
+ def run_ingestion():
10
+ os.makedirs(KB_DIR, exist_ok=True)
11
+
12
+ print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
13
+ dataset = load_dataset(HF_DATASET_REPO, split="train")
14
+
15
+ # Debug: Print column names to logs
16
+ print(f"📊 Dataset columns: {dataset.column_names}")
17
+
18
+ pdf_paths = []
19
+ for i, row in enumerate(dataset):
20
+ # Flexible column mapping
21
+ fname = row.get("file_name") or row.get("filename") or f"document_{i}.pdf"
22
+ pdf_data = row.get("file") or row.get("pdf")
23
+
24
+ if pdf_data is None:
25
+ print(f"⚠️ Skipping row {i}: No PDF data found.")
26
+ continue
27
+
28
+ path = os.path.join(KB_DIR, fname)
29
+ with open(path, "wb") as f:
30
+ # Handle HF dataset format (bytes vs dict)
31
+ if isinstance(pdf_data, dict) and "bytes" in pdf_data:
32
+ f.write(pdf_data["bytes"])
33
+ else:
34
+ f.write(pdf_data)
35
+ pdf_paths.append(path)
36
+
37
+ print(f"📄 Processing {len(pdf_paths)} PDFs...")
38
+ docs = []
39
+ for p in pdf_paths:
40
+ try:
41
+ loader = PyPDFLoader(p)
42
+ docs.extend(loader.load())
43
+ except Exception as e:
44
+ print(f"❌ Error loading {p}: {e}")
45
+
46
+ splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
47
+ splits = splitter.split_documents(docs)
48
+
49
+ print("🧠 Creating embeddings and Vector DB...")
50
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
51
+
52
+ Chroma.from_documents(
53
+ documents=splits,
54
+ embedding=embeddings,
55
+ persist_directory=CHROMA_DIR
56
+ )
57
+ print(f"✅ Ingestion complete. DB saved to {CHROMA_DIR}")
58
+
59
+ if __name__ == "__main__":
60
+ run_ingestion()
rag.py CHANGED
@@ -1,35 +1,42 @@
1
- # rag.py
2
  import os
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
- from langchain_chroma import Chroma # This requires langchain-chroma package
6
  from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
 
8
  # 1. Initialize Embeddings
9
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
10
 
11
- # 2. Load Vector DB
12
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
13
  vectordb = Chroma(
14
  persist_directory=CHROMA_DIR,
15
  embedding_function=embeddings
16
  )
17
- print(f"✅ Vector DB loaded")
18
  else:
 
19
  vectordb = None
20
- print(f"⚠️ Vector DB not found")
21
 
22
- # 3. LLM Pipeline
23
- qa_pipeline = pipeline(task="text-generation", model=LLM_MODEL, max_new_tokens=256)
 
 
 
 
24
 
25
  def ask_rag_with_status(question: str):
26
  if vectordb is None:
27
- return "Knowledge base is empty.", "NO_KB"
28
 
29
  docs = vectordb.similarity_search(question, k=3)
30
- context = "\n\n".join(d.page_content for d in docs)
 
31
 
32
- prompt = f"Use the context to answer.\nContext:\n{context}\nQuestion:\n{question}\nAnswer:"
33
- result = qa_pipeline(prompt)
34
 
35
- return result[0]["generated_text"].split("Answer:")[-1].strip(), "OK"
 
 
 
 
 
1
  import os
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
+ from langchain_chroma import Chroma
5
  from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
6
 
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
+ # 2. Load Vector DB (Load only, do not recreate)
11
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
12
  vectordb = Chroma(
13
  persist_directory=CHROMA_DIR,
14
  embedding_function=embeddings
15
  )
16
+ print(f"✅ Vector DB loaded successfully")
17
  else:
18
+ print(f"❌ Vector DB NOT found at {CHROMA_DIR}")
19
  vectordb = None
 
20
 
21
+ # 3. LLM Pipeline (CPU Safe)
22
+ qa_pipeline = pipeline(
23
+ task="text-generation",
24
+ model=LLM_MODEL,
25
+ max_new_tokens=256
26
+ )
27
 
28
  def ask_rag_with_status(question: str):
29
  if vectordb is None:
30
+ return "Knowledge base is empty. Technical error during build.", "NO_KB"
31
 
32
  docs = vectordb.similarity_search(question, k=3)
33
+ if not docs:
34
+ return "I couldn't find any relevant information in the documents.", "NO_MATCH"
35
 
36
+ context = "\n\n".join(d.page_content for d in docs)
37
+ prompt = f"Use the context to answer.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
38
 
39
+ result = qa_pipeline(prompt)
40
+ answer = result[0]["generated_text"].split("Answer:")[-1].strip()
41
+
42
+ return answer, ["Context retrieved", "Response generated"]