Zubaish commited on
Commit
06629cc
·
1 Parent(s): 6d3e4d2
Files changed (2) hide show
  1. ingest.py +16 -22
  2. rag.py +9 -13
ingest.py CHANGED
@@ -10,43 +10,37 @@ def run_ingestion():
10
  os.makedirs(KB_DIR, exist_ok=True)
11
 
12
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
13
- dataset = load_dataset(HF_DATASET_REPO, split="train")
 
14
 
15
- # Debug: Print column names to logs
16
- print(f"📊 Dataset columns: {dataset.column_names}")
17
-
18
  pdf_paths = []
19
  for i, row in enumerate(dataset):
20
- # Flexible column mapping
21
- fname = row.get("file_name") or row.get("filename") or f"document_{i}.pdf"
22
- pdf_data = row.get("file") or row.get("pdf")
23
-
24
- if pdf_data is None:
25
- print(f"⚠️ Skipping row {i}: No PDF data found.")
26
  continue
27
 
28
  path = os.path.join(KB_DIR, fname)
29
  with open(path, "wb") as f:
30
- # Handle HF dataset format (bytes vs dict)
31
- if isinstance(pdf_data, dict) and "bytes" in pdf_data:
32
- f.write(pdf_data["bytes"])
33
  else:
34
- f.write(pdf_data)
35
  pdf_paths.append(path)
36
 
37
- print(f"📄 Processing {len(pdf_paths)} PDFs...")
38
  docs = []
39
  for p in pdf_paths:
40
- try:
41
- loader = PyPDFLoader(p)
42
- docs.extend(loader.load())
43
- except Exception as e:
44
- print(f"❌ Error loading {p}: {e}")
45
 
46
  splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
47
  splits = splitter.split_documents(docs)
48
 
49
- print("🧠 Creating embeddings and Vector DB...")
50
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
51
 
52
  Chroma.from_documents(
@@ -54,7 +48,7 @@ def run_ingestion():
54
  embedding=embeddings,
55
  persist_directory=CHROMA_DIR
56
  )
57
- print(f"✅ Ingestion complete. DB saved to {CHROMA_DIR}")
58
 
59
  if __name__ == "__main__":
60
  run_ingestion()
 
10
  os.makedirs(KB_DIR, exist_ok=True)
11
 
12
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
13
+ # decode(False) prevents the library from turning bytes into pdfplumber objects
14
+ dataset = load_dataset(HF_DATASET_REPO, split="train").with_format("binary").decode(False)
15
 
 
 
 
16
  pdf_paths = []
17
  for i, row in enumerate(dataset):
18
+ # Determine filename and raw data column
19
+ fname = row.get("filename") or row.get("file_name") or f"doc_{i}.pdf"
20
+ # Access the raw 'bytes' from the 'pdf' column
21
+ pdf_feature = row.get("pdf")
22
+
23
+ if pdf_feature is None:
24
  continue
25
 
26
  path = os.path.join(KB_DIR, fname)
27
  with open(path, "wb") as f:
28
+ if isinstance(pdf_feature, dict) and "bytes" in pdf_feature:
29
+ f.write(pdf_feature["bytes"])
 
30
  else:
31
+ f.write(pdf_feature)
32
  pdf_paths.append(path)
33
 
34
+ print(f"📄 Processing {len(pdf_paths)} documents...")
35
  docs = []
36
  for p in pdf_paths:
37
+ loader = PyPDFLoader(p)
38
+ docs.extend(loader.load())
 
 
 
39
 
40
  splitter = RecursiveCharacterTextSplitter(chunk_size=800, chunk_overlap=100)
41
  splits = splitter.split_documents(docs)
42
 
43
+ print("🧠 Initializing Vector DB...")
44
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
45
 
46
  Chroma.from_documents(
 
48
  embedding=embeddings,
49
  persist_directory=CHROMA_DIR
50
  )
51
+ print(f"✅ Ingestion complete. Data saved to {CHROMA_DIR}")
52
 
53
  if __name__ == "__main__":
54
  run_ingestion()
rag.py CHANGED
@@ -7,36 +7,32 @@ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
- # 2. Load Vector DB (Load only, do not recreate)
11
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
12
  vectordb = Chroma(
13
  persist_directory=CHROMA_DIR,
14
  embedding_function=embeddings
15
  )
16
- print(f"✅ Vector DB loaded successfully")
17
  else:
18
- print(f"❌ Vector DB NOT found at {CHROMA_DIR}")
19
  vectordb = None
 
20
 
21
- # 3. LLM Pipeline (CPU Safe)
22
- qa_pipeline = pipeline(
23
- task="text-generation",
24
- model=LLM_MODEL,
25
- max_new_tokens=256
26
- )
27
 
28
  def ask_rag_with_status(question: str):
29
  if vectordb is None:
30
- return "Knowledge base is empty. Technical error during build.", "NO_KB"
31
 
32
  docs = vectordb.similarity_search(question, k=3)
33
  if not docs:
34
- return "I couldn't find any relevant information in the documents.", "NO_MATCH"
35
 
36
  context = "\n\n".join(d.page_content for d in docs)
37
- prompt = f"Use the context to answer.\n\nContext:\n{context}\n\nQuestion:\n{question}\n\nAnswer:"
38
 
39
  result = qa_pipeline(prompt)
40
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
41
 
42
- return answer, ["Context retrieved", "Response generated"]
 
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
+ # 2. Load Vector DB (Load only)
11
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
12
  vectordb = Chroma(
13
  persist_directory=CHROMA_DIR,
14
  embedding_function=embeddings
15
  )
16
+ print(f"✅ Vector DB loaded from {CHROMA_DIR}")
17
  else:
 
18
  vectordb = None
19
+ print("❌ Vector DB directory is missing or empty")
20
 
21
+ # 3. LLM Pipeline
22
+ qa_pipeline = pipeline("text-generation", model=LLM_MODEL, max_new_tokens=256)
 
 
 
 
23
 
24
  def ask_rag_with_status(question: str):
25
  if vectordb is None:
26
+ return "The knowledge base is not initialized.", "ERROR"
27
 
28
  docs = vectordb.similarity_search(question, k=3)
29
  if not docs:
30
+ return "No relevant information found.", "NO_MATCH"
31
 
32
  context = "\n\n".join(d.page_content for d in docs)
33
+ prompt = f"Context:\n{context}\n\nQuestion: {question}\n\nAnswer:"
34
 
35
  result = qa_pipeline(prompt)
36
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
37
 
38
+ return answer, ["Context retrieved", "OK"]