Zubaish commited on
Commit
11f1809
·
1 Parent(s): 6d3d36d
Files changed (2) hide show
  1. ingest.py +37 -18
  2. rag.py +5 -1
ingest.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import shutil
3
  from datasets import load_dataset
@@ -8,55 +9,73 @@ from langchain_chroma import Chroma
8
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
- # Clean setup for the container
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
 
17
  dataset = load_dataset(HF_DATASET_REPO, split="train")
18
 
19
  pdf_paths = []
 
20
  for i, row in enumerate(dataset):
21
- pdf_feature = row.get("pdf")
22
- src_path = None
23
 
24
- # Access the cached file path from Hugging Face's internal download
25
- if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
26
- elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
27
 
28
- if src_path and os.path.exists(src_path):
29
- dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
30
- shutil.copy(src_path, dest_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  pdf_paths.append(dest_path)
32
- print(f"✅ Cached: doc_{i}.pdf")
 
 
33
 
34
- print(f"📄 Processing {len(pdf_paths)} documents...")
35
  docs = []
36
  for p in pdf_paths:
37
  try:
38
  loader = PyPDFLoader(p)
39
  docs.extend(loader.load())
40
  except Exception as e:
41
- print(f"❌ Error loading {p}: {e}")
42
 
43
  if not docs:
44
- print("❌ Error: No documents successfully loaded.")
45
  return
46
 
 
47
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
48
  splits = splitter.split_documents(docs)
49
 
50
- print(f"🧠 Building Vector DB at {CHROMA_DIR}...")
51
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
52
 
53
- # This creates the physical folder and files
54
  Chroma.from_documents(
55
- documents=splits,
56
- embedding=embeddings,
57
  persist_directory=CHROMA_DIR
58
  )
59
- print("✅ Ingestion complete.")
60
 
61
  if __name__ == "__main__":
62
  run_ingestion()
 
1
+ # ingest.py
2
  import os
3
  import shutil
4
  from datasets import load_dataset
 
9
  from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
10
 
11
  def run_ingestion():
12
+ # Clean and create directories
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
  print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
18
+ # For PDF folders, we want to access the files directly
19
  dataset = load_dataset(HF_DATASET_REPO, split="train")
20
 
21
  pdf_paths = []
22
+ # In PdfFolder, row['pdf'] is often a dictionary or a path object
23
  for i, row in enumerate(dataset):
24
+ pdf_item = row.get("pdf")
 
25
 
26
+ # Determine the filename
27
+ filename = f"doc_{i}.pdf"
28
+ dest_path = os.path.join(KB_DIR, filename)
29
 
30
+ try:
31
+ # Handle if pdf_item is a path string
32
+ if isinstance(pdf_item, str) and os.path.exists(pdf_item):
33
+ shutil.copy(pdf_item, dest_path)
34
+ # Handle if pdf_item is a dictionary with a 'path' (Common in HF)
35
+ elif isinstance(pdf_item, dict) and pdf_item.get("path"):
36
+ shutil.copy(pdf_item["path"], dest_path)
37
+ # Handle if pdf_item is a dictionary with 'bytes'
38
+ elif isinstance(pdf_item, dict) and pdf_item.get("bytes"):
39
+ with open(dest_path, "wb") as f:
40
+ f.write(pdf_item["bytes"])
41
+ # Fallback for specialized HF PDF objects
42
+ elif hasattr(pdf_item, 'filename'):
43
+ shutil.copy(pdf_item.filename, dest_path)
44
+ else:
45
+ print(f"⚠️ Could not find a valid path for document {i}")
46
+ continue
47
+
48
  pdf_paths.append(dest_path)
49
+ print(f"✅ Extracted: {filename}")
50
+ except Exception as e:
51
+ print(f"❌ Failed to extract doc_{i}: {e}")
52
 
53
+ print(f"📄 Loading {len(pdf_paths)} documents into LangChain...")
54
  docs = []
55
  for p in pdf_paths:
56
  try:
57
  loader = PyPDFLoader(p)
58
  docs.extend(loader.load())
59
  except Exception as e:
60
+ print(f"❌ PyPDFLoader error on {p}: {e}")
61
 
62
  if not docs:
63
+ print("❌ CRITICAL: No text could be extracted from PDFs.")
64
  return
65
 
66
+ # Chunking
67
  splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
68
  splits = splitter.split_documents(docs)
69
 
70
+ print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
71
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
72
 
 
73
  Chroma.from_documents(
74
+ documents=splits,
75
+ embedding=embeddings,
76
  persist_directory=CHROMA_DIR
77
  )
78
+ print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
79
 
80
  if __name__ == "__main__":
81
  run_ingestion()
rag.py CHANGED
@@ -20,7 +20,11 @@ else:
20
  vectordb = None
21
 
22
  # 3. LLM Pipeline
23
- qa_pipeline = pipeline("text-generation", model=LLM_MODEL, max_new_tokens=256)
 
 
 
 
24
 
25
  def ask_rag_with_status(question: str):
26
  if vectordb is None:
 
20
  vectordb = None
21
 
22
  # 3. LLM Pipeline
23
+ qa_pipeline = pipeline(
24
+ task="text2text-generation", # Fixed task type for T5 models
25
+ model=LLM_MODEL,
26
+ max_new_tokens=256
27
+ )
28
 
29
  def ask_rag_with_status(question: str):
30
  if vectordb is None: