Zubaish commited on
Commit
d557fa1
·
1 Parent(s): 4ed3f0a
Files changed (3) hide show
  1. ingest.py +35 -54
  2. rag.py +11 -14
  3. requirements.txt +1 -2
ingest.py CHANGED
@@ -2,80 +2,61 @@
2
  import os
3
  import shutil
4
  from datasets import load_dataset
5
- from langchain_community.document_loaders import PyPDFLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
- from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
10
 
11
  def run_ingestion():
12
- # Clean and create directories
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
- print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
18
- # For PDF folders, we want to access the files directly
19
  dataset = load_dataset(HF_DATASET_REPO, split="train")
20
 
21
- pdf_paths = []
22
- # In PdfFolder, row['pdf'] is often a dictionary or a path object
23
  for i, row in enumerate(dataset):
24
- pdf_item = row.get("pdf")
 
 
25
 
26
- # Determine the filename
27
- filename = f"doc_{i}.pdf"
28
- dest_path = os.path.join(KB_DIR, filename)
29
-
30
- try:
31
- # Handle if pdf_item is a path string
32
- if isinstance(pdf_item, str) and os.path.exists(pdf_item):
33
- shutil.copy(pdf_item, dest_path)
34
- # Handle if pdf_item is a dictionary with a 'path' (Common in HF)
35
- elif isinstance(pdf_item, dict) and pdf_item.get("path"):
36
- shutil.copy(pdf_item["path"], dest_path)
37
- # Handle if pdf_item is a dictionary with 'bytes'
38
- elif isinstance(pdf_item, dict) and pdf_item.get("bytes"):
39
- with open(dest_path, "wb") as f:
40
- f.write(pdf_item["bytes"])
41
- # Fallback for specialized HF PDF objects
42
- elif hasattr(pdf_item, 'filename'):
43
- shutil.copy(pdf_item.filename, dest_path)
44
- else:
45
- print(f"⚠️ Could not find a valid path for document {i}")
46
- continue
47
-
48
- pdf_paths.append(dest_path)
49
- print(f"✅ Extracted: {filename}")
50
- except Exception as e:
51
- print(f"❌ Failed to extract doc_{i}: {e}")
52
-
53
- print(f"📄 Loading {len(pdf_paths)} documents into LangChain...")
54
- docs = []
55
- for p in pdf_paths:
56
- try:
57
- loader = PyPDFLoader(p)
58
- docs.extend(loader.load())
59
- except Exception as e:
60
- print(f"❌ PyPDFLoader error on {p}: {e}")
61
 
62
  if not docs:
63
- print("❌ CRITICAL: No text could be extracted from PDFs.")
64
  return
65
 
66
- # Chunking
67
- splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
68
- splits = splitter.split_documents(docs)
 
 
69
 
70
  print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
71
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
72
-
73
- Chroma.from_documents(
74
- documents=splits,
75
- embedding=embeddings,
76
- persist_directory=CHROMA_DIR
77
- )
78
- print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
79
 
80
  if __name__ == "__main__":
81
  run_ingestion()
 
2
  import os
3
  import shutil
4
  from datasets import load_dataset
5
+ from langchain_community.document_loaders import Docx2txtLoader, TextLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
+ from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def run_ingestion():
 
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
+ print(f"⬇️ Loading dataset files from {HF_DATASET_REPO}...")
17
+ # This works for folders of files (txt, docx, etc.)
18
  dataset = load_dataset(HF_DATASET_REPO, split="train")
19
 
20
+ docs = []
 
21
  for i, row in enumerate(dataset):
22
+ # Hugging Face provides the local path to the file in the feature dictionary
23
+ # The key is usually the file extension (e.g., 'docx' or 'text')
24
+ file_feature = row.get("docx") or row.get("text") or row.get("file")
25
 
26
+ src_path = None
27
+ if isinstance(file_feature, dict): src_path = file_feature.get("path")
28
+ elif isinstance(file_feature, str): src_path = file_feature # If it's just a path string
29
+
30
+ if src_path and os.path.exists(src_path):
31
+ ext = os.path.splitext(src_path)[1].lower()
32
+ dest_path = os.path.join(KB_DIR, f"doc_{i}{ext}")
33
+ shutil.copy(src_path, dest_path)
34
+
35
+ # Load based on extension
36
+ try:
37
+ if ext == ".docx":
38
+ loader = Docx2txtLoader(dest_path)
39
+ else:
40
+ loader = TextLoader(dest_path, encoding="utf-8")
41
+ docs.extend(loader.load())
42
+ print(f"✅ Loaded: doc_{i}{ext}")
43
+ except Exception as e:
44
+ print(f"❌ Loader error on doc_{i}: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  if not docs:
47
+ print("❌ CRITICAL: No documents were successfully loaded.")
48
  return
49
 
50
+ # Process and Index
51
+ splits = RecursiveCharacterTextSplitter(
52
+ chunk_size=CHUNK_SIZE,
53
+ chunk_overlap=CHUNK_OVERLAP
54
+ ).split_documents(docs)
55
 
56
  print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
57
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
58
+ Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
59
+ print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 
 
 
 
 
60
 
61
  if __name__ == "__main__":
62
  run_ingestion()
rag.py CHANGED
@@ -1,41 +1,38 @@
 
1
  import os
2
  from transformers import pipeline
3
  from langchain_huggingface import HuggingFaceEmbeddings
4
  from langchain_chroma import Chroma
5
- from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
6
 
7
  # 1. Initialize Embeddings
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
  # 2. Load Vector DB
11
- # Ensure CHROMA_DIR is exactly the same as in ingest.py
12
- if os.path.exists(CHROMA_DIR) and len(os.listdir(CHROMA_DIR)) > 0:
13
- vectordb = Chroma(
14
- persist_directory=CHROMA_DIR,
15
- embedding_function=embeddings
16
- )
17
  print(f"✅ Vector DB loaded from {CHROMA_DIR}")
18
  else:
19
- print(f"⚠️ Vector DB not found at {CHROMA_DIR}")
20
  vectordb = None
 
21
 
22
  # 3. LLM Pipeline
23
  qa_pipeline = pipeline(
24
- task="text-generation", # Changed back from text2text-generation
25
- model=LLM_MODEL,
26
  max_new_tokens=256,
27
- trust_remote_code=True # Added for better compatibility
28
  )
29
 
30
  def ask_rag_with_status(question: str):
31
  if vectordb is None:
32
- return "The knowledge base is not initialized. Please check build logs.", "ERROR"
33
 
34
  docs = vectordb.similarity_search(question, k=3)
35
  context = "\n\n".join(d.page_content for d in docs)
36
- prompt = f"Use the context to answer Gandhi related questions.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"
37
 
38
  result = qa_pipeline(prompt)
 
39
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
40
-
41
  return answer, ["Context retrieved", "Answer generated"]
 
1
+ # rag.py
2
  import os
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
+ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
7
 
8
  # 1. Initialize Embeddings
9
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
10
 
11
  # 2. Load Vector DB
12
+ if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
13
+ vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
 
 
 
 
14
  print(f"✅ Vector DB loaded from {CHROMA_DIR}")
15
  else:
 
16
  vectordb = None
17
+ print("⚠️ Vector DB not found")
18
 
19
  # 3. LLM Pipeline
20
  qa_pipeline = pipeline(
21
+ task="text-generation",
22
+ model=LLM_MODEL,
23
  max_new_tokens=256,
24
+ trust_remote_code=True # Vital for T5 compatibility
25
  )
26
 
27
  def ask_rag_with_status(question: str):
28
  if vectordb is None:
29
+ return "Knowledge base not initialized.", "ERROR"
30
 
31
  docs = vectordb.similarity_search(question, k=3)
32
  context = "\n\n".join(d.page_content for d in docs)
33
+ prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
34
 
35
  result = qa_pipeline(prompt)
36
+ # Correctly parse Seq2Seq output
37
  answer = result[0]["generated_text"].split("Answer:")[-1].strip()
 
38
  return answer, ["Context retrieved", "Answer generated"]
requirements.txt CHANGED
@@ -9,8 +9,7 @@ langchain-chroma
9
  langchain-text-splitters==0.2.4
10
  chromadb==0.5.5
11
  sentence-transformers
12
- pypdf
13
- pdfplumber
14
  transformers>=4.39.0
15
  huggingface_hub
16
  datasets
 
9
  langchain-text-splitters==0.2.4
10
  chromadb==0.5.5
11
  sentence-transformers
12
+ docx2txt # New: For .docx support
 
13
  transformers>=4.39.0
14
  huggingface_hub
15
  datasets