Zubaish commited on
Commit
069ee5c
·
1 Parent(s): d557fa1
Files changed (3) hide show
  1. ingest.py +29 -35
  2. rag.py +12 -12
  3. requirements.txt +2 -1
ingest.py CHANGED
@@ -1,61 +1,55 @@
1
- # ingest.py
2
  import os
3
  import shutil
4
  from datasets import load_dataset
5
- from langchain_community.document_loaders import Docx2txtLoader, TextLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
- from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def run_ingestion():
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
- print(f"⬇️ Loading dataset files from {HF_DATASET_REPO}...")
17
- # This works for folders of files (txt, docx, etc.)
18
- dataset = load_dataset(HF_DATASET_REPO, split="train")
19
 
20
  docs = []
21
- for i, row in enumerate(dataset):
22
- # Hugging Face provides the local path to the file in the feature dictionary
23
- # The key is usually the file extension (e.g., 'docx' or 'text')
24
- file_feature = row.get("docx") or row.get("text") or row.get("file")
25
 
26
- src_path = None
27
- if isinstance(file_feature, dict): src_path = file_feature.get("path")
28
- elif isinstance(file_feature, str): src_path = file_feature # If it's just a path string
29
-
30
- if src_path and os.path.exists(src_path):
31
  ext = os.path.splitext(src_path)[1].lower()
32
- dest_path = os.path.join(KB_DIR, f"doc_{i}{ext}")
33
- shutil.copy(src_path, dest_path)
34
 
35
- # Load based on extension
36
- try:
37
- if ext == ".docx":
 
 
38
  loader = Docx2txtLoader(dest_path)
39
- else:
40
- loader = TextLoader(dest_path, encoding="utf-8")
41
- docs.extend(loader.load())
42
- print(f" Loaded: doc_{i}{ext}")
43
- except Exception as e:
44
- print(f" Loader error on doc_{i}: {e}")
45
 
46
  if not docs:
47
- print("❌ CRITICAL: No documents were successfully loaded.")
48
  return
49
 
50
- # Process and Index
51
- splits = RecursiveCharacterTextSplitter(
52
- chunk_size=CHUNK_SIZE,
53
- chunk_overlap=CHUNK_OVERLAP
54
- ).split_documents(docs)
55
-
56
- print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
57
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
58
- Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
 
 
 
 
 
59
  print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
60
 
61
  if __name__ == "__main__":
 
 
1
  import os
2
  import shutil
3
  from datasets import load_dataset
4
+ from langchain_community.document_loaders import Docx2txtLoader
5
  from langchain_text_splitters import RecursiveCharacterTextSplitter
6
  from langchain_huggingface import HuggingFaceEmbeddings
7
  from langchain_chroma import Chroma
8
+ from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
9
 
10
  def run_ingestion():
11
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
12
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
13
  os.makedirs(KB_DIR, exist_ok=True)
14
 
15
+ print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
16
+ # decode=False is the key to preventing the pdfplumber error
17
+ ds = load_dataset(HF_DATASET_REPO, split="train", decode=False)
18
 
19
  docs = []
20
+ for i, row in enumerate(ds):
21
+ # The 'pdf' or 'docx' column in a folder dataset contains a dict with 'path'
22
+ file_data = row.get("pdf") or row.get("docx") or row.get("file")
 
23
 
24
+ if isinstance(file_data, dict) and file_data.get("path"):
25
+ src_path = file_data["path"]
 
 
 
26
  ext = os.path.splitext(src_path)[1].lower()
 
 
27
 
28
+ # We only want to process .docx files now
29
+ if ext == ".docx":
30
+ dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
31
+ shutil.copy(src_path, dest_path)
32
+ try:
33
  loader = Docx2txtLoader(dest_path)
34
+ docs.extend(loader.load())
35
+ print(f"✅ Loaded .docx: doc_{i}")
36
+ except Exception as e:
37
+ print(f" Error loading doc_{i}: {e}")
38
+ else:
39
+ print(f"⏭️ Skipping non-docx or incompatible row {i}")
40
 
41
  if not docs:
42
+ print("❌ No .docx documents were loaded.")
43
  return
44
 
45
+ splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
 
 
 
 
 
 
46
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
47
+
48
+ Chroma.from_documents(
49
+ documents=splits,
50
+ embedding=embeddings,
51
+ persist_directory=CHROMA_DIR
52
+ )
53
  print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
54
 
55
  if __name__ == "__main__":
rag.py CHANGED
@@ -3,25 +3,21 @@ import os
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
- from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
7
 
8
- # 1. Initialize Embeddings
9
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
10
 
11
- # 2. Load Vector DB
12
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
13
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
14
- print(f"✅ Vector DB loaded from {CHROMA_DIR}")
15
  else:
16
  vectordb = None
17
- print("⚠️ Vector DB not found")
18
 
19
- # 3. LLM Pipeline
20
  qa_pipeline = pipeline(
21
- task="text-generation",
22
  model=LLM_MODEL,
23
  max_new_tokens=256,
24
- trust_remote_code=True # Vital for T5 compatibility
25
  )
26
 
27
  def ask_rag_with_status(question: str):
@@ -30,9 +26,13 @@ def ask_rag_with_status(question: str):
30
 
31
  docs = vectordb.similarity_search(question, k=3)
32
  context = "\n\n".join(d.page_content for d in docs)
33
- prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
 
 
34
 
35
  result = qa_pipeline(prompt)
36
- # Correctly parse Seq2Seq output
37
- answer = result[0]["generated_text"].split("Answer:")[-1].strip()
38
- return answer, ["Context retrieved", "Answer generated"]
 
 
 
3
  from transformers import pipeline
4
  from langchain_huggingface import HuggingFaceEmbeddings
5
  from langchain_chroma import Chroma
6
+ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
 
 
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
 
10
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
11
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
 
12
  else:
13
  vectordb = None
 
14
 
15
+ # Using 'text-generation' is safer across different Hub environments
16
  qa_pipeline = pipeline(
17
+ "text-generation",
18
  model=LLM_MODEL,
19
  max_new_tokens=256,
20
+ trust_remote_code=True
21
  )
22
 
23
  def ask_rag_with_status(question: str):
 
26
 
27
  docs = vectordb.similarity_search(question, k=3)
28
  context = "\n\n".join(d.page_content for d in docs)
29
+
30
+ # Simple prompt for Flan-T5
31
+ prompt = f"Answer the question using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
32
 
33
  result = qa_pipeline(prompt)
34
+ # Flan-T5 often repeats the prompt, so we clean it
35
+ out = result[0]["generated_text"]
36
+ answer = out.split("Answer:")[-1].strip()
37
+
38
+ return answer, ["Success"]
requirements.txt CHANGED
@@ -9,7 +9,8 @@ langchain-chroma
9
  langchain-text-splitters==0.2.4
10
  chromadb==0.5.5
11
  sentence-transformers
12
- docx2txt # New: For .docx support
 
13
  transformers>=4.39.0
14
  huggingface_hub
15
  datasets
 
9
  langchain-text-splitters==0.2.4
10
  chromadb==0.5.5
11
  sentence-transformers
12
+ docx2txt
13
+ pdfplumber
14
  transformers>=4.39.0
15
  huggingface_hub
16
  datasets