Zubaish commited on
Commit
1b7f800
·
1 Parent(s): 069ee5c
Files changed (2) hide show
  1. ingest.py +23 -21
  2. rag.py +8 -10
ingest.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import os
2
  import shutil
3
  from datasets import load_dataset
@@ -12,45 +13,46 @@ def run_ingestion():
12
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
13
  os.makedirs(KB_DIR, exist_ok=True)
14
 
15
- print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
16
- # decode=False is the key to preventing the pdfplumber error
17
- ds = load_dataset(HF_DATASET_REPO, split="train", decode=False)
 
 
18
 
19
  docs = []
20
- for i, row in enumerate(ds):
21
- # The 'pdf' or 'docx' column in a folder dataset contains a dict with 'path'
22
- file_data = row.get("pdf") or row.get("docx") or row.get("file")
23
 
24
- if isinstance(file_data, dict) and file_data.get("path"):
25
- src_path = file_data["path"]
 
 
 
26
  ext = os.path.splitext(src_path)[1].lower()
27
-
28
- # We only want to process .docx files now
29
  if ext == ".docx":
30
  dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
31
  shutil.copy(src_path, dest_path)
32
  try:
33
  loader = Docx2txtLoader(dest_path)
34
  docs.extend(loader.load())
35
- print(f"✅ Loaded .docx: doc_{i}")
36
  except Exception as e:
37
- print(f"❌ Error loading doc_{i}: {e}")
38
  else:
39
- print(f"⏭️ Skipping non-docx or incompatible row {i}")
40
 
41
  if not docs:
42
- print("❌ No .docx documents were loaded.")
43
  return
44
 
 
45
  splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
46
- embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
47
 
48
- Chroma.from_documents(
49
- documents=splits,
50
- embedding=embeddings,
51
- persist_directory=CHROMA_DIR
52
- )
53
- print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
54
 
55
  if __name__ == "__main__":
56
  run_ingestion()
 
1
+ # ingest.py
2
  import os
3
  import shutil
4
  from datasets import load_dataset
 
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
+ print(f"⬇️ Loading raw files from {HF_DATASET_REPO}...")
17
+
18
+ # We load only the file paths to avoid the specialized PDF decoder errors
19
+ # This works for any file extension in your repo
20
+ dataset = load_dataset(HF_DATASET_REPO, split="train", ignore_verifications=True)
21
 
22
  docs = []
23
+ for i, row in enumerate(dataset):
24
+ # In a folder dataset, the 'file' or extension-named column contains path info
25
+ file_item = row.get("docx") or row.get("file") or row.get("pdf")
26
 
27
+ src_path = None
28
+ if isinstance(file_item, dict): src_path = file_item.get("path")
29
+ elif isinstance(file_item, str): src_path = file_item
30
+
31
+ if src_path and os.path.exists(src_path):
32
  ext = os.path.splitext(src_path)[1].lower()
 
 
33
  if ext == ".docx":
34
  dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
35
  shutil.copy(src_path, dest_path)
36
  try:
37
  loader = Docx2txtLoader(dest_path)
38
  docs.extend(loader.load())
39
+ print(f"✅ Extracted docx: doc_{i}")
40
  except Exception as e:
41
+ print(f"❌ Error parsing doc_{i}: {e}")
42
  else:
43
+ print(f"⏭️ Skipping non-docx or missing path at row {i}")
44
 
45
  if not docs:
46
+ print("❌ CRITICAL: No .docx documents were loaded.")
47
  return
48
 
49
+ # Chunk and Embed
50
  splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
51
+ print(f"🧠 Indexing {len(splits)} chunks...")
52
 
53
+ embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
54
+ Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
55
+ print(f"✅ Knowledge base initialized successfully.")
 
 
 
56
 
57
  if __name__ == "__main__":
58
  run_ingestion()
rag.py CHANGED
@@ -7,17 +7,20 @@ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
7
 
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
 
10
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
11
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
 
12
  else:
13
  vectordb = None
 
14
 
15
- # Using 'text-generation' is safer across different Hub environments
16
  qa_pipeline = pipeline(
17
- "text-generation",
18
  model=LLM_MODEL,
19
  max_new_tokens=256,
20
- trust_remote_code=True
21
  )
22
 
23
  def ask_rag_with_status(question: str):
@@ -26,13 +29,8 @@ def ask_rag_with_status(question: str):
26
 
27
  docs = vectordb.similarity_search(question, k=3)
28
  context = "\n\n".join(d.page_content for d in docs)
29
-
30
- # Simple prompt for Flan-T5
31
- prompt = f"Answer the question using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
32
 
33
  result = qa_pipeline(prompt)
34
- # Flan-T5 often repeats the prompt, so we clean it
35
- out = result[0]["generated_text"]
36
- answer = out.split("Answer:")[-1].strip()
37
-
38
  return answer, ["Success"]
 
7
 
8
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
9
 
10
+ # Load database created in build phase
11
  if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
12
  vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
13
+ print("✅ Vector DB ready")
14
  else:
15
  vectordb = None
16
+ print("⚠️ Vector DB missing")
17
 
18
+ # Use generic text-generation for broadest compatibility
19
  qa_pipeline = pipeline(
20
+ task="text-generation",
21
  model=LLM_MODEL,
22
  max_new_tokens=256,
23
+ trust_remote_code=True
24
  )
25
 
26
  def ask_rag_with_status(question: str):
 
29
 
30
  docs = vectordb.similarity_search(question, k=3)
31
  context = "\n\n".join(d.page_content for d in docs)
32
+ prompt = f"Using the context, answer correctly.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
 
 
33
 
34
  result = qa_pipeline(prompt)
35
+ answer = result[0]["generated_text"].split("Answer:")[-1].strip()
 
 
 
36
  return answer, ["Success"]