Zubaish commited on
Commit
9edda50
·
1 Parent(s): 1b7f800
Files changed (1) hide show
  1. ingest.py +34 -19
ingest.py CHANGED
@@ -6,53 +6,68 @@ from langchain_community.document_loaders import Docx2txtLoader
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
- from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
10
 
11
  def run_ingestion():
 
12
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
13
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
14
  os.makedirs(KB_DIR, exist_ok=True)
15
 
16
- print(f"⬇️ Loading raw files from {HF_DATASET_REPO}...")
17
 
18
- # We load only the file paths to avoid the specialized PDF decoder errors
19
- # This works for any file extension in your repo
20
- dataset = load_dataset(HF_DATASET_REPO, split="train", ignore_verifications=True)
21
 
22
  docs = []
 
23
  for i, row in enumerate(dataset):
24
- # In a folder dataset, the 'file' or extension-named column contains path info
25
- file_item = row.get("docx") or row.get("file") or row.get("pdf")
26
 
27
  src_path = None
28
- if isinstance(file_item, dict): src_path = file_item.get("path")
29
- elif isinstance(file_item, str): src_path = file_item
 
 
30
 
31
  if src_path and os.path.exists(src_path):
32
  ext = os.path.splitext(src_path)[1].lower()
 
 
33
  if ext == ".docx":
34
  dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
35
  shutil.copy(src_path, dest_path)
 
36
  try:
37
  loader = Docx2txtLoader(dest_path)
38
  docs.extend(loader.load())
39
- print(f"✅ Extracted docx: doc_{i}")
40
  except Exception as e:
41
- print(f"❌ Error parsing doc_{i}: {e}")
42
- else:
43
- print(f"⏭️ Skipping non-docx or missing path at row {i}")
44
 
45
  if not docs:
46
- print("❌ CRITICAL: No .docx documents were loaded.")
47
  return
48
 
49
- # Chunk and Embed
50
- splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
 
 
 
 
 
 
51
  print(f"🧠 Indexing {len(splits)} chunks...")
52
-
53
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
54
- Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
55
- print(f"✅ Knowledge base initialized successfully.")
 
 
 
 
56
 
57
  if __name__ == "__main__":
58
  run_ingestion()
 
6
  from langchain_text_splitters import RecursiveCharacterTextSplitter
7
  from langchain_huggingface import HuggingFaceEmbeddings
8
  from langchain_chroma import Chroma
9
+ from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
10
 
11
  def run_ingestion():
12
+ # 1. Clean directories
13
  if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
14
  if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
15
  os.makedirs(KB_DIR, exist_ok=True)
16
 
17
+ print(f"⬇️ Loading files from {HF_DATASET_REPO}...")
18
 
19
+ # Use standard load without extra flags that cause ValueErrors
20
+ dataset = load_dataset(HF_DATASET_REPO, split="train")
 
21
 
22
  docs = []
23
+ # Loop through the rows to find paths to files
24
  for i, row in enumerate(dataset):
25
+ # We check common keys used by HF for file paths
26
+ file_info = row.get("docx") or row.get("file") or row.get("pdf")
27
 
28
  src_path = None
29
+ if isinstance(file_info, dict):
30
+ src_path = file_info.get("path")
31
+ elif isinstance(file_info, str):
32
+ src_path = file_info
33
 
34
  if src_path and os.path.exists(src_path):
35
  ext = os.path.splitext(src_path)[1].lower()
36
+
37
+ # ONLY process .docx files to avoid the PDF error
38
  if ext == ".docx":
39
  dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
40
  shutil.copy(src_path, dest_path)
41
+
42
  try:
43
  loader = Docx2txtLoader(dest_path)
44
  docs.extend(loader.load())
45
+ print(f"✅ Successfully loaded: doc_{i}.docx")
46
  except Exception as e:
47
+ print(f"❌ Loader error on doc_{i}: {e}")
48
+ else:
49
+ print(f"⏭️ Skipping non-docx file: {src_path}")
50
 
51
  if not docs:
52
+ print("❌ CRITICAL: No .docx documents found. Ensure your dataset has .docx files.")
53
  return
54
 
55
+ # 2. Chunking
56
+ splitter = RecursiveCharacterTextSplitter(
57
+ chunk_size=CHUNK_SIZE,
58
+ chunk_overlap=CHUNK_OVERLAP
59
+ )
60
+ splits = splitter.split_documents(docs)
61
+
62
+ # 3. Embedding and Storage
63
  print(f"🧠 Indexing {len(splits)} chunks...")
 
64
  embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
65
+ Chroma.from_documents(
66
+ documents=splits,
67
+ embedding=embeddings,
68
+ persist_directory=CHROMA_DIR
69
+ )
70
+ print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
71
 
72
  if __name__ == "__main__":
73
  run_ingestion()