AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 28

Commit

f09a853

1 Parent(s): ce847a1

update

Browse files

Files changed (1) hide show

ingest.py +18 -15

ingest.py CHANGED Viewed

@@ -8,50 +8,53 @@ from langchain_chroma import Chroma
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
-    # Clean up previous runs
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
     print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
-    # Load normally without extra format arguments
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
     for i, row in enumerate(dataset):
-        # Hugging Face PDF folders usually store the decoded PDF in a column named 'pdf'
         pdf_feature = row.get("pdf")
-        # If it's a dict, it usually has a 'path' to the actual file on disk
-        # This is the most efficient way to get the file
-        if isinstance(pdf_feature, dict) and pdf_feature.get("path"):
             src_path = pdf_feature["path"]
             dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
             shutil.copy(src_path, dest_path)
             pdf_paths.append(dest_path)
         else:
-            # Fallback: if we can't find a path, skip or try bytes
-            print(f"⚠️ Could not find file path for row {i}, skipping.")
-    print(f"📄 Loading and splitting {len(pdf_paths)} documents...")
     docs = []
     for p in pdf_paths:
         try:
-            # Using pypdf-based loader
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
         except Exception as e:
-            print(f"⚠️ Error reading {p}: {e}")
     if not docs:
-        print("❌ No documents were successfully loaded.")
         return
-    # Standard text splitting
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     splits = splitter.split_documents(docs)
-    print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
     Chroma.from_documents(
@@ -59,7 +62,7 @@ def run_ingestion():
         embedding=embeddings,
         persist_directory=CHROMA_DIR
     )
-    print(f"✅ Ingestion complete. DB saved to {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
+    # 1. Clean Environment
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
     print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
     for i, row in enumerate(dataset):
         pdf_feature = row.get("pdf")
+        # Determine Source Path
+        # HF PdfFolder datasets store the local path in the 'path' key of the feature
+        src_path = None
+        if isinstance(pdf_feature, dict) and "path" in pdf_feature:
             src_path = pdf_feature["path"]
+        elif hasattr(pdf_feature, "filename"):
+            src_path = pdf_feature.filename
+        if src_path and os.path.exists(src_path):
             dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
             shutil.copy(src_path, dest_path)
             pdf_paths.append(dest_path)
+            print(f"✅ Cached: doc_{i}.pdf")
         else:
+            print(f"⚠️ Could not resolve path for doc_{i}, skipping.")
+    # 2. Process Documents
+    print(f"📄 Processing {len(pdf_paths)} documents...")
     docs = []
     for p in pdf_paths:
         try:
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
         except Exception as e:
+            print(f"❌ Error reading {p}: {e}")
     if not docs:
+        print("❌ CRITICAL: No documents were successfully loaded.")
         return
+    # 3. Chunk and Embed
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     splits = splitter.split_documents(docs)
+    print(f"🧠 Indexing {len(splits)} chunks...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
     Chroma.from_documents(
         embedding=embeddings,
         persist_directory=CHROMA_DIR
     )
+    print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()