AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on 17 days ago

Commit

11f1809

1 Parent(s): 6d3d36d

update

Browse files

Files changed (2) hide show

ingest.py +37 -18
rag.py +5 -1

ingest.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import shutil
 from datasets import load_dataset
@@ -8,55 +9,73 @@ from langchain_chroma import Chroma
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
-    # Clean setup for the container
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
     print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
     for i, row in enumerate(dataset):
-        pdf_feature = row.get("pdf")
-        src_path = None
-        # Access the cached file path from Hugging Face's internal download
-        if isinstance(pdf_feature, dict): src_path = pdf_feature.get("path")
-        elif hasattr(pdf_feature, 'filename'): src_path = pdf_feature.filename
-        if src_path and os.path.exists(src_path):
-            dest_path = os.path.join(KB_DIR, f"doc_{i}.pdf")
-            shutil.copy(src_path, dest_path)
             pdf_paths.append(dest_path)
-            print(f"✅ Cached: doc_{i}.pdf")
-    print(f"📄 Processing {len(pdf_paths)} documents...")
     docs = []
     for p in pdf_paths:
         try:
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
         except Exception as e:
-            print(f"❌ Error loading {p}: {e}")
     if not docs:
-        print("❌ Error: No documents successfully loaded.")
         return
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     splits = splitter.split_documents(docs)
-    print(f"🧠 Building Vector DB at {CHROMA_DIR}...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    # This creates the physical folder and files
     Chroma.from_documents(
-        documents=splits,
-        embedding=embeddings,
         persist_directory=CHROMA_DIR
     )
-    print("✅ Ingestion complete.")
 if __name__ == "__main__":
     run_ingestion()

+# ingest.py
 import os
 import shutil
 from datasets import load_dataset
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
+    # Clean and create directories
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
     print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
+    # For PDF folders, we want to access the files directly
     dataset = load_dataset(HF_DATASET_REPO, split="train")
     pdf_paths = []
+    # In PdfFolder, row['pdf'] is often a dictionary or a path object
     for i, row in enumerate(dataset):
+        pdf_item = row.get("pdf")
+        # Determine the filename
+        filename = f"doc_{i}.pdf"
+        dest_path = os.path.join(KB_DIR, filename)
+        try:
+            # Handle if pdf_item is a path string
+            if isinstance(pdf_item, str) and os.path.exists(pdf_item):
+                shutil.copy(pdf_item, dest_path)
+            # Handle if pdf_item is a dictionary with a 'path' (Common in HF)
+            elif isinstance(pdf_item, dict) and pdf_item.get("path"):
+                shutil.copy(pdf_item["path"], dest_path)
+            # Handle if pdf_item is a dictionary with 'bytes'
+            elif isinstance(pdf_item, dict) and pdf_item.get("bytes"):
+                with open(dest_path, "wb") as f:
+                    f.write(pdf_item["bytes"])
+            # Fallback for specialized HF PDF objects
+            elif hasattr(pdf_item, 'filename'):
+                shutil.copy(pdf_item.filename, dest_path)
+            else:
+                print(f"⚠️ Could not find a valid path for document {i}")
+                continue
             pdf_paths.append(dest_path)
+            print(f"✅ Extracted: {filename}")
+        except Exception as e:
+            print(f"❌ Failed to extract doc_{i}: {e}")
+    print(f"📄 Loading {len(pdf_paths)} documents into LangChain...")
     docs = []
     for p in pdf_paths:
         try:
             loader = PyPDFLoader(p)
             docs.extend(loader.load())
         except Exception as e:
+            print(f"❌ PyPDFLoader error on {p}: {e}")
     if not docs:
+        print("❌ CRITICAL: No text could be extracted from PDFs.")
         return
+    # Chunking
     splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
     splits = splitter.split_documents(docs)
+    print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
     Chroma.from_documents(
+        documents=splits,
+        embedding=embeddings,
         persist_directory=CHROMA_DIR
     )
+    print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

rag.py CHANGED Viewed

@@ -20,7 +20,11 @@ else:
     vectordb = None
 # 3. LLM Pipeline
-qa_pipeline = pipeline("text-generation", model=LLM_MODEL, max_new_tokens=256)
 def ask_rag_with_status(question: str):
     if vectordb is None:

     vectordb = None
 # 3. LLM Pipeline
+qa_pipeline = pipeline(
+    task="text2text-generation", # Fixed task type for T5 models
+    model=LLM_MODEL,
+    max_new_tokens=256
+)
 def ask_rag_with_status(question: str):
     if vectordb is None: