AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on 16 days ago

Commit

d557fa1

1 Parent(s): 4ed3f0a

update

Browse files

Files changed (3) hide show

ingest.py +35 -54
rag.py +11 -14
requirements.txt +1 -2

ingest.py CHANGED Viewed

@@ -2,80 +2,61 @@
 import os
 import shutil
 from datasets import load_dataset
-from langchain_community.document_loaders import PyPDFLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
-from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
-    # Clean and create directories
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
-    # For PDF folders, we want to access the files directly
     dataset = load_dataset(HF_DATASET_REPO, split="train")
-    pdf_paths = []
-    # In PdfFolder, row['pdf'] is often a dictionary or a path object
     for i, row in enumerate(dataset):
-        pdf_item = row.get("pdf")
-        # Determine the filename
-        filename = f"doc_{i}.pdf"
-        dest_path = os.path.join(KB_DIR, filename)
-        try:
-            # Handle if pdf_item is a path string
-            if isinstance(pdf_item, str) and os.path.exists(pdf_item):
-                shutil.copy(pdf_item, dest_path)
-            # Handle if pdf_item is a dictionary with a 'path' (Common in HF)
-            elif isinstance(pdf_item, dict) and pdf_item.get("path"):
-                shutil.copy(pdf_item["path"], dest_path)
-            # Handle if pdf_item is a dictionary with 'bytes'
-            elif isinstance(pdf_item, dict) and pdf_item.get("bytes"):
-                with open(dest_path, "wb") as f:
-                    f.write(pdf_item["bytes"])
-            # Fallback for specialized HF PDF objects
-            elif hasattr(pdf_item, 'filename'):
-                shutil.copy(pdf_item.filename, dest_path)
-            else:
-                print(f"⚠️ Could not find a valid path for document {i}")
-                continue
-            pdf_paths.append(dest_path)
-            print(f"✅ Extracted: {filename}")
-        except Exception as e:
-            print(f"❌ Failed to extract doc_{i}: {e}")
-    print(f"📄 Loading {len(pdf_paths)} documents into LangChain...")
-    docs = []
-    for p in pdf_paths:
-        try:
-            loader = PyPDFLoader(p)
-            docs.extend(loader.load())
-        except Exception as e:
-            print(f"❌ PyPDFLoader error on {p}: {e}")
     if not docs:
-        print("❌ CRITICAL: No text could be extracted from PDFs.")
         return
-    # Chunking
-    splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
-    splits = splitter.split_documents(docs)
     print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    Chroma.from_documents(
-        documents=splits,
-        embedding=embeddings,
-        persist_directory=CHROMA_DIR
-    )
-    print(f"✅ Knowledge base initialized successfully at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

 import os
 import shutil
 from datasets import load_dataset
+from langchain_community.document_loaders import Docx2txtLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
+from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
 def run_ingestion():
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
+    print(f"⬇️ Loading dataset files from {HF_DATASET_REPO}...")
+    # This works for folders of files (txt, docx, etc.)
     dataset = load_dataset(HF_DATASET_REPO, split="train")
+    docs = []
     for i, row in enumerate(dataset):
+        # Hugging Face provides the local path to the file in the feature dictionary
+        # The key is usually the file extension (e.g., 'docx' or 'text')
+        file_feature = row.get("docx") or row.get("text") or row.get("file")
+        src_path = None
+        if isinstance(file_feature, dict): src_path = file_feature.get("path")
+        elif isinstance(file_feature, str): src_path = file_feature # If it's just a path string
+        if src_path and os.path.exists(src_path):
+            ext = os.path.splitext(src_path)[1].lower()
+            dest_path = os.path.join(KB_DIR, f"doc_{i}{ext}")
+            shutil.copy(src_path, dest_path)
+            # Load based on extension
+            try:
+                if ext == ".docx":
+                    loader = Docx2txtLoader(dest_path)
+                else:
+                    loader = TextLoader(dest_path, encoding="utf-8")
+                docs.extend(loader.load())
+                print(f"✅ Loaded: doc_{i}{ext}")
+            except Exception as e:
+                print(f"❌ Loader error on doc_{i}: {e}")
     if not docs:
+        print("❌ CRITICAL: No documents were successfully loaded.")
         return
+    # Process and Index
+    splits = RecursiveCharacterTextSplitter(
+        chunk_size=CHUNK_SIZE,
+        chunk_overlap=CHUNK_OVERLAP
+    ).split_documents(docs)
     print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
+    print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

rag.py CHANGED Viewed

@@ -1,41 +1,38 @@
 import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
-from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
 # 1. Initialize Embeddings
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 # 2. Load Vector DB
-# Ensure CHROMA_DIR is exactly the same as in ingest.py
-if os.path.exists(CHROMA_DIR) and len(os.listdir(CHROMA_DIR)) > 0:
-    vectordb = Chroma(
-        persist_directory=CHROMA_DIR,
-        embedding_function=embeddings
-    )
     print(f"✅ Vector DB loaded from {CHROMA_DIR}")
 else:
-    print(f"⚠️ Vector DB not found at {CHROMA_DIR}")
     vectordb = None
 # 3. LLM Pipeline
 qa_pipeline = pipeline(
-    task="text-generation", # Changed back from text2text-generation
-    model=LLM_MODEL,
     max_new_tokens=256,
-    trust_remote_code=True # Added for better compatibility
 )
 def ask_rag_with_status(question: str):
     if vectordb is None:
-        return "The knowledge base is not initialized. Please check build logs.", "ERROR"
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
-    prompt = f"Use the context to answer Gandhi related questions.\nContext:\n{context}\n\nQuestion: {question}\n\nAnswer:"
     result = qa_pipeline(prompt)
     answer = result[0]["generated_text"].split("Answer:")[-1].strip()
     return answer, ["Context retrieved", "Answer generated"]

+# rag.py
 import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
+from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
 # 1. Initialize Embeddings
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 # 2. Load Vector DB
+if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
+    vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
     print(f"✅ Vector DB loaded from {CHROMA_DIR}")
 else:
     vectordb = None
+    print("⚠️ Vector DB not found")
 # 3. LLM Pipeline
 qa_pipeline = pipeline(
+    task="text-generation",
+    model=LLM_MODEL,
     max_new_tokens=256,
+    trust_remote_code=True # Vital for T5 compatibility
 )
 def ask_rag_with_status(question: str):
     if vectordb is None:
+        return "Knowledge base not initialized.", "ERROR"
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
+    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
     result = qa_pipeline(prompt)
+    # Correctly parse Seq2Seq output
     answer = result[0]["generated_text"].split("Answer:")[-1].strip()
     return answer, ["Context retrieved", "Answer generated"]

requirements.txt CHANGED Viewed

@@ -9,8 +9,7 @@ langchain-chroma
 langchain-text-splitters==0.2.4
 chromadb==0.5.5
 sentence-transformers
-pypdf
-pdfplumber
 transformers>=4.39.0
 huggingface_hub
 datasets

 langchain-text-splitters==0.2.4
 chromadb==0.5.5
 sentence-transformers
+docx2txt              # New: For .docx support
 transformers>=4.39.0
 huggingface_hub
 datasets