AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on 10 days ago

Commit

069ee5c

1 Parent(s): d557fa1

update

Browse files

Files changed (3) hide show

ingest.py +29 -35
rag.py +12 -12
requirements.txt +2 -1

ingest.py CHANGED Viewed

@@ -1,61 +1,55 @@
-# ingest.py
 import os
 import shutil
 from datasets import load_dataset
-from langchain_community.document_loaders import Docx2txtLoader, TextLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
-from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP
 def run_ingestion():
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Loading dataset files from {HF_DATASET_REPO}...")
-    # This works for folders of files (txt, docx, etc.)
-    dataset = load_dataset(HF_DATASET_REPO, split="train")
     docs = []
-    for i, row in enumerate(dataset):
-        # Hugging Face provides the local path to the file in the feature dictionary
-        # The key is usually the file extension (e.g., 'docx' or 'text')
-        file_feature = row.get("docx") or row.get("text") or row.get("file")
-        src_path = None
-        if isinstance(file_feature, dict): src_path = file_feature.get("path")
-        elif isinstance(file_feature, str): src_path = file_feature # If it's just a path string
-        if src_path and os.path.exists(src_path):
             ext = os.path.splitext(src_path)[1].lower()
-            dest_path = os.path.join(KB_DIR, f"doc_{i}{ext}")
-            shutil.copy(src_path, dest_path)
-            # Load based on extension
-            try:
-                if ext == ".docx":
                     loader = Docx2txtLoader(dest_path)
-                else:
-                    loader = TextLoader(dest_path, encoding="utf-8")
-                docs.extend(loader.load())
-                print(f"✅ Loaded: doc_{i}{ext}")
-            except Exception as e:
-                print(f"❌ Loader error on doc_{i}: {e}")
     if not docs:
-        print("❌ CRITICAL: No documents were successfully loaded.")
         return
-    # Process and Index
-    splits = RecursiveCharacterTextSplitter(
-        chunk_size=CHUNK_SIZE,
-        chunk_overlap=CHUNK_OVERLAP
-    ).split_documents(docs)
-    print(f"🧠 Indexing {len(splits)} chunks into ChromaDB...")
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
     print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":

 import os
 import shutil
 from datasets import load_dataset
+from langchain_community.document_loaders import Docx2txtLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
+from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR
 def run_ingestion():
     if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
+    print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
+    # decode=False is the key to preventing the pdfplumber error
+    ds = load_dataset(HF_DATASET_REPO, split="train", decode=False)
     docs = []
+    for i, row in enumerate(ds):
+        # The 'pdf' or 'docx' column in a folder dataset contains a dict with 'path'
+        file_data = row.get("pdf") or row.get("docx") or row.get("file")
+        if isinstance(file_data, dict) and file_data.get("path"):
+            src_path = file_data["path"]
             ext = os.path.splitext(src_path)[1].lower()
+            # We only want to process .docx files now
+            if ext == ".docx":
+                dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
+                shutil.copy(src_path, dest_path)
+                try:
                     loader = Docx2txtLoader(dest_path)
+                    docs.extend(loader.load())
+                    print(f"✅ Loaded .docx: doc_{i}")
+                except Exception as e:
+                    print(f"❌ Error loading doc_{i}: {e}")
+        else:
+            print(f"⏭️ Skipping non-docx or incompatible row {i}")
     if not docs:
+        print("❌ No .docx documents were loaded.")
         return
+    splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
     embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    Chroma.from_documents(
+        documents=splits,
+        embedding=embeddings,
+        persist_directory=CHROMA_DIR
+    )
     print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":

rag.py CHANGED Viewed

@@ -3,25 +3,21 @@ import os
 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
-from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR, LLM_TASK
-# 1. Initialize Embeddings
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-# 2. Load Vector DB
 if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
-    print(f"✅ Vector DB loaded from {CHROMA_DIR}")
 else:
     vectordb = None
-    print("⚠️ Vector DB not found")
-# 3. LLM Pipeline
 qa_pipeline = pipeline(
-    task="text-generation",
     model=LLM_MODEL,
     max_new_tokens=256,
-    trust_remote_code=True # Vital for T5 compatibility
 )
 def ask_rag_with_status(question: str):
@@ -30,9 +26,13 @@ def ask_rag_with_status(question: str):
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
-    prompt = f"Context: {context}\n\nQuestion: {question}\nAnswer:"
     result = qa_pipeline(prompt)
-    # Correctly parse Seq2Seq output
-    answer = result[0]["generated_text"].split("Answer:")[-1].strip()
-    return answer, ["Context retrieved", "Answer generated"]

 from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
 from langchain_chroma import Chroma
+from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
 else:
     vectordb = None
+# Using 'text-generation' is safer across different Hub environments
 qa_pipeline = pipeline(
+    "text-generation",
     model=LLM_MODEL,
     max_new_tokens=256,
+    trust_remote_code=True
 )
 def ask_rag_with_status(question: str):
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
+    # Simple prompt for Flan-T5
+    prompt = f"Answer the question using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
     result = qa_pipeline(prompt)
+    # Flan-T5 often repeats the prompt, so we clean it
+    out = result[0]["generated_text"]
+    answer = out.split("Answer:")[-1].strip()
+    return answer, ["Success"]

requirements.txt CHANGED Viewed

@@ -9,7 +9,8 @@ langchain-chroma
 langchain-text-splitters==0.2.4
 chromadb==0.5.5
 sentence-transformers
-docx2txt              # New: For .docx support
 transformers>=4.39.0
 huggingface_hub
 datasets

 langchain-text-splitters==0.2.4
 chromadb==0.5.5
 sentence-transformers
+docx2txt
+pdfplumber
 transformers>=4.39.0
 huggingface_hub
 datasets