AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 28

Commit

1b7f800

1 Parent(s): 069ee5c

update

Browse files

Files changed (2) hide show

ingest.py +23 -21
rag.py +8 -10

ingest.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import os
 import shutil
 from datasets import load_dataset
@@ -12,45 +13,46 @@ def run_ingestion():
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Loading dataset from {HF_DATASET_REPO}...")
-    # decode=False is the key to preventing the pdfplumber error
-    ds = load_dataset(HF_DATASET_REPO, split="train", decode=False)
     docs = []
-    for i, row in enumerate(ds):
-        # The 'pdf' or 'docx' column in a folder dataset contains a dict with 'path'
-        file_data = row.get("pdf") or row.get("docx") or row.get("file")
-        if isinstance(file_data, dict) and file_data.get("path"):
-            src_path = file_data["path"]
             ext = os.path.splitext(src_path)[1].lower()
-            # We only want to process .docx files now
             if ext == ".docx":
                 dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
                 shutil.copy(src_path, dest_path)
                 try:
                     loader = Docx2txtLoader(dest_path)
                     docs.extend(loader.load())
-                    print(f"✅ Loaded .docx: doc_{i}")
                 except Exception as e:
-                    print(f"❌ Error loading doc_{i}: {e}")
         else:
-            print(f"⏭️ Skipping non-docx or incompatible row {i}")
     if not docs:
-        print("❌ No .docx documents were loaded.")
         return
     splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
-    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-    Chroma.from_documents(
-        documents=splits,
-        embedding=embeddings,
-        persist_directory=CHROMA_DIR
-    )
-    print(f"✅ Knowledge base initialized at {CHROMA_DIR}")
 if __name__ == "__main__":
     run_ingestion()

+# ingest.py
 import os
 import shutil
 from datasets import load_dataset
     if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
+    print(f"⬇️ Loading raw files from {HF_DATASET_REPO}...")
+    # We load only the file paths to avoid the specialized PDF decoder errors
+    # This works for any file extension in your repo
+    dataset = load_dataset(HF_DATASET_REPO, split="train", ignore_verifications=True)
     docs = []
+    for i, row in enumerate(dataset):
+        # In a folder dataset, the 'file' or extension-named column contains path info
+        file_item = row.get("docx") or row.get("file") or row.get("pdf")
+        src_path = None
+        if isinstance(file_item, dict): src_path = file_item.get("path")
+        elif isinstance(file_item, str): src_path = file_item
+        if src_path and os.path.exists(src_path):
             ext = os.path.splitext(src_path)[1].lower()
             if ext == ".docx":
                 dest_path = os.path.join(KB_DIR, f"doc_{i}.docx")
                 shutil.copy(src_path, dest_path)
                 try:
                     loader = Docx2txtLoader(dest_path)
                     docs.extend(loader.load())
+                    print(f"✅ Extracted docx: doc_{i}")
                 except Exception as e:
+                    print(f"❌ Error parsing doc_{i}: {e}")
         else:
+            print(f"⏭️ Skipping non-docx or missing path at row {i}")
     if not docs:
+        print("❌ CRITICAL: No .docx documents were loaded.")
         return
+    # Chunk and Embed
     splits = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100).split_documents(docs)
+    print(f"🧠 Indexing {len(splits)} chunks...")
+    embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+    Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
+    print(f"✅ Knowledge base initialized successfully.")
 if __name__ == "__main__":
     run_ingestion()

rag.py CHANGED Viewed

@@ -7,17 +7,20 @@ from config import EMBEDDING_MODEL, LLM_MODEL, CHROMA_DIR
 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
 if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
 else:
     vectordb = None
-# Using 'text-generation' is safer across different Hub environments
 qa_pipeline = pipeline(
-    "text-generation",
     model=LLM_MODEL,
     max_new_tokens=256,
-    trust_remote_code=True
 )
 def ask_rag_with_status(question: str):
@@ -26,13 +29,8 @@ def ask_rag_with_status(question: str):
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
-    # Simple prompt for Flan-T5
-    prompt = f"Answer the question using the context.\nContext: {context}\nQuestion: {question}\nAnswer:"
     result = qa_pipeline(prompt)
-    # Flan-T5 often repeats the prompt, so we clean it
-    out = result[0]["generated_text"]
-    answer = out.split("Answer:")[-1].strip()
     return answer, ["Success"]

 embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+# Load database created in build phase
 if os.path.exists(CHROMA_DIR) and os.listdir(CHROMA_DIR):
     vectordb = Chroma(persist_directory=CHROMA_DIR, embedding_function=embeddings)
+    print("✅ Vector DB ready")
 else:
     vectordb = None
+    print("⚠️ Vector DB missing")
+# Use generic text-generation for broadest compatibility
 qa_pipeline = pipeline(
+    task="text-generation",
     model=LLM_MODEL,
     max_new_tokens=256,
+    trust_remote_code=True
 )
 def ask_rag_with_status(question: str):
     docs = vectordb.similarity_search(question, k=3)
     context = "\n\n".join(d.page_content for d in docs)
+    prompt = f"Using the context, answer correctly.\n\nContext: {context}\n\nQuestion: {question}\n\nAnswer:"
     result = qa_pipeline(prompt)
+    answer = result[0]["generated_text"].split("Answer:")[-1].strip()
     return answer, ["Success"]