AsyncRAG

Sleeping

App Files Files Community

Zubaish commited on Jan 28

Commit

2194516

1 Parent(s): e8fa82e

update

Browse files

Files changed (3) hide show

download_models.py +9 -6
ingest.py +54 -12
requirements.txt +2 -1

download_models.py CHANGED Viewed

@@ -1,10 +1,13 @@
-from transformers import pipeline
 from langchain_huggingface import HuggingFaceEmbeddings
-from config import EMBEDDING_MODEL, LLM_MODEL, LLM_TASK
-print("⏳ Pre-downloading models...")
-# Cache Embedding Model
 HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-# Cache Qwen LLM
-pipeline(LLM_TASK, model=LLM_MODEL, device_map="cpu", trust_remote_code=True)
 print("✅ Models cached successfully")

+# download_models.py
+from transformers import AutoModelForCausalLM, AutoTokenizer
 from langchain_huggingface import HuggingFaceEmbeddings
+from config import EMBEDDING_MODEL, LLM_MODEL
+print("⏳ Downloading Embedding Model...")
 HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+print(f"⏳ Downloading LLM: {LLM_MODEL}...")
+# Direct download to cache
+AutoTokenizer.from_pretrained(LLM_MODEL)
+AutoModelForCausalLM.from_pretrained(LLM_MODEL)
 print("✅ Models cached successfully")

ingest.py CHANGED Viewed

@@ -1,4 +1,5 @@
-import os, shutil
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import Docx2txtLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
@@ -7,34 +8,75 @@ from langchain_chroma import Chroma
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
 def run_ingestion():
-    if os.path.exists(KB_DIR): shutil.rmtree(KB_DIR)
-    if os.path.exists(CHROMA_DIR): shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
-    print(f"⬇️ Downloading files from: {HF_DATASET_REPO}...")
     try:
         all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         docx_files = [f for f in all_files if f.lower().endswith(".docx")]
         docs = []
         for file_name in docx_files:
-            local_path = hf_hub_download(repo_id=HF_DATASET_REPO, filename=file_name, repo_type="dataset", local_dir=KB_DIR, token=HF_TOKEN)
-            loader = Docx2txtLoader(local_path)
             docs.extend(loader.load())
-            print(f"✅ Loaded: {file_name}")
         if not docs:
-            print("❌ No documents found.")
             return
-        splits = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=CHUNK_OVERLAP).split_documents(docs)
         embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
-        Chroma.from_documents(documents=splits, embedding=embeddings, persist_directory=CHROMA_DIR)
-        print(f"✅ Knowledge base initialized successfully.")
     except Exception as e:
-        print(f"❌ Ingestion failed: {e}")
 if __name__ == "__main__":
     run_ingestion()

+import os
+import shutil
 from huggingface_hub import hf_hub_download, list_repo_files
 from langchain_community.document_loaders import Docx2txtLoader
 from langchain_text_splitters import RecursiveCharacterTextSplitter
 from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
 def run_ingestion():
+    # 1. Environment Cleanup & Setup
+    # Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
+    if os.path.exists(KB_DIR):
+        shutil.rmtree(KB_DIR)
+    if os.path.exists(CHROMA_DIR):
+        shutil.rmtree(CHROMA_DIR)
     os.makedirs(KB_DIR, exist_ok=True)
+    os.makedirs(CHROMA_DIR, exist_ok=True)
+    print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")
     try:
+        # 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
+        # This only fetches .docx files to keep your Gandhi ji knowledge base clean
         all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
         docx_files = [f for f in all_files if f.lower().endswith(".docx")]
+        if not docx_files:
+            print("❌ Error: No .docx files found in the dataset repository.")
+            return
         docs = []
         for file_name in docx_files:
+            print(f"📂 Downloading {file_name}...")
+            # Download to HF cache first
+            temp_path = hf_hub_download(
+                repo_id=HF_DATASET_REPO,
+                filename=file_name,
+                repo_type="dataset",
+                token=HF_TOKEN
+            )
+            # Copy to our predictable /app/kb directory
+            local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
+            shutil.copy(temp_path, local_docx)
+            # 3. Load text from Docx (ignores images automatically)
+            loader = Docx2txtLoader(local_docx)
             docs.extend(loader.load())
+            print(f"✅ Text extracted from: {file_name}")
         if not docs:
+            print("❌ Error: Extracted document list is empty.")
             return
+        # 4. Text Splitting (Optimized for RAG context windows)
+        text_splitter = RecursiveCharacterTextSplitter(
+            chunk_size=CHUNK_SIZE,
+            chunk_overlap=CHUNK_OVERLAP,
+            add_start_index=True
+        )
+        splits = text_splitter.split_documents(docs)
+        print(f"✂️ Split into {len(splits)} text chunks.")
+        # 5. Embedding & Vector Store Creation
+        print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
         embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
+        # Save to the persistent directory specified in config (/app/chroma_db)
+        print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
+        Chroma.from_documents(
+            documents=splits,
+            embedding=embeddings,
+            persist_directory=CHROMA_DIR
+        )
+        print(f"✨ Knowledge base fully initialized and saved.")
     except Exception as e:
+        print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")
 if __name__ == "__main__":
     run_ingestion()

requirements.txt CHANGED Viewed

@@ -10,8 +10,9 @@ langchain-text-splitters==0.2.4
 chromadb==0.5.5
 sentence-transformers
 docx2txt
-pdfplumber
 transformers>=4.39.0
 huggingface_hub
 datasets
 torch

 chromadb==0.5.5
 sentence-transformers
 docx2txt
 transformers>=4.39.0
+accelerate    # Added for Qwen support
+bitsandbytes  # Added for memory efficiency
 huggingface_hub
 datasets
 torch