File size: 3,237 Bytes
2194516 19be3af 069ee5c d4bb434 6d3e4d2 d4bb434 19be3af b713a11 6d3e4d2 2194516 6d3e4d2 2194516 6d3d36d 2194516 3ad751d 19be3af 2194516 19be3af 2194516 19be3af e8fa82e 2194516 19be3af 2194516 1b7f800 19be3af 2194516 19be3af 6d3d36d 2194516 19be3af 2194516 19be3af 2194516 6d3e4d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 |
import os
import shutil
from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN
def run_ingestion():
# 1. Environment Cleanup & Setup
# Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
if os.path.exists(KB_DIR):
shutil.rmtree(KB_DIR)
if os.path.exists(CHROMA_DIR):
shutil.rmtree(CHROMA_DIR)
os.makedirs(KB_DIR, exist_ok=True)
os.makedirs(CHROMA_DIR, exist_ok=True)
print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")
try:
# 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
# This only fetches .docx files to keep your Gandhi ji knowledge base clean
all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
docx_files = [f for f in all_files if f.lower().endswith(".docx")]
if not docx_files:
print("❌ Error: No .docx files found in the dataset repository.")
return
docs = []
for file_name in docx_files:
print(f"📂 Downloading {file_name}...")
# Download to HF cache first
temp_path = hf_hub_download(
repo_id=HF_DATASET_REPO,
filename=file_name,
repo_type="dataset",
token=HF_TOKEN
)
# Copy to our predictable /app/kb directory
local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
shutil.copy(temp_path, local_docx)
# 3. Load text from Docx (ignores images automatically)
loader = Docx2txtLoader(local_docx)
docs.extend(loader.load())
print(f"✅ Text extracted from: {file_name}")
if not docs:
print("❌ Error: Extracted document list is empty.")
return
# 4. Text Splitting (Optimized for RAG context windows)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=CHUNK_SIZE,
chunk_overlap=CHUNK_OVERLAP,
add_start_index=True
)
splits = text_splitter.split_documents(docs)
print(f"✂️ Split into {len(splits)} text chunks.")
# 5. Embedding & Vector Store Creation
print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
# Save to the persistent directory specified in config (/app/chroma_db)
print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
Chroma.from_documents(
documents=splits,
embedding=embeddings,
persist_directory=CHROMA_DIR
)
print(f"✨ Knowledge base fully initialized and saved.")
except Exception as e:
print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")
if __name__ == "__main__":
run_ingestion() |