File size: 3,237 Bytes
2194516
 
19be3af
069ee5c
d4bb434
6d3e4d2
d4bb434
19be3af
b713a11
6d3e4d2
2194516
 
 
 
 
 
 
6d3e4d2
2194516
6d3d36d
2194516
3ad751d
19be3af
2194516
 
19be3af
 
 
2194516
 
 
 
19be3af
e8fa82e
2194516
 
 
 
 
 
 
 
 
 
 
 
 
 
19be3af
2194516
1b7f800
19be3af
2194516
19be3af
6d3d36d
2194516
 
 
 
 
 
 
 
 
 
 
19be3af
2194516
 
 
 
 
 
 
 
 
19be3af
 
2194516
6d3e4d2
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import shutil
from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN

def run_ingestion():
    # 1. Environment Cleanup & Setup
    # Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
    if os.path.exists(KB_DIR):
        shutil.rmtree(KB_DIR)
    if os.path.exists(CHROMA_DIR):
        shutil.rmtree(CHROMA_DIR)
    
    os.makedirs(KB_DIR, exist_ok=True)
    os.makedirs(CHROMA_DIR, exist_ok=True)

    print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")
    
    try:
        # 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
        # This only fetches .docx files to keep your Gandhi ji knowledge base clean
        all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
        docx_files = [f for f in all_files if f.lower().endswith(".docx")]
        
        if not docx_files:
            print("❌ Error: No .docx files found in the dataset repository.")
            return

        docs = []
        for file_name in docx_files:
            print(f"📂 Downloading {file_name}...")
            # Download to HF cache first
            temp_path = hf_hub_download(
                repo_id=HF_DATASET_REPO, 
                filename=file_name, 
                repo_type="dataset", 
                token=HF_TOKEN
            )
            # Copy to our predictable /app/kb directory
            local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
            shutil.copy(temp_path, local_docx)
            
            # 3. Load text from Docx (ignores images automatically)
            loader = Docx2txtLoader(local_docx)
            docs.extend(loader.load())
            print(f"✅ Text extracted from: {file_name}")

        if not docs:
            print("❌ Error: Extracted document list is empty.")
            return

        # 4. Text Splitting (Optimized for RAG context windows)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            add_start_index=True
        )
        splits = text_splitter.split_documents(docs)
        print(f"✂️ Split into {len(splits)} text chunks.")

        # 5. Embedding & Vector Store Creation
        print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
        
        # Save to the persistent directory specified in config (/app/chroma_db)
        print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
        Chroma.from_documents(
            documents=splits, 
            embedding=embeddings, 
            persist_directory=CHROMA_DIR
        )
        print(f"✨ Knowledge base fully initialized and saved.")

    except Exception as e:
        print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")

if __name__ == "__main__":
    run_ingestion()