import os
import shutil
from huggingface_hub import hf_hub_download, list_repo_files
from langchain_community.document_loaders import Docx2txtLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN

def run_ingestion():
    # 1. Environment Cleanup & Setup
    # Using absolute paths from config (e.g., /app/kb and /app/chroma_db)
    if os.path.exists(KB_DIR):
        shutil.rmtree(KB_DIR)
    if os.path.exists(CHROMA_DIR):
        shutil.rmtree(CHROMA_DIR)
    
    os.makedirs(KB_DIR, exist_ok=True)
    os.makedirs(CHROMA_DIR, exist_ok=True)

    print(f"⬇️ Listing files in repository: {HF_DATASET_REPO}...")
    
    try:
        # 2. Direct File Download (Bypassing load_dataset to avoid PDF errors)
        # This only fetches .docx files to keep your Gandhi ji knowledge base clean
        all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN)
        docx_files = [f for f in all_files if f.lower().endswith(".docx")]
        
        if not docx_files:
            print("❌ Error: No .docx files found in the dataset repository.")
            return

        docs = []
        for file_name in docx_files:
            print(f"📂 Downloading {file_name}...")
            # Download to HF cache first
            temp_path = hf_hub_download(
                repo_id=HF_DATASET_REPO, 
                filename=file_name, 
                repo_type="dataset", 
                token=HF_TOKEN
            )
            # Copy to our predictable /app/kb directory
            local_docx = os.path.join(KB_DIR, os.path.basename(file_name))
            shutil.copy(temp_path, local_docx)
            
            # 3. Load text from Docx (ignores images automatically)
            loader = Docx2txtLoader(local_docx)
            docs.extend(loader.load())
            print(f"✅ Text extracted from: {file_name}")

        if not docs:
            print("❌ Error: Extracted document list is empty.")
            return

        # 4. Text Splitting (Optimized for RAG context windows)
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=CHUNK_SIZE,
            chunk_overlap=CHUNK_OVERLAP,
            add_start_index=True
        )
        splits = text_splitter.split_documents(docs)
        print(f"✂️ Split into {len(splits)} text chunks.")

        # 5. Embedding & Vector Store Creation
        print(f"🧠 Generating embeddings with {EMBEDDING_MODEL}...")
        embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL)
        
        # Save to the persistent directory specified in config (/app/chroma_db)
        print(f"💾 Saving Vector Database to {CHROMA_DIR}...")
        Chroma.from_documents(
            documents=splits, 
            embedding=embeddings, 
            persist_directory=CHROMA_DIR
        )
        print(f"✨ Knowledge base fully initialized and saved.")

    except Exception as e:
        print(f"❌ CRITICAL INGESTION ERROR: {str(e)}")

if __name__ == "__main__":
    run_ingestion()