|
|
import os |
|
|
import shutil |
|
|
from huggingface_hub import hf_hub_download, list_repo_files |
|
|
from langchain_community.document_loaders import Docx2txtLoader |
|
|
from langchain_text_splitters import RecursiveCharacterTextSplitter |
|
|
from langchain_huggingface import HuggingFaceEmbeddings |
|
|
from langchain_chroma import Chroma |
|
|
from config import KB_DIR, HF_DATASET_REPO, EMBEDDING_MODEL, CHROMA_DIR, CHUNK_SIZE, CHUNK_OVERLAP, HF_TOKEN |
|
|
|
|
|
def run_ingestion(): |
|
|
|
|
|
|
|
|
if os.path.exists(KB_DIR): |
|
|
shutil.rmtree(KB_DIR) |
|
|
if os.path.exists(CHROMA_DIR): |
|
|
shutil.rmtree(CHROMA_DIR) |
|
|
|
|
|
os.makedirs(KB_DIR, exist_ok=True) |
|
|
os.makedirs(CHROMA_DIR, exist_ok=True) |
|
|
|
|
|
print(f"β¬οΈ Listing files in repository: {HF_DATASET_REPO}...") |
|
|
|
|
|
try: |
|
|
|
|
|
|
|
|
all_files = list_repo_files(repo_id=HF_DATASET_REPO, repo_type="dataset", token=HF_TOKEN) |
|
|
docx_files = [f for f in all_files if f.lower().endswith(".docx")] |
|
|
|
|
|
if not docx_files: |
|
|
print("β Error: No .docx files found in the dataset repository.") |
|
|
return |
|
|
|
|
|
docs = [] |
|
|
for file_name in docx_files: |
|
|
print(f"π Downloading {file_name}...") |
|
|
|
|
|
temp_path = hf_hub_download( |
|
|
repo_id=HF_DATASET_REPO, |
|
|
filename=file_name, |
|
|
repo_type="dataset", |
|
|
token=HF_TOKEN |
|
|
) |
|
|
|
|
|
local_docx = os.path.join(KB_DIR, os.path.basename(file_name)) |
|
|
shutil.copy(temp_path, local_docx) |
|
|
|
|
|
|
|
|
loader = Docx2txtLoader(local_docx) |
|
|
docs.extend(loader.load()) |
|
|
print(f"β
Text extracted from: {file_name}") |
|
|
|
|
|
if not docs: |
|
|
print("β Error: Extracted document list is empty.") |
|
|
return |
|
|
|
|
|
|
|
|
text_splitter = RecursiveCharacterTextSplitter( |
|
|
chunk_size=CHUNK_SIZE, |
|
|
chunk_overlap=CHUNK_OVERLAP, |
|
|
add_start_index=True |
|
|
) |
|
|
splits = text_splitter.split_documents(docs) |
|
|
print(f"βοΈ Split into {len(splits)} text chunks.") |
|
|
|
|
|
|
|
|
print(f"π§ Generating embeddings with {EMBEDDING_MODEL}...") |
|
|
embeddings = HuggingFaceEmbeddings(model_name=EMBEDDING_MODEL) |
|
|
|
|
|
|
|
|
print(f"πΎ Saving Vector Database to {CHROMA_DIR}...") |
|
|
Chroma.from_documents( |
|
|
documents=splits, |
|
|
embedding=embeddings, |
|
|
persist_directory=CHROMA_DIR |
|
|
) |
|
|
print(f"β¨ Knowledge base fully initialized and saved.") |
|
|
|
|
|
except Exception as e: |
|
|
print(f"β CRITICAL INGESTION ERROR: {str(e)}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
run_ingestion() |