chatbot_nihe / src /indexing /build_index.py
Auto Deploy Script
Auto deploy from local machine
f9b0dca
import os
import json
import glob
import pickle
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
# Paths
PROJECT_ROOT = "d:/NLP_KMH/Chatbot_NIHE_v2"
CHUNKS_DIR = os.path.join(PROJECT_ROOT, "data/chunks")
INDEX_DIR = os.path.join(PROJECT_ROOT, "data/index")
# Configuration
# Using a dedicated embedding model is better/faster for RAG than LLM encoders usually.
# 'keepitreal/vietnamese-sbert' is a good balance for VN text.
MODEL_NAME = 'keepitreal/vietnamese-sbert'
def load_chunks():
"""Load all chunk JSON files."""
files = glob.glob(os.path.join(CHUNKS_DIR, "*.json"))
chunks = []
print(f"Loading {len(files)} chunks...")
for fpath in files:
try:
with open(fpath, 'r', encoding='utf-8') as f:
chunks.append(json.load(f))
except Exception as e:
print(f"Error loading {fpath}: {e}")
return chunks
def build_index():
if not os.path.exists(INDEX_DIR):
os.makedirs(INDEX_DIR)
# 1. Load Data
chunks = load_chunks()
if not chunks:
print("No chunks found. Aborting.")
return
texts = [c['text'] for c in chunks]
# 2. Initialize Model
print(f"Loading embedding model: {MODEL_NAME}...")
model = SentenceTransformer(MODEL_NAME)
# 3. Generate Embeddings
print("Generating embeddings...")
embeddings = model.encode(texts, show_progress_bar=True)
embeddings = np.array(embeddings).astype('float32')
dimension = embeddings.shape[1]
print(f"Embedding dimension: {dimension}")
# 4. create FAISS index
print("Building FAISS index...")
# L2 distance (Euclidean). For cosine similarity, normalize vectors first.
# To use cosine similarity with L2 index: normalize vectors.
faiss.normalize_L2(embeddings)
index = faiss.IndexFlatIP(dimension) # Inner Product (~Cosine Sim if normalized)
index.add(embeddings)
# 5. Save everything
index_path = os.path.join(INDEX_DIR, "nihe_faiss.index")
metadata_path = os.path.join(INDEX_DIR, "metadata.pkl")
print(f"Saving index to {index_path}...")
faiss.write_index(index, index_path)
print(f"Saving metadata to {metadata_path}...")
with open(metadata_path, 'wb') as f:
pickle.dump(chunks, f)
print("Indexing complete.")
if __name__ == "__main__":
build_index()