Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import glob | |
| import pickle | |
| import numpy as np | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| # Paths | |
| PROJECT_ROOT = "d:/NLP_KMH/Chatbot_NIHE_v2" | |
| CHUNKS_DIR = os.path.join(PROJECT_ROOT, "data/chunks") | |
| INDEX_DIR = os.path.join(PROJECT_ROOT, "data/index") | |
| # Configuration | |
| # Using a dedicated embedding model is better/faster for RAG than LLM encoders usually. | |
| # 'keepitreal/vietnamese-sbert' is a good balance for VN text. | |
| MODEL_NAME = 'keepitreal/vietnamese-sbert' | |
| def load_chunks(): | |
| """Load all chunk JSON files.""" | |
| files = glob.glob(os.path.join(CHUNKS_DIR, "*.json")) | |
| chunks = [] | |
| print(f"Loading {len(files)} chunks...") | |
| for fpath in files: | |
| try: | |
| with open(fpath, 'r', encoding='utf-8') as f: | |
| chunks.append(json.load(f)) | |
| except Exception as e: | |
| print(f"Error loading {fpath}: {e}") | |
| return chunks | |
| def build_index(): | |
| if not os.path.exists(INDEX_DIR): | |
| os.makedirs(INDEX_DIR) | |
| # 1. Load Data | |
| chunks = load_chunks() | |
| if not chunks: | |
| print("No chunks found. Aborting.") | |
| return | |
| texts = [c['text'] for c in chunks] | |
| # 2. Initialize Model | |
| print(f"Loading embedding model: {MODEL_NAME}...") | |
| model = SentenceTransformer(MODEL_NAME) | |
| # 3. Generate Embeddings | |
| print("Generating embeddings...") | |
| embeddings = model.encode(texts, show_progress_bar=True) | |
| embeddings = np.array(embeddings).astype('float32') | |
| dimension = embeddings.shape[1] | |
| print(f"Embedding dimension: {dimension}") | |
| # 4. create FAISS index | |
| print("Building FAISS index...") | |
| # L2 distance (Euclidean). For cosine similarity, normalize vectors first. | |
| # To use cosine similarity with L2 index: normalize vectors. | |
| faiss.normalize_L2(embeddings) | |
| index = faiss.IndexFlatIP(dimension) # Inner Product (~Cosine Sim if normalized) | |
| index.add(embeddings) | |
| # 5. Save everything | |
| index_path = os.path.join(INDEX_DIR, "nihe_faiss.index") | |
| metadata_path = os.path.join(INDEX_DIR, "metadata.pkl") | |
| print(f"Saving index to {index_path}...") | |
| faiss.write_index(index, index_path) | |
| print(f"Saving metadata to {metadata_path}...") | |
| with open(metadata_path, 'wb') as f: | |
| pickle.dump(chunks, f) | |
| print("Indexing complete.") | |
| if __name__ == "__main__": | |
| build_index() | |