Graduation_Project-v1.2 / rebuild_index_server.py
bat-6's picture
feat: add server-side script to rebuild FAISS index for deployment environments
4c2a767
Raw
History Blame Contribute Delete
1.78 kB
"""
SERVER-SIDE rebuild script (Linux / Azure App Service).
Run this once on the deployed server to rebuild the FAISS index with all-mpnet-base-v2.
Usage:
cd /home/user/app
python rebuild_index_server.py
"""
import os
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
import sys
sys.path.insert(0, "/home/user/app")
import numpy as np
import pandas as pd
import faiss
from pathlib import Path
from sentence_transformers import SentenceTransformer
MODEL_NAME = "all-mpnet-base-v2"
MODELS_DIR = Path("/home/user/app/models")
PARQUET_PATH = MODELS_DIR / "metadata.parquet"
INDEX_PATH = MODELS_DIR / "faiss_index.bin"
EMBED_PATH = MODELS_DIR / "project_embeddings.npy"
print(f"[SERVER REBUILD] Model: {MODEL_NAME}")
model = SentenceTransformer(MODEL_NAME)
dim = model.get_sentence_embedding_dimension()
print(f"[SERVER REBUILD] Embedding dim: {dim}")
df = pd.read_parquet(PARQUET_PATH)
print(f"[SERVER REBUILD] Loaded {len(df)} projects")
def build_text(row):
return f"{row.get('project_title','') or ''}. {row.get('abstract','') or ''}. {row.get('description','') or ''}".strip()
texts = df.apply(build_text, axis=1).tolist()
print("[SERVER REBUILD] Generating embeddings...")
embeddings = model.encode(
texts,
convert_to_numpy=True,
normalize_embeddings=True,
show_progress_bar=True,
batch_size=8
).astype("float32")
print(f"[SERVER REBUILD] Embeddings shape: {embeddings.shape}")
np.save(str(EMBED_PATH), embeddings)
index = faiss.IndexFlatIP(dim)
index.add(embeddings)
faiss.write_index(index, str(INDEX_PATH))
print(f"[SERVER REBUILD] Done! Index: {index.ntotal} vectors @ {dim}-dim")
print("[SERVER REBUILD] Restart the app process to reload the index.")