import logging from pathlib import Path from typing import List import pandas as pd import numpy as np import faiss from sentence_transformers import SentenceTransformer from Data.database.sql_connector import load_preprocessed_projects logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s") logger = logging.getLogger(__name__) DEFAULT_MODEL = "all-mpnet-base-v2" TEXT_COL = "clean_text" TITLE_COL = "project_title" TECH_COL = "technologies" _PROJECT_ROOT = Path(__file__).resolve().parents[2] MODEL_DIR = _PROJECT_ROOT / "models" INDEX_PATH = MODEL_DIR / "faiss_index.bin" META_PATH = MODEL_DIR / "metadata.parquet" class ProjectEmbedder: def __init__(self, model_name: str = DEFAULT_MODEL): logger.info(f"Loading embedding model: {model_name}") self.model = SentenceTransformer(model_name) self.index = None self.metadata = None def generate_embeddings(self, texts: List[str], batch_size: int = 64) -> np.ndarray: logger.info(f"Generating embeddings for {len(texts)} projects...") vectors = self.model.encode( texts, batch_size=batch_size, show_progress_bar=True, convert_to_numpy=True, normalize_embeddings=True ) return vectors.astype("float32") def build_index(self, df: pd.DataFrame): """Build FAISS cosine index.""" self.metadata = df.copy() self.metadata = self.metadata.reset_index(drop=True) for col in [TITLE_COL, TEXT_COL]: if col not in self.metadata.columns: self.metadata[col] = "" if TECH_COL not in self.metadata.columns: self.metadata[TECH_COL] = "" FEATURE_COL = "features" if FEATURE_COL not in self.metadata.columns: self.metadata[FEATURE_COL] = "" feature_text = self.metadata[FEATURE_COL].fillna("").astype(str) rich_texts = ( self.metadata[TITLE_COL].fillna("").astype(str) + " " + self.metadata[TEXT_COL].fillna("").astype(str) + " " + feature_text ).tolist() embeddings = self.generate_embeddings(rich_texts) self.embeddings = embeddings dim = embeddings.shape[1] base_index = faiss.IndexFlatIP(dim) self.index = faiss.IndexIDMap(base_index) ids = np.arange(len(self.metadata)).astype("int64") self.index.add_with_ids(embeddings, ids) logger.info(f"FAISS index built successfully with {self.index.ntotal} vectors.") def save_artifacts(self, folder: str = "models"): path = Path(folder) path.mkdir(parents=True, exist_ok=True) faiss.write_index(self.index, str(path / "faiss_index.bin")) self.metadata.to_parquet(path / "metadata.parquet", index=False) if hasattr(self, "embeddings"): np.save(str(path / "project_embeddings.npy"), self.embeddings) logger.info(f"Artifacts saved to {folder}") def load_artifacts(self, folder: str = "models"): path = Path(folder) self.index = faiss.read_index(str(path / "faiss_index.bin")) self.metadata = pd.read_parquet(path / "metadata.parquet") logger.info("Artifacts loaded successfully.") def train_embedding_engine(): logger.info("Loading processed dataset from Azure SQL...") df = load_preprocessed_projects() engine = ProjectEmbedder() engine.build_index(df) engine.save_artifacts(str(MODEL_DIR)) logger.info("Embedding engine completed successfully.") return engine