bat-6's picture
feat: implement similarity engine with embedding generation, hybrid ranking, and explainability modules
5e95de5
Raw
History Blame Contribute Delete
6.09 kB
import re
import logging
import yake
import numpy as np
from functools import lru_cache
from pathlib import Path
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")
logger = logging.getLogger(__name__)
MODEL_NAME = "all-mpnet-base-v2"
@lru_cache(maxsize=1)
def _get_embed_model():
logger.info(f"Loading embed model: {MODEL_NAME}")
return SentenceTransformer(MODEL_NAME)
MIN_WORDS = 8
MAX_WORDS = 4000
def normalize_text(text):
if pd.isna(text):
return ""
text = str(text).lower().strip()
text = re.sub(r"http\S+|www\S+|\S+@\S+", " ", text)
text = re.sub(r"[^a-z0-9\+\#\./\- ]", " ", text)
text = re.sub(r"\s+", " ", text)
return text.strip()
def substring_deduplicate(features):
features = sorted(features, key=len, reverse=True)
kept = []
for feat in features:
is_substring = False
for longer_feat in kept:
if feat in longer_feat:
is_substring = True
break
if not is_substring:
kept.append(feat)
return kept
def semantic_deduplicate(features, model, threshold=0.85):
if len(features) <= 1:
return features
embeddings = model.encode(
features,
convert_to_numpy=True,
normalize_embeddings=True
)
kept = []
for i, feat in enumerate(features):
redundant = False
for existing in kept:
sim = cosine_similarity(
embeddings[i].reshape(1, -1),
embeddings[existing].reshape(1, -1)
)[0][0]
if sim >= threshold:
redundant = True
break
if not redundant:
kept.append(i)
return [features[i] for i in kept]
@lru_cache(maxsize=1)
def _get_yake_extractor():
logger.info("Initializing YAKE NLP feature extractor")
return yake.KeywordExtractor(lan="en", n=3, dedupLim=0.9, top=20, features=None)
import json
_feature_db_frequencies = None
def load_feature_frequencies_cache():
global _feature_db_frequencies
if _feature_db_frequencies is None:
try:
from src.similarity_model.semantic_search import load_metadata
df = load_metadata()
from collections import Counter
counter = Counter()
total_docs = len(df)
if total_docs > 0:
for feats in df["features"]:
if isinstance(feats, str):
try:
feats = json.loads(feats)
except:
feats = []
if isinstance(feats, list):
seen = set(str(f).strip().lower() for f in feats)
for f in seen:
if f:
counter[f] += 1
_feature_db_frequencies = {k: v / total_docs for k, v in counter.items()}
else:
_feature_db_frequencies = {}
except Exception:
_feature_db_frequencies = {}
return _feature_db_frequencies
def extract_features(text: str) -> list:
"""
Extracts detailed, multi-word phrases generated purely by YAKE.
Filters out highly generic features appearing in > 15% of indexed projects.
"""
matched = []
try:
kw_extractor = _get_yake_extractor()
yake_results = kw_extractor.extract_keywords(text)
freq_cache = load_feature_frequencies_cache()
max_df_threshold = 0.15 # Filter if keyword appears in > 15% of database
for kw, score in yake_results:
kw_clean = str(kw).strip().lower()
if len(kw_clean.split()) > 1 and kw_clean not in matched:
# Apply IDF filter check
doc_freq = freq_cache.get(kw_clean, 0.0)
if doc_freq <= max_df_threshold:
matched.append(kw_clean)
except Exception as e:
logger.error(f"YAKE extraction failed: {e}")
if not matched:
return []
matched = substring_deduplicate(matched)
return semantic_deduplicate(matched, _get_embed_model(), threshold=0.85)
def preprocess_dataset(df):
logger.info("Starting preprocessing...")
df = df.copy()
df.columns = df.columns.str.strip().str.lower().str.replace(r"\W+", "_", regex=True)
column_mapping = {
"title": "project_title",
"ai_summary": "ai_summary",
"technologies": "technologies",
"keywords": "keywords",
"abstract": "abstract",
"description": "description",
"problem_statement": "problem_statement",
"proposed_solution": "proposed_solution",
"objectives": "objectives",
"category": "category"
}
df = df.rename(columns=column_mapping)
for col in ["project_title", "abstract", "description"]:
if col not in df.columns:
df[col] = ""
df[col] = df[col].fillna("").astype(str)
df["full_content"] = df["project_title"] + ". " + df["abstract"] + ". " + df["description"]
df["clean_text"] = df["full_content"].apply(normalize_text)
before = len(df)
df = df.drop_duplicates(subset=["project_title", "clean_text"]).copy()
logger.info(f"Removed duplicates: {before-len(df)}")
df["word_count"] = df["clean_text"].str.split().str.len()
df = df[df["word_count"].between(MIN_WORDS, MAX_WORDS)].copy()
df.reset_index(drop=True, inplace=True)
logger.info("Extracting features...")
df["features"] = df["clean_text"].apply(extract_features)
df = df[df["features"].apply(len) > 0].copy()
df.reset_index(drop=True, inplace=True)
logger.info(f"Final rows: {len(df)}")
return df