feat: implement semantic search functionality and engine for project similarity analysis
Browse files
src/similarity_model/semantic_search.py
CHANGED
|
@@ -87,9 +87,32 @@ def load_metadata():
|
|
| 87 |
|
| 88 |
mtime = os.path.getmtime(INDEX_PATH)
|
| 89 |
if _cached_metadata is None or _cached_metadata_mtime != mtime:
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
_cached_metadata_mtime = mtime
|
| 94 |
return _cached_metadata
|
| 95 |
|
|
|
|
| 87 |
|
| 88 |
mtime = os.path.getmtime(INDEX_PATH)
|
| 89 |
if _cached_metadata is None or _cached_metadata_mtime != mtime:
|
| 90 |
+
if META_PATH.exists():
|
| 91 |
+
logger.info(f"Loading metadata from local parquet {META_PATH} (syncing with FAISS index mtime: {mtime})...")
|
| 92 |
+
try:
|
| 93 |
+
df = pd.read_parquet(str(META_PATH))
|
| 94 |
+
if "features" in df.columns:
|
| 95 |
+
import json
|
| 96 |
+
def parse_features(x):
|
| 97 |
+
if not isinstance(x, str):
|
| 98 |
+
return x
|
| 99 |
+
try:
|
| 100 |
+
x = json.loads(x)
|
| 101 |
+
if isinstance(x, str):
|
| 102 |
+
x = json.loads(x)
|
| 103 |
+
return x
|
| 104 |
+
except Exception:
|
| 105 |
+
return []
|
| 106 |
+
df["features"] = df["features"].apply(parse_features)
|
| 107 |
+
_cached_metadata = df.reset_index(drop=True)
|
| 108 |
+
except Exception as e:
|
| 109 |
+
logger.warning(f"Failed to read local metadata.parquet: {e}. Falling back to database query...")
|
| 110 |
+
df = load_preprocessed_projects()
|
| 111 |
+
_cached_metadata = df.reset_index(drop=True)
|
| 112 |
+
else:
|
| 113 |
+
logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
|
| 114 |
+
df = load_preprocessed_projects()
|
| 115 |
+
_cached_metadata = df.reset_index(drop=True)
|
| 116 |
_cached_metadata_mtime = mtime
|
| 117 |
return _cached_metadata
|
| 118 |
|
src/similarity_model/similarity_engine.py
CHANGED
|
@@ -291,7 +291,44 @@ def find_similar_projects(
|
|
| 291 |
)
|
| 292 |
|
| 293 |
if not exclude_self:
|
| 294 |
-
# Check
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 295 |
db_match_found = False
|
| 296 |
db_project_id = None
|
| 297 |
db_features = []
|
|
@@ -325,29 +362,13 @@ def find_similar_projects(
|
|
| 325 |
db_features = []
|
| 326 |
except Exception as db_exc:
|
| 327 |
logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
|
| 328 |
-
|
| 329 |
-
# Check local/cached metadata dataframe as fallback
|
| 330 |
-
existing_project = df[
|
| 331 |
-
df["project_title"].str.lower().str.strip()
|
| 332 |
-
==
|
| 333 |
-
title.lower().strip()
|
| 334 |
-
]
|
| 335 |
|
| 336 |
-
if
|
| 337 |
-
logger.info("Exact title match found in preprocess table — returning originality = 0")
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
stored_features = safe_feature_list(matched_row["features"])
|
| 343 |
-
matched_title = matched_row["project_title"]
|
| 344 |
-
matched_abstract = matched_row.get("abstract", "")
|
| 345 |
-
matched_desc = matched_row.get("description", "")
|
| 346 |
-
else:
|
| 347 |
-
project_id = int(db_project_id)
|
| 348 |
-
stored_features = db_features
|
| 349 |
-
matched_abstract = db_abstract
|
| 350 |
-
matched_desc = db_description
|
| 351 |
|
| 352 |
return pd.DataFrame([{
|
| 353 |
"project_id": project_id,
|
|
|
|
| 291 |
)
|
| 292 |
|
| 293 |
if not exclude_self:
|
| 294 |
+
# Check local/cached metadata dataframe first (extremely fast, zero DB/network overhead)
|
| 295 |
+
existing_project = df[
|
| 296 |
+
df["project_title"].str.lower().str.strip()
|
| 297 |
+
==
|
| 298 |
+
title.lower().strip()
|
| 299 |
+
]
|
| 300 |
+
|
| 301 |
+
if len(existing_project) > 0:
|
| 302 |
+
logger.info("Exact title match found in local metadata cache — returning originality = 0")
|
| 303 |
+
matched_row = existing_project.iloc[0]
|
| 304 |
+
project_id = int(matched_row.name)
|
| 305 |
+
stored_features = safe_feature_list(matched_row["features"])
|
| 306 |
+
matched_title = matched_row["project_title"]
|
| 307 |
+
matched_abstract = matched_row.get("abstract", "")
|
| 308 |
+
matched_desc = matched_row.get("description", "")
|
| 309 |
+
|
| 310 |
+
return pd.DataFrame([{
|
| 311 |
+
"project_id": project_id,
|
| 312 |
+
"project_title": matched_title,
|
| 313 |
+
"semantic_score": 1.0,
|
| 314 |
+
"feature_score": 1.0,
|
| 315 |
+
"coverage": 1.0,
|
| 316 |
+
"hybrid_score": 1.0,
|
| 317 |
+
"originality_score": 0.0,
|
| 318 |
+
"confidence_score": 1.0,
|
| 319 |
+
"duplicate_risk": "Very High",
|
| 320 |
+
"shared_features_count": len(stored_features),
|
| 321 |
+
"matched_features": [{"feature_a": f, "feature_b": f, "score": 1.0} for f in stored_features],
|
| 322 |
+
"unique_query_features": [],
|
| 323 |
+
"unique_candidate_features": [],
|
| 324 |
+
"query_features_used": stored_features,
|
| 325 |
+
"query_clean_text": title,
|
| 326 |
+
"candidate_features": stored_features,
|
| 327 |
+
"abstract": matched_abstract,
|
| 328 |
+
"description": matched_desc
|
| 329 |
+
}])
|
| 330 |
+
|
| 331 |
+
# If not found in cache, check database table 'preprocess' directly
|
| 332 |
db_match_found = False
|
| 333 |
db_project_id = None
|
| 334 |
db_features = []
|
|
|
|
| 362 |
db_features = []
|
| 363 |
except Exception as db_exc:
|
| 364 |
logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 365 |
|
| 366 |
+
if db_match_found:
|
| 367 |
+
logger.info("Exact title match found in database 'preprocess' table — returning originality = 0")
|
| 368 |
+
project_id = int(db_project_id)
|
| 369 |
+
stored_features = db_features
|
| 370 |
+
matched_abstract = db_abstract
|
| 371 |
+
matched_desc = db_description
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 372 |
|
| 373 |
return pd.DataFrame([{
|
| 374 |
"project_id": project_id,
|