bat-6 commited on
Commit
eecb5f7
·
1 Parent(s): cf51ebc

feat: implement semantic search functionality and engine for project similarity analysis

Browse files
src/similarity_model/semantic_search.py CHANGED
@@ -87,9 +87,32 @@ def load_metadata():
87
 
88
  mtime = os.path.getmtime(INDEX_PATH)
89
  if _cached_metadata is None or _cached_metadata_mtime != mtime:
90
- logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
91
- df = load_preprocessed_projects()
92
- _cached_metadata = df.reset_index(drop=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
93
  _cached_metadata_mtime = mtime
94
  return _cached_metadata
95
 
 
87
 
88
  mtime = os.path.getmtime(INDEX_PATH)
89
  if _cached_metadata is None or _cached_metadata_mtime != mtime:
90
+ if META_PATH.exists():
91
+ logger.info(f"Loading metadata from local parquet {META_PATH} (syncing with FAISS index mtime: {mtime})...")
92
+ try:
93
+ df = pd.read_parquet(str(META_PATH))
94
+ if "features" in df.columns:
95
+ import json
96
+ def parse_features(x):
97
+ if not isinstance(x, str):
98
+ return x
99
+ try:
100
+ x = json.loads(x)
101
+ if isinstance(x, str):
102
+ x = json.loads(x)
103
+ return x
104
+ except Exception:
105
+ return []
106
+ df["features"] = df["features"].apply(parse_features)
107
+ _cached_metadata = df.reset_index(drop=True)
108
+ except Exception as e:
109
+ logger.warning(f"Failed to read local metadata.parquet: {e}. Falling back to database query...")
110
+ df = load_preprocessed_projects()
111
+ _cached_metadata = df.reset_index(drop=True)
112
+ else:
113
+ logger.info(f"Loading metadata from Azure SQL (syncing with FAISS index mtime: {mtime})...")
114
+ df = load_preprocessed_projects()
115
+ _cached_metadata = df.reset_index(drop=True)
116
  _cached_metadata_mtime = mtime
117
  return _cached_metadata
118
 
src/similarity_model/similarity_engine.py CHANGED
@@ -291,7 +291,44 @@ def find_similar_projects(
291
  )
292
 
293
  if not exclude_self:
294
- # Check database table 'preprocess' directly to see if this project title already exists
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
295
  db_match_found = False
296
  db_project_id = None
297
  db_features = []
@@ -325,29 +362,13 @@ def find_similar_projects(
325
  db_features = []
326
  except Exception as db_exc:
327
  logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
328
-
329
- # Check local/cached metadata dataframe as fallback
330
- existing_project = df[
331
- df["project_title"].str.lower().str.strip()
332
- ==
333
- title.lower().strip()
334
- ]
335
 
336
- if len(existing_project) > 0 or db_match_found:
337
- logger.info("Exact title match found in preprocess table — returning originality = 0")
338
-
339
- if len(existing_project) > 0:
340
- matched_row = existing_project.iloc[0]
341
- project_id = int(matched_row.name)
342
- stored_features = safe_feature_list(matched_row["features"])
343
- matched_title = matched_row["project_title"]
344
- matched_abstract = matched_row.get("abstract", "")
345
- matched_desc = matched_row.get("description", "")
346
- else:
347
- project_id = int(db_project_id)
348
- stored_features = db_features
349
- matched_abstract = db_abstract
350
- matched_desc = db_description
351
 
352
  return pd.DataFrame([{
353
  "project_id": project_id,
 
291
  )
292
 
293
  if not exclude_self:
294
+ # Check local/cached metadata dataframe first (extremely fast, zero DB/network overhead)
295
+ existing_project = df[
296
+ df["project_title"].str.lower().str.strip()
297
+ ==
298
+ title.lower().strip()
299
+ ]
300
+
301
+ if len(existing_project) > 0:
302
+ logger.info("Exact title match found in local metadata cache — returning originality = 0")
303
+ matched_row = existing_project.iloc[0]
304
+ project_id = int(matched_row.name)
305
+ stored_features = safe_feature_list(matched_row["features"])
306
+ matched_title = matched_row["project_title"]
307
+ matched_abstract = matched_row.get("abstract", "")
308
+ matched_desc = matched_row.get("description", "")
309
+
310
+ return pd.DataFrame([{
311
+ "project_id": project_id,
312
+ "project_title": matched_title,
313
+ "semantic_score": 1.0,
314
+ "feature_score": 1.0,
315
+ "coverage": 1.0,
316
+ "hybrid_score": 1.0,
317
+ "originality_score": 0.0,
318
+ "confidence_score": 1.0,
319
+ "duplicate_risk": "Very High",
320
+ "shared_features_count": len(stored_features),
321
+ "matched_features": [{"feature_a": f, "feature_b": f, "score": 1.0} for f in stored_features],
322
+ "unique_query_features": [],
323
+ "unique_candidate_features": [],
324
+ "query_features_used": stored_features,
325
+ "query_clean_text": title,
326
+ "candidate_features": stored_features,
327
+ "abstract": matched_abstract,
328
+ "description": matched_desc
329
+ }])
330
+
331
+ # If not found in cache, check database table 'preprocess' directly
332
  db_match_found = False
333
  db_project_id = None
334
  db_features = []
 
362
  db_features = []
363
  except Exception as db_exc:
364
  logger.warning(f"Direct database check of 'preprocess' table failed: {db_exc}")
 
 
 
 
 
 
 
365
 
366
+ if db_match_found:
367
+ logger.info("Exact title match found in database 'preprocess' table — returning originality = 0")
368
+ project_id = int(db_project_id)
369
+ stored_features = db_features
370
+ matched_abstract = db_abstract
371
+ matched_desc = db_description
 
 
 
 
 
 
 
 
 
372
 
373
  return pd.DataFrame([{
374
  "project_id": project_id,