bat-6 commited on
Commit
df662e5
·
1 Parent(s): e24894a

feat: implement hybrid similarity ranking engine with dynamic weighting and originality scoring

Browse files
src/similarity_model/hybrid_ranker.py CHANGED
@@ -1,4 +1,5 @@
1
  import logging
 
2
  from typing import List, Dict, Any
3
 
4
  import pandas as pd
@@ -140,24 +141,37 @@ def get_baseline_similarity():
140
 
141
  def compute_originality(
142
  hybrid_score: float,
143
- unique_query_features: int,
144
- total_query_features: int
145
  ) -> float:
146
  """
147
  Originality Score (0-100).
148
- Base: (1 - calibrated_similarity) * 100.
 
 
 
 
 
 
 
149
  """
 
 
 
150
  hybrid_score = clamp(hybrid_score)
151
  baseline_sim = get_baseline_similarity()
152
-
153
- # Subtraction and Min-Max scaling
154
- calibrated_similarity = max(0.0, (hybrid_score - baseline_sim) / (1.0 - baseline_sim))
155
- originality = 100.0 * (1.0 - calibrated_similarity)
156
-
157
- # Only apply uniqueness bonus when feature comparison was meaningful
158
- if total_query_features > 0 and unique_query_features < total_query_features:
159
- uniqueness_ratio = unique_query_features / total_query_features
160
- originality = min(100.0, originality + (uniqueness_ratio * 10.0))
 
 
 
161
 
162
  return round(max(0.0, min(100.0, originality)), 2)
163
 
 
1
  import logging
2
+ import math
3
  from typing import List, Dict, Any
4
 
5
  import pandas as pd
 
141
 
142
  def compute_originality(
143
  hybrid_score: float,
144
+ unique_query_features: int = 0,
145
+ total_query_features: int = 0
146
  ) -> float:
147
  """
148
  Originality Score (0-100).
149
+
150
+ Uses a shifted sigmoid calibration so that moderate similarity
151
+ (0.40-0.60) triggers strong originality penalties, while truly
152
+ novel projects (sim < 0.30) retain high originality.
153
+
154
+ Sigmoid parameters:
155
+ k = 14 (steepness of the drop-off)
156
+ midpoint = 0.27 (calibrated similarity where originality ≈ 50%)
157
  """
158
+ SIGMOID_K = 14
159
+ SIGMOID_MIDPOINT = 0.27
160
+
161
  hybrid_score = clamp(hybrid_score)
162
  baseline_sim = get_baseline_similarity()
163
+
164
+ # Subtraction and Min-Max scaling (unchanged)
165
+ calibrated_similarity = max(
166
+ 0.0, (hybrid_score - baseline_sim) / (1.0 - baseline_sim)
167
+ )
168
+
169
+ # Sigmoid mapping: converts calibrated_similarity [0, 1]
170
+ sigmoid_output = 1.0 / (
171
+ 1.0 + math.exp(-SIGMOID_K * (calibrated_similarity - SIGMOID_MIDPOINT))
172
+ )
173
+
174
+ originality = 100.0 * (1.0 - sigmoid_output)
175
 
176
  return round(max(0.0, min(100.0, originality)), 2)
177
 
src/similarity_model/similarity_engine.py CHANGED
@@ -1,5 +1,6 @@
1
  import logging
2
  from typing import Dict, Any, List, Optional
 
3
 
4
  import pandas as pd
5
 
@@ -19,6 +20,21 @@ from src.similarity_model import (
19
  risk_label
20
  )
21
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
  logging.basicConfig(
23
  level=logging.INFO,
24
  format="%(asctime)s | %(levelname)s | %(message)s"
@@ -404,7 +420,74 @@ def find_similar_projects(
404
  ascending=False
405
  ).reset_index(drop=True)
406
 
407
- # Decaying aggregation over Top-5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
408
  K_val = min(5, len(final_df))
409
  if K_val > 0:
410
  s1 = float(final_df.loc[0, "hybrid_score"])
@@ -414,22 +497,16 @@ def find_similar_projects(
414
  for i in range(1, K_val):
415
  si = float(final_df.loc[i, "hybrid_score"])
416
  density_penalty += np.exp(-lam * i) * si
417
-
418
  aggregated_score = min(1.0, s1 + beta * density_penalty)
419
-
420
  # Recalculate originality based on aggregated similarity score
421
- top_row = final_df.iloc[0]
422
- unique_q_feats = top_row.get("unique_query_features", [])
423
- total_q_feats = len(query_project[FEATURE_COL])
424
-
425
  aggregated_originality = compute_originality(
426
- hybrid_score=aggregated_score,
427
- unique_query_features=len(unique_q_feats),
428
- total_query_features=total_q_feats
429
  )
430
  if aggregated_score >= 0.90:
431
  aggregated_originality = 0.0
432
-
433
  final_df.loc[0, "originality_score"] = aggregated_originality
434
  else:
435
  aggregated_score = 0.0
 
1
  import logging
2
  from typing import Dict, Any, List, Optional
3
+ from functools import lru_cache
4
 
5
  import pandas as pd
6
 
 
20
  risk_label
21
  )
22
 
23
+ # ---------------------------------------------------------------------------
24
+ # Cross-encoder for paraphrase detection (lazy-loaded, cached)
25
+ # ---------------------------------------------------------------------------
26
+ @lru_cache(maxsize=1)
27
+ def _load_cross_encoder():
28
+ from sentence_transformers import CrossEncoder
29
+ logger.info("Loading cross-encoder: cross-encoder/stsb-distilroberta-base")
30
+ return CrossEncoder("cross-encoder/stsb-distilroberta-base", max_length=512)
31
+
32
+ CROSS_ENCODER_THRESHOLD = 0.60 # minimum cross-score to trigger boost
33
+ CROSS_ENCODER_MAX_BOOST = 0.30 # maximum hybrid_score boost from cross-encoder
34
+ WORKFLOW_COVERAGE_THRESH = 0.50 # minimum coverage to trigger workflow penalty
35
+ WORKFLOW_FEATURE_THRESH = 0.45 # minimum feature_score to trigger workflow penalty
36
+ WORKFLOW_MAX_BOOST = 0.10 # maximum hybrid_score boost from workflow overlap
37
+
38
  logging.basicConfig(
39
  level=logging.INFO,
40
  format="%(asctime)s | %(levelname)s | %(message)s"
 
420
  ascending=False
421
  ).reset_index(drop=True)
422
 
423
+ # -----------------------------------------------------------------
424
+ # CROSS-ENCODER RE-SCORING (top-1 candidate only)
425
+ # -----------------------------------------------------------------
426
+ if len(final_df) > 0:
427
+ top_row = final_df.iloc[0]
428
+ candidate_id = int(top_row["project_id"])
429
+ candidate_row = df.loc[candidate_id]
430
+
431
+ # Build full texts for cross-encoder comparison
432
+ query_full = build_raw_text(
433
+ title=title, abstract=abstract, description=description
434
+ )
435
+ candidate_full = build_raw_text(
436
+ title=str(candidate_row.get(TITLE_COL, "")),
437
+ abstract=str(candidate_row.get("abstract", "")),
438
+ description=str(candidate_row.get("description", ""))
439
+ )
440
+
441
+ try:
442
+ cross_encoder = _load_cross_encoder()
443
+ cross_score = float(
444
+ cross_encoder.predict([(query_full, candidate_full)])[0]
445
+ )
446
+ # stsb model already outputs [0, 1] — clamp for safety
447
+ cross_score = max(0.0, min(1.0, cross_score))
448
+ logger.info(
449
+ f"Cross-encoder score (top-1): {cross_score:.4f}"
450
+ )
451
+ except Exception as exc:
452
+ logger.warning(f"Cross-encoder failed, skipping: {exc}")
453
+ cross_score = 0.0
454
+
455
+ # Apply cross-encoder boost if threshold met
456
+ if cross_score >= CROSS_ENCODER_THRESHOLD:
457
+ boost = CROSS_ENCODER_MAX_BOOST * (
458
+ (cross_score - CROSS_ENCODER_THRESHOLD)
459
+ / (1.0 - CROSS_ENCODER_THRESHOLD)
460
+ )
461
+ original_hybrid = float(final_df.loc[0, "hybrid_score"])
462
+ boosted_hybrid = min(1.0, original_hybrid + boost)
463
+ final_df.loc[0, "hybrid_score"] = round(boosted_hybrid, 4)
464
+ logger.info(
465
+ f"Cross-encoder boost: {original_hybrid:.4f} -> {boosted_hybrid:.4f} "
466
+ f"(+{boost:.4f})"
467
+ )
468
+
469
+ # -----------------------------------------------------------------
470
+ # WORKFLOW OVERLAP PENALTY
471
+ # -----------------------------------------------------------------
472
+ top_coverage = float(top_row.get("coverage", 0.0))
473
+ top_feat_score = float(top_row.get("feature_score", 0.0))
474
+
475
+ if (top_coverage >= WORKFLOW_COVERAGE_THRESH
476
+ and top_feat_score >= WORKFLOW_FEATURE_THRESH):
477
+ workflow_boost = (
478
+ WORKFLOW_MAX_BOOST * top_coverage * top_feat_score
479
+ )
480
+ current_hybrid = float(final_df.loc[0, "hybrid_score"])
481
+ boosted_hybrid = min(1.0, current_hybrid + workflow_boost)
482
+ final_df.loc[0, "hybrid_score"] = round(boosted_hybrid, 4)
483
+ logger.info(
484
+ f"Workflow overlap boost: {current_hybrid:.4f} -> "
485
+ f"{boosted_hybrid:.4f} (+{workflow_boost:.4f})"
486
+ )
487
+
488
+ # -----------------------------------------------------------------
489
+ # DECAYING AGGREGATION over Top-5
490
+ # -----------------------------------------------------------------
491
  K_val = min(5, len(final_df))
492
  if K_val > 0:
493
  s1 = float(final_df.loc[0, "hybrid_score"])
 
497
  for i in range(1, K_val):
498
  si = float(final_df.loc[i, "hybrid_score"])
499
  density_penalty += np.exp(-lam * i) * si
500
+
501
  aggregated_score = min(1.0, s1 + beta * density_penalty)
502
+
503
  # Recalculate originality based on aggregated similarity score
 
 
 
 
504
  aggregated_originality = compute_originality(
505
+ hybrid_score=aggregated_score
 
 
506
  )
507
  if aggregated_score >= 0.90:
508
  aggregated_originality = 0.0
509
+
510
  final_df.loc[0, "originality_score"] = aggregated_originality
511
  else:
512
  aggregated_score = 0.0