Spaces:
Sleeping
Sleeping
feat: implement hybrid similarity ranking engine with dynamic weighting and originality scoring
Browse files
src/similarity_model/hybrid_ranker.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import logging
|
|
|
|
| 2 |
from typing import List, Dict, Any
|
| 3 |
|
| 4 |
import pandas as pd
|
|
@@ -140,24 +141,37 @@ def get_baseline_similarity():
|
|
| 140 |
|
| 141 |
def compute_originality(
|
| 142 |
hybrid_score: float,
|
| 143 |
-
unique_query_features: int,
|
| 144 |
-
total_query_features: int
|
| 145 |
) -> float:
|
| 146 |
"""
|
| 147 |
Originality Score (0-100).
|
| 148 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
"""
|
|
|
|
|
|
|
|
|
|
| 150 |
hybrid_score = clamp(hybrid_score)
|
| 151 |
baseline_sim = get_baseline_similarity()
|
| 152 |
-
|
| 153 |
-
# Subtraction and Min-Max scaling
|
| 154 |
-
calibrated_similarity = max(
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
|
|
|
|
|
|
|
|
|
| 161 |
|
| 162 |
return round(max(0.0, min(100.0, originality)), 2)
|
| 163 |
|
|
|
|
| 1 |
import logging
|
| 2 |
+
import math
|
| 3 |
from typing import List, Dict, Any
|
| 4 |
|
| 5 |
import pandas as pd
|
|
|
|
| 141 |
|
| 142 |
def compute_originality(
|
| 143 |
hybrid_score: float,
|
| 144 |
+
unique_query_features: int = 0,
|
| 145 |
+
total_query_features: int = 0
|
| 146 |
) -> float:
|
| 147 |
"""
|
| 148 |
Originality Score (0-100).
|
| 149 |
+
|
| 150 |
+
Uses a shifted sigmoid calibration so that moderate similarity
|
| 151 |
+
(0.40-0.60) triggers strong originality penalties, while truly
|
| 152 |
+
novel projects (sim < 0.30) retain high originality.
|
| 153 |
+
|
| 154 |
+
Sigmoid parameters:
|
| 155 |
+
k = 14 (steepness of the drop-off)
|
| 156 |
+
midpoint = 0.27 (calibrated similarity where originality ≈ 50%)
|
| 157 |
"""
|
| 158 |
+
SIGMOID_K = 14
|
| 159 |
+
SIGMOID_MIDPOINT = 0.27
|
| 160 |
+
|
| 161 |
hybrid_score = clamp(hybrid_score)
|
| 162 |
baseline_sim = get_baseline_similarity()
|
| 163 |
+
|
| 164 |
+
# Subtraction and Min-Max scaling (unchanged)
|
| 165 |
+
calibrated_similarity = max(
|
| 166 |
+
0.0, (hybrid_score - baseline_sim) / (1.0 - baseline_sim)
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
# Sigmoid mapping: converts calibrated_similarity → [0, 1]
|
| 170 |
+
sigmoid_output = 1.0 / (
|
| 171 |
+
1.0 + math.exp(-SIGMOID_K * (calibrated_similarity - SIGMOID_MIDPOINT))
|
| 172 |
+
)
|
| 173 |
+
|
| 174 |
+
originality = 100.0 * (1.0 - sigmoid_output)
|
| 175 |
|
| 176 |
return round(max(0.0, min(100.0, originality)), 2)
|
| 177 |
|
src/similarity_model/similarity_engine.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import logging
|
| 2 |
from typing import Dict, Any, List, Optional
|
|
|
|
| 3 |
|
| 4 |
import pandas as pd
|
| 5 |
|
|
@@ -19,6 +20,21 @@ from src.similarity_model import (
|
|
| 19 |
risk_label
|
| 20 |
)
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
logging.basicConfig(
|
| 23 |
level=logging.INFO,
|
| 24 |
format="%(asctime)s | %(levelname)s | %(message)s"
|
|
@@ -404,7 +420,74 @@ def find_similar_projects(
|
|
| 404 |
ascending=False
|
| 405 |
).reset_index(drop=True)
|
| 406 |
|
| 407 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 408 |
K_val = min(5, len(final_df))
|
| 409 |
if K_val > 0:
|
| 410 |
s1 = float(final_df.loc[0, "hybrid_score"])
|
|
@@ -414,22 +497,16 @@ def find_similar_projects(
|
|
| 414 |
for i in range(1, K_val):
|
| 415 |
si = float(final_df.loc[i, "hybrid_score"])
|
| 416 |
density_penalty += np.exp(-lam * i) * si
|
| 417 |
-
|
| 418 |
aggregated_score = min(1.0, s1 + beta * density_penalty)
|
| 419 |
-
|
| 420 |
# Recalculate originality based on aggregated similarity score
|
| 421 |
-
top_row = final_df.iloc[0]
|
| 422 |
-
unique_q_feats = top_row.get("unique_query_features", [])
|
| 423 |
-
total_q_feats = len(query_project[FEATURE_COL])
|
| 424 |
-
|
| 425 |
aggregated_originality = compute_originality(
|
| 426 |
-
hybrid_score=aggregated_score
|
| 427 |
-
unique_query_features=len(unique_q_feats),
|
| 428 |
-
total_query_features=total_q_feats
|
| 429 |
)
|
| 430 |
if aggregated_score >= 0.90:
|
| 431 |
aggregated_originality = 0.0
|
| 432 |
-
|
| 433 |
final_df.loc[0, "originality_score"] = aggregated_originality
|
| 434 |
else:
|
| 435 |
aggregated_score = 0.0
|
|
|
|
| 1 |
import logging
|
| 2 |
from typing import Dict, Any, List, Optional
|
| 3 |
+
from functools import lru_cache
|
| 4 |
|
| 5 |
import pandas as pd
|
| 6 |
|
|
|
|
| 20 |
risk_label
|
| 21 |
)
|
| 22 |
|
| 23 |
+
# ---------------------------------------------------------------------------
|
| 24 |
+
# Cross-encoder for paraphrase detection (lazy-loaded, cached)
|
| 25 |
+
# ---------------------------------------------------------------------------
|
| 26 |
+
@lru_cache(maxsize=1)
|
| 27 |
+
def _load_cross_encoder():
|
| 28 |
+
from sentence_transformers import CrossEncoder
|
| 29 |
+
logger.info("Loading cross-encoder: cross-encoder/stsb-distilroberta-base")
|
| 30 |
+
return CrossEncoder("cross-encoder/stsb-distilroberta-base", max_length=512)
|
| 31 |
+
|
| 32 |
+
CROSS_ENCODER_THRESHOLD = 0.60 # minimum cross-score to trigger boost
|
| 33 |
+
CROSS_ENCODER_MAX_BOOST = 0.30 # maximum hybrid_score boost from cross-encoder
|
| 34 |
+
WORKFLOW_COVERAGE_THRESH = 0.50 # minimum coverage to trigger workflow penalty
|
| 35 |
+
WORKFLOW_FEATURE_THRESH = 0.45 # minimum feature_score to trigger workflow penalty
|
| 36 |
+
WORKFLOW_MAX_BOOST = 0.10 # maximum hybrid_score boost from workflow overlap
|
| 37 |
+
|
| 38 |
logging.basicConfig(
|
| 39 |
level=logging.INFO,
|
| 40 |
format="%(asctime)s | %(levelname)s | %(message)s"
|
|
|
|
| 420 |
ascending=False
|
| 421 |
).reset_index(drop=True)
|
| 422 |
|
| 423 |
+
# -----------------------------------------------------------------
|
| 424 |
+
# CROSS-ENCODER RE-SCORING (top-1 candidate only)
|
| 425 |
+
# -----------------------------------------------------------------
|
| 426 |
+
if len(final_df) > 0:
|
| 427 |
+
top_row = final_df.iloc[0]
|
| 428 |
+
candidate_id = int(top_row["project_id"])
|
| 429 |
+
candidate_row = df.loc[candidate_id]
|
| 430 |
+
|
| 431 |
+
# Build full texts for cross-encoder comparison
|
| 432 |
+
query_full = build_raw_text(
|
| 433 |
+
title=title, abstract=abstract, description=description
|
| 434 |
+
)
|
| 435 |
+
candidate_full = build_raw_text(
|
| 436 |
+
title=str(candidate_row.get(TITLE_COL, "")),
|
| 437 |
+
abstract=str(candidate_row.get("abstract", "")),
|
| 438 |
+
description=str(candidate_row.get("description", ""))
|
| 439 |
+
)
|
| 440 |
+
|
| 441 |
+
try:
|
| 442 |
+
cross_encoder = _load_cross_encoder()
|
| 443 |
+
cross_score = float(
|
| 444 |
+
cross_encoder.predict([(query_full, candidate_full)])[0]
|
| 445 |
+
)
|
| 446 |
+
# stsb model already outputs [0, 1] — clamp for safety
|
| 447 |
+
cross_score = max(0.0, min(1.0, cross_score))
|
| 448 |
+
logger.info(
|
| 449 |
+
f"Cross-encoder score (top-1): {cross_score:.4f}"
|
| 450 |
+
)
|
| 451 |
+
except Exception as exc:
|
| 452 |
+
logger.warning(f"Cross-encoder failed, skipping: {exc}")
|
| 453 |
+
cross_score = 0.0
|
| 454 |
+
|
| 455 |
+
# Apply cross-encoder boost if threshold met
|
| 456 |
+
if cross_score >= CROSS_ENCODER_THRESHOLD:
|
| 457 |
+
boost = CROSS_ENCODER_MAX_BOOST * (
|
| 458 |
+
(cross_score - CROSS_ENCODER_THRESHOLD)
|
| 459 |
+
/ (1.0 - CROSS_ENCODER_THRESHOLD)
|
| 460 |
+
)
|
| 461 |
+
original_hybrid = float(final_df.loc[0, "hybrid_score"])
|
| 462 |
+
boosted_hybrid = min(1.0, original_hybrid + boost)
|
| 463 |
+
final_df.loc[0, "hybrid_score"] = round(boosted_hybrid, 4)
|
| 464 |
+
logger.info(
|
| 465 |
+
f"Cross-encoder boost: {original_hybrid:.4f} -> {boosted_hybrid:.4f} "
|
| 466 |
+
f"(+{boost:.4f})"
|
| 467 |
+
)
|
| 468 |
+
|
| 469 |
+
# -----------------------------------------------------------------
|
| 470 |
+
# WORKFLOW OVERLAP PENALTY
|
| 471 |
+
# -----------------------------------------------------------------
|
| 472 |
+
top_coverage = float(top_row.get("coverage", 0.0))
|
| 473 |
+
top_feat_score = float(top_row.get("feature_score", 0.0))
|
| 474 |
+
|
| 475 |
+
if (top_coverage >= WORKFLOW_COVERAGE_THRESH
|
| 476 |
+
and top_feat_score >= WORKFLOW_FEATURE_THRESH):
|
| 477 |
+
workflow_boost = (
|
| 478 |
+
WORKFLOW_MAX_BOOST * top_coverage * top_feat_score
|
| 479 |
+
)
|
| 480 |
+
current_hybrid = float(final_df.loc[0, "hybrid_score"])
|
| 481 |
+
boosted_hybrid = min(1.0, current_hybrid + workflow_boost)
|
| 482 |
+
final_df.loc[0, "hybrid_score"] = round(boosted_hybrid, 4)
|
| 483 |
+
logger.info(
|
| 484 |
+
f"Workflow overlap boost: {current_hybrid:.4f} -> "
|
| 485 |
+
f"{boosted_hybrid:.4f} (+{workflow_boost:.4f})"
|
| 486 |
+
)
|
| 487 |
+
|
| 488 |
+
# -----------------------------------------------------------------
|
| 489 |
+
# DECAYING AGGREGATION over Top-5
|
| 490 |
+
# -----------------------------------------------------------------
|
| 491 |
K_val = min(5, len(final_df))
|
| 492 |
if K_val > 0:
|
| 493 |
s1 = float(final_df.loc[0, "hybrid_score"])
|
|
|
|
| 497 |
for i in range(1, K_val):
|
| 498 |
si = float(final_df.loc[i, "hybrid_score"])
|
| 499 |
density_penalty += np.exp(-lam * i) * si
|
| 500 |
+
|
| 501 |
aggregated_score = min(1.0, s1 + beta * density_penalty)
|
| 502 |
+
|
| 503 |
# Recalculate originality based on aggregated similarity score
|
|
|
|
|
|
|
|
|
|
|
|
|
| 504 |
aggregated_originality = compute_originality(
|
| 505 |
+
hybrid_score=aggregated_score
|
|
|
|
|
|
|
| 506 |
)
|
| 507 |
if aggregated_score >= 0.90:
|
| 508 |
aggregated_originality = 0.0
|
| 509 |
+
|
| 510 |
final_df.loc[0, "originality_score"] = aggregated_originality
|
| 511 |
else:
|
| 512 |
aggregated_score = 0.0
|