Spaces:
Sleeping
Sleeping
Duplicate concepts major refactor; pipeline refactor; rejection rationale
Browse files- app.py +12 -12
- images/pipeline.png +2 -2
- modules/pipeline.py +66 -34
- modules/semantic_similarity.py +216 -40
app.py
CHANGED
|
@@ -28,8 +28,8 @@ from io import BytesIO
|
|
| 28 |
logger = logging.getLogger(__name__)
|
| 29 |
|
| 30 |
# Local
|
| 31 |
-
|
| 32 |
-
|
| 33 |
|
| 34 |
config = getconfig("config.cfg")
|
| 35 |
|
|
@@ -69,7 +69,7 @@ def get_azure_deployment():
|
|
| 69 |
def main():
|
| 70 |
# Temporarily set authentication to True for testing
|
| 71 |
if 'authenticated' not in st.session_state:
|
| 72 |
-
st.session_state['authenticated'] =
|
| 73 |
|
| 74 |
if st.session_state['authenticated']:
|
| 75 |
# Remove login success message for testing
|
|
@@ -228,15 +228,15 @@ def main():
|
|
| 228 |
|
| 229 |
|
| 230 |
# Comment out for testing
|
| 231 |
-
else:
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
|
| 241 |
|
| 242 |
|
|
|
|
| 28 |
logger = logging.getLogger(__name__)
|
| 29 |
|
| 30 |
# Local
|
| 31 |
+
from dotenv import load_dotenv
|
| 32 |
+
load_dotenv()
|
| 33 |
|
| 34 |
config = getconfig("config.cfg")
|
| 35 |
|
|
|
|
| 69 |
def main():
|
| 70 |
# Temporarily set authentication to True for testing
|
| 71 |
if 'authenticated' not in st.session_state:
|
| 72 |
+
st.session_state['authenticated'] = True
|
| 73 |
|
| 74 |
if st.session_state['authenticated']:
|
| 75 |
# Remove login success message for testing
|
|
|
|
| 228 |
|
| 229 |
|
| 230 |
# Comment out for testing
|
| 231 |
+
# else:
|
| 232 |
+
# username = st.text_input("Username")
|
| 233 |
+
# password = st.text_input("Password", type="password")
|
| 234 |
+
# if st.button("Login"):
|
| 235 |
+
# if validate_login(username, password):
|
| 236 |
+
# st.session_state['authenticated'] = True
|
| 237 |
+
# st.rerun()
|
| 238 |
+
# else:
|
| 239 |
+
# st.error("Incorrect username or password")
|
| 240 |
|
| 241 |
|
| 242 |
|
images/pipeline.png
CHANGED
|
Git LFS Details
|
|
Git LFS Details
|
modules/pipeline.py
CHANGED
|
@@ -12,7 +12,7 @@ from openpyxl.styles.differential import DifferentialStyle
|
|
| 12 |
from modules.org_count import standardize_organization_names
|
| 13 |
from modules.utils import clean_text, extract_predicted_labels
|
| 14 |
# from modules.llm import check_duplicate_concepts
|
| 15 |
-
from modules.semantic_similarity import
|
| 16 |
from sentence_transformers import SentenceTransformer
|
| 17 |
import logging
|
| 18 |
|
|
@@ -110,6 +110,51 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
|
|
| 110 |
return predictions
|
| 111 |
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
# Main function to process data
|
| 114 |
def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
| 115 |
"""
|
|
@@ -214,22 +259,13 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 214 |
logger.info(f"Loading semantic similarity model on device: {device}")
|
| 215 |
semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
|
| 216 |
|
| 217 |
-
# Process duplicate check
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
row['org_renamed'],
|
| 225 |
-
row['scope_txt'],
|
| 226 |
-
df
|
| 227 |
-
)
|
| 228 |
-
duplicate_results.append(result)
|
| 229 |
-
# Update progress bar with each iteration
|
| 230 |
-
progress = (i + 1) / total
|
| 231 |
-
progress_bar.progress(progress)
|
| 232 |
-
df['duplicate_check'] = duplicate_results
|
| 233 |
|
| 234 |
|
| 235 |
logger.info(f"Completed: {model_name}")
|
|
@@ -308,27 +344,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 308 |
else False, axis=1)
|
| 309 |
|
| 310 |
# Predict score
|
| 311 |
-
sector_classes = ['Energy','Transport','Industries']
|
| 312 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
else 'PRE-ASSESSMENT' if sens_level <= x['pred_score'] <= sens_level+1
|
| 323 |
-
else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
|
| 324 |
-
else 'ERROR', axis=1)
|
| 325 |
|
| 326 |
# Reorder columns in final dataframe
|
| 327 |
-
column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
|
| 328 |
-
'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
|
| 329 |
-
'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
|
| 330 |
'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
|
| 331 |
-
'word_length_check', 'pred_score', 'pred_action']
|
| 332 |
|
| 333 |
# Only include columns that exist in the DataFrame
|
| 334 |
final_columns = [col for col in column_order if col in df.columns]
|
|
|
|
| 12 |
from modules.org_count import standardize_organization_names
|
| 13 |
from modules.utils import clean_text, extract_predicted_labels
|
| 14 |
# from modules.llm import check_duplicate_concepts
|
| 15 |
+
from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
|
| 16 |
from sentence_transformers import SentenceTransformer
|
| 17 |
import logging
|
| 18 |
|
|
|
|
| 110 |
return predictions
|
| 111 |
|
| 112 |
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# Helper function to determine pred_action and build rationale
|
| 116 |
+
def determine_action(row, sens_level):
|
| 117 |
+
rationale = []
|
| 118 |
+
action = None
|
| 119 |
+
# Eligible sector classes for filtering
|
| 120 |
+
SECTOR_CLASSES = ['Energy', 'Transport', 'Industries']
|
| 121 |
+
|
| 122 |
+
# Label cause of REJECT / INELLIGBLE
|
| 123 |
+
if 'concept_count' in row.index and row['concept_count'] > 6:
|
| 124 |
+
rationale.append('Multiple concepts same org (>6)')
|
| 125 |
+
action = 'INELIGIBLE'
|
| 126 |
+
if row['LANG'][0:2] != 'en':
|
| 127 |
+
rationale.append(f"Non-English language: ({row['LANG']})")
|
| 128 |
+
action = 'INELIGIBLE'
|
| 129 |
+
if row['ADAPMIT'] == 'Adaptation':
|
| 130 |
+
rationale.append('Adaptation (not Mitigation)')
|
| 131 |
+
action = 'INELIGIBLE'
|
| 132 |
+
if not any(sector in [row['SECTOR1'], row['SECTOR2']] for sector in SECTOR_CLASSES):
|
| 133 |
+
if row['SECTOR2'] is None:
|
| 134 |
+
rationale.append(f"Ineligible sector ({row['SECTOR1']})")
|
| 135 |
+
else:
|
| 136 |
+
rationale.append(f"Ineligible sectors ({row['SECTOR1']}, {row['SECTOR2']})")
|
| 137 |
+
action = 'INELIGIBLE'
|
| 138 |
+
if row['word_length_check'] == True:
|
| 139 |
+
rationale.append('Insufficient word count')
|
| 140 |
+
if row['pred_score'] < sens_level:
|
| 141 |
+
rationale.append(f"Score below threshold ({row['pred_score']} < {sens_level})")
|
| 142 |
+
|
| 143 |
+
# Determine action
|
| 144 |
+
if action == 'INELIGIBLE':
|
| 145 |
+
pass
|
| 146 |
+
elif rationale:
|
| 147 |
+
action = 'REJECT'
|
| 148 |
+
elif sens_level <= row['pred_score'] <= sens_level + 1:
|
| 149 |
+
action = 'PRE-ASSESSMENT'
|
| 150 |
+
elif row['pred_score'] > sens_level + 2:
|
| 151 |
+
action = 'FULL-ASSESSMENT'
|
| 152 |
+
else:
|
| 153 |
+
action = 'ERROR'
|
| 154 |
+
|
| 155 |
+
return pd.Series({'pred_action': action, 'action_rationale': rationale})
|
| 156 |
+
|
| 157 |
+
|
| 158 |
# Main function to process data
|
| 159 |
def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
| 160 |
"""
|
|
|
|
| 259 |
logger.info(f"Loading semantic similarity model on device: {device}")
|
| 260 |
semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
|
| 261 |
|
| 262 |
+
# Process duplicate check using batched approach for efficiency
|
| 263 |
+
progress_bar.progress(0.1) # Show initial progress
|
| 264 |
+
df['duplicate_check'] = check_duplicate_concepts(
|
| 265 |
+
semantic_model,
|
| 266 |
+
df
|
| 267 |
+
)
|
| 268 |
+
progress_bar.progress(1.0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
logger.info(f"Completed: {model_name}")
|
|
|
|
| 344 |
else False, axis=1)
|
| 345 |
|
| 346 |
# Predict score
|
|
|
|
| 347 |
df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
|
| 348 |
+
|
| 349 |
+
# Initialize action_rationale column
|
| 350 |
+
df['action_rationale'] = [[] for _ in range(len(df))]
|
| 351 |
+
|
| 352 |
+
# Apply the function to determine action and rationale
|
| 353 |
+
df[['pred_action', 'action_rationale']] = df.apply(lambda x: determine_action(x, sens_level), axis=1, result_type='expand')
|
| 354 |
+
|
| 355 |
+
# Final assessment of duplicate concepts (top scored concept is maintained, others: 'REJECT'. If tie, take the first in the df index)
|
| 356 |
+
df = assess_duplicate_concepts(df)
|
|
|
|
|
|
|
|
|
|
| 357 |
|
| 358 |
# Reorder columns in final dataframe
|
| 359 |
+
column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
|
| 360 |
+
'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
|
| 361 |
+
'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
|
| 362 |
'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
|
| 363 |
+
'word_length_check', 'pred_score', 'pred_action', 'action_rationale']
|
| 364 |
|
| 365 |
# Only include columns that exist in the DataFrame
|
| 366 |
final_columns = [col for col in column_order if col in df.columns]
|
modules/semantic_similarity.py
CHANGED
|
@@ -1,70 +1,246 @@
|
|
| 1 |
# Semantic similarity-based duplicate detection
|
|
|
|
| 2 |
import pandas as pd
|
| 3 |
-
import logging
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from modules.utils import setup_logging
|
| 7 |
|
| 8 |
logger = setup_logging()
|
| 9 |
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
|
|
|
| 12 |
model: SentenceTransformer,
|
| 13 |
-
concept_id: str,
|
| 14 |
-
organization: str,
|
| 15 |
-
concept_profile: str,
|
| 16 |
df: pd.DataFrame,
|
| 17 |
-
similarity_threshold: float = 0.85
|
| 18 |
-
|
|
|
|
| 19 |
"""
|
| 20 |
-
Check for duplicate concepts within the same organization using semantic similarity
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
|
| 22 |
Args:
|
| 23 |
model: SentenceTransformer model for computing embeddings
|
| 24 |
-
concept_id: ID of the current concept being checked
|
| 25 |
-
organization: Organization name
|
| 26 |
-
concept_profile: Text description of the concept to check
|
| 27 |
df: DataFrame containing all application data
|
| 28 |
similarity_threshold: Threshold for considering concepts duplicates (0-1)
|
| 29 |
Recommended values: 0.80 (lenient) to 0.95 (strict)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
Returns:
|
| 32 |
-
|
|
|
|
| 33 |
"""
|
|
|
|
| 34 |
|
| 35 |
-
#
|
| 36 |
-
|
| 37 |
|
| 38 |
-
# Get
|
| 39 |
-
|
| 40 |
-
other_concepts = org_concepts['scope_txt'].tolist()
|
| 41 |
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
[text if text else "" for text in other_concepts],
|
| 55 |
-
convert_to_numpy=True
|
| 56 |
-
)
|
| 57 |
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
other_embeddings
|
| 62 |
-
)[0]
|
| 63 |
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
-
|
|
|
|
|
|
|
|
|
|
| 67 |
|
| 68 |
-
|
| 69 |
-
return True
|
| 70 |
-
return False
|
|
|
|
| 1 |
# Semantic similarity-based duplicate detection
|
| 2 |
+
import numpy as np
|
| 3 |
import pandas as pd
|
|
|
|
| 4 |
from sentence_transformers import SentenceTransformer
|
| 5 |
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
from modules.utils import setup_logging
|
| 7 |
|
| 8 |
logger = setup_logging()
|
| 9 |
|
| 10 |
+
# Text fields to check for duplication
|
| 11 |
+
DUPLICATE_CHECK_FIELDS = ['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']
|
| 12 |
|
| 13 |
+
|
| 14 |
+
def check_duplicate_concepts(
|
| 15 |
model: SentenceTransformer,
|
|
|
|
|
|
|
|
|
|
| 16 |
df: pd.DataFrame,
|
| 17 |
+
similarity_threshold: float = 0.85,
|
| 18 |
+
batch_size: int = 64
|
| 19 |
+
) -> pd.Series:
|
| 20 |
"""
|
| 21 |
+
Check for duplicate concepts within the same organization using semantic similarity.
|
| 22 |
+
Uses batched embedding computation for efficiency.
|
| 23 |
+
|
| 24 |
+
A concept is marked as duplicate if there exists at least one other concept
|
| 25 |
+
in the same organization where ALL text fields simultaneously meet the
|
| 26 |
+
similarity threshold.
|
| 27 |
|
| 28 |
Args:
|
| 29 |
model: SentenceTransformer model for computing embeddings
|
|
|
|
|
|
|
|
|
|
| 30 |
df: DataFrame containing all application data
|
| 31 |
similarity_threshold: Threshold for considering concepts duplicates (0-1)
|
| 32 |
Recommended values: 0.80 (lenient) to 0.95 (strict)
|
| 33 |
+
batch_size: Batch size for embedding computation
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
pd.Series of boolean values indexed by df.index, True if concept has a duplicate
|
| 37 |
+
"""
|
| 38 |
+
# Initialize results - default to False (no duplicate)
|
| 39 |
+
results = pd.Series(False, index=df.index)
|
| 40 |
+
|
| 41 |
+
# Pre-compute all embeddings for each field in batches
|
| 42 |
+
logger.info("Computing embeddings for all concepts...")
|
| 43 |
+
field_embeddings = {}
|
| 44 |
+
for field in DUPLICATE_CHECK_FIELDS:
|
| 45 |
+
texts = df[field].fillna("").astype(str).tolist()
|
| 46 |
+
embeddings = model.encode(
|
| 47 |
+
texts,
|
| 48 |
+
convert_to_numpy=True,
|
| 49 |
+
batch_size=batch_size,
|
| 50 |
+
show_progress_bar=False
|
| 51 |
+
)
|
| 52 |
+
field_embeddings[field] = embeddings
|
| 53 |
+
logger.info("Embeddings computed for all fields")
|
| 54 |
+
|
| 55 |
+
# Group by organization and process each group
|
| 56 |
+
org_groups = df.groupby('org_renamed')
|
| 57 |
+
|
| 58 |
+
for org_name, org_df in org_groups:
|
| 59 |
+
# Skip organizations with only one concept (no duplicates possible)
|
| 60 |
+
if len(org_df) < 2:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
# Get indices for this organization (positions in original df)
|
| 64 |
+
org_indices = org_df.index.tolist()
|
| 65 |
+
org_positions = [df.index.get_loc(idx) for idx in org_indices]
|
| 66 |
+
n_concepts = len(org_positions)
|
| 67 |
+
|
| 68 |
+
# Compute pairwise similarity matrices for each field within this org
|
| 69 |
+
field_sim_matrices = {}
|
| 70 |
+
for field in DUPLICATE_CHECK_FIELDS:
|
| 71 |
+
# Extract embeddings for this org's concepts
|
| 72 |
+
org_embeddings = field_embeddings[field][org_positions]
|
| 73 |
+
# Compute NxN similarity matrix for this field
|
| 74 |
+
sim_matrix = cosine_similarity(org_embeddings)
|
| 75 |
+
field_sim_matrices[field] = sim_matrix
|
| 76 |
+
|
| 77 |
+
# For each concept, check if any other concept matches on ALL fields
|
| 78 |
+
for i in range(n_concepts):
|
| 79 |
+
concept_idx = org_indices[i]
|
| 80 |
+
concept_id = df.loc[concept_idx, 'id']
|
| 81 |
+
|
| 82 |
+
for j in range(n_concepts):
|
| 83 |
+
if i == j:
|
| 84 |
+
continue
|
| 85 |
+
|
| 86 |
+
# Check if all fields meet threshold for this pair
|
| 87 |
+
all_fields_match = True
|
| 88 |
+
field_sims = {}
|
| 89 |
+
for field in DUPLICATE_CHECK_FIELDS:
|
| 90 |
+
sim = field_sim_matrices[field][i, j]
|
| 91 |
+
field_sims[field] = sim
|
| 92 |
+
if sim < similarity_threshold:
|
| 93 |
+
all_fields_match = False
|
| 94 |
+
break
|
| 95 |
+
|
| 96 |
+
if all_fields_match:
|
| 97 |
+
other_concept_id = df.loc[org_indices[j], 'id']
|
| 98 |
+
logger.info(
|
| 99 |
+
f"Duplicate found: concept {concept_id} matches concept {other_concept_id} "
|
| 100 |
+
f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
|
| 101 |
+
)
|
| 102 |
+
results.loc[concept_idx] = True
|
| 103 |
+
break # Found a duplicate, no need to check more
|
| 104 |
+
|
| 105 |
+
# Log concepts with no duplicates
|
| 106 |
+
for i in range(n_concepts):
|
| 107 |
+
concept_idx = org_indices[i]
|
| 108 |
+
if not results.loc[concept_idx]:
|
| 109 |
+
concept_id = df.loc[concept_idx, 'id']
|
| 110 |
+
logger.info(f"No duplicate found for concept {concept_id}")
|
| 111 |
+
|
| 112 |
+
return results
|
| 113 |
+
|
| 114 |
+
|
| 115 |
+
# def check_duplicate_concepts_semantic(
|
| 116 |
+
# model: SentenceTransformer,
|
| 117 |
+
# concept_id: str,
|
| 118 |
+
# df: pd.DataFrame,
|
| 119 |
+
# similarity_threshold: float = 0.85
|
| 120 |
+
# ) -> bool:
|
| 121 |
+
# """
|
| 122 |
+
# Check for duplicate concepts within the same organization using semantic similarity.
|
| 123 |
+
# Returns True if there exists at least one other concept where ALL text fields
|
| 124 |
+
# simultaneously meet the similarity threshold.
|
| 125 |
+
|
| 126 |
+
# DEPRECATED: Use check_duplicate_concepts_semantic_batched() for better performance.
|
| 127 |
+
|
| 128 |
+
# Args:
|
| 129 |
+
# model: SentenceTransformer model for computing embeddings
|
| 130 |
+
# concept_id: ID of the current concept being checked
|
| 131 |
+
# df: DataFrame containing all application data
|
| 132 |
+
# similarity_threshold: Threshold for considering concepts duplicates (0-1)
|
| 133 |
+
# Recommended values: 0.80 (lenient) to 0.95 (strict)
|
| 134 |
+
|
| 135 |
+
# Returns:
|
| 136 |
+
# Boolean classification result - True if any single other concept matches
|
| 137 |
+
# on ALL fields simultaneously
|
| 138 |
+
# """
|
| 139 |
+
# # Get the current concept's row
|
| 140 |
+
# current_row = df[df['id'] == concept_id]
|
| 141 |
+
# if len(current_row) == 0:
|
| 142 |
+
# logger.warning(f"Concept ID {concept_id} not found in dataframe")
|
| 143 |
+
# return False
|
| 144 |
+
|
| 145 |
+
# current_row = current_row.iloc[0]
|
| 146 |
+
# organization = current_row['org_renamed']
|
| 147 |
+
|
| 148 |
+
# # Get other concepts from the same organization (excluding current)
|
| 149 |
+
# org_concepts = df[(df['org_renamed'] == organization) & (df['id'] != concept_id)]
|
| 150 |
+
|
| 151 |
+
# # If no other concepts from this organization, return False
|
| 152 |
+
# if len(org_concepts) == 0:
|
| 153 |
+
# return False
|
| 154 |
+
|
| 155 |
+
# # Pre-compute embeddings for current concept's fields
|
| 156 |
+
# current_embeddings = {}
|
| 157 |
+
# for field in DUPLICATE_CHECK_FIELDS:
|
| 158 |
+
# current_text = current_row.get(field, "") or ""
|
| 159 |
+
# current_embeddings[field] = model.encode(current_text, convert_to_numpy=True)
|
| 160 |
+
|
| 161 |
+
# # Check each other concept - ALL fields must match for a single concept
|
| 162 |
+
# for _, other_row in org_concepts.iterrows():
|
| 163 |
+
# all_fields_match_this_concept = True
|
| 164 |
+
# field_sims = {}
|
| 165 |
+
|
| 166 |
+
# for field in DUPLICATE_CHECK_FIELDS:
|
| 167 |
+
# other_text = other_row.get(field, "") or ""
|
| 168 |
+
# other_embedding = model.encode(other_text, convert_to_numpy=True)
|
| 169 |
+
|
| 170 |
+
# similarity = cosine_similarity(
|
| 171 |
+
# current_embeddings[field].reshape(1, -1),
|
| 172 |
+
# other_embedding.reshape(1, -1)
|
| 173 |
+
# )[0][0]
|
| 174 |
+
|
| 175 |
+
# field_sims[field] = similarity
|
| 176 |
+
|
| 177 |
+
# if similarity < similarity_threshold:
|
| 178 |
+
# all_fields_match_this_concept = False
|
| 179 |
+
# break # No need to check remaining fields for this concept
|
| 180 |
+
|
| 181 |
+
# if all_fields_match_this_concept:
|
| 182 |
+
# logger.info(
|
| 183 |
+
# f"Duplicate found: concept {concept_id} matches concept {other_row['id']} "
|
| 184 |
+
# f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
|
| 185 |
+
# )
|
| 186 |
+
# return True
|
| 187 |
+
|
| 188 |
+
# logger.info(f"No duplicate found for concept {concept_id}")
|
| 189 |
+
# return False
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def assess_duplicate_concepts(
|
| 193 |
+
df: pd.DataFrame
|
| 194 |
+
) -> pd.DataFrame:
|
| 195 |
+
"""
|
| 196 |
+
Check flagged duplicate concepts within the same organization.
|
| 197 |
+
Get the top ranked concept out of all the duplicates (pred_score) and reject all others.
|
| 198 |
+
|
| 199 |
+
Args:
|
| 200 |
+
df: DataFrame containing application data with 'duplicate_check', 'org_renamed',
|
| 201 |
+
'pred_score', 'id', and 'pred_action' columns
|
| 202 |
|
| 203 |
Returns:
|
| 204 |
+
DataFrame with pred_action set to 'REJECT' for duplicate concepts that are not
|
| 205 |
+
the top-scoring concept within their organization
|
| 206 |
"""
|
| 207 |
+
df_assess = df.copy()
|
| 208 |
|
| 209 |
+
# Filter to only duplicate-flagged concepts
|
| 210 |
+
duplicates_mask = df_assess['duplicate_check'] == True
|
| 211 |
|
| 212 |
+
# Get unique organizations that have duplicates
|
| 213 |
+
orgs_with_duplicates = df_assess.loc[duplicates_mask, 'org_renamed'].unique()
|
|
|
|
| 214 |
|
| 215 |
+
for org in orgs_with_duplicates:
|
| 216 |
+
# Get all duplicate concepts for this organization
|
| 217 |
+
org_duplicates_mask = (df_assess['org_renamed'] == org) & (df_assess['duplicate_check'] == True)
|
| 218 |
|
| 219 |
+
# Sort by pred_score descending; index order handles ties (lowest index wins)
|
| 220 |
+
org_duplicates = df_assess.loc[org_duplicates_mask].sort_values(
|
| 221 |
+
by='pred_score',
|
| 222 |
+
ascending=False
|
| 223 |
+
)
|
| 224 |
|
| 225 |
+
# Top concept is the first one after sorting
|
| 226 |
+
top_concept_id = org_duplicates.iloc[0]['id']
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
# Set pred_action to 'REJECT' for all duplicates except the top concept
|
| 229 |
+
reject_mask = org_duplicates_mask & (df_assess['id'] != top_concept_id)
|
| 230 |
+
df_assess.loc[reject_mask, 'pred_action'] = 'REJECT'
|
|
|
|
|
|
|
| 231 |
|
| 232 |
+
# Append rationale for rejected duplicates (if action_rationale column exists)
|
| 233 |
+
if 'action_rationale' in df_assess.columns:
|
| 234 |
+
for idx in df_assess.loc[reject_mask].index:
|
| 235 |
+
rationale = df_assess.at[idx, 'action_rationale']
|
| 236 |
+
if isinstance(rationale, list):
|
| 237 |
+
rationale.append(f'Lower-scoring duplicate (kept concept {top_concept_id})')
|
| 238 |
+
else:
|
| 239 |
+
df_assess.at[idx, 'action_rationale'] = [f'Lower-scoring duplicate (kept concept {top_concept_id})']
|
| 240 |
|
| 241 |
+
logger.info(
|
| 242 |
+
f"Duplicate assessment for org '{org}': kept concept {top_concept_id}, "
|
| 243 |
+
f"rejected {reject_mask.sum()} duplicate(s)"
|
| 244 |
+
)
|
| 245 |
|
| 246 |
+
return df_assess
|
|
|
|
|
|