mtyrrell commited on
Commit
b6faf10
·
1 Parent(s): a49bcd0

Duplicate concepts major refactor; pipeline refactor; rejection rationale

Browse files
Files changed (4) hide show
  1. app.py +12 -12
  2. images/pipeline.png +2 -2
  3. modules/pipeline.py +66 -34
  4. modules/semantic_similarity.py +216 -40
app.py CHANGED
@@ -28,8 +28,8 @@ from io import BytesIO
28
  logger = logging.getLogger(__name__)
29
 
30
  # Local
31
- # from dotenv import load_dotenv
32
- # load_dotenv()
33
 
34
  config = getconfig("config.cfg")
35
 
@@ -69,7 +69,7 @@ def get_azure_deployment():
69
  def main():
70
  # Temporarily set authentication to True for testing
71
  if 'authenticated' not in st.session_state:
72
- st.session_state['authenticated'] = False
73
 
74
  if st.session_state['authenticated']:
75
  # Remove login success message for testing
@@ -228,15 +228,15 @@ def main():
228
 
229
 
230
  # Comment out for testing
231
- else:
232
- username = st.text_input("Username")
233
- password = st.text_input("Password", type="password")
234
- if st.button("Login"):
235
- if validate_login(username, password):
236
- st.session_state['authenticated'] = True
237
- st.rerun()
238
- else:
239
- st.error("Incorrect username or password")
240
 
241
 
242
 
 
28
  logger = logging.getLogger(__name__)
29
 
30
  # Local
31
+ from dotenv import load_dotenv
32
+ load_dotenv()
33
 
34
  config = getconfig("config.cfg")
35
 
 
69
  def main():
70
  # Temporarily set authentication to True for testing
71
  if 'authenticated' not in st.session_state:
72
+ st.session_state['authenticated'] = True
73
 
74
  if st.session_state['authenticated']:
75
  # Remove login success message for testing
 
228
 
229
 
230
  # Comment out for testing
231
+ # else:
232
+ # username = st.text_input("Username")
233
+ # password = st.text_input("Password", type="password")
234
+ # if st.button("Login"):
235
+ # if validate_login(username, password):
236
+ # st.session_state['authenticated'] = True
237
+ # st.rerun()
238
+ # else:
239
+ # st.error("Incorrect username or password")
240
 
241
 
242
 
images/pipeline.png CHANGED

Git LFS Details

  • SHA256: c6966ec792e3b749e773a185e441a558bd5abbde42bb61f826fb1321af910928
  • Pointer size: 131 Bytes
  • Size of remote file: 679 kB

Git LFS Details

  • SHA256: b97765fa2ce04f7f2cb0016d4c890beeffdfff39789a8ca1c6cf781ab1cc1d53
  • Pointer size: 131 Bytes
  • Size of remote file: 625 kB
modules/pipeline.py CHANGED
@@ -12,7 +12,7 @@ from openpyxl.styles.differential import DifferentialStyle
12
  from modules.org_count import standardize_organization_names
13
  from modules.utils import clean_text, extract_predicted_labels
14
  # from modules.llm import check_duplicate_concepts
15
- from modules.semantic_similarity import check_duplicate_concepts_semantic
16
  from sentence_transformers import SentenceTransformer
17
  import logging
18
 
@@ -110,6 +110,51 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
110
  return predictions
111
 
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # Main function to process data
114
  def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
115
  """
@@ -214,22 +259,13 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
214
  logger.info(f"Loading semantic similarity model on device: {device}")
215
  semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
216
 
217
- # Process duplicate check with progress tracking
218
- duplicate_results = []
219
- total = len(df)
220
- for i, row in df.iterrows():
221
- result = check_duplicate_concepts_semantic(
222
- semantic_model,
223
- row['id'],
224
- row['org_renamed'],
225
- row['scope_txt'],
226
- df
227
- )
228
- duplicate_results.append(result)
229
- # Update progress bar with each iteration
230
- progress = (i + 1) / total
231
- progress_bar.progress(progress)
232
- df['duplicate_check'] = duplicate_results
233
 
234
 
235
  logger.info(f"Completed: {model_name}")
@@ -308,27 +344,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
308
  else False, axis=1)
309
 
310
  # Predict score
311
- sector_classes = ['Energy','Transport','Industries']
312
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
313
- # labelling logic
314
- df['pred_action'] = df.apply(lambda x:
315
- 'REJECT' if (('concept_count' in df.columns and x['concept_count'] > 6) or
316
- x['LANG'][0:2] != 'en' or
317
- x['ADAPMIT'] == 'Adaptation' or
318
- not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
319
- x['word_length_check'] == True or
320
- x['duplicate_check'] == True or
321
- x['pred_score'] < sens_level)
322
- else 'PRE-ASSESSMENT' if sens_level <= x['pred_score'] <= sens_level+1
323
- else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
324
- else 'ERROR', axis=1)
325
 
326
  # Reorder columns in final dataframe
327
- column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
328
- 'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
329
- 'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
330
  'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
331
- 'word_length_check', 'pred_score', 'pred_action']
332
 
333
  # Only include columns that exist in the DataFrame
334
  final_columns = [col for col in column_order if col in df.columns]
 
12
  from modules.org_count import standardize_organization_names
13
  from modules.utils import clean_text, extract_predicted_labels
14
  # from modules.llm import check_duplicate_concepts
15
+ from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
16
  from sentence_transformers import SentenceTransformer
17
  import logging
18
 
 
110
  return predictions
111
 
112
 
113
+
114
+
115
+ # Helper function to determine pred_action and build rationale
116
+ def determine_action(row, sens_level):
117
+ rationale = []
118
+ action = None
119
+ # Eligible sector classes for filtering
120
+ SECTOR_CLASSES = ['Energy', 'Transport', 'Industries']
121
+
122
+ # Label cause of REJECT / INELLIGBLE
123
+ if 'concept_count' in row.index and row['concept_count'] > 6:
124
+ rationale.append('Multiple concepts same org (>6)')
125
+ action = 'INELIGIBLE'
126
+ if row['LANG'][0:2] != 'en':
127
+ rationale.append(f"Non-English language: ({row['LANG']})")
128
+ action = 'INELIGIBLE'
129
+ if row['ADAPMIT'] == 'Adaptation':
130
+ rationale.append('Adaptation (not Mitigation)')
131
+ action = 'INELIGIBLE'
132
+ if not any(sector in [row['SECTOR1'], row['SECTOR2']] for sector in SECTOR_CLASSES):
133
+ if row['SECTOR2'] is None:
134
+ rationale.append(f"Ineligible sector ({row['SECTOR1']})")
135
+ else:
136
+ rationale.append(f"Ineligible sectors ({row['SECTOR1']}, {row['SECTOR2']})")
137
+ action = 'INELIGIBLE'
138
+ if row['word_length_check'] == True:
139
+ rationale.append('Insufficient word count')
140
+ if row['pred_score'] < sens_level:
141
+ rationale.append(f"Score below threshold ({row['pred_score']} < {sens_level})")
142
+
143
+ # Determine action
144
+ if action == 'INELIGIBLE':
145
+ pass
146
+ elif rationale:
147
+ action = 'REJECT'
148
+ elif sens_level <= row['pred_score'] <= sens_level + 1:
149
+ action = 'PRE-ASSESSMENT'
150
+ elif row['pred_score'] > sens_level + 2:
151
+ action = 'FULL-ASSESSMENT'
152
+ else:
153
+ action = 'ERROR'
154
+
155
+ return pd.Series({'pred_action': action, 'action_rationale': rationale})
156
+
157
+
158
  # Main function to process data
159
  def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
160
  """
 
259
  logger.info(f"Loading semantic similarity model on device: {device}")
260
  semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
261
 
262
+ # Process duplicate check using batched approach for efficiency
263
+ progress_bar.progress(0.1) # Show initial progress
264
+ df['duplicate_check'] = check_duplicate_concepts(
265
+ semantic_model,
266
+ df
267
+ )
268
+ progress_bar.progress(1.0)
 
 
 
 
 
 
 
 
 
269
 
270
 
271
  logger.info(f"Completed: {model_name}")
 
344
  else False, axis=1)
345
 
346
  # Predict score
 
347
  df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
348
+
349
+ # Initialize action_rationale column
350
+ df['action_rationale'] = [[] for _ in range(len(df))]
351
+
352
+ # Apply the function to determine action and rationale
353
+ df[['pred_action', 'action_rationale']] = df.apply(lambda x: determine_action(x, sens_level), axis=1, result_type='expand')
354
+
355
+ # Final assessment of duplicate concepts (top scored concept is maintained, others: 'REJECT'. If tie, take the first in the df index)
356
+ df = assess_duplicate_concepts(df)
 
 
 
357
 
358
  # Reorder columns in final dataframe
359
+ column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
360
+ 'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
361
+ 'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
362
  'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
363
+ 'word_length_check', 'pred_score', 'pred_action', 'action_rationale']
364
 
365
  # Only include columns that exist in the DataFrame
366
  final_columns = [col for col in column_order if col in df.columns]
modules/semantic_similarity.py CHANGED
@@ -1,70 +1,246 @@
1
  # Semantic similarity-based duplicate detection
 
2
  import pandas as pd
3
- import logging
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from modules.utils import setup_logging
7
 
8
  logger = setup_logging()
9
 
 
 
10
 
11
- def check_duplicate_concepts_semantic(
 
12
  model: SentenceTransformer,
13
- concept_id: str,
14
- organization: str,
15
- concept_profile: str,
16
  df: pd.DataFrame,
17
- similarity_threshold: float = 0.85
18
- ) -> bool:
 
19
  """
20
- Check for duplicate concepts within the same organization using semantic similarity
 
 
 
 
 
21
 
22
  Args:
23
  model: SentenceTransformer model for computing embeddings
24
- concept_id: ID of the current concept being checked
25
- organization: Organization name
26
- concept_profile: Text description of the concept to check
27
  df: DataFrame containing all application data
28
  similarity_threshold: Threshold for considering concepts duplicates (0-1)
29
  Recommended values: 0.80 (lenient) to 0.95 (strict)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
30
 
31
  Returns:
32
- Boolean classification result
 
33
  """
 
34
 
35
- # Remove current concept from the dataframe
36
- df_check = df[df['id'] != concept_id].copy()
37
 
38
- # Get other concepts from the same organization
39
- org_concepts = df_check[df_check['org_renamed'] == organization]
40
- other_concepts = org_concepts['scope_txt'].tolist()
41
 
42
- # If no other concepts from this organization, return False
43
- if len(other_concepts) == 0:
44
- return False
45
 
46
- # Compute embedding for current concept
47
- current_embedding = model.encode(
48
- concept_profile if concept_profile else "",
49
- convert_to_numpy=True
50
- )
51
 
52
- # Compute embeddings for other concepts
53
- other_embeddings = model.encode(
54
- [text if text else "" for text in other_concepts],
55
- convert_to_numpy=True
56
- )
57
 
58
- # Compute cosine similarities
59
- similarities = cosine_similarity(
60
- current_embedding.reshape(1, -1),
61
- other_embeddings
62
- )[0]
63
 
64
- max_similarity = similarities.max() if len(similarities) > 0 else 0.0
 
 
 
 
 
 
 
65
 
66
- logger.info(f"Duplicate check response for concept ID {concept_id}: max_similarity={max_similarity:.3f}")
 
 
 
67
 
68
- if max_similarity >= similarity_threshold:
69
- return True
70
- return False
 
1
  # Semantic similarity-based duplicate detection
2
+ import numpy as np
3
  import pandas as pd
 
4
  from sentence_transformers import SentenceTransformer
5
  from sklearn.metrics.pairwise import cosine_similarity
6
  from modules.utils import setup_logging
7
 
8
  logger = setup_logging()
9
 
10
+ # Text fields to check for duplication
11
+ DUPLICATE_CHECK_FIELDS = ['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']
12
 
13
+
14
+ def check_duplicate_concepts(
15
  model: SentenceTransformer,
 
 
 
16
  df: pd.DataFrame,
17
+ similarity_threshold: float = 0.85,
18
+ batch_size: int = 64
19
+ ) -> pd.Series:
20
  """
21
+ Check for duplicate concepts within the same organization using semantic similarity.
22
+ Uses batched embedding computation for efficiency.
23
+
24
+ A concept is marked as duplicate if there exists at least one other concept
25
+ in the same organization where ALL text fields simultaneously meet the
26
+ similarity threshold.
27
 
28
  Args:
29
  model: SentenceTransformer model for computing embeddings
 
 
 
30
  df: DataFrame containing all application data
31
  similarity_threshold: Threshold for considering concepts duplicates (0-1)
32
  Recommended values: 0.80 (lenient) to 0.95 (strict)
33
+ batch_size: Batch size for embedding computation
34
+
35
+ Returns:
36
+ pd.Series of boolean values indexed by df.index, True if concept has a duplicate
37
+ """
38
+ # Initialize results - default to False (no duplicate)
39
+ results = pd.Series(False, index=df.index)
40
+
41
+ # Pre-compute all embeddings for each field in batches
42
+ logger.info("Computing embeddings for all concepts...")
43
+ field_embeddings = {}
44
+ for field in DUPLICATE_CHECK_FIELDS:
45
+ texts = df[field].fillna("").astype(str).tolist()
46
+ embeddings = model.encode(
47
+ texts,
48
+ convert_to_numpy=True,
49
+ batch_size=batch_size,
50
+ show_progress_bar=False
51
+ )
52
+ field_embeddings[field] = embeddings
53
+ logger.info("Embeddings computed for all fields")
54
+
55
+ # Group by organization and process each group
56
+ org_groups = df.groupby('org_renamed')
57
+
58
+ for org_name, org_df in org_groups:
59
+ # Skip organizations with only one concept (no duplicates possible)
60
+ if len(org_df) < 2:
61
+ continue
62
+
63
+ # Get indices for this organization (positions in original df)
64
+ org_indices = org_df.index.tolist()
65
+ org_positions = [df.index.get_loc(idx) for idx in org_indices]
66
+ n_concepts = len(org_positions)
67
+
68
+ # Compute pairwise similarity matrices for each field within this org
69
+ field_sim_matrices = {}
70
+ for field in DUPLICATE_CHECK_FIELDS:
71
+ # Extract embeddings for this org's concepts
72
+ org_embeddings = field_embeddings[field][org_positions]
73
+ # Compute NxN similarity matrix for this field
74
+ sim_matrix = cosine_similarity(org_embeddings)
75
+ field_sim_matrices[field] = sim_matrix
76
+
77
+ # For each concept, check if any other concept matches on ALL fields
78
+ for i in range(n_concepts):
79
+ concept_idx = org_indices[i]
80
+ concept_id = df.loc[concept_idx, 'id']
81
+
82
+ for j in range(n_concepts):
83
+ if i == j:
84
+ continue
85
+
86
+ # Check if all fields meet threshold for this pair
87
+ all_fields_match = True
88
+ field_sims = {}
89
+ for field in DUPLICATE_CHECK_FIELDS:
90
+ sim = field_sim_matrices[field][i, j]
91
+ field_sims[field] = sim
92
+ if sim < similarity_threshold:
93
+ all_fields_match = False
94
+ break
95
+
96
+ if all_fields_match:
97
+ other_concept_id = df.loc[org_indices[j], 'id']
98
+ logger.info(
99
+ f"Duplicate found: concept {concept_id} matches concept {other_concept_id} "
100
+ f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
101
+ )
102
+ results.loc[concept_idx] = True
103
+ break # Found a duplicate, no need to check more
104
+
105
+ # Log concepts with no duplicates
106
+ for i in range(n_concepts):
107
+ concept_idx = org_indices[i]
108
+ if not results.loc[concept_idx]:
109
+ concept_id = df.loc[concept_idx, 'id']
110
+ logger.info(f"No duplicate found for concept {concept_id}")
111
+
112
+ return results
113
+
114
+
115
+ # def check_duplicate_concepts_semantic(
116
+ # model: SentenceTransformer,
117
+ # concept_id: str,
118
+ # df: pd.DataFrame,
119
+ # similarity_threshold: float = 0.85
120
+ # ) -> bool:
121
+ # """
122
+ # Check for duplicate concepts within the same organization using semantic similarity.
123
+ # Returns True if there exists at least one other concept where ALL text fields
124
+ # simultaneously meet the similarity threshold.
125
+
126
+ # DEPRECATED: Use check_duplicate_concepts_semantic_batched() for better performance.
127
+
128
+ # Args:
129
+ # model: SentenceTransformer model for computing embeddings
130
+ # concept_id: ID of the current concept being checked
131
+ # df: DataFrame containing all application data
132
+ # similarity_threshold: Threshold for considering concepts duplicates (0-1)
133
+ # Recommended values: 0.80 (lenient) to 0.95 (strict)
134
+
135
+ # Returns:
136
+ # Boolean classification result - True if any single other concept matches
137
+ # on ALL fields simultaneously
138
+ # """
139
+ # # Get the current concept's row
140
+ # current_row = df[df['id'] == concept_id]
141
+ # if len(current_row) == 0:
142
+ # logger.warning(f"Concept ID {concept_id} not found in dataframe")
143
+ # return False
144
+
145
+ # current_row = current_row.iloc[0]
146
+ # organization = current_row['org_renamed']
147
+
148
+ # # Get other concepts from the same organization (excluding current)
149
+ # org_concepts = df[(df['org_renamed'] == organization) & (df['id'] != concept_id)]
150
+
151
+ # # If no other concepts from this organization, return False
152
+ # if len(org_concepts) == 0:
153
+ # return False
154
+
155
+ # # Pre-compute embeddings for current concept's fields
156
+ # current_embeddings = {}
157
+ # for field in DUPLICATE_CHECK_FIELDS:
158
+ # current_text = current_row.get(field, "") or ""
159
+ # current_embeddings[field] = model.encode(current_text, convert_to_numpy=True)
160
+
161
+ # # Check each other concept - ALL fields must match for a single concept
162
+ # for _, other_row in org_concepts.iterrows():
163
+ # all_fields_match_this_concept = True
164
+ # field_sims = {}
165
+
166
+ # for field in DUPLICATE_CHECK_FIELDS:
167
+ # other_text = other_row.get(field, "") or ""
168
+ # other_embedding = model.encode(other_text, convert_to_numpy=True)
169
+
170
+ # similarity = cosine_similarity(
171
+ # current_embeddings[field].reshape(1, -1),
172
+ # other_embedding.reshape(1, -1)
173
+ # )[0][0]
174
+
175
+ # field_sims[field] = similarity
176
+
177
+ # if similarity < similarity_threshold:
178
+ # all_fields_match_this_concept = False
179
+ # break # No need to check remaining fields for this concept
180
+
181
+ # if all_fields_match_this_concept:
182
+ # logger.info(
183
+ # f"Duplicate found: concept {concept_id} matches concept {other_row['id']} "
184
+ # f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
185
+ # )
186
+ # return True
187
+
188
+ # logger.info(f"No duplicate found for concept {concept_id}")
189
+ # return False
190
+
191
+
192
+ def assess_duplicate_concepts(
193
+ df: pd.DataFrame
194
+ ) -> pd.DataFrame:
195
+ """
196
+ Check flagged duplicate concepts within the same organization.
197
+ Get the top ranked concept out of all the duplicates (pred_score) and reject all others.
198
+
199
+ Args:
200
+ df: DataFrame containing application data with 'duplicate_check', 'org_renamed',
201
+ 'pred_score', 'id', and 'pred_action' columns
202
 
203
  Returns:
204
+ DataFrame with pred_action set to 'REJECT' for duplicate concepts that are not
205
+ the top-scoring concept within their organization
206
  """
207
+ df_assess = df.copy()
208
 
209
+ # Filter to only duplicate-flagged concepts
210
+ duplicates_mask = df_assess['duplicate_check'] == True
211
 
212
+ # Get unique organizations that have duplicates
213
+ orgs_with_duplicates = df_assess.loc[duplicates_mask, 'org_renamed'].unique()
 
214
 
215
+ for org in orgs_with_duplicates:
216
+ # Get all duplicate concepts for this organization
217
+ org_duplicates_mask = (df_assess['org_renamed'] == org) & (df_assess['duplicate_check'] == True)
218
 
219
+ # Sort by pred_score descending; index order handles ties (lowest index wins)
220
+ org_duplicates = df_assess.loc[org_duplicates_mask].sort_values(
221
+ by='pred_score',
222
+ ascending=False
223
+ )
224
 
225
+ # Top concept is the first one after sorting
226
+ top_concept_id = org_duplicates.iloc[0]['id']
 
 
 
227
 
228
+ # Set pred_action to 'REJECT' for all duplicates except the top concept
229
+ reject_mask = org_duplicates_mask & (df_assess['id'] != top_concept_id)
230
+ df_assess.loc[reject_mask, 'pred_action'] = 'REJECT'
 
 
231
 
232
+ # Append rationale for rejected duplicates (if action_rationale column exists)
233
+ if 'action_rationale' in df_assess.columns:
234
+ for idx in df_assess.loc[reject_mask].index:
235
+ rationale = df_assess.at[idx, 'action_rationale']
236
+ if isinstance(rationale, list):
237
+ rationale.append(f'Lower-scoring duplicate (kept concept {top_concept_id})')
238
+ else:
239
+ df_assess.at[idx, 'action_rationale'] = [f'Lower-scoring duplicate (kept concept {top_concept_id})']
240
 
241
+ logger.info(
242
+ f"Duplicate assessment for org '{org}': kept concept {top_concept_id}, "
243
+ f"rejected {reject_mask.sum()} duplicate(s)"
244
+ )
245
 
246
+ return df_assess