Spaces:

mtyrrell
/

prefilter_app

Sleeping

App Files Files Community

mtyrrell commited on Jan 16

Commit

b6faf10

1 Parent(s): a49bcd0

Duplicate concepts major refactor; pipeline refactor; rejection rationale

Browse files

Files changed (4) hide show

app.py +12 -12
images/pipeline.png +2 -2
modules/pipeline.py +66 -34
modules/semantic_similarity.py +216 -40

app.py CHANGED Viewed

@@ -28,8 +28,8 @@ from io import BytesIO
 logger = logging.getLogger(__name__)
 # Local
-# from dotenv import load_dotenv
-# load_dotenv()
 config = getconfig("config.cfg")
@@ -69,7 +69,7 @@ def get_azure_deployment():
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
-        st.session_state['authenticated'] = False
     if st.session_state['authenticated']:
         # Remove login success message for testing
@@ -228,15 +228,15 @@ def main():
     # Comment out for testing
-    else:
-        username = st.text_input("Username")
-        password = st.text_input("Password", type="password")
-        if st.button("Login"):
-            if validate_login(username, password):
-                st.session_state['authenticated'] = True
-                st.rerun()
-            else:
-                st.error("Incorrect username or password")

 logger = logging.getLogger(__name__)
 # Local
+from dotenv import load_dotenv
+load_dotenv()
 config = getconfig("config.cfg")
 def main():
     # Temporarily set authentication to True for testing
     if 'authenticated' not in st.session_state:
+        st.session_state['authenticated'] = True
     if st.session_state['authenticated']:
         # Remove login success message for testing
     # Comment out for testing
+    # else:
+    #     username = st.text_input("Username")
+    #     password = st.text_input("Password", type="password")
+    #     if st.button("Login"):
+    #         if validate_login(username, password):
+    #             st.session_state['authenticated'] = True
+    #             st.rerun()
+    #         else:
+    #             st.error("Incorrect username or password")

images/pipeline.png CHANGED Viewed

Git LFS Details

SHA256: c6966ec792e3b749e773a185e441a558bd5abbde42bb61f826fb1321af910928
Pointer size: 131 Bytes
Size of remote file: 679 kB

Git LFS Details

SHA256: b97765fa2ce04f7f2cb0016d4c890beeffdfff39789a8ca1c6cf781ab1cc1d53
Pointer size: 131 Bytes
Size of remote file: 625 kB

modules/pipeline.py CHANGED Viewed

@@ -12,7 +12,7 @@ from openpyxl.styles.differential import DifferentialStyle
 from modules.org_count import standardize_organization_names
 from modules.utils import clean_text, extract_predicted_labels
 # from modules.llm import check_duplicate_concepts
-from modules.semantic_similarity import check_duplicate_concepts_semantic
 from sentence_transformers import SentenceTransformer
 import logging
@@ -110,6 +110,51 @@ def predict_category(df, model_name, progress_bar, repo, profile, multilabel=Fal
     return predictions
 # Main function to process data
 def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
     """
@@ -214,22 +259,13 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
             logger.info(f"Loading semantic similarity model on device: {device}")
             semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
-            # Process duplicate check with progress tracking
-            duplicate_results = []
-            total = len(df)
-            for i, row in df.iterrows():
-                result = check_duplicate_concepts_semantic(
-                    semantic_model,
-                    row['id'],
-                    row['org_renamed'],
-                    row['scope_txt'],
-                    df
-                )
-                duplicate_results.append(result)
-                # Update progress bar with each iteration
-                progress = (i + 1) / total
-                progress_bar.progress(progress)
-            df['duplicate_check'] = duplicate_results
         logger.info(f"Completed: {model_name}")
@@ -308,27 +344,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
             else False, axis=1)
     # Predict score
-    sector_classes = ['Energy','Transport','Industries']
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
-    # labelling logic
-    df['pred_action'] = df.apply(lambda x:
-        'REJECT' if (('concept_count' in df.columns and x['concept_count'] > 6) or
-                        x['LANG'][0:2] != 'en' or
-                        x['ADAPMIT'] == 'Adaptation' or
-                        not any(sector in [x['SECTOR1'], x['SECTOR2']] for sector in sector_classes) or
-                        x['word_length_check'] == True or
-                        x['duplicate_check'] == True or
-                        x['pred_score'] < sens_level)
-        else 'PRE-ASSESSMENT' if sens_level <= x['pred_score'] <= sens_level+1
-        else 'FULL-ASSESSMENT' if x['pred_score'] > sens_level+2
-        else 'ERROR', axis=1)
     # Reorder columns in final dataframe
-    column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
-                    'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
-                    'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
                     'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
-                    'word_length_check', 'pred_score', 'pred_action']
     # Only include columns that exist in the DataFrame
     final_columns = [col for col in column_order if col in df.columns]

 from modules.org_count import standardize_organization_names
 from modules.utils import clean_text, extract_predicted_labels
 # from modules.llm import check_duplicate_concepts
+from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
 from sentence_transformers import SentenceTransformer
 import logging
     return predictions
+# Helper function to determine pred_action and build rationale
+def determine_action(row, sens_level):
+    rationale = []
+    action = None
+    # Eligible sector classes for filtering
+    SECTOR_CLASSES = ['Energy', 'Transport', 'Industries']
+    # Label cause of REJECT / INELLIGBLE
+    if 'concept_count' in row.index and row['concept_count'] > 6:
+        rationale.append('Multiple concepts same org (>6)')
+        action = 'INELIGIBLE'
+    if row['LANG'][0:2] != 'en':
+        rationale.append(f"Non-English language: ({row['LANG']})")
+        action = 'INELIGIBLE'
+    if row['ADAPMIT'] == 'Adaptation':
+        rationale.append('Adaptation (not Mitigation)')
+        action = 'INELIGIBLE'
+    if not any(sector in [row['SECTOR1'], row['SECTOR2']] for sector in SECTOR_CLASSES):
+        if row['SECTOR2'] is None:
+            rationale.append(f"Ineligible sector ({row['SECTOR1']})")
+        else:
+            rationale.append(f"Ineligible sectors ({row['SECTOR1']}, {row['SECTOR2']})")
+        action = 'INELIGIBLE'
+    if row['word_length_check'] == True:
+        rationale.append('Insufficient word count')
+    if row['pred_score'] < sens_level:
+        rationale.append(f"Score below threshold ({row['pred_score']} < {sens_level})")
+    # Determine action
+    if action == 'INELIGIBLE':
+        pass
+    elif rationale:
+        action = 'REJECT'
+    elif sens_level <= row['pred_score'] <= sens_level + 1:
+        action = 'PRE-ASSESSMENT'
+    elif row['pred_score'] > sens_level + 2:
+        action = 'FULL-ASSESSMENT'
+    else:
+        action = 'ERROR'
+    return pd.Series({'pred_action': action, 'action_rationale': rationale})
 # Main function to process data
 def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
     """
             logger.info(f"Loading semantic similarity model on device: {device}")
             semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
+            # Process duplicate check using batched approach for efficiency
+            progress_bar.progress(0.1)  # Show initial progress
+            df['duplicate_check'] = check_duplicate_concepts(
+                semantic_model,
+                df
+            )
+            progress_bar.progress(1.0)
         logger.info(f"Completed: {model_name}")
             else False, axis=1)
     # Predict score
     df['pred_score'] = df.apply(lambda x: round((x['fin_lab2']*2 + x['scope_lab1']*2 + x['scope_lab2']*2 + x['tech_lab1'] + x['tech_lab3'] + x['bar_lab2'] + x['lev_gt_0']+x['lev_maf_scale'])/11*10,0), axis=1)
+    # Initialize action_rationale column
+    df['action_rationale'] = [[] for _ in range(len(df))]
+    # Apply the function to determine action and rationale
+    df[['pred_action', 'action_rationale']] = df.apply(lambda x: determine_action(x, sens_level), axis=1, result_type='expand')
+    # Final assessment of duplicate concepts (top scored concept is maintained, others: 'REJECT'. If tie, take the first in the df index)
+    df = assess_duplicate_concepts(df)
     # Reorder columns in final dataframe
+    column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
+                    'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
+                    'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
                     'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
+                    'word_length_check', 'pred_score', 'pred_action', 'action_rationale']
     # Only include columns that exist in the DataFrame
     final_columns = [col for col in column_order if col in df.columns]

modules/semantic_similarity.py CHANGED Viewed

@@ -1,70 +1,246 @@
 # Semantic similarity-based duplicate detection
 import pandas as pd
-import logging
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from modules.utils import setup_logging
 logger = setup_logging()
-def check_duplicate_concepts_semantic(
     model: SentenceTransformer,
-    concept_id: str,
-    organization: str,
-    concept_profile: str,
     df: pd.DataFrame,
-    similarity_threshold: float = 0.85
-) -> bool:
     """
-    Check for duplicate concepts within the same organization using semantic similarity
     Args:
         model: SentenceTransformer model for computing embeddings
-        concept_id: ID of the current concept being checked
-        organization: Organization name
-        concept_profile: Text description of the concept to check
         df: DataFrame containing all application data
         similarity_threshold: Threshold for considering concepts duplicates (0-1)
                             Recommended values: 0.80 (lenient) to 0.95 (strict)
     Returns:
-        Boolean classification result
     """
-    # Remove current concept from the dataframe
-    df_check = df[df['id'] != concept_id].copy()
-    # Get other concepts from the same organization
-    org_concepts = df_check[df_check['org_renamed'] == organization]
-    other_concepts = org_concepts['scope_txt'].tolist()
-    # If no other concepts from this organization, return False
-    if len(other_concepts) == 0:
-        return False
-    # Compute embedding for current concept
-    current_embedding = model.encode(
-        concept_profile if concept_profile else "",
-        convert_to_numpy=True
-    )
-    # Compute embeddings for other concepts
-    other_embeddings = model.encode(
-        [text if text else "" for text in other_concepts],
-        convert_to_numpy=True
-    )
-    # Compute cosine similarities
-    similarities = cosine_similarity(
-        current_embedding.reshape(1, -1),
-        other_embeddings
-    )[0]
-    max_similarity = similarities.max() if len(similarities) > 0 else 0.0
-    logger.info(f"Duplicate check response for concept ID {concept_id}: max_similarity={max_similarity:.3f}")
-    if max_similarity >= similarity_threshold:
-        return True
-    return False

 # Semantic similarity-based duplicate detection
+import numpy as np
 import pandas as pd
 from sentence_transformers import SentenceTransformer
 from sklearn.metrics.pairwise import cosine_similarity
 from modules.utils import setup_logging
 logger = setup_logging()
+# Text fields to check for duplication
+DUPLICATE_CHECK_FIELDS = ['scope_txt', 'tech_txt', 'fin_txt', 'bar_txt']
+def check_duplicate_concepts(
     model: SentenceTransformer,
     df: pd.DataFrame,
+    similarity_threshold: float = 0.85,
+    batch_size: int = 64
+) -> pd.Series:
     """
+    Check for duplicate concepts within the same organization using semantic similarity.
+    Uses batched embedding computation for efficiency.
+    A concept is marked as duplicate if there exists at least one other concept
+    in the same organization where ALL text fields simultaneously meet the
+    similarity threshold.
     Args:
         model: SentenceTransformer model for computing embeddings
         df: DataFrame containing all application data
         similarity_threshold: Threshold for considering concepts duplicates (0-1)
                             Recommended values: 0.80 (lenient) to 0.95 (strict)
+        batch_size: Batch size for embedding computation
+    Returns:
+        pd.Series of boolean values indexed by df.index, True if concept has a duplicate
+    """
+    # Initialize results - default to False (no duplicate)
+    results = pd.Series(False, index=df.index)
+    # Pre-compute all embeddings for each field in batches
+    logger.info("Computing embeddings for all concepts...")
+    field_embeddings = {}
+    for field in DUPLICATE_CHECK_FIELDS:
+        texts = df[field].fillna("").astype(str).tolist()
+        embeddings = model.encode(
+            texts,
+            convert_to_numpy=True,
+            batch_size=batch_size,
+            show_progress_bar=False
+        )
+        field_embeddings[field] = embeddings
+    logger.info("Embeddings computed for all fields")
+    # Group by organization and process each group
+    org_groups = df.groupby('org_renamed')
+    for org_name, org_df in org_groups:
+        # Skip organizations with only one concept (no duplicates possible)
+        if len(org_df) < 2:
+            continue
+        # Get indices for this organization (positions in original df)
+        org_indices = org_df.index.tolist()
+        org_positions = [df.index.get_loc(idx) for idx in org_indices]
+        n_concepts = len(org_positions)
+        # Compute pairwise similarity matrices for each field within this org
+        field_sim_matrices = {}
+        for field in DUPLICATE_CHECK_FIELDS:
+            # Extract embeddings for this org's concepts
+            org_embeddings = field_embeddings[field][org_positions]
+            # Compute NxN similarity matrix for this field
+            sim_matrix = cosine_similarity(org_embeddings)
+            field_sim_matrices[field] = sim_matrix
+        # For each concept, check if any other concept matches on ALL fields
+        for i in range(n_concepts):
+            concept_idx = org_indices[i]
+            concept_id = df.loc[concept_idx, 'id']
+            for j in range(n_concepts):
+                if i == j:
+                    continue
+                # Check if all fields meet threshold for this pair
+                all_fields_match = True
+                field_sims = {}
+                for field in DUPLICATE_CHECK_FIELDS:
+                    sim = field_sim_matrices[field][i, j]
+                    field_sims[field] = sim
+                    if sim < similarity_threshold:
+                        all_fields_match = False
+                        break
+                if all_fields_match:
+                    other_concept_id = df.loc[org_indices[j], 'id']
+                    logger.info(
+                        f"Duplicate found: concept {concept_id} matches concept {other_concept_id} "
+                        f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
+                    )
+                    results.loc[concept_idx] = True
+                    break  # Found a duplicate, no need to check more
+        # Log concepts with no duplicates
+        for i in range(n_concepts):
+            concept_idx = org_indices[i]
+            if not results.loc[concept_idx]:
+                concept_id = df.loc[concept_idx, 'id']
+                logger.info(f"No duplicate found for concept {concept_id}")
+    return results
+# def check_duplicate_concepts_semantic(
+#     model: SentenceTransformer,
+#     concept_id: str,
+#     df: pd.DataFrame,
+#     similarity_threshold: float = 0.85
+# ) -> bool:
+#     """
+#     Check for duplicate concepts within the same organization using semantic similarity.
+#     Returns True if there exists at least one other concept where ALL text fields
+#     simultaneously meet the similarity threshold.
+#     DEPRECATED: Use check_duplicate_concepts_semantic_batched() for better performance.
+#     Args:
+#         model: SentenceTransformer model for computing embeddings
+#         concept_id: ID of the current concept being checked
+#         df: DataFrame containing all application data
+#         similarity_threshold: Threshold for considering concepts duplicates (0-1)
+#                             Recommended values: 0.80 (lenient) to 0.95 (strict)
+#     Returns:
+#         Boolean classification result - True if any single other concept matches
+#         on ALL fields simultaneously
+#     """
+#     # Get the current concept's row
+#     current_row = df[df['id'] == concept_id]
+#     if len(current_row) == 0:
+#         logger.warning(f"Concept ID {concept_id} not found in dataframe")
+#         return False
+#     current_row = current_row.iloc[0]
+#     organization = current_row['org_renamed']
+#     # Get other concepts from the same organization (excluding current)
+#     org_concepts = df[(df['org_renamed'] == organization) & (df['id'] != concept_id)]
+#     # If no other concepts from this organization, return False
+#     if len(org_concepts) == 0:
+#         return False
+#     # Pre-compute embeddings for current concept's fields
+#     current_embeddings = {}
+#     for field in DUPLICATE_CHECK_FIELDS:
+#         current_text = current_row.get(field, "") or ""
+#         current_embeddings[field] = model.encode(current_text, convert_to_numpy=True)
+#     # Check each other concept - ALL fields must match for a single concept
+#     for _, other_row in org_concepts.iterrows():
+#         all_fields_match_this_concept = True
+#         field_sims = {}
+#         for field in DUPLICATE_CHECK_FIELDS:
+#             other_text = other_row.get(field, "") or ""
+#             other_embedding = model.encode(other_text, convert_to_numpy=True)
+#             similarity = cosine_similarity(
+#                 current_embeddings[field].reshape(1, -1),
+#                 other_embedding.reshape(1, -1)
+#             )[0][0]
+#             field_sims[field] = similarity
+#             if similarity < similarity_threshold:
+#                 all_fields_match_this_concept = False
+#                 break  # No need to check remaining fields for this concept
+#         if all_fields_match_this_concept:
+#             logger.info(
+#                 f"Duplicate found: concept {concept_id} matches concept {other_row['id']} "
+#                 f"(sims: {{{', '.join(f'{k}={v:.3f}' for k, v in field_sims.items())}}})"
+#             )
+#             return True
+#     logger.info(f"No duplicate found for concept {concept_id}")
+#     return False
+def assess_duplicate_concepts(
+    df: pd.DataFrame
+) -> pd.DataFrame:
+    """
+    Check flagged duplicate concepts within the same organization.
+    Get the top ranked concept out of all the duplicates (pred_score) and reject all others.
+    Args:
+        df: DataFrame containing application data with 'duplicate_check', 'org_renamed',
+            'pred_score', 'id', and 'pred_action' columns
     Returns:
+        DataFrame with pred_action set to 'REJECT' for duplicate concepts that are not
+        the top-scoring concept within their organization
     """
+    df_assess = df.copy()
+    # Filter to only duplicate-flagged concepts
+    duplicates_mask = df_assess['duplicate_check'] == True
+    # Get unique organizations that have duplicates
+    orgs_with_duplicates = df_assess.loc[duplicates_mask, 'org_renamed'].unique()
+    for org in orgs_with_duplicates:
+        # Get all duplicate concepts for this organization
+        org_duplicates_mask = (df_assess['org_renamed'] == org) & (df_assess['duplicate_check'] == True)
+        # Sort by pred_score descending; index order handles ties (lowest index wins)
+        org_duplicates = df_assess.loc[org_duplicates_mask].sort_values(
+            by='pred_score',
+            ascending=False
+        )
+        # Top concept is the first one after sorting
+        top_concept_id = org_duplicates.iloc[0]['id']
+        # Set pred_action to 'REJECT' for all duplicates except the top concept
+        reject_mask = org_duplicates_mask & (df_assess['id'] != top_concept_id)
+        df_assess.loc[reject_mask, 'pred_action'] = 'REJECT'
+        # Append rationale for rejected duplicates (if action_rationale column exists)
+        if 'action_rationale' in df_assess.columns:
+            for idx in df_assess.loc[reject_mask].index:
+                rationale = df_assess.at[idx, 'action_rationale']
+                if isinstance(rationale, list):
+                    rationale.append(f'Lower-scoring duplicate (kept concept {top_concept_id})')
+                else:
+                    df_assess.at[idx, 'action_rationale'] = [f'Lower-scoring duplicate (kept concept {top_concept_id})']
+        logger.info(
+            f"Duplicate assessment for org '{org}': kept concept {top_concept_id}, "
+            f"rejected {reject_mask.sum()} duplicate(s)"
+        )
+    return df_assess