Spaces:

mtyrrell
/

prefilter_app

Sleeping

App Files Files Community

mtyrrell commited on Jan 16

Commit

4fbb7a3

1 Parent(s): c8c24db

innovation classification

Browse files

Files changed (6) hide show

modules/llm.py +21 -37
modules/models.py +12 -4
modules/pipeline.py +22 -10
modules/prompts.py +27 -16
modules/semantic_similarity.py +12 -3
modules/utils.py +4 -1

modules/llm.py CHANGED Viewed

@@ -6,13 +6,14 @@ import torch
 import logging
 from transformers import pipeline
 from modules.utils import setup_logging
-from modules.prompts import prompt_concept
-from modules.models import ConceptClassify
 from openai import OpenAI
 logger = setup_logging()
-def call_structured(client: OpenAI, deployment: str, system_prompt: str, user_prompt: str,
                    response_model: None,
                    logger: logging.Logger) -> Dict[str, Any]:
     """Call Azure OpenAI with structured output"""
@@ -48,43 +49,26 @@ def call_structured(client: OpenAI, deployment: str, system_prompt: str, user_pr
         return None
-# Not used - results sucked
-# def check_duplicate_concepts(client, deployment, concept_id: str, organization: str, concept_profile: str, df) -> bool:
-#     """
-#     Check for duplicate concepts within the same organization using Azure OpenAI
-#     Args:
-#         client: AzureOpenAI client instance
-#         deployment: Azure OpenAI deployment name
-#         concept_id: ID of the current concept being checked
-#         organization: Organization name
-#         concept_profile: Text description of the concept to check
-#         df: DataFrame containing all application data
-#     Returns:
-#         Boolean classification result
-#     """
-#     # Remove current concept from the dataframe
-#     df_check = df[df['id'] != concept_id].copy()
-#     # Get other concepts from the same organization
-#     org_concepts = df_check[df_check['org_renamed'] == organization]
-#     other_concepts = org_concepts['scope_txt'].tolist()
-#     # If no other concepts from this organization, return False
-#     if len(other_concepts) == 0:
-#         return False
-#     logger.info(f"Checking duplicates for concept ID {concept_id} from organization {organization} against {len(other_concepts)} other concept(s).")
-#     logger.info(f"Scope text {concept_profile}")
-#     # Construct prompt
-#     prompt = prompt_concept(concept_profile, other_concepts)
-#     response = call_structured(client, deployment, prompt, concept_profile, ConceptClassify, logger)
-#     check = response['classification']
-#     logger.info(f"Duplicate check response for concept ID {concept_id}: {check}")
-#     if check == "YES":
-#         return True
-#     return False

 import logging
 from transformers import pipeline
 from modules.utils import setup_logging
+from modules.prompts import prompt_innovation
+from modules.models import InnovationClassify
 from openai import OpenAI
+import pandas
 logger = setup_logging()
+def call_structured(client: OpenAI, deployment: str, user_prompt: str,
                    response_model: None,
                    logger: logging.Logger) -> Dict[str, Any]:
     """Call Azure OpenAI with structured output"""
         return None
+def classify_innovation(client, deployment, concept_id: str, tech: str, rationale: str) -> Dict:
+    """
+    Classify level of innovation for given text fields
+    Args:
+        client: AzureOpenAI client instance
+        deployment: Azure OpenAI deployment name
+        concept: Text description of the concept to check
+    Returns:
+        Dictionary with classification result
+    """
+    # concatenate context
+    concept = f"Technology: {tech}\nRationale: {rationale}"
+    logger.info(f"Analyzing Innovation for: {concept_id}")
+    # Construct prompt
+    prompt = prompt_innovation(concept)
+    response = call_structured(client, deployment, prompt, InnovationClassify, logger)
+    return response

modules/models.py CHANGED Viewed

@@ -1,8 +1,16 @@
-from typing import Dict, Any, List, Optional, Literal
-from pydantic import BaseModel, Field
 #===================== Duplicate concepts =====================
-class ConceptClassify(BaseModel):
-    classification: Literal["YES","NO","UNCERTAIN"] = Field(description="Is the concept duplicated in other applications? (yes/no/uncertain)")

+from pydantic import BaseModel, Field, conint
+from typing import Literal, List
 #===================== Duplicate concepts =====================
+class InnovationClassify(BaseModel):
+    classification: Literal["INSUFFICIENT INFO","NOT INNOVATIVE", "MODERATELY INNOVATIVE", "VERY INNOVATIVE"] = Field(
+        description="Overall innovation level using the rubric."
+    )
+    rationale: str = Field(
+        description="1–2 sentences that justify the classification."
+    )
+    confidence: int = Field(
+        description="0–100 confidence in the classification given the detail level in the concept."
+    )

modules/pipeline.py CHANGED Viewed

@@ -11,7 +11,7 @@ from openpyxl.styles import Font, NamedStyle, PatternFill
 from openpyxl.styles.differential import DifferentialStyle
 from modules.org_count import standardize_organization_names
 from modules.utils import clean_text, extract_predicted_labels
-# from modules.llm import check_duplicate_concepts
 from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
 from sentence_transformers import SentenceTransformer
 import logging
@@ -176,6 +176,9 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
         'technology': 'tech_txt',
         'financial': 'fin_txt',
         'barrier': 'bar_txt',
         'maf_funding_requested': 'maf_funding',
         'contributions_public_sector': 'cont_public',
         'contributions_private_sector': 'cont_private',
@@ -214,9 +217,7 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
     # Define models and predictions
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
-    model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK']
-    # model_names_sf = []
-    # model_names = ['ADAPMIT_SCOPE','ADAPMIT_TECH']
     total_predictions = len(model_names) * len(df)
     progress_count = 0
@@ -260,12 +261,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
             semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
             # Process duplicate check using batched approach for efficiency
-            progress_bar.progress(0.1)  # Show initial progress
             df['duplicate_check'] = check_duplicate_concepts(
                 semantic_model,
-                df
             )
-            progress_bar.progress(1.0)
         logger.info(f"Completed: {model_name}")
@@ -356,11 +368,11 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
     df = assess_duplicate_concepts(df)
     # Reorder columns in final dataframe
-    column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
                     'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
                     'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
-                    'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
-                    'word_length_check', 'pred_score', 'pred_action', 'action_rationale']
     # Only include columns that exist in the DataFrame
     final_columns = [col for col in column_order if col in df.columns]

 from openpyxl.styles.differential import DifferentialStyle
 from modules.org_count import standardize_organization_names
 from modules.utils import clean_text, extract_predicted_labels
+from modules.llm import classify_innovation
 from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
 from sentence_transformers import SentenceTransformer
 import logging
         'technology': 'tech_txt',
         'financial': 'fin_txt',
         'barrier': 'bar_txt',
+        'technology_rationale': 'tech_rationale_txt',
+        'project_rationale': 'project_rationale_txt',
+        'project_objectives': 'project_objectives_txt',
         'maf_funding_requested': 'maf_funding',
         'contributions_public_sector': 'cont_public',
         'contributions_private_sector': 'cont_private',
     # Define models and predictions
     model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
+    model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK','INNOVATION_CLASSIFICATION']
     total_predictions = len(model_names) * len(df)
     progress_count = 0
             semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
             # Process duplicate check using batched approach for efficiency
             df['duplicate_check'] = check_duplicate_concepts(
                 semantic_model,
+                df,
+                progress_callback=lambda p: progress_bar.progress(p)
+            )
+        elif model_name == 'INNOVATION_CLASSIFICATION':
+            df['innovation_classification'] = df.apply(
+                lambda x: classify_innovation(
+                    azure_client,
+                    azure_deployment,
+                    x['id'],
+                    x['tech_txt'],
+                    x['tech_rationale_txt']
+                ),
+                axis=1
             )
         logger.info(f"Completed: {model_name}")
     df = assess_duplicate_concepts(df)
     # Reorder columns in final dataframe
+    column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'word_length_check', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt','maf_funding', 'cont_public',
                     'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
                     'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
+                    'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm','innovation_classification',
+                    'pred_score', 'pred_action', 'action_rationale']
     # Only include columns that exist in the DataFrame
     final_columns = [col for col in column_order if col in df.columns]

modules/prompts.py CHANGED Viewed

@@ -1,21 +1,32 @@
 # Prompts library
 from typing import List
-def prompt_concept(concept: str, other_concepts: List[str]) -> str:
-    """Generate prompt for classifying concepts by similarity"""
-    prompt = f"""
-    Each organization is allowed to submit up to 6 concepts per year via a web portal. However, in some cases organizations submit the same concept multiple times.
-    This can happen for various reasons. For example, an organization may erroneously submit the same application twice because they lost access to the previous web session somehow.
-    In all such cases, it is not usually the case that the duplicate concepts are verbatim identical. It is more usually the case that there is simply high semantic alignment - i.e. it is the same concept, but there are minor superficial differences between each application.
-    Your task is to review the concept profiles submitted by a particular organization and identify the amount of similarity so that we can in turn identify cases of duplicate concepts.
-    Here is the concept profile for review:
     {concept}
-    Please review this against the following concepts and assess for duplication:
-    {other_concepts}
-    Please conduct your review carefully - however, ensure that you tag all duplicates correctly. Please return your response according to the following structure:
-    """
-    return prompt

 # Prompts library
 from typing import List
+from typing import List
+from textwrap import dedent
+def prompt_innovation(concept: str) -> str:
+    return dedent(f"""
+    You are reviewing applications for grant funding.
+    Task: classify the submitted concept by innovation level using the definition below.
+    Definition of innovation:
+    Innovative mitigation technologies that have not yet been tested or implemented in the local context.
+    Innovation may be novel globally or within the specific country context.
+    Examples from our portfolio include tidal stream energy generation, green hydrogen-based production,
+    locally manufactured lithium-ion batteries for custom-built e-mobility solutions, or reactive power
+    compensation systems to enhance grid stability.
+    Classification rubric:
+    - NOT INNOVATIVE: already common/deployed in the local context, or standard practice with no clear novelty.
+    - MODERATELY INNOVATIVE: established elsewhere but new to the local context, or a meaningful adaptation.
+    - VERY INNOVATIVE: novel approach globally or locally with clear differentiation from standard solutions.
+    - INSUFFICIENT INFO: not enough detail to classify innovation level.
+    If the concept is too vague to judge novelty, choose the lowest classification that is defensible and reflect uncertainty in confidence.
+    Concept for review:
     {concept}
+    """).strip()

modules/semantic_similarity.py CHANGED Viewed

@@ -15,7 +15,8 @@ def check_duplicate_concepts(
     model: SentenceTransformer,
     df: pd.DataFrame,
     similarity_threshold: float = 0.85,
-    batch_size: int = 64
 ) -> pd.Series:
     """
     Check for duplicate concepts within the same organization using semantic similarity.
@@ -31,6 +32,7 @@ def check_duplicate_concepts(
         similarity_threshold: Threshold for considering concepts duplicates (0-1)
                             Recommended values: 0.80 (lenient) to 0.95 (strict)
         batch_size: Batch size for embedding computation
     Returns:
         pd.Series of boolean values indexed by df.index, True if concept has a duplicate
@@ -39,9 +41,11 @@ def check_duplicate_concepts(
     results = pd.Series(False, index=df.index)
     # Pre-compute all embeddings for each field in batches
     logger.info("Computing embeddings for all concepts...")
     field_embeddings = {}
-    for field in DUPLICATE_CHECK_FIELDS:
         texts = df[field].fillna("").astype(str).tolist()
         embeddings = model.encode(
             texts,
@@ -50,12 +54,16 @@ def check_duplicate_concepts(
             show_progress_bar=False
         )
         field_embeddings[field] = embeddings
     logger.info("Embeddings computed for all fields")
     # Group by organization and process each group
     org_groups = df.groupby('org_renamed')
-    for org_name, org_df in org_groups:
         # Skip organizations with only one concept (no duplicates possible)
         if len(org_df) < 2:
             continue
@@ -109,6 +117,7 @@ def check_duplicate_concepts(
                 concept_id = df.loc[concept_idx, 'id']
                 logger.info(f"No duplicate found for concept {concept_id}")
     return results

     model: SentenceTransformer,
     df: pd.DataFrame,
     similarity_threshold: float = 0.85,
+    batch_size: int = 64,
+    progress_callback=None
 ) -> pd.Series:
     """
     Check for duplicate concepts within the same organization using semantic similarity.
         similarity_threshold: Threshold for considering concepts duplicates (0-1)
                             Recommended values: 0.80 (lenient) to 0.95 (strict)
         batch_size: Batch size for embedding computation
+        progress_callback: Optional callback function that takes a float (0-1) to report progress
     Returns:
         pd.Series of boolean values indexed by df.index, True if concept has a duplicate
     results = pd.Series(False, index=df.index)
     # Pre-compute all embeddings for each field in batches
+    # This is the bulk of processing time (~90%)
     logger.info("Computing embeddings for all concepts...")
     field_embeddings = {}
+    total_fields = len(DUPLICATE_CHECK_FIELDS)
+    for field_idx, field in enumerate(DUPLICATE_CHECK_FIELDS):
         texts = df[field].fillna("").astype(str).tolist()
         embeddings = model.encode(
             texts,
             show_progress_bar=False
         )
         field_embeddings[field] = embeddings
+        # Report progress during embedding computation
+        if progress_callback:
+            progress_callback((field_idx + 1) / total_fields)
     logger.info("Embeddings computed for all fields")
     # Group by organization and process each group
     org_groups = df.groupby('org_renamed')
+    total_orgs = len(org_groups)
+    for org_idx, (org_name, org_df) in enumerate(org_groups):
         # Skip organizations with only one concept (no duplicates possible)
         if len(org_df) < 2:
             continue
                 concept_id = df.loc[concept_idx, 'id']
                 logger.info(f"No duplicate found for concept {concept_id}")
     return results

modules/utils.py CHANGED Viewed

@@ -52,6 +52,9 @@ def create_excel():
                'technology',
                'financial',
                'barrier',
                'maf_funding_requested',
                'contributions_public_sector',
                'contributions_private_sector',
@@ -60,7 +63,7 @@ def create_excel():
     sheet.append(columns)  # Appending columns to the first row
     # formatting
-    for c in sheet['A1:K4'][0]:
         c.fill = PatternFill('solid', fgColor = 'bad8e1')
         c.font = Font(bold=True)

                'technology',
                'financial',
                'barrier',
+               'technology_rationale',
+               'project_rationale',
+               'project_objectives',
                'maf_funding_requested',
                'contributions_public_sector',
                'contributions_private_sector',
     sheet.append(columns)  # Appending columns to the first row
     # formatting
+    for c in sheet['A1:N4'][0]:
         c.fill = PatternFill('solid', fgColor = 'bad8e1')
         c.font = Font(bold=True)