Spaces:
Sleeping
Sleeping
innovation classification
Browse files- modules/llm.py +21 -37
- modules/models.py +12 -4
- modules/pipeline.py +22 -10
- modules/prompts.py +27 -16
- modules/semantic_similarity.py +12 -3
- modules/utils.py +4 -1
modules/llm.py
CHANGED
|
@@ -6,13 +6,14 @@ import torch
|
|
| 6 |
import logging
|
| 7 |
from transformers import pipeline
|
| 8 |
from modules.utils import setup_logging
|
| 9 |
-
from modules.prompts import
|
| 10 |
-
from modules.models import
|
| 11 |
from openai import OpenAI
|
|
|
|
| 12 |
|
| 13 |
logger = setup_logging()
|
| 14 |
|
| 15 |
-
def call_structured(client: OpenAI, deployment: str,
|
| 16 |
response_model: None,
|
| 17 |
logger: logging.Logger) -> Dict[str, Any]:
|
| 18 |
"""Call Azure OpenAI with structured output"""
|
|
@@ -48,43 +49,26 @@ def call_structured(client: OpenAI, deployment: str, system_prompt: str, user_pr
|
|
| 48 |
return None
|
| 49 |
|
| 50 |
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
# Check for duplicate concepts within the same organization using Azure OpenAI
|
| 55 |
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
# organization: Organization name
|
| 61 |
-
# concept_profile: Text description of the concept to check
|
| 62 |
-
# df: DataFrame containing all application data
|
| 63 |
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
-
#
|
| 69 |
-
|
| 70 |
|
| 71 |
-
|
| 72 |
-
#
|
| 73 |
-
|
| 74 |
|
| 75 |
-
|
| 76 |
-
# if len(other_concepts) == 0:
|
| 77 |
-
# return False
|
| 78 |
|
| 79 |
-
|
| 80 |
-
# logger.info(f"Scope text {concept_profile}")
|
| 81 |
-
# # Construct prompt
|
| 82 |
-
# prompt = prompt_concept(concept_profile, other_concepts)
|
| 83 |
-
|
| 84 |
-
# response = call_structured(client, deployment, prompt, concept_profile, ConceptClassify, logger)
|
| 85 |
-
|
| 86 |
-
# check = response['classification']
|
| 87 |
-
# logger.info(f"Duplicate check response for concept ID {concept_id}: {check}")
|
| 88 |
-
# if check == "YES":
|
| 89 |
-
# return True
|
| 90 |
-
# return False
|
|
|
|
| 6 |
import logging
|
| 7 |
from transformers import pipeline
|
| 8 |
from modules.utils import setup_logging
|
| 9 |
+
from modules.prompts import prompt_innovation
|
| 10 |
+
from modules.models import InnovationClassify
|
| 11 |
from openai import OpenAI
|
| 12 |
+
import pandas
|
| 13 |
|
| 14 |
logger = setup_logging()
|
| 15 |
|
| 16 |
+
def call_structured(client: OpenAI, deployment: str, user_prompt: str,
|
| 17 |
response_model: None,
|
| 18 |
logger: logging.Logger) -> Dict[str, Any]:
|
| 19 |
"""Call Azure OpenAI with structured output"""
|
|
|
|
| 49 |
return None
|
| 50 |
|
| 51 |
|
| 52 |
+
def classify_innovation(client, deployment, concept_id: str, tech: str, rationale: str) -> Dict:
|
| 53 |
+
"""
|
| 54 |
+
Classify level of innovation for given text fields
|
|
|
|
| 55 |
|
| 56 |
+
Args:
|
| 57 |
+
client: AzureOpenAI client instance
|
| 58 |
+
deployment: Azure OpenAI deployment name
|
| 59 |
+
concept: Text description of the concept to check
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
+
Returns:
|
| 62 |
+
Dictionary with classification result
|
| 63 |
+
"""
|
| 64 |
|
| 65 |
+
# concatenate context
|
| 66 |
+
concept = f"Technology: {tech}\nRationale: {rationale}"
|
| 67 |
|
| 68 |
+
logger.info(f"Analyzing Innovation for: {concept_id}")
|
| 69 |
+
# Construct prompt
|
| 70 |
+
prompt = prompt_innovation(concept)
|
| 71 |
|
| 72 |
+
response = call_structured(client, deployment, prompt, InnovationClassify, logger)
|
|
|
|
|
|
|
| 73 |
|
| 74 |
+
return response
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/models.py
CHANGED
|
@@ -1,8 +1,16 @@
|
|
| 1 |
-
from
|
| 2 |
-
from
|
| 3 |
|
| 4 |
#===================== Duplicate concepts =====================
|
| 5 |
|
| 6 |
-
class
|
| 7 |
-
classification: Literal["
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
|
|
|
|
| 1 |
+
from pydantic import BaseModel, Field, conint
|
| 2 |
+
from typing import Literal, List
|
| 3 |
|
| 4 |
#===================== Duplicate concepts =====================
|
| 5 |
|
| 6 |
+
class InnovationClassify(BaseModel):
|
| 7 |
+
classification: Literal["INSUFFICIENT INFO","NOT INNOVATIVE", "MODERATELY INNOVATIVE", "VERY INNOVATIVE"] = Field(
|
| 8 |
+
description="Overall innovation level using the rubric."
|
| 9 |
+
)
|
| 10 |
+
rationale: str = Field(
|
| 11 |
+
description="1–2 sentences that justify the classification."
|
| 12 |
+
)
|
| 13 |
+
confidence: int = Field(
|
| 14 |
+
description="0–100 confidence in the classification given the detail level in the concept."
|
| 15 |
+
)
|
| 16 |
|
modules/pipeline.py
CHANGED
|
@@ -11,7 +11,7 @@ from openpyxl.styles import Font, NamedStyle, PatternFill
|
|
| 11 |
from openpyxl.styles.differential import DifferentialStyle
|
| 12 |
from modules.org_count import standardize_organization_names
|
| 13 |
from modules.utils import clean_text, extract_predicted_labels
|
| 14 |
-
|
| 15 |
from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
|
| 16 |
from sentence_transformers import SentenceTransformer
|
| 17 |
import logging
|
|
@@ -176,6 +176,9 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 176 |
'technology': 'tech_txt',
|
| 177 |
'financial': 'fin_txt',
|
| 178 |
'barrier': 'bar_txt',
|
|
|
|
|
|
|
|
|
|
| 179 |
'maf_funding_requested': 'maf_funding',
|
| 180 |
'contributions_public_sector': 'cont_public',
|
| 181 |
'contributions_private_sector': 'cont_private',
|
|
@@ -214,9 +217,7 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 214 |
|
| 215 |
# Define models and predictions
|
| 216 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
|
| 217 |
-
model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK']
|
| 218 |
-
# model_names_sf = []
|
| 219 |
-
# model_names = ['ADAPMIT_SCOPE','ADAPMIT_TECH']
|
| 220 |
total_predictions = len(model_names) * len(df)
|
| 221 |
progress_count = 0
|
| 222 |
|
|
@@ -260,12 +261,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 260 |
semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
|
| 261 |
|
| 262 |
# Process duplicate check using batched approach for efficiency
|
| 263 |
-
progress_bar.progress(0.1) # Show initial progress
|
| 264 |
df['duplicate_check'] = check_duplicate_concepts(
|
| 265 |
semantic_model,
|
| 266 |
-
df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
)
|
| 268 |
-
progress_bar.progress(1.0)
|
| 269 |
|
| 270 |
|
| 271 |
logger.info(f"Completed: {model_name}")
|
|
@@ -356,11 +368,11 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
|
|
| 356 |
df = assess_duplicate_concepts(df)
|
| 357 |
|
| 358 |
# Reorder columns in final dataframe
|
| 359 |
-
column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
|
| 360 |
'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
|
| 361 |
'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
|
| 362 |
-
'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
|
| 363 |
-
'
|
| 364 |
|
| 365 |
# Only include columns that exist in the DataFrame
|
| 366 |
final_columns = [col for col in column_order if col in df.columns]
|
|
|
|
| 11 |
from openpyxl.styles.differential import DifferentialStyle
|
| 12 |
from modules.org_count import standardize_organization_names
|
| 13 |
from modules.utils import clean_text, extract_predicted_labels
|
| 14 |
+
from modules.llm import classify_innovation
|
| 15 |
from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
|
| 16 |
from sentence_transformers import SentenceTransformer
|
| 17 |
import logging
|
|
|
|
| 176 |
'technology': 'tech_txt',
|
| 177 |
'financial': 'fin_txt',
|
| 178 |
'barrier': 'bar_txt',
|
| 179 |
+
'technology_rationale': 'tech_rationale_txt',
|
| 180 |
+
'project_rationale': 'project_rationale_txt',
|
| 181 |
+
'project_objectives': 'project_objectives_txt',
|
| 182 |
'maf_funding_requested': 'maf_funding',
|
| 183 |
'contributions_public_sector': 'cont_public',
|
| 184 |
'contributions_private_sector': 'cont_private',
|
|
|
|
| 217 |
|
| 218 |
# Define models and predictions
|
| 219 |
model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
|
| 220 |
+
model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK','INNOVATION_CLASSIFICATION']
|
|
|
|
|
|
|
| 221 |
total_predictions = len(model_names) * len(df)
|
| 222 |
progress_count = 0
|
| 223 |
|
|
|
|
| 261 |
semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
|
| 262 |
|
| 263 |
# Process duplicate check using batched approach for efficiency
|
|
|
|
| 264 |
df['duplicate_check'] = check_duplicate_concepts(
|
| 265 |
semantic_model,
|
| 266 |
+
df,
|
| 267 |
+
progress_callback=lambda p: progress_bar.progress(p)
|
| 268 |
+
)
|
| 269 |
+
|
| 270 |
+
elif model_name == 'INNOVATION_CLASSIFICATION':
|
| 271 |
+
df['innovation_classification'] = df.apply(
|
| 272 |
+
lambda x: classify_innovation(
|
| 273 |
+
azure_client,
|
| 274 |
+
azure_deployment,
|
| 275 |
+
x['id'],
|
| 276 |
+
x['tech_txt'],
|
| 277 |
+
x['tech_rationale_txt']
|
| 278 |
+
),
|
| 279 |
+
axis=1
|
| 280 |
)
|
|
|
|
| 281 |
|
| 282 |
|
| 283 |
logger.info(f"Completed: {model_name}")
|
|
|
|
| 368 |
df = assess_duplicate_concepts(df)
|
| 369 |
|
| 370 |
# Reorder columns in final dataframe
|
| 371 |
+
column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'word_length_check', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt','maf_funding', 'cont_public',
|
| 372 |
'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
|
| 373 |
'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
|
| 374 |
+
'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm','innovation_classification',
|
| 375 |
+
'pred_score', 'pred_action', 'action_rationale']
|
| 376 |
|
| 377 |
# Only include columns that exist in the DataFrame
|
| 378 |
final_columns = [col for col in column_order if col in df.columns]
|
modules/prompts.py
CHANGED
|
@@ -1,21 +1,32 @@
|
|
| 1 |
# Prompts library
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
-
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
{concept}
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
Please review this against the following concepts and assess for duplication:
|
| 17 |
-
{other_concepts}
|
| 18 |
-
|
| 19 |
-
Please conduct your review carefully - however, ensure that you tag all duplicates correctly. Please return your response according to the following structure:
|
| 20 |
-
"""
|
| 21 |
-
return prompt
|
|
|
|
| 1 |
# Prompts library
|
| 2 |
from typing import List
|
| 3 |
|
| 4 |
+
from typing import List
|
| 5 |
+
from textwrap import dedent
|
| 6 |
+
|
| 7 |
+
def prompt_innovation(concept: str) -> str:
|
| 8 |
+
return dedent(f"""
|
| 9 |
+
You are reviewing applications for grant funding.
|
| 10 |
+
|
| 11 |
+
Task: classify the submitted concept by innovation level using the definition below.
|
| 12 |
+
|
| 13 |
+
Definition of innovation:
|
| 14 |
+
Innovative mitigation technologies that have not yet been tested or implemented in the local context.
|
| 15 |
+
Innovation may be novel globally or within the specific country context.
|
| 16 |
+
Examples from our portfolio include tidal stream energy generation, green hydrogen-based production,
|
| 17 |
+
locally manufactured lithium-ion batteries for custom-built e-mobility solutions, or reactive power
|
| 18 |
+
compensation systems to enhance grid stability.
|
| 19 |
+
|
| 20 |
+
Classification rubric:
|
| 21 |
+
- NOT INNOVATIVE: already common/deployed in the local context, or standard practice with no clear novelty.
|
| 22 |
+
- MODERATELY INNOVATIVE: established elsewhere but new to the local context, or a meaningful adaptation.
|
| 23 |
+
- VERY INNOVATIVE: novel approach globally or locally with clear differentiation from standard solutions.
|
| 24 |
+
- INSUFFICIENT INFO: not enough detail to classify innovation level.
|
| 25 |
+
|
| 26 |
+
If the concept is too vague to judge novelty, choose the lowest classification that is defensible and reflect uncertainty in confidence.
|
| 27 |
+
|
| 28 |
+
Concept for review:
|
| 29 |
{concept}
|
| 30 |
+
""").strip()
|
| 31 |
+
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
modules/semantic_similarity.py
CHANGED
|
@@ -15,7 +15,8 @@ def check_duplicate_concepts(
|
|
| 15 |
model: SentenceTransformer,
|
| 16 |
df: pd.DataFrame,
|
| 17 |
similarity_threshold: float = 0.85,
|
| 18 |
-
batch_size: int = 64
|
|
|
|
| 19 |
) -> pd.Series:
|
| 20 |
"""
|
| 21 |
Check for duplicate concepts within the same organization using semantic similarity.
|
|
@@ -31,6 +32,7 @@ def check_duplicate_concepts(
|
|
| 31 |
similarity_threshold: Threshold for considering concepts duplicates (0-1)
|
| 32 |
Recommended values: 0.80 (lenient) to 0.95 (strict)
|
| 33 |
batch_size: Batch size for embedding computation
|
|
|
|
| 34 |
|
| 35 |
Returns:
|
| 36 |
pd.Series of boolean values indexed by df.index, True if concept has a duplicate
|
|
@@ -39,9 +41,11 @@ def check_duplicate_concepts(
|
|
| 39 |
results = pd.Series(False, index=df.index)
|
| 40 |
|
| 41 |
# Pre-compute all embeddings for each field in batches
|
|
|
|
| 42 |
logger.info("Computing embeddings for all concepts...")
|
| 43 |
field_embeddings = {}
|
| 44 |
-
|
|
|
|
| 45 |
texts = df[field].fillna("").astype(str).tolist()
|
| 46 |
embeddings = model.encode(
|
| 47 |
texts,
|
|
@@ -50,12 +54,16 @@ def check_duplicate_concepts(
|
|
| 50 |
show_progress_bar=False
|
| 51 |
)
|
| 52 |
field_embeddings[field] = embeddings
|
|
|
|
|
|
|
|
|
|
| 53 |
logger.info("Embeddings computed for all fields")
|
| 54 |
|
| 55 |
# Group by organization and process each group
|
| 56 |
org_groups = df.groupby('org_renamed')
|
|
|
|
| 57 |
|
| 58 |
-
for org_name, org_df in org_groups:
|
| 59 |
# Skip organizations with only one concept (no duplicates possible)
|
| 60 |
if len(org_df) < 2:
|
| 61 |
continue
|
|
@@ -109,6 +117,7 @@ def check_duplicate_concepts(
|
|
| 109 |
concept_id = df.loc[concept_idx, 'id']
|
| 110 |
logger.info(f"No duplicate found for concept {concept_id}")
|
| 111 |
|
|
|
|
| 112 |
return results
|
| 113 |
|
| 114 |
|
|
|
|
| 15 |
model: SentenceTransformer,
|
| 16 |
df: pd.DataFrame,
|
| 17 |
similarity_threshold: float = 0.85,
|
| 18 |
+
batch_size: int = 64,
|
| 19 |
+
progress_callback=None
|
| 20 |
) -> pd.Series:
|
| 21 |
"""
|
| 22 |
Check for duplicate concepts within the same organization using semantic similarity.
|
|
|
|
| 32 |
similarity_threshold: Threshold for considering concepts duplicates (0-1)
|
| 33 |
Recommended values: 0.80 (lenient) to 0.95 (strict)
|
| 34 |
batch_size: Batch size for embedding computation
|
| 35 |
+
progress_callback: Optional callback function that takes a float (0-1) to report progress
|
| 36 |
|
| 37 |
Returns:
|
| 38 |
pd.Series of boolean values indexed by df.index, True if concept has a duplicate
|
|
|
|
| 41 |
results = pd.Series(False, index=df.index)
|
| 42 |
|
| 43 |
# Pre-compute all embeddings for each field in batches
|
| 44 |
+
# This is the bulk of processing time (~90%)
|
| 45 |
logger.info("Computing embeddings for all concepts...")
|
| 46 |
field_embeddings = {}
|
| 47 |
+
total_fields = len(DUPLICATE_CHECK_FIELDS)
|
| 48 |
+
for field_idx, field in enumerate(DUPLICATE_CHECK_FIELDS):
|
| 49 |
texts = df[field].fillna("").astype(str).tolist()
|
| 50 |
embeddings = model.encode(
|
| 51 |
texts,
|
|
|
|
| 54 |
show_progress_bar=False
|
| 55 |
)
|
| 56 |
field_embeddings[field] = embeddings
|
| 57 |
+
# Report progress during embedding computation
|
| 58 |
+
if progress_callback:
|
| 59 |
+
progress_callback((field_idx + 1) / total_fields)
|
| 60 |
logger.info("Embeddings computed for all fields")
|
| 61 |
|
| 62 |
# Group by organization and process each group
|
| 63 |
org_groups = df.groupby('org_renamed')
|
| 64 |
+
total_orgs = len(org_groups)
|
| 65 |
|
| 66 |
+
for org_idx, (org_name, org_df) in enumerate(org_groups):
|
| 67 |
# Skip organizations with only one concept (no duplicates possible)
|
| 68 |
if len(org_df) < 2:
|
| 69 |
continue
|
|
|
|
| 117 |
concept_id = df.loc[concept_idx, 'id']
|
| 118 |
logger.info(f"No duplicate found for concept {concept_id}")
|
| 119 |
|
| 120 |
+
|
| 121 |
return results
|
| 122 |
|
| 123 |
|
modules/utils.py
CHANGED
|
@@ -52,6 +52,9 @@ def create_excel():
|
|
| 52 |
'technology',
|
| 53 |
'financial',
|
| 54 |
'barrier',
|
|
|
|
|
|
|
|
|
|
| 55 |
'maf_funding_requested',
|
| 56 |
'contributions_public_sector',
|
| 57 |
'contributions_private_sector',
|
|
@@ -60,7 +63,7 @@ def create_excel():
|
|
| 60 |
sheet.append(columns) # Appending columns to the first row
|
| 61 |
|
| 62 |
# formatting
|
| 63 |
-
for c in sheet['A1:
|
| 64 |
c.fill = PatternFill('solid', fgColor = 'bad8e1')
|
| 65 |
c.font = Font(bold=True)
|
| 66 |
|
|
|
|
| 52 |
'technology',
|
| 53 |
'financial',
|
| 54 |
'barrier',
|
| 55 |
+
'technology_rationale',
|
| 56 |
+
'project_rationale',
|
| 57 |
+
'project_objectives',
|
| 58 |
'maf_funding_requested',
|
| 59 |
'contributions_public_sector',
|
| 60 |
'contributions_private_sector',
|
|
|
|
| 63 |
sheet.append(columns) # Appending columns to the first row
|
| 64 |
|
| 65 |
# formatting
|
| 66 |
+
for c in sheet['A1:N4'][0]:
|
| 67 |
c.fill = PatternFill('solid', fgColor = 'bad8e1')
|
| 68 |
c.font = Font(bold=True)
|
| 69 |
|