mtyrrell commited on
Commit
4fbb7a3
·
1 Parent(s): c8c24db

innovation classification

Browse files
modules/llm.py CHANGED
@@ -6,13 +6,14 @@ import torch
6
  import logging
7
  from transformers import pipeline
8
  from modules.utils import setup_logging
9
- from modules.prompts import prompt_concept
10
- from modules.models import ConceptClassify
11
  from openai import OpenAI
 
12
 
13
  logger = setup_logging()
14
 
15
- def call_structured(client: OpenAI, deployment: str, system_prompt: str, user_prompt: str,
16
  response_model: None,
17
  logger: logging.Logger) -> Dict[str, Any]:
18
  """Call Azure OpenAI with structured output"""
@@ -48,43 +49,26 @@ def call_structured(client: OpenAI, deployment: str, system_prompt: str, user_pr
48
  return None
49
 
50
 
51
- # Not used - results sucked
52
- # def check_duplicate_concepts(client, deployment, concept_id: str, organization: str, concept_profile: str, df) -> bool:
53
- # """
54
- # Check for duplicate concepts within the same organization using Azure OpenAI
55
 
56
- # Args:
57
- # client: AzureOpenAI client instance
58
- # deployment: Azure OpenAI deployment name
59
- # concept_id: ID of the current concept being checked
60
- # organization: Organization name
61
- # concept_profile: Text description of the concept to check
62
- # df: DataFrame containing all application data
63
 
64
- # Returns:
65
- # Boolean classification result
66
- # """
67
 
68
- # # Remove current concept from the dataframe
69
- # df_check = df[df['id'] != concept_id].copy()
70
 
71
- # # Get other concepts from the same organization
72
- # org_concepts = df_check[df_check['org_renamed'] == organization]
73
- # other_concepts = org_concepts['scope_txt'].tolist()
74
 
75
- # # If no other concepts from this organization, return False
76
- # if len(other_concepts) == 0:
77
- # return False
78
 
79
- # logger.info(f"Checking duplicates for concept ID {concept_id} from organization {organization} against {len(other_concepts)} other concept(s).")
80
- # logger.info(f"Scope text {concept_profile}")
81
- # # Construct prompt
82
- # prompt = prompt_concept(concept_profile, other_concepts)
83
-
84
- # response = call_structured(client, deployment, prompt, concept_profile, ConceptClassify, logger)
85
-
86
- # check = response['classification']
87
- # logger.info(f"Duplicate check response for concept ID {concept_id}: {check}")
88
- # if check == "YES":
89
- # return True
90
- # return False
 
6
  import logging
7
  from transformers import pipeline
8
  from modules.utils import setup_logging
9
+ from modules.prompts import prompt_innovation
10
+ from modules.models import InnovationClassify
11
  from openai import OpenAI
12
+ import pandas
13
 
14
  logger = setup_logging()
15
 
16
+ def call_structured(client: OpenAI, deployment: str, user_prompt: str,
17
  response_model: None,
18
  logger: logging.Logger) -> Dict[str, Any]:
19
  """Call Azure OpenAI with structured output"""
 
49
  return None
50
 
51
 
52
+ def classify_innovation(client, deployment, concept_id: str, tech: str, rationale: str) -> Dict:
53
+ """
54
+ Classify level of innovation for given text fields
 
55
 
56
+ Args:
57
+ client: AzureOpenAI client instance
58
+ deployment: Azure OpenAI deployment name
59
+ concept: Text description of the concept to check
 
 
 
60
 
61
+ Returns:
62
+ Dictionary with classification result
63
+ """
64
 
65
+ # concatenate context
66
+ concept = f"Technology: {tech}\nRationale: {rationale}"
67
 
68
+ logger.info(f"Analyzing Innovation for: {concept_id}")
69
+ # Construct prompt
70
+ prompt = prompt_innovation(concept)
71
 
72
+ response = call_structured(client, deployment, prompt, InnovationClassify, logger)
 
 
73
 
74
+ return response
 
 
 
 
 
 
 
 
 
 
 
modules/models.py CHANGED
@@ -1,8 +1,16 @@
1
- from typing import Dict, Any, List, Optional, Literal
2
- from pydantic import BaseModel, Field
3
 
4
  #===================== Duplicate concepts =====================
5
 
6
- class ConceptClassify(BaseModel):
7
- classification: Literal["YES","NO","UNCERTAIN"] = Field(description="Is the concept duplicated in other applications? (yes/no/uncertain)")
 
 
 
 
 
 
 
 
8
 
 
1
+ from pydantic import BaseModel, Field, conint
2
+ from typing import Literal, List
3
 
4
  #===================== Duplicate concepts =====================
5
 
6
+ class InnovationClassify(BaseModel):
7
+ classification: Literal["INSUFFICIENT INFO","NOT INNOVATIVE", "MODERATELY INNOVATIVE", "VERY INNOVATIVE"] = Field(
8
+ description="Overall innovation level using the rubric."
9
+ )
10
+ rationale: str = Field(
11
+ description="1–2 sentences that justify the classification."
12
+ )
13
+ confidence: int = Field(
14
+ description="0–100 confidence in the classification given the detail level in the concept."
15
+ )
16
 
modules/pipeline.py CHANGED
@@ -11,7 +11,7 @@ from openpyxl.styles import Font, NamedStyle, PatternFill
11
  from openpyxl.styles.differential import DifferentialStyle
12
  from modules.org_count import standardize_organization_names
13
  from modules.utils import clean_text, extract_predicted_labels
14
- # from modules.llm import check_duplicate_concepts
15
  from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
16
  from sentence_transformers import SentenceTransformer
17
  import logging
@@ -176,6 +176,9 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
176
  'technology': 'tech_txt',
177
  'financial': 'fin_txt',
178
  'barrier': 'bar_txt',
 
 
 
179
  'maf_funding_requested': 'maf_funding',
180
  'contributions_public_sector': 'cont_public',
181
  'contributions_private_sector': 'cont_private',
@@ -214,9 +217,7 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
214
 
215
  # Define models and predictions
216
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
217
- model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK']
218
- # model_names_sf = []
219
- # model_names = ['ADAPMIT_SCOPE','ADAPMIT_TECH']
220
  total_predictions = len(model_names) * len(df)
221
  progress_count = 0
222
 
@@ -260,12 +261,23 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
260
  semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
261
 
262
  # Process duplicate check using batched approach for efficiency
263
- progress_bar.progress(0.1) # Show initial progress
264
  df['duplicate_check'] = check_duplicate_concepts(
265
  semantic_model,
266
- df
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  )
268
- progress_bar.progress(1.0)
269
 
270
 
271
  logger.info(f"Completed: {model_name}")
@@ -356,11 +368,11 @@ def process_data(uploaded_file, sens_level, azure_client, azure_deployment):
356
  df = assess_duplicate_concepts(df)
357
 
358
  # Reorder columns in final dataframe
359
- column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'scope_txt', 'tech_txt', 'fin_txt', 'maf_funding', 'cont_public',
360
  'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
361
  'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
362
- 'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm',
363
- 'word_length_check', 'pred_score', 'pred_action', 'action_rationale']
364
 
365
  # Only include columns that exist in the DataFrame
366
  final_columns = [col for col in column_order if col in df.columns]
 
11
  from openpyxl.styles.differential import DifferentialStyle
12
  from modules.org_count import standardize_organization_names
13
  from modules.utils import clean_text, extract_predicted_labels
14
+ from modules.llm import classify_innovation
15
  from modules.semantic_similarity import assess_duplicate_concepts, check_duplicate_concepts
16
  from sentence_transformers import SentenceTransformer
17
  import logging
 
176
  'technology': 'tech_txt',
177
  'financial': 'fin_txt',
178
  'barrier': 'bar_txt',
179
+ 'technology_rationale': 'tech_rationale_txt',
180
+ 'project_rationale': 'project_rationale_txt',
181
+ 'project_objectives': 'project_objectives_txt',
182
  'maf_funding_requested': 'maf_funding',
183
  'contributions_public_sector': 'cont_public',
184
  'contributions_private_sector': 'cont_private',
 
217
 
218
  # Define models and predictions
219
  model_names_sf = ['scope_lab1', 'scope_lab2', 'tech_lab1', 'tech_lab3', 'fin_lab2','bar_lab2']
220
+ model_names = model_names_sf + ['ADAPMIT_SCOPE','ADAPMIT_TECH','SECTOR','LANG','DUPLICATE_CHECK','INNOVATION_CLASSIFICATION']
 
 
221
  total_predictions = len(model_names) * len(df)
222
  progress_count = 0
223
 
 
261
  semantic_model = SentenceTransformer('BAAI/bge-m3', device=device)
262
 
263
  # Process duplicate check using batched approach for efficiency
 
264
  df['duplicate_check'] = check_duplicate_concepts(
265
  semantic_model,
266
+ df,
267
+ progress_callback=lambda p: progress_bar.progress(p)
268
+ )
269
+
270
+ elif model_name == 'INNOVATION_CLASSIFICATION':
271
+ df['innovation_classification'] = df.apply(
272
+ lambda x: classify_innovation(
273
+ azure_client,
274
+ azure_deployment,
275
+ x['id'],
276
+ x['tech_txt'],
277
+ x['tech_rationale_txt']
278
+ ),
279
+ axis=1
280
  )
 
281
 
282
 
283
  logger.info(f"Completed: {model_name}")
 
368
  df = assess_duplicate_concepts(df)
369
 
370
  # Reorder columns in final dataframe
371
+ column_order = ['id', 'organization', 'org_renamed', 'concept_count', 'duplicate_check', 'word_length_check', 'scope_txt', 'tech_txt', 'fin_txt', 'bar_txt','maf_funding', 'cont_public',
372
  'cont_private', 'cont_other', 'scope_lab1', 'scope_lab2', 'tech_lab1',
373
  'tech_lab3', 'fin_lab2', 'bar_lab2', 'ADAPMIT_SCOPE', 'ADAPMIT_TECH', 'ADAPMIT', 'SECTOR1',
374
+ 'SECTOR2', 'LANG', 'lev_total', 'lev_gt_0', 'lev_maf_%', 'lev_maf_scale','mitigation_potential', 'cost_effectivness', 'cost_effectivness_norm','innovation_classification',
375
+ 'pred_score', 'pred_action', 'action_rationale']
376
 
377
  # Only include columns that exist in the DataFrame
378
  final_columns = [col for col in column_order if col in df.columns]
modules/prompts.py CHANGED
@@ -1,21 +1,32 @@
1
  # Prompts library
2
  from typing import List
3
 
4
- def prompt_concept(concept: str, other_concepts: List[str]) -> str:
5
- """Generate prompt for classifying concepts by similarity"""
6
- prompt = f"""
7
-
8
- Each organization is allowed to submit up to 6 concepts per year via a web portal. However, in some cases organizations submit the same concept multiple times.
9
- This can happen for various reasons. For example, an organization may erroneously submit the same application twice because they lost access to the previous web session somehow.
10
- In all such cases, it is not usually the case that the duplicate concepts are verbatim identical. It is more usually the case that there is simply high semantic alignment - i.e. it is the same concept, but there are minor superficial differences between each application.
11
- Your task is to review the concept profiles submitted by a particular organization and identify the amount of similarity so that we can in turn identify cases of duplicate concepts.
12
-
13
- Here is the concept profile for review:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  {concept}
 
 
15
 
16
- Please review this against the following concepts and assess for duplication:
17
- {other_concepts}
18
-
19
- Please conduct your review carefully - however, ensure that you tag all duplicates correctly. Please return your response according to the following structure:
20
- """
21
- return prompt
 
1
  # Prompts library
2
  from typing import List
3
 
4
+ from typing import List
5
+ from textwrap import dedent
6
+
7
+ def prompt_innovation(concept: str) -> str:
8
+ return dedent(f"""
9
+ You are reviewing applications for grant funding.
10
+
11
+ Task: classify the submitted concept by innovation level using the definition below.
12
+
13
+ Definition of innovation:
14
+ Innovative mitigation technologies that have not yet been tested or implemented in the local context.
15
+ Innovation may be novel globally or within the specific country context.
16
+ Examples from our portfolio include tidal stream energy generation, green hydrogen-based production,
17
+ locally manufactured lithium-ion batteries for custom-built e-mobility solutions, or reactive power
18
+ compensation systems to enhance grid stability.
19
+
20
+ Classification rubric:
21
+ - NOT INNOVATIVE: already common/deployed in the local context, or standard practice with no clear novelty.
22
+ - MODERATELY INNOVATIVE: established elsewhere but new to the local context, or a meaningful adaptation.
23
+ - VERY INNOVATIVE: novel approach globally or locally with clear differentiation from standard solutions.
24
+ - INSUFFICIENT INFO: not enough detail to classify innovation level.
25
+
26
+ If the concept is too vague to judge novelty, choose the lowest classification that is defensible and reflect uncertainty in confidence.
27
+
28
+ Concept for review:
29
  {concept}
30
+ """).strip()
31
+
32
 
 
 
 
 
 
 
modules/semantic_similarity.py CHANGED
@@ -15,7 +15,8 @@ def check_duplicate_concepts(
15
  model: SentenceTransformer,
16
  df: pd.DataFrame,
17
  similarity_threshold: float = 0.85,
18
- batch_size: int = 64
 
19
  ) -> pd.Series:
20
  """
21
  Check for duplicate concepts within the same organization using semantic similarity.
@@ -31,6 +32,7 @@ def check_duplicate_concepts(
31
  similarity_threshold: Threshold for considering concepts duplicates (0-1)
32
  Recommended values: 0.80 (lenient) to 0.95 (strict)
33
  batch_size: Batch size for embedding computation
 
34
 
35
  Returns:
36
  pd.Series of boolean values indexed by df.index, True if concept has a duplicate
@@ -39,9 +41,11 @@ def check_duplicate_concepts(
39
  results = pd.Series(False, index=df.index)
40
 
41
  # Pre-compute all embeddings for each field in batches
 
42
  logger.info("Computing embeddings for all concepts...")
43
  field_embeddings = {}
44
- for field in DUPLICATE_CHECK_FIELDS:
 
45
  texts = df[field].fillna("").astype(str).tolist()
46
  embeddings = model.encode(
47
  texts,
@@ -50,12 +54,16 @@ def check_duplicate_concepts(
50
  show_progress_bar=False
51
  )
52
  field_embeddings[field] = embeddings
 
 
 
53
  logger.info("Embeddings computed for all fields")
54
 
55
  # Group by organization and process each group
56
  org_groups = df.groupby('org_renamed')
 
57
 
58
- for org_name, org_df in org_groups:
59
  # Skip organizations with only one concept (no duplicates possible)
60
  if len(org_df) < 2:
61
  continue
@@ -109,6 +117,7 @@ def check_duplicate_concepts(
109
  concept_id = df.loc[concept_idx, 'id']
110
  logger.info(f"No duplicate found for concept {concept_id}")
111
 
 
112
  return results
113
 
114
 
 
15
  model: SentenceTransformer,
16
  df: pd.DataFrame,
17
  similarity_threshold: float = 0.85,
18
+ batch_size: int = 64,
19
+ progress_callback=None
20
  ) -> pd.Series:
21
  """
22
  Check for duplicate concepts within the same organization using semantic similarity.
 
32
  similarity_threshold: Threshold for considering concepts duplicates (0-1)
33
  Recommended values: 0.80 (lenient) to 0.95 (strict)
34
  batch_size: Batch size for embedding computation
35
+ progress_callback: Optional callback function that takes a float (0-1) to report progress
36
 
37
  Returns:
38
  pd.Series of boolean values indexed by df.index, True if concept has a duplicate
 
41
  results = pd.Series(False, index=df.index)
42
 
43
  # Pre-compute all embeddings for each field in batches
44
+ # This is the bulk of processing time (~90%)
45
  logger.info("Computing embeddings for all concepts...")
46
  field_embeddings = {}
47
+ total_fields = len(DUPLICATE_CHECK_FIELDS)
48
+ for field_idx, field in enumerate(DUPLICATE_CHECK_FIELDS):
49
  texts = df[field].fillna("").astype(str).tolist()
50
  embeddings = model.encode(
51
  texts,
 
54
  show_progress_bar=False
55
  )
56
  field_embeddings[field] = embeddings
57
+ # Report progress during embedding computation
58
+ if progress_callback:
59
+ progress_callback((field_idx + 1) / total_fields)
60
  logger.info("Embeddings computed for all fields")
61
 
62
  # Group by organization and process each group
63
  org_groups = df.groupby('org_renamed')
64
+ total_orgs = len(org_groups)
65
 
66
+ for org_idx, (org_name, org_df) in enumerate(org_groups):
67
  # Skip organizations with only one concept (no duplicates possible)
68
  if len(org_df) < 2:
69
  continue
 
117
  concept_id = df.loc[concept_idx, 'id']
118
  logger.info(f"No duplicate found for concept {concept_id}")
119
 
120
+
121
  return results
122
 
123
 
modules/utils.py CHANGED
@@ -52,6 +52,9 @@ def create_excel():
52
  'technology',
53
  'financial',
54
  'barrier',
 
 
 
55
  'maf_funding_requested',
56
  'contributions_public_sector',
57
  'contributions_private_sector',
@@ -60,7 +63,7 @@ def create_excel():
60
  sheet.append(columns) # Appending columns to the first row
61
 
62
  # formatting
63
- for c in sheet['A1:K4'][0]:
64
  c.fill = PatternFill('solid', fgColor = 'bad8e1')
65
  c.font = Font(bold=True)
66
 
 
52
  'technology',
53
  'financial',
54
  'barrier',
55
+ 'technology_rationale',
56
+ 'project_rationale',
57
+ 'project_objectives',
58
  'maf_funding_requested',
59
  'contributions_public_sector',
60
  'contributions_private_sector',
 
63
  sheet.append(columns) # Appending columns to the first row
64
 
65
  # formatting
66
+ for c in sheet['A1:N4'][0]:
67
  c.fill = PatternFill('solid', fgColor = 'bad8e1')
68
  c.font = Font(bold=True)
69