heymenn commited on
Commit
06cfe93
·
verified ·
1 Parent(s): fdcb3d0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +322 -145
app.py CHANGED
@@ -8,14 +8,15 @@ import re
8
  import urllib.parse
9
  import itertools # For generating pairs
10
  import os
 
11
 
12
  # --- Configuration ---
13
  CATEGORY_JSON_PATH = "categories.json"
14
  TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
15
  MODEL_NAME = 'all-MiniLM-L6-v2'
16
- CATEGORY_SIMILARITY_THRESHOLD = 0.3
17
- MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem
18
- MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs to use for solution search
19
  MAX_SEARCH_REFERENCES_PER_PAIR = 3 # Max references from the API per pair
20
  SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
21
 
@@ -30,47 +31,83 @@ model = None
30
 
31
 
32
  ###- GOOGLE DRIVE API
33
-
34
- from google.oauth2 import service_account
35
- from googleapiclient.discovery import build
36
- from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
37
-
38
- # Environment variables
39
- FOLDER_ID = os.getenv("FOLDER_ID")
40
  GOOGLE_CREDENTIALS = os.environ.get("GOOGLE_CREDENTIALS")
 
41
 
42
- def create_new_file_in_drive(username, dataframe_to_upload, credentials_json, folder_id):
43
- """Crée un nouveau fichier CSV dans Google Drive à partir d'un DataFrame Pandas."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- creds_dict = json.loads(credentials_json)
 
 
 
 
46
 
47
- # Charger les informations d'identification du compte de service
48
- creds = service_account.Credentials.from_service_account_info(creds_dict)
 
49
 
50
- # Construire le service API Drive
51
- service = build('drive', 'v3', credentials=creds)
52
 
53
- # Convertir le DataFrame en fichier CSV en mémoire
54
- csv_buffer = io.BytesIO()
55
- dataframe_to_upload.to_csv(csv_buffer, index=False, sep=';', encoding='utf-8')
56
- csv_buffer.seek(0)
 
57
 
58
- # Créer les métadonnées du fichier
59
- filename = f"rating-results-{username}.csv"
60
- file_metadata = {
61
- 'name': filename,
62
- 'parents': [folder_id]
63
- }
64
 
65
- # Télécharger le fichier CSV sur Google Drive
66
- media = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)
67
- file = service.files().create(body=file_metadata, media_body=media, fields='id').execute()
68
 
69
- print(f"File '{filename}' created successfully.")
 
 
70
 
 
 
 
 
71
 
72
  ###-
73
-
74
 
75
  # --- Load Data and Model (Load once at startup) ---
76
  def load_data_and_model():
@@ -79,7 +116,7 @@ def load_data_and_model():
79
  print("Loading data and model...")
80
  try:
81
  # Load Categories
82
- with open(CATEGORY_JSON_PATH, 'r') as f:
83
  categories_data = json.load(f)["Category"]
84
  category_names = list(categories_data.keys())
85
  category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
@@ -87,7 +124,12 @@ def load_data_and_model():
87
 
88
  # Load Technologies
89
  technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
90
- technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
 
 
 
 
 
91
  technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
92
  # Add a unique ID if 'technology' name isn't unique or for easier embedding mapping
93
  technologies_df['tech_id'] = technologies_df.index
@@ -104,7 +146,6 @@ def load_data_and_model():
104
 
105
  # Pre-compute technology description embeddings
106
  print("Computing technology description embeddings...")
107
- # Ensure descriptions are strings, handle potential errors during embedding
108
  valid_descriptions = technologies_df['description_clean'].tolist()
109
  technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True, show_progress_bar=True)
110
  print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")
@@ -119,86 +160,154 @@ def load_data_and_model():
119
  # --- Helper Functions ---
120
 
121
  def find_best_category(problem_description):
122
- """Finds the most relevant category using pre-computed embeddings."""
 
 
 
123
  if not problem_description or not category_names or category_embeddings is None:
124
  return None, 0.0
125
  try:
126
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
127
  cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
128
  best_score, best_idx = torch.max(cosine_scores, dim=0)
129
- if best_score.item() >= CATEGORY_SIMILARITY_THRESHOLD:
130
- return category_names[best_idx.item()], best_score.item()
131
- else:
132
- return None, best_score.item()
 
 
 
 
 
133
  except Exception as e:
134
  print(f"Error during category finding: {e}")
135
- return None, 0.0
136
 
137
- def find_relevant_technologies(category_name, problem_description):
 
138
  """
139
- Filters technologies by category, calculates similarity with the problem using
140
- pre-computed embeddings, sorts, and returns the top results.
 
141
  """
142
- relevant_tech_data = []
143
- if not category_name or technologies_df.empty or technology_embeddings is None or not problem_description:
 
144
  return pd.DataFrame()
145
 
146
  try:
147
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
148
 
149
- # Filter by category first
150
  for index, row in technologies_df.iterrows():
151
- tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
152
- if category_name in tech_categories:
153
- tech_id = row['tech_id'] # Use the index/id
154
- # Retrieve pre-computed embedding
155
- tech_embedding = technology_embeddings[tech_id]
156
- # Calculate similarity score with the problem
157
- similarity_score = util.pytorch_cos_sim(problem_embedding, tech_embedding)[0][0].item()
158
- relevant_tech_data.append((row, similarity_score))
159
-
160
- relevant_tech_data.sort(key=lambda item: item[1], reverse=True)
161
-
162
- if not relevant_tech_data:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  return pd.DataFrame()
164
 
165
- sorted_rows = [item[0] for item in relevant_tech_data]
166
- scores = [item[1] for item in relevant_tech_data]
 
 
 
 
 
 
 
 
 
 
 
167
 
168
- relevant_df = pd.DataFrame(sorted_rows).reset_index(drop=True) # Reset index after potential filtering
169
- relevant_df['similarity_score_problem'] = scores # Score relative to problem
170
 
171
- return relevant_df.head(MAX_TECHNOLOGIES_TO_SHOW)
 
172
 
173
  except Exception as e:
174
  print(f"Error during technology finding/scoring: {e}")
 
 
175
  return pd.DataFrame()
176
 
177
 
178
  def find_top_technology_pairs(relevant_technologies_df):
179
  """
180
- Calculates similarity between pairs of relevant technologies and returns the top pairs.
 
 
181
  """
182
  if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
 
183
  return []
184
 
185
  pairs_with_scores = []
186
- # Use tech_id (index) to reliably get embeddings
 
 
 
 
 
187
  tech_ids = relevant_technologies_df['tech_id'].tolist()
 
 
 
188
 
189
- # Generate unique pairs of indices (tech_ids)
190
- for idx_a, idx_b in itertools.combinations(tech_ids, 2):
191
  try:
192
  # Retrieve pre-computed embeddings using the original index (tech_id)
193
- embedding_a = technology_embeddings[idx_a]
194
- embedding_b = technology_embeddings[idx_b]
 
 
 
 
 
 
 
 
 
 
 
 
195
 
196
  # Calculate inter-technology similarity
197
  inter_similarity = util.pytorch_cos_sim(embedding_a, embedding_b)[0][0].item()
198
 
199
- # Get technology names corresponding to the indices
200
- tech_name_a = technologies_df.loc[idx_a, 'technology']
201
- tech_name_b = technologies_df.loc[idx_b, 'technology']
202
 
203
  # Clean names for display/use
204
  clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
@@ -207,17 +316,20 @@ def find_top_technology_pairs(relevant_technologies_df):
207
  pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))
208
 
209
  except IndexError:
210
- print(f"Warning: Could not find pre-computed embedding for index {idx_a} or {idx_b}. Skipping pair.")
211
- continue
212
  except Exception as e:
213
- print(f"Error calculating similarity for pair ({idx_a}, {idx_b}): {e}")
214
- continue
 
 
215
 
216
 
217
  # Sort pairs by inter-similarity score (descending)
218
  pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
219
 
220
  # Return the top K pairs
 
221
  return pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH]
222
 
223
 
@@ -227,7 +339,12 @@ def search_solutions_for_pairs(problem_description, top_pairs):
227
  """
228
  results = {} # Store results keyed by the pair tuple
229
  if not top_pairs or not problem_description:
230
- return "No technology pairs identified or problem description missing, cannot search for solutions."
 
 
 
 
 
231
 
232
  headers = {'accept': 'application/json'}
233
 
@@ -237,68 +354,96 @@ def search_solutions_for_pairs(problem_description, top_pairs):
237
 
238
  if not tech_a_name or not tech_b_name: continue # Skip if names are invalid
239
 
240
- # Construct query for the API - include both tech names and "patent" / "research"
241
- # Keep problem description concise
242
- query = f'{problem_description} using {tech_a_name} AND {tech_b_name} patent OR research paper'
 
 
 
243
 
244
  params = {
245
  'query': query,
246
  'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
247
  }
248
- encoded_params = urllib.parse.urlencode(params)
249
  full_url = f"{SEARCH_API_URL}?{encoded_params}"
250
 
251
  pair_key = f"{tech_a_name} + {tech_b_name}" # Key for storing results
252
- print(f"Calling API for pair ({pair_key}): POST {full_url}")
253
 
254
  try:
255
- response = requests.post(full_url, headers=headers, timeout=30)
256
- response.raise_for_status()
257
- api_response = response.json()
 
 
 
 
 
 
 
 
 
258
 
259
  search_results = []
260
- # --- Adapt based on actual API response ---
261
  if isinstance(api_response, list):
262
- search_results = api_response
263
  elif isinstance(api_response, dict) and 'results' in api_response and isinstance(api_response['results'], list):
264
- search_results = api_response['results']
 
 
 
265
  else:
266
- print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response: {api_response}")
 
 
 
 
267
  # --- End adaptation ---
268
 
 
 
 
 
 
 
 
 
 
 
269
  results[pair_key] = {
270
  "score": pair_score, # Store pair score for context
271
- "links": [
272
- {'title': r.get('title', 'N/A'), 'link': r.get('url', '#')}
273
- for r in search_results if isinstance(r, dict)
274
- ]
275
  }
276
 
 
 
 
 
 
 
277
  except requests.exceptions.RequestException as e:
278
  print(f"Error calling search API for pair '{pair_key}': {e}")
279
- results[pair_key] = {"score": pair_score, "error": f"API Error: {e}"}
280
- except json.JSONDecodeError:
281
- err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
282
- print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
283
- results[pair_key] = {"score": pair_score, "error": err_msg}
284
  except Exception as e:
285
- err_msg = f"Unexpected Error: {e}"
286
- print(f"Unexpected error during API call for pair '{pair_key}': {e}")
287
- results[pair_key] = {"score": pair_score, "error": err_msg}
 
 
288
 
289
 
290
  # Format results for display
291
  output = f"### Potential Solutions & Patents (Found using Top {len(results)} Technology Pairs):\n\n"
292
  if not results:
293
- output += "No search results could be retrieved from the API for the technology pairs."
294
  return output
295
 
296
- # Optionally sort results display by pair score, though they should be roughly sorted already
297
- # sorted_results = sorted(results.items(), key=lambda item: item[1].get('score', 0), reverse=True)
298
-
299
- for pair_key, search_data in results.items(): # Use results directly as find_top_technology_pairs already sorted
300
  pair_score = search_data.get('score', 0.0)
301
- output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.2f})\n"
302
 
303
  if "error" in search_data:
304
  output += f"- *Search failed: {search_data['error']}*\n"
@@ -306,15 +451,16 @@ def search_solutions_for_pairs(problem_description, top_pairs):
306
  links = search_data["links"]
307
  if links:
308
  for link_info in links:
309
- href = link_info.get('link', '#')
310
- if not href.startswith(('http://', 'https://')):
311
- href = '#'
312
- output += f"- [{link_info.get('title', 'N/A')}]({href})\n"
 
313
  else:
314
  output += "- *No specific results found by the API for this technology pair.*\n"
315
  else:
316
- output += "- *Unknown search result state.*\n" # Should not happen
317
- output += "\n"
318
 
319
  return output
320
 
@@ -323,55 +469,74 @@ def process_problem(problem_description):
323
  """
324
  Main function called by Gradio interface. Orchestrates the process.
325
  """
 
326
  if not problem_description:
327
  return "Please enter a problem description."
328
 
329
- # 1. Categorize Problem
330
- category_name, cat_score = find_best_category(problem_description)
331
  if category_name:
332
- category_output = f"**Identified Category:** {category_name} (Similarity Score: {cat_score:.2f})"
 
333
  else:
334
- category_output = f"**Could not confidently identify a relevant category.** (Highest score: {cat_score:.2f})"
 
335
 
336
- # 2. Find Relevant Technologies (relative to problem)
337
- relevant_technologies_df = find_relevant_technologies(category_name, problem_description)
 
 
338
 
339
  tech_output = ""
340
  if not relevant_technologies_df.empty:
341
- tech_output += f"### Top {len(relevant_technologies_df)} Relevant Technologies (based on similarity to problem):\n\n"
 
342
  for _, row in relevant_technologies_df.iterrows():
343
- # Clean name for display
344
- tech_name = re.sub(r'^- Title\s*:\s*', '', str(row['technology'])).strip()
345
- tech_output += f"- **{tech_name}** (Problem Relevance: {row['similarity_score_problem']:.2f})\n"
346
- # Optionally show description again, or omit for brevity
347
- # desc_lines = str(row['description']).split('<br>')
348
- # cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
349
- # tech_output += f" Description: {cleaned_desc[:100]}...\n" # Truncated description
 
 
350
  tech_output += "\n---\n" # Add separator
351
- elif category_name:
352
- tech_output = f"No specific technologies found listed under the '{category_name}' category.\n\n---\n"
353
  else:
354
- tech_output = "No relevant technologies could be identified.\n\n---\n"
355
 
356
 
357
- # 3. Find Top Technology Pairs (based on inter-similarity)
358
  top_pairs = find_top_technology_pairs(relevant_technologies_df)
 
359
 
360
  pairs_output = ""
361
  if top_pairs:
362
- pairs_output += f"### Top {len(top_pairs)} Technology Pairs (based on inter-similarity, used for search):\n\n"
363
- for pair_names, score in top_pairs:
364
- pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.2f})\n"
365
- pairs_output += "\n---\n"
366
- else:
367
- pairs_output = "Could not identify relevant technology pairs for search.\n\n---\n"
368
-
369
- # 4. Search for Solutions using Pairs
 
 
 
370
  solution_output = search_solutions_for_pairs(problem_description, top_pairs)
 
371
 
372
  # 5. Combine Outputs for Gradio
373
- final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n{pairs_output}\n{solution_output}"
 
 
 
 
 
 
 
374
 
 
375
  return final_output
376
 
377
  # --- Create Gradio Interface ---
@@ -391,24 +556,36 @@ if interface_enabled:
391
  fn=process_problem,
392
  inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here... e.g., 'Develop low-latency communication protocols for 6G networks'"),
393
  outputs=gr.Markdown(label="Analysis and Potential Solutions"),
394
- title="Technical Problem Analyzer v3 (Technology Pairs Search)",
395
- description="Enter a technical problem. The app categorizes it, finds relevant technologies, identifies the most similar *pairs* of technologies, and searches for patents/research using these pairs via API.",
 
 
 
 
 
 
396
  examples=[
397
  ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
398
  ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
399
  ["Develop low-latency communication protocols for 6G networks"],
400
- ["Design efficient routing algorithms for large scale mesh networks in smart cities"]
 
 
401
  ],
402
  allow_flagging='never',
 
 
403
  )
404
  else:
405
  # Provide a dummy interface indicating failure
406
  def error_fn():
407
- return "Application failed to initialize. Please check the logs."
408
  iface = gr.Interface(fn=error_fn, inputs=[], outputs=gr.Markdown(), title="Initialization Failed")
409
 
410
 
411
  # --- Launch the App ---
412
  if __name__ == "__main__":
413
  print("Launching Gradio app...")
 
 
414
  iface.launch()
 
8
  import urllib.parse
9
  import itertools # For generating pairs
10
  import os
11
+ import io # Required for Google Drive upload
12
 
13
  # --- Configuration ---
14
  CATEGORY_JSON_PATH = "categories.json"
15
  TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
16
  MODEL_NAME = 'all-MiniLM-L6-v2'
17
+ CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
18
+ MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
19
+ MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
20
  MAX_SEARCH_REFERENCES_PER_PAIR = 3 # Max references from the API per pair
21
  SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
22
 
 
31
 
32
 
33
  ###- GOOGLE DRIVE API
34
+ # Check if running in an environment where Google Credentials are set
35
+ # Use placeholder credentials if not found, but functionality will fail
 
 
 
 
 
36
  GOOGLE_CREDENTIALS = os.environ.get("GOOGLE_CREDENTIALS")
37
+ FOLDER_ID = os.getenv("FOLDER_ID") # Optional: Folder ID for uploads
38
 
39
+ # Only import Google libraries if credentials are potentially available
40
+ if GOOGLE_CREDENTIALS:
41
+ try:
42
+ from google.oauth2 import service_account
43
+ from googleapiclient.discovery import build
44
+ from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
45
+ GOOGLE_API_AVAILABLE = True
46
+ print("Google API libraries loaded.")
47
+ except ImportError:
48
+ print("Warning: Google API libraries not found. Google Drive upload will be disabled.")
49
+ GOOGLE_API_AVAILABLE = False
50
+ else:
51
+ print("Warning: GOOGLE_CREDENTIALS environment variable not set. Google Drive upload will be disabled.")
52
+ GOOGLE_API_AVAILABLE = False
53
+ # Define dummy functions or handle calls gracefully if needed elsewhere
54
+ def create_new_file_in_drive(*args, **kwargs):
55
+ print("Google Drive upload skipped: Credentials not configured.")
56
+ return None
57
+
58
+
59
+ if GOOGLE_API_AVAILABLE:
60
+ def create_new_file_in_drive(username, dataframe_to_upload, credentials_json_str, folder_id):
61
+ """Crée un nouveau fichier CSV dans Google Drive à partir d'un DataFrame Pandas."""
62
+ print(f"Attempting to upload results for user: {username}")
63
+ if not credentials_json_str:
64
+ print("Error: Google Credentials JSON string is empty.")
65
+ return None
66
+ if not folder_id:
67
+ print("Warning: Google Drive FOLDER_ID not specified. Upload might fail or go to root.")
68
+ # Decide if you want to default to root or fail
69
+ # return None # Option: Fail if no folder ID
70
 
71
+ try:
72
+ creds_dict = json.loads(credentials_json_str)
73
+ except json.JSONDecodeError as e:
74
+ print(f"Error decoding Google Credentials JSON: {e}")
75
+ return None
76
 
77
+ try:
78
+ # Charger les informations d'identification du compte de service
79
+ creds = service_account.Credentials.from_service_account_info(creds_dict)
80
 
81
+ # Construire le service API Drive
82
+ service = build('drive', 'v3', credentials=creds)
83
 
84
+ # Convertir le DataFrame en fichier CSV en mémoire
85
+ csv_buffer = io.BytesIO()
86
+ # Ensure UTF-8 encoding, especially with BOM for Excel compatibility if needed
87
+ dataframe_to_upload.to_csv(csv_buffer, index=False, sep=';', encoding='utf-8-sig')
88
+ csv_buffer.seek(0)
89
 
90
+ # Créer les métadonnées du fichier
91
+ filename = f"rating-results-{username}.csv" # Consider adding a timestamp
92
+ file_metadata = {'name': filename}
93
+ if folder_id:
94
+ file_metadata['parents'] = [folder_id]
 
95
 
96
+ # Télécharger le fichier CSV sur Google Drive
97
+ media = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)
98
+ file = service.files().create(body=file_metadata, media_body=media, fields='id, name, webViewLink').execute()
99
 
100
+ print(f"File '{file.get('name')}' created successfully in Google Drive. ID: {file.get('id')}")
101
+ print(f"Link: {file.get('webViewLink')}") # Optional: print link
102
+ return file.get('id')
103
 
104
+ except Exception as e:
105
+ print(f"Error during Google Drive upload: {e}")
106
+ # Consider more specific error handling (e.g., authentication errors)
107
+ return None
108
 
109
  ###-
110
+
111
 
112
  # --- Load Data and Model (Load once at startup) ---
113
  def load_data_and_model():
 
116
  print("Loading data and model...")
117
  try:
118
  # Load Categories
119
+ with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f: # Specify encoding
120
  categories_data = json.load(f)["Category"]
121
  category_names = list(categories_data.keys())
122
  category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
 
124
 
125
  # Load Technologies
126
  technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
127
+ # Clean column names (remove leading/trailing spaces)
128
+ technologies_df.columns = technologies_df.columns.str.strip()
129
+ # Ensure required columns exist
130
+ if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
131
+ raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
132
+ technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str) # Use .get for optional category
133
  technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
134
  # Add a unique ID if 'technology' name isn't unique or for easier embedding mapping
135
  technologies_df['tech_id'] = technologies_df.index
 
146
 
147
  # Pre-compute technology description embeddings
148
  print("Computing technology description embeddings...")
 
149
  valid_descriptions = technologies_df['description_clean'].tolist()
150
  technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True, show_progress_bar=True)
151
  print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")
 
160
  # --- Helper Functions ---
161
 
162
  def find_best_category(problem_description):
163
+ """
164
+ Finds the most relevant category using pre-computed embeddings.
165
+ This is now primarily for informational output.
166
+ """
167
  if not problem_description or not category_names or category_embeddings is None:
168
  return None, 0.0
169
  try:
170
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
171
  cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
172
  best_score, best_idx = torch.max(cosine_scores, dim=0)
173
+ # Return the best category regardless of threshold, but indicate confidence
174
+ best_category_name = category_names[best_idx.item()]
175
+ best_category_score = best_score.item()
176
+
177
+ # Decide if the match is confident enough to strongly suggest
178
+ is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
179
+
180
+ return best_category_name, best_category_score, is_confident
181
+
182
  except Exception as e:
183
  print(f"Error during category finding: {e}")
184
+ return None, 0.0, False
185
 
186
+ # --- MODIFIED FUNCTION ---
187
+ def find_relevant_technologies(problem_description):
188
  """
189
+ Calculates similarity between the problem description and ALL technology
190
+ descriptions using pre-computed embeddings, sorts, and returns the top results.
191
+ Category is no longer used for filtering here.
192
  """
193
+ all_tech_data = []
194
+ if technologies_df.empty or technology_embeddings is None or not problem_description:
195
+ print("Warning: Technologies DF, embeddings, or problem description missing.")
196
  return pd.DataFrame()
197
 
198
  try:
199
  problem_embedding = model.encode(problem_description, convert_to_tensor=True)
200
 
201
+ # Iterate through ALL technologies
202
  for index, row in technologies_df.iterrows():
203
+ tech_id = row['tech_id'] # Use the pre-assigned index/id
204
+
205
+ # Ensure tech_id is within the bounds of the embeddings tensor
206
+ if tech_id >= technology_embeddings.shape[0]:
207
+ print(f"Warning: tech_id {tech_id} is out of bounds for technology_embeddings (shape: {technology_embeddings.shape}). Skipping.")
208
+ continue
209
+
210
+ # Retrieve pre-computed embedding using tech_id
211
+ tech_embedding = technology_embeddings[tech_id]
212
+
213
+ # Calculate similarity score with the problem
214
+ # Ensure embeddings are compatible (e.g., both are single vectors)
215
+ if problem_embedding.ndim == 1:
216
+ problem_embedding_exp = problem_embedding.unsqueeze(0) # Add batch dimension if needed
217
+ else:
218
+ problem_embedding_exp = problem_embedding
219
+
220
+ if tech_embedding.ndim == 1:
221
+ tech_embedding_exp = tech_embedding.unsqueeze(0)
222
+ else:
223
+ tech_embedding_exp = tech_embedding
224
+
225
+
226
+ similarity_score = util.pytorch_cos_sim(problem_embedding_exp, tech_embedding_exp)[0][0].item()
227
+
228
+ # Store the original row data and the similarity score
229
+ all_tech_data.append({'data': row.to_dict(), 'similarity_score_problem': similarity_score})
230
+
231
+
232
+ # Sort technologies based on similarity to the problem (descending)
233
+ all_tech_data.sort(key=lambda item: item['similarity_score_problem'], reverse=True)
234
+
235
+ if not all_tech_data:
236
+ print("No technologies found or scored.")
237
  return pd.DataFrame()
238
 
239
+ # Create DataFrame from the top N results
240
+ # Extract the 'data' part (which is a dict) for DataFrame creation
241
+ top_tech_rows = [item['data'] for item in all_tech_data[:MAX_TECHNOLOGIES_TO_SHOW]]
242
+ # Extract the corresponding scores
243
+ top_tech_scores = [item['similarity_score_problem'] for item in all_tech_data[:MAX_TECHNOLOGIES_TO_SHOW]]
244
+
245
+ if not top_tech_rows:
246
+ return pd.DataFrame()
247
+
248
+ relevant_df = pd.DataFrame(top_tech_rows)
249
+ # Important: Ensure the index aligns if you add the score column later
250
+ relevant_df = relevant_df.reset_index(drop=True)
251
+ relevant_df['similarity_score_problem'] = top_tech_scores # Add scores as a new column
252
 
 
 
253
 
254
+ # print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
255
+ return relevant_df # Return the top N technologies based on problem similarity
256
 
257
  except Exception as e:
258
  print(f"Error during technology finding/scoring: {e}")
259
+ import traceback
260
+ traceback.print_exc() # Print full traceback for debugging
261
  return pd.DataFrame()
262
 
263
 
264
  def find_top_technology_pairs(relevant_technologies_df):
265
  """
266
+ Calculates similarity between pairs of the identified relevant technologies
267
+ (which were selected based on problem similarity) and returns the top pairs.
268
+ Uses pre-computed embeddings.
269
  """
270
  if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
271
+ # print("Warning: Not enough relevant technologies (<2) or embeddings missing for pairing.")
272
  return []
273
 
274
  pairs_with_scores = []
275
+ # Use tech_id (which should be the original index) to reliably get embeddings
276
+ # Check if 'tech_id' column exists in the relevant_technologies_df
277
+ if 'tech_id' not in relevant_technologies_df.columns:
278
+ print("Error: 'tech_id' column missing in relevant_technologies_df. Cannot proceed with pairing.")
279
+ return []
280
+
281
  tech_ids = relevant_technologies_df['tech_id'].tolist()
282
+ # Create a mapping from tech_id back to the technology name in the relevant subset for easy lookup
283
+ tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()
284
+
285
 
286
+ # Generate unique pairs of tech_ids from the relevant list
287
+ for id_a, id_b in itertools.combinations(tech_ids, 2):
288
  try:
289
  # Retrieve pre-computed embeddings using the original index (tech_id)
290
+ # Add boundary checks again just in case
291
+ if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
292
+ print(f"Warning: tech_id {id_a} or {id_b} out of bounds for embeddings. Skipping pair.")
293
+ continue
294
+
295
+ embedding_a = technology_embeddings[id_a]
296
+ embedding_b = technology_embeddings[id_b]
297
+
298
+ # Ensure embeddings are 1D or correctly shaped for cos_sim
299
+ if embedding_a.ndim > 1: embedding_a = embedding_a.squeeze()
300
+ if embedding_b.ndim > 1: embedding_b = embedding_b.squeeze()
301
+ if embedding_a.ndim == 0 or embedding_b.ndim == 0: # Check if squeeze resulted in 0-dim tensor
302
+ print(f"Warning: Invalid embedding dimension after squeeze for pair ({id_a}, {id_b}). Skipping.")
303
+ continue
304
 
305
  # Calculate inter-technology similarity
306
  inter_similarity = util.pytorch_cos_sim(embedding_a, embedding_b)[0][0].item()
307
 
308
+ # Get technology names using the mapping created earlier
309
+ tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
310
+ tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")
311
 
312
  # Clean names for display/use
313
  clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
 
316
  pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))
317
 
318
  except IndexError:
319
+ print(f"Warning: Could not find pre-computed embedding for index {id_a} or {id_b}. Skipping pair.")
320
+ continue
321
  except Exception as e:
322
+ print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
323
+ import traceback
324
+ traceback.print_exc()
325
+ continue
326
 
327
 
328
  # Sort pairs by inter-similarity score (descending)
329
  pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
330
 
331
  # Return the top K pairs
332
+ # print(f"Top pairs identified: {pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH]}") # Debug print
333
  return pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH]
334
 
335
 
 
339
  """
340
  results = {} # Store results keyed by the pair tuple
341
  if not top_pairs or not problem_description:
342
+ # Provide a more informative message if no pairs were generated
343
+ if not top_pairs:
344
+ return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n"
345
+ else: # problem_description must be missing
346
+ return "Problem description is missing. Cannot search for solutions.\n"
347
+
348
 
349
  headers = {'accept': 'application/json'}
350
 
 
354
 
355
  if not tech_a_name or not tech_b_name: continue # Skip if names are invalid
356
 
357
+ # Construct query for the API
358
+ # Focus query on tech combination and context (patent/research)
359
+ # Keep problem description out of the API query unless the API is designed for it
360
+ # query = f'"{tech_a_name}" AND "{tech_b_name}" patent OR research paper OR application'
361
+ # More targeted query:
362
+ query = f'Combining "{tech_a_name}" and "{tech_b_name}" for applications related to "{problem_description[:100]}..."' # Use snippet of problem
363
 
364
  params = {
365
  'query': query,
366
  'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
367
  }
368
+ encoded_params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) # Ensure proper encoding
369
  full_url = f"{SEARCH_API_URL}?{encoded_params}"
370
 
371
  pair_key = f"{tech_a_name} + {tech_b_name}" # Key for storing results
372
+ print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query: {query}") # Log query separately
373
 
374
  try:
375
+ # Using POST as originally indicated, send params in the body (common for longer queries)
376
+ # If API expects GET, change to requests.get(full_url, headers=headers)
377
+ response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45) # Increased timeout
378
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
379
+
380
+ try:
381
+ api_response = response.json()
382
+ except json.JSONDecodeError:
383
+ err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
384
+ print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
385
+ results[pair_key] = {"score": pair_score, "error": err_msg}
386
+ continue # Skip to next pair
387
 
388
  search_results = []
389
+ # --- Adapt based on actual API response structure ---
390
  if isinstance(api_response, list):
391
+ search_results = api_response # Assumes list of dicts like {'title': '...', 'url': '...'}
392
  elif isinstance(api_response, dict) and 'results' in api_response and isinstance(api_response['results'], list):
393
+ search_results = api_response['results']
394
+ elif isinstance(api_response, dict) and 'references' in api_response and isinstance(api_response['references'], list):
395
+ # Handle potential alternative key name
396
+ search_results = api_response['references']
397
  else:
398
+ print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response: {api_response}")
399
+ # Attempt to extract links if possible, otherwise mark as no results
400
+ # This part needs adjustment based on observed API responses
401
+ search_results = [] # Default to empty if format unknown
402
+
403
  # --- End adaptation ---
404
 
405
+ valid_links = []
406
+ for r in search_results:
407
+ if isinstance(r, dict):
408
+ title = r.get('title', 'N/A')
409
+ url = r.get('url', r.get('link')) # Check for 'url' or 'link'
410
+ if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
411
+ valid_links.append({'title': title, 'link': url})
412
+ elif url:
413
+ print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")
414
+
415
  results[pair_key] = {
416
  "score": pair_score, # Store pair score for context
417
+ "links": valid_links
 
 
 
418
  }
419
 
420
+ except requests.exceptions.Timeout:
421
+ print(f"Error: API call timed out for pair '{pair_key}'")
422
+ results[pair_key] = {"score": pair_score, "error": "API Timeout"}
423
+ except requests.exceptions.HTTPError as e:
424
+ print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
425
+ results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
426
  except requests.exceptions.RequestException as e:
427
  print(f"Error calling search API for pair '{pair_key}': {e}")
428
+ results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
 
 
 
 
429
  except Exception as e:
430
+ err_msg = f"Unexpected Error during API call: {e}"
431
+ print(f"Unexpected error during API call for pair '{pair_key}': {e}")
432
+ import traceback
433
+ traceback.print_exc()
434
+ results[pair_key] = {"score": pair_score, "error": err_msg}
435
 
436
 
437
  # Format results for display
438
  output = f"### Potential Solutions & Patents (Found using Top {len(results)} Technology Pairs):\n\n"
439
  if not results:
440
+ output += "No search results could be retrieved from the API for the generated technology pairs."
441
  return output
442
 
443
+ # Display results in the order they were searched (already sorted by pair score)
444
+ for pair_key, search_data in results.items():
 
 
445
  pair_score = search_data.get('score', 0.0)
446
+ output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n" # More precision
447
 
448
  if "error" in search_data:
449
  output += f"- *Search failed: {search_data['error']}*\n"
 
451
  links = search_data["links"]
452
  if links:
453
  for link_info in links:
454
+ # Ensure title is a string before replacing
455
+ title_str = str(link_info.get('title', 'N/A'))
456
+ # Basic sanitization for Markdown display
457
+ title_sanitized = title_str.replace('[','(').replace(']',')')
458
+ output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
459
  else:
460
  output += "- *No specific results found by the API for this technology pair.*\n"
461
  else:
462
+ output += "- *Unknown search result state.*\n"
463
+ output += "\n" # Add space between pairs
464
 
465
  return output
466
 
 
469
  """
470
  Main function called by Gradio interface. Orchestrates the process.
471
  """
472
+ print(f"\n--- Processing request for: '{problem_description[:100]}...' ---") # Log start
473
  if not problem_description:
474
  return "Please enter a problem description."
475
 
476
+ # 1. Categorize Problem (Informational)
477
+ category_name, cat_score, is_confident = find_best_category(problem_description)
478
  if category_name:
479
+ confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
480
+ category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
481
  else:
482
+ category_output = "**Could not identify a matching category.**"
483
+ print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")
484
 
485
+ # 2. Find Relevant Technologies (relative to problem, across ALL categories)
486
+ # Pass only the problem description now
487
+ relevant_technologies_df = find_relevant_technologies(problem_description)
488
+ print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
489
 
490
  tech_output = ""
491
  if not relevant_technologies_df.empty:
492
+ # Modify the header to clarify the selection criteria
493
+ tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
494
  for _, row in relevant_technologies_df.iterrows():
495
+ # Clean name for display
496
+ tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
497
+ problem_relevance = row.get('similarity_score_problem', 0.0)
498
+ tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n" # More precision
499
+ # Optionally show original category for info
500
+ original_cats = str(row.get('category', 'Unknown')).strip()
501
+ if original_cats:
502
+ tech_output += f" *Original Category listed as: {original_cats}*\n"
503
+
504
  tech_output += "\n---\n" # Add separator
 
 
505
  else:
506
+ tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"
507
 
508
 
509
+ # 3. Find Top Technology Pairs (based on inter-similarity among the relevant ones)
510
  top_pairs = find_top_technology_pairs(relevant_technologies_df)
511
+ print(f"Identified {len(top_pairs)} top technology pairs for searching.")
512
 
513
  pairs_output = ""
514
  if top_pairs:
515
+ # Clarify the source of pairs
516
+ pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
517
+ for pair_names, score in top_pairs:
518
+ pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n" # More precision
519
+ pairs_output += "\n---\n"
520
+ # Don't add output if no pairs found, the search function will handle this
521
+ # else:
522
+ # pairs_output = "Could not identify relevant technology pairs for search (need >= 2 relevant technologies).\n\n---\n"
523
+
524
+ # 4. Search for Solutions using the Top Pairs
525
+ # Pass the original problem description for context if needed by the search function
526
  solution_output = search_solutions_for_pairs(problem_description, top_pairs)
527
+ print("API search for solutions completed.")
528
 
529
  # 5. Combine Outputs for Gradio
530
+ final_output = (
531
+ f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
532
+ f"{category_output}\n\n"
533
+ f"{tech_output}"
534
+ # Only show pairs if they were found
535
+ f"{pairs_output if top_pairs else 'No technology pairs identified to search with.\\n\\n---\\n'}"
536
+ f"{solution_output}"
537
+ )
538
 
539
+ print("--- Processing finished ---")
540
  return final_output
541
 
542
  # --- Create Gradio Interface ---
 
556
  fn=process_problem,
557
  inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here... e.g., 'Develop low-latency communication protocols for 6G networks'"),
558
  outputs=gr.Markdown(label="Analysis and Potential Solutions"),
559
+ title="Technical Problem Analyzer v4 (Cross-Category Relevance)",
560
+ description=(
561
+ "Enter a technical problem. The app:\n"
562
+ "1. Identifies the best matching **category** (for informational purposes).\n"
563
+ "2. Finds the **most relevant technologies** based *directly on your problem description* (across all categories).\n"
564
+ "3. Identifies **promising pairs** among these relevant technologies based on their similarity to each other.\n"
565
+ "4. Searches for **patents/research** using these pairs via an external API."
566
+ ),
567
  examples=[
568
  ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
569
  ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
570
  ["Develop low-latency communication protocols for 6G networks"],
571
+ ["Design efficient routing algorithms for large scale mesh networks in smart cities"],
572
+ ["Create biodegradable packaging material from agricultural waste"], # Example crossing categories potentially
573
+ ["Develop a method for real-time traffic prediction using heterogeneous data sources"]
574
  ],
575
  allow_flagging='never',
576
+ # Add theme for better visuals if desired
577
+ # theme=gr.themes.Soft()
578
  )
579
  else:
580
  # Provide a dummy interface indicating failure
581
  def error_fn():
582
+ return "Application failed to initialize. Please check the logs for errors (e.g., missing files or model issues)."
583
  iface = gr.Interface(fn=error_fn, inputs=[], outputs=gr.Markdown(), title="Initialization Failed")
584
 
585
 
586
  # --- Launch the App ---
587
  if __name__ == "__main__":
588
  print("Launching Gradio app...")
589
+ # Consider adding share=True for public link if running on appropriate infra
590
+ # debug=True can be helpful during development
591
  iface.launch()