Spaces:
Runtime error
Runtime error
File size: 28,838 Bytes
dcf3971 20436a6 dcf3971 20436a6 fdcb3d0 06cfe93 dcf3971 20436a6 06cfe93 20436a6 dcf3971 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c 06cfe93 7ce161c dcf3971 20436a6 06cfe93 20436a6 06cfe93 20436a6 dcf3971 06cfe93 20436a6 d6ed968 dcf3971 06cfe93 dcf3971 06cfe93 dcf3971 06cfe93 dcf3971 06cfe93 dcf3971 06cfe93 20436a6 d6ed968 dcf3971 06cfe93 d6ed968 06cfe93 20436a6 d6ed968 06cfe93 d6ed968 06cfe93 d6ed968 06cfe93 20436a6 dcf3971 d6ed968 20436a6 dcf3971 06cfe93 dcf3971 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 d6ed968 20436a6 d6ed968 20436a6 06cfe93 d455ce2 d6ed968 20436a6 d6ed968 06cfe93 d6ed968 20436a6 06cfe93 d6ed968 06cfe93 d6ed968 06cfe93 20436a6 06cfe93 20436a6 06cfe93 d6ed968 06cfe93 20436a6 d6ed968 06cfe93 20436a6 06cfe93 20436a6 d6ed968 06cfe93 d6ed968 20436a6 06cfe93 20436a6 06cfe93 dcf3971 20436a6 dcf3971 06cfe93 dcf3971 06cfe93 20436a6 06cfe93 20436a6 06cfe93 d6ed968 20436a6 06cfe93 dcf3971 20436a6 dcf3971 06cfe93 dcf3971 d6ed968 dcf3971 06cfe93 dcf3971 06cfe93 dcf3971 06cfe93 d6ed968 06cfe93 dcf3971 20436a6 dcf3971 06cfe93 dcf3971 06cfe93 05c88fd 06cfe93 20436a6 dcf3971 06cfe93 20436a6 dcf3971 06cfe93 20436a6 06cfe93 20436a6 06cfe93 05c88fd 06cfe93 05c88fd 06cfe93 20436a6 06cfe93 dcf3971 05c88fd 06cfe93 05c88fd 06cfe93 05c88fd 06cfe93 dcf3971 05c88fd 06cfe93 dcf3971 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 06cfe93 20436a6 dcf3971 06cfe93 d6ed968 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 | import gradio as gr
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
import requests
import re
import urllib.parse
import itertools # For generating pairs
import os
import io # Required for Google Drive upload
# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2'
CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
MAX_SEARCH_REFERENCES_PER_PAIR = 3 # Max references from the API per pair
SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
# --- Global Variables ---
# To store pre-computed embeddings and data
categories_data = {}
category_names = []
category_embeddings = None
technologies_df = pd.DataFrame()
technology_embeddings = None # Will store pre-computed embeddings for descriptions
model = None
###- GOOGLE DRIVE API
# Check if running in an environment where Google Credentials are set
# Use placeholder credentials if not found, but functionality will fail
GOOGLE_CREDENTIALS = os.environ.get("GOOGLE_CREDENTIALS")
FOLDER_ID = os.getenv("FOLDER_ID") # Optional: Folder ID for uploads
# Only import Google libraries if credentials are potentially available
if GOOGLE_CREDENTIALS:
try:
from google.oauth2 import service_account
from googleapiclient.discovery import build
from googleapiclient.http import MediaIoBaseDownload, MediaIoBaseUpload
GOOGLE_API_AVAILABLE = True
print("Google API libraries loaded.")
except ImportError:
print("Warning: Google API libraries not found. Google Drive upload will be disabled.")
GOOGLE_API_AVAILABLE = False
else:
print("Warning: GOOGLE_CREDENTIALS environment variable not set. Google Drive upload will be disabled.")
GOOGLE_API_AVAILABLE = False
# Define dummy functions or handle calls gracefully if needed elsewhere
def create_new_file_in_drive(*args, **kwargs):
print("Google Drive upload skipped: Credentials not configured.")
return None
if GOOGLE_API_AVAILABLE:
def create_new_file_in_drive(username, dataframe_to_upload, credentials_json_str, folder_id):
"""Crée un nouveau fichier CSV dans Google Drive à partir d'un DataFrame Pandas."""
print(f"Attempting to upload results for user: {username}")
if not credentials_json_str:
print("Error: Google Credentials JSON string is empty.")
return None
if not folder_id:
print("Warning: Google Drive FOLDER_ID not specified. Upload might fail or go to root.")
# Decide if you want to default to root or fail
# return None # Option: Fail if no folder ID
try:
creds_dict = json.loads(credentials_json_str)
except json.JSONDecodeError as e:
print(f"Error decoding Google Credentials JSON: {e}")
return None
try:
# Charger les informations d'identification du compte de service
creds = service_account.Credentials.from_service_account_info(creds_dict)
# Construire le service API Drive
service = build('drive', 'v3', credentials=creds)
# Convertir le DataFrame en fichier CSV en mémoire
csv_buffer = io.BytesIO()
# Ensure UTF-8 encoding, especially with BOM for Excel compatibility if needed
dataframe_to_upload.to_csv(csv_buffer, index=False, sep=';', encoding='utf-8-sig')
csv_buffer.seek(0)
# Créer les métadonnées du fichier
filename = f"rating-results-{username}.csv" # Consider adding a timestamp
file_metadata = {'name': filename}
if folder_id:
file_metadata['parents'] = [folder_id]
# Télécharger le fichier CSV sur Google Drive
media = MediaIoBaseUpload(csv_buffer, mimetype='text/csv', resumable=True)
file = service.files().create(body=file_metadata, media_body=media, fields='id, name, webViewLink').execute()
print(f"File '{file.get('name')}' created successfully in Google Drive. ID: {file.get('id')}")
print(f"Link: {file.get('webViewLink')}") # Optional: print link
return file.get('id')
except Exception as e:
print(f"Error during Google Drive upload: {e}")
# Consider more specific error handling (e.g., authentication errors)
return None
###-
# --- Load Data and Model (Load once at startup) ---
def load_data_and_model():
global categories_data, category_names, category_embeddings
global technologies_df, technology_embeddings, model
print("Loading data and model...")
try:
# Load Categories
with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f: # Specify encoding
categories_data = json.load(f)["Category"]
category_names = list(categories_data.keys())
category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
print(f"Loaded {len(category_names)} categories.")
# Load Technologies
technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
# Clean column names (remove leading/trailing spaces)
technologies_df.columns = technologies_df.columns.str.strip()
# Ensure required columns exist
if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str) # Use .get for optional category
technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
# Add a unique ID if 'technology' name isn't unique or for easier embedding mapping
technologies_df['tech_id'] = technologies_df.index
print(f"Loaded {len(technologies_df)} technologies.")
# Load Sentence Transformer Model
model = SentenceTransformer(MODEL_NAME)
print(f"Loaded Sentence Transformer model: {MODEL_NAME}")
# Pre-compute category embeddings
print("Computing category embeddings...")
category_embeddings = model.encode(category_texts, convert_to_tensor=True, show_progress_bar=True)
print("Category embeddings computed.")
# Pre-compute technology description embeddings
print("Computing technology description embeddings...")
valid_descriptions = technologies_df['description_clean'].tolist()
technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True, show_progress_bar=True)
print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")
except FileNotFoundError as e:
print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
raise e
except Exception as e:
print(f"ERROR loading data or model: {e}")
raise e
# --- Helper Functions ---
def find_best_category(problem_description):
"""
Finds the most relevant category using pre-computed embeddings.
This is now primarily for informational output.
"""
if not problem_description or not category_names or category_embeddings is None:
return None, 0.0
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
best_score, best_idx = torch.max(cosine_scores, dim=0)
# Return the best category regardless of threshold, but indicate confidence
best_category_name = category_names[best_idx.item()]
best_category_score = best_score.item()
# Decide if the match is confident enough to strongly suggest
is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
return best_category_name, best_category_score, is_confident
except Exception as e:
print(f"Error during category finding: {e}")
return None, 0.0, False
# --- MODIFIED FUNCTION ---
def find_relevant_technologies(problem_description):
"""
Calculates similarity between the problem description and ALL technology
descriptions using pre-computed embeddings, sorts, and returns the top results.
Category is no longer used for filtering here.
"""
all_tech_data = []
if technologies_df.empty or technology_embeddings is None or not problem_description:
print("Warning: Technologies DF, embeddings, or problem description missing.")
return pd.DataFrame()
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
# Iterate through ALL technologies
for index, row in technologies_df.iterrows():
tech_id = row['tech_id'] # Use the pre-assigned index/id
# Ensure tech_id is within the bounds of the embeddings tensor
if tech_id >= technology_embeddings.shape[0]:
print(f"Warning: tech_id {tech_id} is out of bounds for technology_embeddings (shape: {technology_embeddings.shape}). Skipping.")
continue
# Retrieve pre-computed embedding using tech_id
tech_embedding = technology_embeddings[tech_id]
# Calculate similarity score with the problem
# Ensure embeddings are compatible (e.g., both are single vectors)
if problem_embedding.ndim == 1:
problem_embedding_exp = problem_embedding.unsqueeze(0) # Add batch dimension if needed
else:
problem_embedding_exp = problem_embedding
if tech_embedding.ndim == 1:
tech_embedding_exp = tech_embedding.unsqueeze(0)
else:
tech_embedding_exp = tech_embedding
similarity_score = util.pytorch_cos_sim(problem_embedding_exp, tech_embedding_exp)[0][0].item()
# Store the original row data and the similarity score
all_tech_data.append({'data': row.to_dict(), 'similarity_score_problem': similarity_score})
# Sort technologies based on similarity to the problem (descending)
all_tech_data.sort(key=lambda item: item['similarity_score_problem'], reverse=True)
if not all_tech_data:
print("No technologies found or scored.")
return pd.DataFrame()
# Create DataFrame from the top N results
# Extract the 'data' part (which is a dict) for DataFrame creation
top_tech_rows = [item['data'] for item in all_tech_data[:MAX_TECHNOLOGIES_TO_SHOW]]
# Extract the corresponding scores
top_tech_scores = [item['similarity_score_problem'] for item in all_tech_data[:MAX_TECHNOLOGIES_TO_SHOW]]
if not top_tech_rows:
return pd.DataFrame()
relevant_df = pd.DataFrame(top_tech_rows)
# Important: Ensure the index aligns if you add the score column later
relevant_df = relevant_df.reset_index(drop=True)
relevant_df['similarity_score_problem'] = top_tech_scores # Add scores as a new column
# print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
return relevant_df # Return the top N technologies based on problem similarity
except Exception as e:
print(f"Error during technology finding/scoring: {e}")
import traceback
traceback.print_exc() # Print full traceback for debugging
return pd.DataFrame()
def find_top_technology_pairs(relevant_technologies_df):
"""
Calculates similarity between pairs of the identified relevant technologies
(which were selected based on problem similarity) and returns the top pairs.
Uses pre-computed embeddings.
"""
if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
# print("Warning: Not enough relevant technologies (<2) or embeddings missing for pairing.")
return []
pairs_with_scores = []
# Use tech_id (which should be the original index) to reliably get embeddings
# Check if 'tech_id' column exists in the relevant_technologies_df
if 'tech_id' not in relevant_technologies_df.columns:
print("Error: 'tech_id' column missing in relevant_technologies_df. Cannot proceed with pairing.")
return []
tech_ids = relevant_technologies_df['tech_id'].tolist()
# Create a mapping from tech_id back to the technology name in the relevant subset for easy lookup
tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()
# Generate unique pairs of tech_ids from the relevant list
for id_a, id_b in itertools.combinations(tech_ids, 2):
try:
# Retrieve pre-computed embeddings using the original index (tech_id)
# Add boundary checks again just in case
if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
print(f"Warning: tech_id {id_a} or {id_b} out of bounds for embeddings. Skipping pair.")
continue
embedding_a = technology_embeddings[id_a]
embedding_b = technology_embeddings[id_b]
# Ensure embeddings are 1D or correctly shaped for cos_sim
if embedding_a.ndim > 1: embedding_a = embedding_a.squeeze()
if embedding_b.ndim > 1: embedding_b = embedding_b.squeeze()
if embedding_a.ndim == 0 or embedding_b.ndim == 0: # Check if squeeze resulted in 0-dim tensor
print(f"Warning: Invalid embedding dimension after squeeze for pair ({id_a}, {id_b}). Skipping.")
continue
# Calculate inter-technology similarity
inter_similarity = util.pytorch_cos_sim(embedding_a, embedding_b)[0][0].item()
# Get technology names using the mapping created earlier
tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")
# Clean names for display/use
clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip()
pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))
except IndexError:
print(f"Warning: Could not find pre-computed embedding for index {id_a} or {id_b}. Skipping pair.")
continue
except Exception as e:
print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
import traceback
traceback.print_exc()
continue
# Sort pairs by inter-similarity score (descending)
pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
# Return the top K pairs
# print(f"Top pairs identified: {pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH]}") # Debug print
return pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH]
def search_solutions_for_pairs(problem_description, top_pairs):
"""
Searches for solutions/patents using pairs of technologies via the API.
"""
results = {} # Store results keyed by the pair tuple
if not top_pairs or not problem_description:
# Provide a more informative message if no pairs were generated
if not top_pairs:
return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n"
else: # problem_description must be missing
return "Problem description is missing. Cannot search for solutions.\n"
headers = {'accept': 'application/json'}
for pair_info in top_pairs:
pair_names, pair_score = pair_info
tech_a_name, tech_b_name = pair_names
if not tech_a_name or not tech_b_name: continue # Skip if names are invalid
# Construct query for the API
# Focus query on tech combination and context (patent/research)
# Keep problem description out of the API query unless the API is designed for it
# query = f'"{tech_a_name}" AND "{tech_b_name}" patent OR research paper OR application'
# More targeted query:
query = f'Combining {tech_a_name} and {tech_b_name} for applications related to {problem_description}' # Use snippet of problem
params = {
'query': query,
'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
}
encoded_params = urllib.parse.urlencode(params, quote_via=urllib.parse.quote) # Ensure proper encoding
full_url = f"{SEARCH_API_URL}?{encoded_params}"
pair_key = f"{tech_a_name} + {tech_b_name}" # Key for storing results
print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query: {query}") # Log query separately
try:
# Using POST as originally indicated, send params in the body (common for longer queries)
# If API expects GET, change to requests.get(full_url, headers=headers)
response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45) # Increased timeout
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
try:
api_response = response.json()
except json.JSONDecodeError:
err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
results[pair_key] = {"score": pair_score, "error": err_msg}
continue # Skip to next pair
search_results = []
# --- Adapt based on actual API response structure ---
if isinstance(api_response, list):
search_results = api_response # Assumes list of dicts like {'title': '...', 'url': '...'}
elif isinstance(api_response, dict) and 'results' in api_response and isinstance(api_response['results'], list):
search_results = api_response['results']
elif isinstance(api_response, dict) and 'references' in api_response and isinstance(api_response['references'], list):
# Handle potential alternative key name
search_results = api_response['references']
else:
print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response: {api_response}")
# Attempt to extract links if possible, otherwise mark as no results
# This part needs adjustment based on observed API responses
search_results = [] # Default to empty if format unknown
# --- End adaptation ---
valid_links = []
for r in search_results:
if isinstance(r, dict):
title = r.get('title', 'N/A')
url = r.get('url', r.get('link')) # Check for 'url' or 'link'
if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
valid_links.append({'title': title, 'link': url})
elif url:
print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")
results[pair_key] = {
"score": pair_score, # Store pair score for context
"links": valid_links
}
except requests.exceptions.Timeout:
print(f"Error: API call timed out for pair '{pair_key}'")
results[pair_key] = {"score": pair_score, "error": "API Timeout"}
except requests.exceptions.HTTPError as e:
print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
except requests.exceptions.RequestException as e:
print(f"Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
except Exception as e:
err_msg = f"Unexpected Error during API call: {e}"
print(f"Unexpected error during API call for pair '{pair_key}': {e}")
import traceback
traceback.print_exc()
results[pair_key] = {"score": pair_score, "error": err_msg}
# Format results for display
output = f"### Potential Solutions & Patents (Found using Top {len(results)} Technology Pairs):\n\n"
if not results:
output += "No search results could be retrieved from the API for the generated technology pairs."
return output
# Display results in the order they were searched (already sorted by pair score)
for pair_key, search_data in results.items():
pair_score = search_data.get('score', 0.0)
output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n" # More precision
if "error" in search_data:
output += f"- *Search failed: {search_data['error']}*\n"
elif "links" in search_data:
links = search_data["links"]
if links:
for link_info in links:
# Ensure title is a string before replacing
title_str = str(link_info.get('title', 'N/A'))
# Basic sanitization for Markdown display
title_sanitized = title_str.replace('[','(').replace(']',')')
output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
else:
output += "- *No specific results found by the API for this technology pair.*\n"
else:
output += "- *Unknown search result state.*\n"
output += "\n" # Add space between pairs
return output
# --- Main Processing Function ---
def process_problem(problem_description):
"""
Main function called by Gradio interface. Orchestrates the process.
"""
print(f"\n--- Processing request for: '{problem_description[:100]}...' ---") # Log start
if not problem_description:
return "Please enter a problem description."
# 1. Categorize Problem (Informational)
category_name, cat_score, is_confident = find_best_category(problem_description)
if category_name:
confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
else:
category_output = "**Could not identify a matching category.**"
print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")
# 2. Find Relevant Technologies (relative to problem, across ALL categories)
relevant_technologies_df = find_relevant_technologies(problem_description)
print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
tech_output = ""
if not relevant_technologies_df.empty:
tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
for _, row in relevant_technologies_df.iterrows():
tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
problem_relevance = row.get('similarity_score_problem', 0.0)
tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n"
original_cats = str(row.get('category', 'Unknown')).strip()
if original_cats:
tech_output += f" *Original Category listed as: {original_cats}*\n"
tech_output += "\n---\n" # Add separator
else:
tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"
# 3. Find Top Technology Pairs (based on inter-similarity among the relevant ones)
top_pairs = find_top_technology_pairs(relevant_technologies_df)
print(f"Identified {len(top_pairs)} top technology pairs for searching.")
pairs_output = ""
if top_pairs:
pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
for pair_names, score in top_pairs:
pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n"
pairs_output += "\n---\n"
# Note: The "else" case message will be added during final output assembly
# 4. Search for Solutions using the Top Pairs
solution_output = search_solutions_for_pairs(problem_description, top_pairs)
print("API search for solutions completed.")
# 5. Combine Outputs for Gradio --- CORRECTED SECTION ---
final_output = (
f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
f"{category_output}\n\n"
f"{tech_output}"
# Intentionally left blank line above for structure
)
# Add the pairs section conditionally - This avoids the backslash issue
if top_pairs:
final_output += pairs_output # pairs_output already contains formatting and separators
else:
# Add the "no pairs" message directly here
final_output += "No technology pairs identified to search with.\n\n---\n"
# Add the solution output
final_output += solution_output
# --- END OF CORRECTION ---
print("--- Processing finished ---")
return final_output
# --- Create Gradio Interface ---
print("Setting up Gradio interface...")
# Load data only once when the script starts
try:
load_data_and_model()
interface_enabled = True
except Exception as e:
print(f"FATAL: Failed to initialize application. Error: {e}")
interface_enabled = False
# Only create interface if initialization succeeded
if interface_enabled:
iface = gr.Interface(
fn=process_problem,
inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here... e.g., 'Develop low-latency communication protocols for 6G networks'"),
outputs=gr.Markdown(label="Analysis and Potential Solutions"),
title="Technical Problem Analyzer v4 (Cross-Category Relevance)",
description=(
"Enter a technical problem. The app:\n"
"1. Identifies the best matching **category** (for informational purposes).\n"
"2. Finds the **most relevant technologies** based *directly on your problem description* (across all categories).\n"
"3. Identifies **promising pairs** among these relevant technologies based on their similarity to each other.\n"
"4. Searches for **patents/research** using these pairs via an external API."
),
examples=[
["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
["Develop low-latency communication protocols for 6G networks"],
["Design efficient routing algorithms for large scale mesh networks in smart cities"],
["Create biodegradable packaging material from agricultural waste"], # Example crossing categories potentially
["Develop a method for real-time traffic prediction using heterogeneous data sources"]
],
allow_flagging='never',
# Add theme for better visuals if desired
# theme=gr.themes.Soft()
)
else:
# Provide a dummy interface indicating failure
def error_fn():
return "Application failed to initialize. Please check the logs for errors (e.g., missing files or model issues)."
iface = gr.Interface(fn=error_fn, inputs=[], outputs=gr.Markdown(), title="Initialization Failed")
# --- Launch the App ---
if __name__ == "__main__":
print("Launching Gradio app...")
# Consider adding share=True for public link if running on appropriate infra
# debug=True can be helpful during development
iface.launch() |