Spaces:
Runtime error
Runtime error
File size: 21,518 Bytes
9be7b6e dcf3971 20436a6 dcf3971 20436a6 9be7b6e fdcb3d0 9be7b6e dcf3971 20436a6 06cfe93 2a63273 20436a6 9be7b6e 20436a6 9be7b6e 7ce161c 9be7b6e 7ce161c 9be7b6e 7ce161c 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 06cfe93 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e dcf3971 9be7b6e 20436a6 9be7b6e dcf3971 06cfe93 dcf3971 06cfe93 dcf3971 9be7b6e 06cfe93 20436a6 d6ed968 9be7b6e dcf3971 9be7b6e d6ed968 9be7b6e d6ed968 06cfe93 9be7b6e d6ed968 06cfe93 20436a6 dcf3971 d6ed968 9be7b6e 20436a6 06cfe93 9be7b6e 06cfe93 20436a6 06cfe93 20436a6 9be7b6e 06cfe93 9be7b6e 06cfe93 20436a6 9be7b6e 20436a6 06cfe93 20436a6 06cfe93 20436a6 9be7b6e 20436a6 9be7b6e d6ed968 9be7b6e d6ed968 20436a6 9be7b6e d6ed968 9be7b6e d6ed968 20436a6 d6ed968 9be7b6e d6ed968 9be7b6e d6ed968 06cfe93 20436a6 9be7b6e d6ed968 9be7b6e 20436a6 d6ed968 06cfe93 9be7b6e d6ed968 06cfe93 d6ed968 20436a6 06cfe93 9be7b6e 20436a6 06cfe93 dcf3971 9be7b6e dcf3971 06cfe93 20436a6 9be7b6e 20436a6 9be7b6e 20436a6 9be7b6e d6ed968 9be7b6e 20436a6 9be7b6e dcf3971 9be7b6e dcf3971 9be7b6e dcf3971 9be7b6e dcf3971 9be7b6e dcf3971 9be7b6e dcf3971 06cfe93 dcf3971 06cfe93 d6ed968 9be7b6e 06cfe93 20436a6 dcf3971 06cfe93 9be7b6e dcf3971 9be7b6e 20436a6 9be7b6e dcf3971 9be7b6e 20436a6 9be7b6e 20436a6 06cfe93 05c88fd 06cfe93 9be7b6e 06cfe93 9be7b6e 06cfe93 dcf3971 9be7b6e 06cfe93 05c88fd 06cfe93 05c88fd 9be7b6e 05c88fd 9be7b6e 05c88fd 9be7b6e 05c88fd 06cfe93 9be7b6e 20436a6 9be7b6e dcf3971 9be7b6e dcf3971 9be7b6e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 |
import streamlit as st
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
import requests
import re
import urllib.parse
import itertools # For generating pairs
import os
import io # Keep for potential future use (e.g., local download)
import traceback # Keep for error logging
# -- Fix SSL error
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'
# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2'
CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair
SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
# --- Removed Google Drive Config ---
# --- Global Variables (will be managed by Streamlit's caching) ---
# These are loaded once via the cached function below
# --- Removed Google Drive API Setup ---
# --- Removed Google Drive Function ---
# --- Load Data and Model (Cached) ---
@st.cache_resource # Cache the model and embeddings
def load_data_and_model():
"""Loads data files and the Sentence Transformer model once."""
print("Attempting to load data and model...")
try:
# Load Categories
with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f:
categories_data = json.load(f)["Category"]
category_names = list(categories_data.keys())
category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
print(f"Loaded {len(category_names)} categories.")
# Load Technologies
technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
technologies_df.columns = technologies_df.columns.str.strip()
if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str)
technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
technologies_df['tech_id'] = technologies_df.index # Use index as unique ID
print(f"Loaded {len(technologies_df)} technologies.")
# Load Sentence Transformer Model
model = SentenceTransformer(MODEL_NAME)
print(f"Loaded Sentence Transformer model: {MODEL_NAME}")
# Pre-compute category embeddings
print("Computing category embeddings...")
category_embeddings = model.encode(category_texts, convert_to_tensor=True)
print("Category embeddings computed.")
# Pre-compute technology description embeddings
print("Computing technology description embeddings...")
valid_descriptions = technologies_df['description_clean'].tolist()
technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True)
print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")
return (model, categories_data, category_names, category_embeddings,
technologies_df, technology_embeddings)
except FileNotFoundError as e:
st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.")
print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
return None # Indicate failure
except Exception as e:
st.error(f"ERROR loading data or model: {e}")
print(f"ERROR loading data or model: {e}")
traceback.print_exc()
return None # Indicate failure
# --- Helper Functions (unchanged, use loaded_data) ---
def find_best_category(problem_description, model, category_names, category_embeddings):
"""Finds the most relevant category using pre-computed embeddings."""
if not problem_description or not category_names or category_embeddings is None:
return None, 0.0, False
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
best_score, best_idx = torch.max(cosine_scores, dim=0)
best_category_name = category_names[best_idx.item()]
best_category_score = best_score.item()
is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
return best_category_name, best_category_score, is_confident
except Exception as e:
print(f"Error during category finding: {e}")
return None, 0.0, False
def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings):
"""Calculates similarity between the problem and ALL technology descriptions."""
all_tech_data = []
if technologies_df.empty or technology_embeddings is None or not problem_description:
print("Warning: Technologies DF, embeddings, or problem description missing.")
return pd.DataFrame()
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
# Efficiently calculate all similarities at once
cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0]
# Add scores to the dataframe temporarily
temp_df = technologies_df.copy()
temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy
# Sort by similarity and get top N
relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem')
# print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
return relevant_df
except Exception as e:
print(f"Error during technology finding/scoring: {e}")
traceback.print_exc() # Print full traceback for debugging
return pd.DataFrame()
def find_top_technology_pairs(relevant_technologies_df, technology_embeddings):
"""Calculates similarity between pairs of relevant technologies."""
if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
return []
pairs_with_scores = []
if 'tech_id' not in relevant_technologies_df.columns:
print("Error: 'tech_id' column missing in relevant_technologies_df.")
return []
tech_ids = relevant_technologies_df['tech_id'].tolist()
tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()
for id_a, id_b in itertools.combinations(tech_ids, 2):
try:
# Boundary checks
if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.")
continue
embedding_a = technology_embeddings[id_a]
embedding_b = technology_embeddings[id_b]
# Calculate inter-technology similarity
inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item()
tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")
clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip()
pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))
except Exception as e:
print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
traceback.print_exc()
continue
pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
pairs_with_scores_min_max = []
pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2])
pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:])
return pairs_with_scores_min_max
def search_solutions_for_pairs(problem_description, top_pairs):
"""Searches for solutions/patents using pairs of technologies via the API."""
results = {}
if not top_pairs:
# Return value modified for clarity
return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results
if not problem_description:
return "Problem description is missing. Cannot search for solutions.\n", results
headers = {'accept': 'application/json'}
api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n"
for pair_info in top_pairs:
pair_names, pair_score = pair_info
tech_a_name, tech_b_name = pair_names
if not tech_a_name or not tech_b_name: continue
query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused
params = {
'query': query,
'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
}
pair_key = f"{tech_a_name} + {tech_b_name}"
print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...")
try:
response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45)
response.raise_for_status()
api_response = response.json() # Assume JSON response
search_results = []
# --- Adapt based on actual API response structure ---
if isinstance(api_response, list):
search_results = api_response
elif isinstance(api_response, dict):
# Try common keys for results lists
if 'results' in api_response and isinstance(api_response.get('results'), list):
search_results = api_response['results']
elif 'references' in api_response and isinstance(api_response.get('references'), list):
search_results = api_response['references']
elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility
search_results = api_response['links']
else: # Check if the dict itself contains title/url
if 'title' in api_response and ('url' in api_response or 'link' in api_response):
search_results = [api_response] # Wrap it in a list
else:
print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}")
else:
print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}")
# --- End adaptation ---
valid_links = []
for r in search_results:
if isinstance(r, dict):
title = r.get('title', 'N/A')
url = r.get('url', r.get('link')) # Check for 'url' or 'link'
if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
valid_links.append({'title': title, 'link': url})
elif url:
print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")
results[pair_key] = {"score": pair_score, "links": valid_links}
except requests.exceptions.Timeout:
print(f"Error: API call timed out for pair '{pair_key}'")
results[pair_key] = {"score": pair_score, "error": "API Timeout"}
except requests.exceptions.HTTPError as e:
print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
except requests.exceptions.RequestException as e:
print(f"Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
except json.JSONDecodeError:
err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
results[pair_key] = {"score": pair_score, "error": err_msg}
except Exception as e:
err_msg = f"Unexpected Error during API call: {e}"
print(f"Unexpected error during API call for pair '{pair_key}': {e}")
traceback.print_exc()
results[pair_key] = {"score": pair_score, "error": err_msg}
# Format results for display
if not results:
api_output += "No search results could be retrieved from the API for the generated technology pairs."
return api_output, results # Return formatted string and raw results dict
for pair_key, search_data in results.items():
pair_score = search_data.get('score', 0.0)
api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n"
if "error" in search_data:
api_output += f"- *Search failed: {search_data['error']}*\n"
elif "links" in search_data:
links = search_data["links"]
if links:
for link_info in links:
title_str = str(link_info.get('title', 'N/A'))
title_sanitized = title_str.replace('[','(').replace(']',')')
api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
else:
api_output += "- *No specific results found by the API for this technology pair.*\n"
else:
api_output += "- *Unknown search result state.*\n"
api_output += "\n"
return api_output, results # Return formatted string and raw results dict
# --- Main Processing Function ---
def process_problem(problem_description, loaded_data):
"""
Main function called by Streamlit interface. Orchestrates the process.
Returns the formatted output string AND the relevant technologies DataFrame.
"""
print(f"\n--- Processing request for: '{problem_description[:100]}...' ---")
if not loaded_data:
# This case should ideally be handled before calling process_problem
return "Error: Model and data not loaded.", pd.DataFrame()
(model, categories_data, category_names, category_embeddings,
technologies_df, technology_embeddings) = loaded_data
# 1. Categorize Problem
category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings)
if category_name:
confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
else:
category_output = "**Could not identify a matching category.**"
print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")
# 2. Find Relevant Technologies
relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings)
print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
tech_output = ""
if not relevant_technologies_df.empty:
tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
# Create a list for display, keeping relevant data
display_tech_list = []
for _, row in relevant_technologies_df.iterrows():
tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
problem_relevance = row.get('similarity_score_problem', 0.0)
original_cats = str(row.get('category', 'Unknown')).strip()
tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n"
if original_cats:
tech_output += f" *Original Category listed as: {original_cats}*\n"
tech_output += "\n---\n"
else:
tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"
# 3. Find Top Technology Pairs
top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings)
print(f"Identified {len(top_pairs)} top technology pairs for searching.")
pairs_output = ""
if top_pairs:
pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
for pair_names, score in top_pairs:
pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n"
pairs_output += "\n---\n"
# No 'else' needed here, handled in final assembly
# 4. Search for Solutions using the Top Pairs
solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here
print("API search for solutions completed.")
# 5. Combine Outputs
final_output = (
f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
f"{category_output}\n\n"
f"{tech_output}"
)
if top_pairs:
final_output += pairs_output
else:
final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n"
final_output += solution_output_text
print("--- Processing finished ---")
# Return both the formatted text and the DataFrame (might be useful later)
return final_output, relevant_technologies_df
# --- Streamlit UI ---
def main():
st.set_page_config(page_title="Technical Problem Analyzer", layout="wide")
st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)")
st.markdown(
"""
Enter a technical problem. The app will:
1. Identify the best matching **category** (for informational purposes).
2. Find the **most relevant technologies** based *directly on your problem description*.
3. Identify **promising pairs** among these relevant technologies based on their similarity.
4. Search for **patents/research** using these pairs via an external API.
"""
)
# Load data and model (cached)
loaded_data = load_data_and_model()
if loaded_data is None:
st.error("Application initialization failed. Check logs for details.")
st.stop() # Stop execution if loading failed
# Example problems (optional)
st.subheader("Example Problems:")
examples = [
"How can I establish reliable communication between low-orbit satellites for continuous global monitoring?",
"Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.",
"Develop low-latency communication protocols for 6G networks",
"Design efficient routing algorithms for large scale mesh networks in smart cities",
"Create biodegradable packaging material from agricultural waste",
"Develop a method for real-time traffic prediction using heterogeneous data sources"
]
selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples)
# User input
problem_description_input = st.text_area(
"Enter Technical Problem Description:",
height=150,
placeholder="Describe your technical challenge or requirement here...",
value=selected_example # Use selected example if chosen
)
# Button to trigger analysis
analyze_button = st.button("Analyze Problem")
if analyze_button and problem_description_input:
with st.spinner("Analyzing problem and searching for solutions..."):
# Run the main processing function
analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data)
# Display results
st.markdown("---") # Separator
st.markdown(analysis_output) # Display formatted text results
# --- Removed Google Drive Upload Section ---
# You could potentially add other actions here using relevant_tech_df,
# like displaying it as a table or offering a local download.
# Example: Display relevant technologies table
if not relevant_tech_df.empty:
st.markdown("---")
st.subheader("Relevant Technologies Data")
st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']])
elif analyze_button and not problem_description_input:
st.warning("Please enter a problem description.")
# --- Run the App ---
if __name__ == "__main__":
main()
|