Spaces:

heymenn
/

technical-problem-analyzer

Runtime error

App Files Files Community

technical-problem-analyzer / app.py

heymenn

Update app.py

9be7b6e verified 9 months ago

raw

history blame contribute delete

21.5 kB

	import streamlit as st
	import pandas as pd
	import json
	from sentence_transformers import SentenceTransformer, util
	import torch
	import requests
	import re
	import urllib.parse
	import itertools # For generating pairs
	import os
	import io # Keep for potential future use (e.g., local download)
	import traceback # Keep for error logging

	# -- Fix SSL error
	os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'

	# --- Configuration ---
	CATEGORY_JSON_PATH = "categories.json"
	TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
	MODEL_NAME = 'all-MiniLM-L6-v2'
	CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for displaying the best category match
	MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
	MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
	MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair
	SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
	# --- Removed Google Drive Config ---

	# --- Global Variables (will be managed by Streamlit's caching) ---
	# These are loaded once via the cached function below

	# --- Removed Google Drive API Setup ---

	# --- Removed Google Drive Function ---

	# --- Load Data and Model (Cached) ---
	@st.cache_resource # Cache the model and embeddings
	def load_data_and_model():
	"""Loads data files and the Sentence Transformer model once."""
	print("Attempting to load data and model...")
	try:
	# Load Categories
	with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f:
	categories_data = json.load(f)["Category"]
	category_names = list(categories_data.keys())
	category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
	print(f"Loaded {len(category_names)} categories.")

	# Load Technologies
	technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
	technologies_df.columns = technologies_df.columns.str.strip()
	if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
	raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
	technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str)
	technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
	technologies_df['tech_id'] = technologies_df.index # Use index as unique ID
	print(f"Loaded {len(technologies_df)} technologies.")

	# Load Sentence Transformer Model
	model = SentenceTransformer(MODEL_NAME)
	print(f"Loaded Sentence Transformer model: {MODEL_NAME}")

	# Pre-compute category embeddings
	print("Computing category embeddings...")
	category_embeddings = model.encode(category_texts, convert_to_tensor=True)
	print("Category embeddings computed.")

	# Pre-compute technology description embeddings
	print("Computing technology description embeddings...")
	valid_descriptions = technologies_df['description_clean'].tolist()
	technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True)
	print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")

	return (model, categories_data, category_names, category_embeddings,
	technologies_df, technology_embeddings)

	except FileNotFoundError as e:
	st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.")
	print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
	return None # Indicate failure
	except Exception as e:
	st.error(f"ERROR loading data or model: {e}")
	print(f"ERROR loading data or model: {e}")
	traceback.print_exc()
	return None # Indicate failure

	# --- Helper Functions (unchanged, use loaded_data) ---
	def find_best_category(problem_description, model, category_names, category_embeddings):
	"""Finds the most relevant category using pre-computed embeddings."""
	if not problem_description or not category_names or category_embeddings is None:
	return None, 0.0, False
	try:
	problem_embedding = model.encode(problem_description, convert_to_tensor=True)
	cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
	best_score, best_idx = torch.max(cosine_scores, dim=0)
	best_category_name = category_names[best_idx.item()]
	best_category_score = best_score.item()
	is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
	return best_category_name, best_category_score, is_confident
	except Exception as e:
	print(f"Error during category finding: {e}")
	return None, 0.0, False

	def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings):
	"""Calculates similarity between the problem and ALL technology descriptions."""
	all_tech_data = []
	if technologies_df.empty or technology_embeddings is None or not problem_description:
	print("Warning: Technologies DF, embeddings, or problem description missing.")
	return pd.DataFrame()
	try:
	problem_embedding = model.encode(problem_description, convert_to_tensor=True)
	# Efficiently calculate all similarities at once
	cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0]

	# Add scores to the dataframe temporarily
	temp_df = technologies_df.copy()
	temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy

	# Sort by similarity and get top N
	relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem')

	# print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
	return relevant_df

	except Exception as e:
	print(f"Error during technology finding/scoring: {e}")
	traceback.print_exc() # Print full traceback for debugging
	return pd.DataFrame()


	def find_top_technology_pairs(relevant_technologies_df, technology_embeddings):
	"""Calculates similarity between pairs of relevant technologies."""
	if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
	return []

	pairs_with_scores = []
	if 'tech_id' not in relevant_technologies_df.columns:
	print("Error: 'tech_id' column missing in relevant_technologies_df.")
	return []

	tech_ids = relevant_technologies_df['tech_id'].tolist()
	tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()

	for id_a, id_b in itertools.combinations(tech_ids, 2):
	try:
	# Boundary checks
	if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
	print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.")
	continue

	embedding_a = technology_embeddings[id_a]
	embedding_b = technology_embeddings[id_b]

	# Calculate inter-technology similarity
	inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item()

	tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
	tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")

	clean_tech_name_a = re.sub(r'^- Title\s:\s', '', str(tech_name_a)).strip()
	clean_tech_name_b = re.sub(r'^- Title\s:\s', '', str(tech_name_b)).strip()

	pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))

	except Exception as e:
	print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
	traceback.print_exc()
	continue

	pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
	pairs_with_scores_min_max = []
	pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2])
	pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:])
	return pairs_with_scores_min_max


	def search_solutions_for_pairs(problem_description, top_pairs):
	"""Searches for solutions/patents using pairs of technologies via the API."""
	results = {}
	if not top_pairs:
	# Return value modified for clarity
	return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results
	if not problem_description:
	return "Problem description is missing. Cannot search for solutions.\n", results

	headers = {'accept': 'application/json'}
	api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n"

	for pair_info in top_pairs:
	pair_names, pair_score = pair_info
	tech_a_name, tech_b_name = pair_names
	if not tech_a_name or not tech_b_name: continue

	query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused
	params = {
	'query': query,
	'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
	}
	pair_key = f"{tech_a_name} + {tech_b_name}"
	print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...")

	try:
	response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45)
	response.raise_for_status()
	api_response = response.json() # Assume JSON response

	search_results = []
	# --- Adapt based on actual API response structure ---
	if isinstance(api_response, list):
	search_results = api_response
	elif isinstance(api_response, dict):
	# Try common keys for results lists
	if 'results' in api_response and isinstance(api_response.get('results'), list):
	search_results = api_response['results']
	elif 'references' in api_response and isinstance(api_response.get('references'), list):
	search_results = api_response['references']
	elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility
	search_results = api_response['links']
	else: # Check if the dict itself contains title/url
	if 'title' in api_response and ('url' in api_response or 'link' in api_response):
	search_results = [api_response] # Wrap it in a list
	else:
	print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}")
	else:
	print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}")
	# --- End adaptation ---

	valid_links = []
	for r in search_results:
	if isinstance(r, dict):
	title = r.get('title', 'N/A')
	url = r.get('url', r.get('link')) # Check for 'url' or 'link'
	if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
	valid_links.append({'title': title, 'link': url})
	elif url:
	print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")

	results[pair_key] = {"score": pair_score, "links": valid_links}

	except requests.exceptions.Timeout:
	print(f"Error: API call timed out for pair '{pair_key}'")
	results[pair_key] = {"score": pair_score, "error": "API Timeout"}
	except requests.exceptions.HTTPError as e:
	print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
	results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
	except requests.exceptions.RequestException as e:
	print(f"Error calling search API for pair '{pair_key}': {e}")
	results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
	except json.JSONDecodeError:
	err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
	print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
	results[pair_key] = {"score": pair_score, "error": err_msg}
	except Exception as e:
	err_msg = f"Unexpected Error during API call: {e}"
	print(f"Unexpected error during API call for pair '{pair_key}': {e}")
	traceback.print_exc()
	results[pair_key] = {"score": pair_score, "error": err_msg}

	# Format results for display
	if not results:
	api_output += "No search results could be retrieved from the API for the generated technology pairs."
	return api_output, results # Return formatted string and raw results dict

	for pair_key, search_data in results.items():
	pair_score = search_data.get('score', 0.0)
	api_output += f"For Technology Pair: {pair_key} (Inter-Similarity Score: {pair_score:.3f})\n"
	if "error" in search_data:
	api_output += f"- Search failed: {search_data['error']}\n"
	elif "links" in search_data:
	links = search_data["links"]
	if links:
	for link_info in links:
	title_str = str(link_info.get('title', 'N/A'))
	title_sanitized = title_str.replace('[','(').replace(']',')')
	api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
	else:
	api_output += "- No specific results found by the API for this technology pair.\n"
	else:
	api_output += "- Unknown search result state.\n"
	api_output += "\n"

	return api_output, results # Return formatted string and raw results dict

	# --- Main Processing Function ---
	def process_problem(problem_description, loaded_data):
	"""
	Main function called by Streamlit interface. Orchestrates the process.
	Returns the formatted output string AND the relevant technologies DataFrame.
	"""
	print(f"\n--- Processing request for: '{problem_description[:100]}...' ---")
	if not loaded_data:
	# This case should ideally be handled before calling process_problem
	return "Error: Model and data not loaded.", pd.DataFrame()

	(model, categories_data, category_names, category_embeddings,
	technologies_df, technology_embeddings) = loaded_data

	# 1. Categorize Problem
	category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings)
	if category_name:
	confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
	category_output = f"Best Matching Category: {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
	else:
	category_output = "Could not identify a matching category."
	print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")

	# 2. Find Relevant Technologies
	relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings)
	print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
	tech_output = ""
	if not relevant_technologies_df.empty:
	tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
	# Create a list for display, keeping relevant data
	display_tech_list = []
	for _, row in relevant_technologies_df.iterrows():
	tech_name = re.sub(r'^- Title\s:\s', '', str(row.get('technology', 'N/A'))).strip()
	problem_relevance = row.get('similarity_score_problem', 0.0)
	original_cats = str(row.get('category', 'Unknown')).strip()

	tech_output += f"- {tech_name} (Problem Relevance: {problem_relevance:.3f})\n"
	if original_cats:
	tech_output += f" Original Category listed as: {original_cats}\n"

	tech_output += "\n---\n"
	else:
	tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"

	# 3. Find Top Technology Pairs
	top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings)
	print(f"Identified {len(top_pairs)} top technology pairs for searching.")
	pairs_output = ""
	if top_pairs:
	pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
	for pair_names, score in top_pairs:
	pairs_output += f"- {pair_names[0]} + {pair_names[1]} (Inter-Similarity: {score:.3f})\n"
	pairs_output += "\n---\n"
	# No 'else' needed here, handled in final assembly

	# 4. Search for Solutions using the Top Pairs
	solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here
	print("API search for solutions completed.")

	# 5. Combine Outputs
	final_output = (
	f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
	f"{category_output}\n\n"
	f"{tech_output}"
	)
	if top_pairs:
	final_output += pairs_output
	else:
	final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n"

	final_output += solution_output_text

	print("--- Processing finished ---")
	# Return both the formatted text and the DataFrame (might be useful later)
	return final_output, relevant_technologies_df

	# --- Streamlit UI ---
	def main():
	st.set_page_config(page_title="Technical Problem Analyzer", layout="wide")
	st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)")

	st.markdown(
	"""
	Enter a technical problem. The app will:
	1. Identify the best matching category (for informational purposes).
	2. Find the most relevant technologies based directly on your problem description.
	3. Identify promising pairs among these relevant technologies based on their similarity.
	4. Search for patents/research using these pairs via an external API.
	"""
	)

	# Load data and model (cached)
	loaded_data = load_data_and_model()

	if loaded_data is None:
	st.error("Application initialization failed. Check logs for details.")
	st.stop() # Stop execution if loading failed

	# Example problems (optional)
	st.subheader("Example Problems:")
	examples = [
	"How can I establish reliable communication between low-orbit satellites for continuous global monitoring?",
	"Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.",
	"Develop low-latency communication protocols for 6G networks",
	"Design efficient routing algorithms for large scale mesh networks in smart cities",
	"Create biodegradable packaging material from agricultural waste",
	"Develop a method for real-time traffic prediction using heterogeneous data sources"
	]
	selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples)


	# User input
	problem_description_input = st.text_area(
	"Enter Technical Problem Description:",
	height=150,
	placeholder="Describe your technical challenge or requirement here...",
	value=selected_example # Use selected example if chosen
	)

	# Button to trigger analysis
	analyze_button = st.button("Analyze Problem")

	if analyze_button and problem_description_input:
	with st.spinner("Analyzing problem and searching for solutions..."):
	# Run the main processing function
	analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data)

	# Display results
	st.markdown("---") # Separator
	st.markdown(analysis_output) # Display formatted text results

	# --- Removed Google Drive Upload Section ---
	# You could potentially add other actions here using relevant_tech_df,
	# like displaying it as a table or offering a local download.
	# Example: Display relevant technologies table
	if not relevant_tech_df.empty:
	st.markdown("---")
	st.subheader("Relevant Technologies Data")
	st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']])


	elif analyze_button and not problem_description_input:
	st.warning("Please enter a problem description.")

	# --- Run the App ---
	if __name__ == "__main__":
	main()