heymenn's picture
Update app.py
9be7b6e verified
import streamlit as st
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
import requests
import re
import urllib.parse
import itertools # For generating pairs
import os
import io # Keep for potential future use (e.g., local download)
import traceback # Keep for error logging
# -- Fix SSL error
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'
# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2'
CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair
SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
# --- Removed Google Drive Config ---
# --- Global Variables (will be managed by Streamlit's caching) ---
# These are loaded once via the cached function below
# --- Removed Google Drive API Setup ---
# --- Removed Google Drive Function ---
# --- Load Data and Model (Cached) ---
@st.cache_resource # Cache the model and embeddings
def load_data_and_model():
"""Loads data files and the Sentence Transformer model once."""
print("Attempting to load data and model...")
try:
# Load Categories
with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f:
categories_data = json.load(f)["Category"]
category_names = list(categories_data.keys())
category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
print(f"Loaded {len(category_names)} categories.")
# Load Technologies
technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
technologies_df.columns = technologies_df.columns.str.strip()
if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str)
technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
technologies_df['tech_id'] = technologies_df.index # Use index as unique ID
print(f"Loaded {len(technologies_df)} technologies.")
# Load Sentence Transformer Model
model = SentenceTransformer(MODEL_NAME)
print(f"Loaded Sentence Transformer model: {MODEL_NAME}")
# Pre-compute category embeddings
print("Computing category embeddings...")
category_embeddings = model.encode(category_texts, convert_to_tensor=True)
print("Category embeddings computed.")
# Pre-compute technology description embeddings
print("Computing technology description embeddings...")
valid_descriptions = technologies_df['description_clean'].tolist()
technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True)
print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")
return (model, categories_data, category_names, category_embeddings,
technologies_df, technology_embeddings)
except FileNotFoundError as e:
st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.")
print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
return None # Indicate failure
except Exception as e:
st.error(f"ERROR loading data or model: {e}")
print(f"ERROR loading data or model: {e}")
traceback.print_exc()
return None # Indicate failure
# --- Helper Functions (unchanged, use loaded_data) ---
def find_best_category(problem_description, model, category_names, category_embeddings):
"""Finds the most relevant category using pre-computed embeddings."""
if not problem_description or not category_names or category_embeddings is None:
return None, 0.0, False
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
best_score, best_idx = torch.max(cosine_scores, dim=0)
best_category_name = category_names[best_idx.item()]
best_category_score = best_score.item()
is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
return best_category_name, best_category_score, is_confident
except Exception as e:
print(f"Error during category finding: {e}")
return None, 0.0, False
def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings):
"""Calculates similarity between the problem and ALL technology descriptions."""
all_tech_data = []
if technologies_df.empty or technology_embeddings is None or not problem_description:
print("Warning: Technologies DF, embeddings, or problem description missing.")
return pd.DataFrame()
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
# Efficiently calculate all similarities at once
cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0]
# Add scores to the dataframe temporarily
temp_df = technologies_df.copy()
temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy
# Sort by similarity and get top N
relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem')
# print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
return relevant_df
except Exception as e:
print(f"Error during technology finding/scoring: {e}")
traceback.print_exc() # Print full traceback for debugging
return pd.DataFrame()
def find_top_technology_pairs(relevant_technologies_df, technology_embeddings):
"""Calculates similarity between pairs of relevant technologies."""
if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
return []
pairs_with_scores = []
if 'tech_id' not in relevant_technologies_df.columns:
print("Error: 'tech_id' column missing in relevant_technologies_df.")
return []
tech_ids = relevant_technologies_df['tech_id'].tolist()
tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()
for id_a, id_b in itertools.combinations(tech_ids, 2):
try:
# Boundary checks
if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.")
continue
embedding_a = technology_embeddings[id_a]
embedding_b = technology_embeddings[id_b]
# Calculate inter-technology similarity
inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item()
tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")
clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip()
pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))
except Exception as e:
print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
traceback.print_exc()
continue
pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
pairs_with_scores_min_max = []
pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2])
pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:])
return pairs_with_scores_min_max
def search_solutions_for_pairs(problem_description, top_pairs):
"""Searches for solutions/patents using pairs of technologies via the API."""
results = {}
if not top_pairs:
# Return value modified for clarity
return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results
if not problem_description:
return "Problem description is missing. Cannot search for solutions.\n", results
headers = {'accept': 'application/json'}
api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n"
for pair_info in top_pairs:
pair_names, pair_score = pair_info
tech_a_name, tech_b_name = pair_names
if not tech_a_name or not tech_b_name: continue
query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused
params = {
'query': query,
'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
}
pair_key = f"{tech_a_name} + {tech_b_name}"
print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...")
try:
response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45)
response.raise_for_status()
api_response = response.json() # Assume JSON response
search_results = []
# --- Adapt based on actual API response structure ---
if isinstance(api_response, list):
search_results = api_response
elif isinstance(api_response, dict):
# Try common keys for results lists
if 'results' in api_response and isinstance(api_response.get('results'), list):
search_results = api_response['results']
elif 'references' in api_response and isinstance(api_response.get('references'), list):
search_results = api_response['references']
elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility
search_results = api_response['links']
else: # Check if the dict itself contains title/url
if 'title' in api_response and ('url' in api_response or 'link' in api_response):
search_results = [api_response] # Wrap it in a list
else:
print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}")
else:
print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}")
# --- End adaptation ---
valid_links = []
for r in search_results:
if isinstance(r, dict):
title = r.get('title', 'N/A')
url = r.get('url', r.get('link')) # Check for 'url' or 'link'
if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
valid_links.append({'title': title, 'link': url})
elif url:
print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")
results[pair_key] = {"score": pair_score, "links": valid_links}
except requests.exceptions.Timeout:
print(f"Error: API call timed out for pair '{pair_key}'")
results[pair_key] = {"score": pair_score, "error": "API Timeout"}
except requests.exceptions.HTTPError as e:
print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
except requests.exceptions.RequestException as e:
print(f"Error calling search API for pair '{pair_key}': {e}")
results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
except json.JSONDecodeError:
err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
results[pair_key] = {"score": pair_score, "error": err_msg}
except Exception as e:
err_msg = f"Unexpected Error during API call: {e}"
print(f"Unexpected error during API call for pair '{pair_key}': {e}")
traceback.print_exc()
results[pair_key] = {"score": pair_score, "error": err_msg}
# Format results for display
if not results:
api_output += "No search results could be retrieved from the API for the generated technology pairs."
return api_output, results # Return formatted string and raw results dict
for pair_key, search_data in results.items():
pair_score = search_data.get('score', 0.0)
api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n"
if "error" in search_data:
api_output += f"- *Search failed: {search_data['error']}*\n"
elif "links" in search_data:
links = search_data["links"]
if links:
for link_info in links:
title_str = str(link_info.get('title', 'N/A'))
title_sanitized = title_str.replace('[','(').replace(']',')')
api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
else:
api_output += "- *No specific results found by the API for this technology pair.*\n"
else:
api_output += "- *Unknown search result state.*\n"
api_output += "\n"
return api_output, results # Return formatted string and raw results dict
# --- Main Processing Function ---
def process_problem(problem_description, loaded_data):
"""
Main function called by Streamlit interface. Orchestrates the process.
Returns the formatted output string AND the relevant technologies DataFrame.
"""
print(f"\n--- Processing request for: '{problem_description[:100]}...' ---")
if not loaded_data:
# This case should ideally be handled before calling process_problem
return "Error: Model and data not loaded.", pd.DataFrame()
(model, categories_data, category_names, category_embeddings,
technologies_df, technology_embeddings) = loaded_data
# 1. Categorize Problem
category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings)
if category_name:
confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
else:
category_output = "**Could not identify a matching category.**"
print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")
# 2. Find Relevant Technologies
relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings)
print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
tech_output = ""
if not relevant_technologies_df.empty:
tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
# Create a list for display, keeping relevant data
display_tech_list = []
for _, row in relevant_technologies_df.iterrows():
tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
problem_relevance = row.get('similarity_score_problem', 0.0)
original_cats = str(row.get('category', 'Unknown')).strip()
tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n"
if original_cats:
tech_output += f" *Original Category listed as: {original_cats}*\n"
tech_output += "\n---\n"
else:
tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"
# 3. Find Top Technology Pairs
top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings)
print(f"Identified {len(top_pairs)} top technology pairs for searching.")
pairs_output = ""
if top_pairs:
pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
for pair_names, score in top_pairs:
pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n"
pairs_output += "\n---\n"
# No 'else' needed here, handled in final assembly
# 4. Search for Solutions using the Top Pairs
solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here
print("API search for solutions completed.")
# 5. Combine Outputs
final_output = (
f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
f"{category_output}\n\n"
f"{tech_output}"
)
if top_pairs:
final_output += pairs_output
else:
final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n"
final_output += solution_output_text
print("--- Processing finished ---")
# Return both the formatted text and the DataFrame (might be useful later)
return final_output, relevant_technologies_df
# --- Streamlit UI ---
def main():
st.set_page_config(page_title="Technical Problem Analyzer", layout="wide")
st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)")
st.markdown(
"""
Enter a technical problem. The app will:
1. Identify the best matching **category** (for informational purposes).
2. Find the **most relevant technologies** based *directly on your problem description*.
3. Identify **promising pairs** among these relevant technologies based on their similarity.
4. Search for **patents/research** using these pairs via an external API.
"""
)
# Load data and model (cached)
loaded_data = load_data_and_model()
if loaded_data is None:
st.error("Application initialization failed. Check logs for details.")
st.stop() # Stop execution if loading failed
# Example problems (optional)
st.subheader("Example Problems:")
examples = [
"How can I establish reliable communication between low-orbit satellites for continuous global monitoring?",
"Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.",
"Develop low-latency communication protocols for 6G networks",
"Design efficient routing algorithms for large scale mesh networks in smart cities",
"Create biodegradable packaging material from agricultural waste",
"Develop a method for real-time traffic prediction using heterogeneous data sources"
]
selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples)
# User input
problem_description_input = st.text_area(
"Enter Technical Problem Description:",
height=150,
placeholder="Describe your technical challenge or requirement here...",
value=selected_example # Use selected example if chosen
)
# Button to trigger analysis
analyze_button = st.button("Analyze Problem")
if analyze_button and problem_description_input:
with st.spinner("Analyzing problem and searching for solutions..."):
# Run the main processing function
analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data)
# Display results
st.markdown("---") # Separator
st.markdown(analysis_output) # Display formatted text results
# --- Removed Google Drive Upload Section ---
# You could potentially add other actions here using relevant_tech_df,
# like displaying it as a table or offering a local download.
# Example: Display relevant technologies table
if not relevant_tech_df.empty:
st.markdown("---")
st.subheader("Relevant Technologies Data")
st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']])
elif analyze_button and not problem_description_input:
st.warning("Please enter a problem description.")
# --- Run the App ---
if __name__ == "__main__":
main()