import streamlit as st import pandas as pd import json from sentence_transformers import SentenceTransformer, util import torch import requests import re import urllib.parse import itertools # For generating pairs import os import io # Keep for potential future use (e.g., local download) import traceback # Keep for error logging # -- Fix SSL error os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt' # --- Configuration --- CATEGORY_JSON_PATH = "categories.json" TECHNOLOGY_EXCEL_PATH = "technologies.xlsx" MODEL_NAME = 'all-MiniLM-L6-v2' CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories) MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search" # --- Removed Google Drive Config --- # --- Global Variables (will be managed by Streamlit's caching) --- # These are loaded once via the cached function below # --- Removed Google Drive API Setup --- # --- Removed Google Drive Function --- # --- Load Data and Model (Cached) --- @st.cache_resource # Cache the model and embeddings def load_data_and_model(): """Loads data files and the Sentence Transformer model once.""" print("Attempting to load data and model...") try: # Load Categories with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f: categories_data = json.load(f)["Category"] category_names = list(categories_data.keys()) category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()] print(f"Loaded {len(category_names)} categories.") # Load Technologies technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH) technologies_df.columns = technologies_df.columns.str.strip() if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns: raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx") technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str) technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str) technologies_df['tech_id'] = technologies_df.index # Use index as unique ID print(f"Loaded {len(technologies_df)} technologies.") # Load Sentence Transformer Model model = SentenceTransformer(MODEL_NAME) print(f"Loaded Sentence Transformer model: {MODEL_NAME}") # Pre-compute category embeddings print("Computing category embeddings...") category_embeddings = model.encode(category_texts, convert_to_tensor=True) print("Category embeddings computed.") # Pre-compute technology description embeddings print("Computing technology description embeddings...") valid_descriptions = technologies_df['description_clean'].tolist() technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True) print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).") return (model, categories_data, category_names, category_embeddings, technologies_df, technology_embeddings) except FileNotFoundError as e: st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.") print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.") return None # Indicate failure except Exception as e: st.error(f"ERROR loading data or model: {e}") print(f"ERROR loading data or model: {e}") traceback.print_exc() return None # Indicate failure # --- Helper Functions (unchanged, use loaded_data) --- def find_best_category(problem_description, model, category_names, category_embeddings): """Finds the most relevant category using pre-computed embeddings.""" if not problem_description or not category_names or category_embeddings is None: return None, 0.0, False try: problem_embedding = model.encode(problem_description, convert_to_tensor=True) cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0] best_score, best_idx = torch.max(cosine_scores, dim=0) best_category_name = category_names[best_idx.item()] best_category_score = best_score.item() is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD return best_category_name, best_category_score, is_confident except Exception as e: print(f"Error during category finding: {e}") return None, 0.0, False def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings): """Calculates similarity between the problem and ALL technology descriptions.""" all_tech_data = [] if technologies_df.empty or technology_embeddings is None or not problem_description: print("Warning: Technologies DF, embeddings, or problem description missing.") return pd.DataFrame() try: problem_embedding = model.encode(problem_description, convert_to_tensor=True) # Efficiently calculate all similarities at once cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0] # Add scores to the dataframe temporarily temp_df = technologies_df.copy() temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy # Sort by similarity and get top N relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem') # print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print return relevant_df except Exception as e: print(f"Error during technology finding/scoring: {e}") traceback.print_exc() # Print full traceback for debugging return pd.DataFrame() def find_top_technology_pairs(relevant_technologies_df, technology_embeddings): """Calculates similarity between pairs of relevant technologies.""" if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None: return [] pairs_with_scores = [] if 'tech_id' not in relevant_technologies_df.columns: print("Error: 'tech_id' column missing in relevant_technologies_df.") return [] tech_ids = relevant_technologies_df['tech_id'].tolist() tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict() for id_a, id_b in itertools.combinations(tech_ids, 2): try: # Boundary checks if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]: print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.") continue embedding_a = technology_embeddings[id_a] embedding_b = technology_embeddings[id_b] # Calculate inter-technology similarity inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item() tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})") tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})") clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip() clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip() pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity)) except Exception as e: print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}") traceback.print_exc() continue pairs_with_scores.sort(key=lambda item: item[1], reverse=True) pairs_with_scores_min_max = [] pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2]) pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:]) return pairs_with_scores_min_max def search_solutions_for_pairs(problem_description, top_pairs): """Searches for solutions/patents using pairs of technologies via the API.""" results = {} if not top_pairs: # Return value modified for clarity return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results if not problem_description: return "Problem description is missing. Cannot search for solutions.\n", results headers = {'accept': 'application/json'} api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n" for pair_info in top_pairs: pair_names, pair_score = pair_info tech_a_name, tech_b_name = pair_names if not tech_a_name or not tech_b_name: continue query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused params = { 'query': query, 'max_references': MAX_SEARCH_REFERENCES_PER_PAIR } pair_key = f"{tech_a_name} + {tech_b_name}" print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...") try: response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45) response.raise_for_status() api_response = response.json() # Assume JSON response search_results = [] # --- Adapt based on actual API response structure --- if isinstance(api_response, list): search_results = api_response elif isinstance(api_response, dict): # Try common keys for results lists if 'results' in api_response and isinstance(api_response.get('results'), list): search_results = api_response['results'] elif 'references' in api_response and isinstance(api_response.get('references'), list): search_results = api_response['references'] elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility search_results = api_response['links'] else: # Check if the dict itself contains title/url if 'title' in api_response and ('url' in api_response or 'link' in api_response): search_results = [api_response] # Wrap it in a list else: print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}") else: print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}") # --- End adaptation --- valid_links = [] for r in search_results: if isinstance(r, dict): title = r.get('title', 'N/A') url = r.get('url', r.get('link')) # Check for 'url' or 'link' if url and isinstance(url, str) and url.startswith(('http://', 'https://')): valid_links.append({'title': title, 'link': url}) elif url: print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}") results[pair_key] = {"score": pair_score, "links": valid_links} except requests.exceptions.Timeout: print(f"Error: API call timed out for pair '{pair_key}'") results[pair_key] = {"score": pair_score, "error": "API Timeout"} except requests.exceptions.HTTPError as e: print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}") results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"} except requests.exceptions.RequestException as e: print(f"Error calling search API for pair '{pair_key}': {e}") results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"} except json.JSONDecodeError: err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}" print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}") results[pair_key] = {"score": pair_score, "error": err_msg} except Exception as e: err_msg = f"Unexpected Error during API call: {e}" print(f"Unexpected error during API call for pair '{pair_key}': {e}") traceback.print_exc() results[pair_key] = {"score": pair_score, "error": err_msg} # Format results for display if not results: api_output += "No search results could be retrieved from the API for the generated technology pairs." return api_output, results # Return formatted string and raw results dict for pair_key, search_data in results.items(): pair_score = search_data.get('score', 0.0) api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n" if "error" in search_data: api_output += f"- *Search failed: {search_data['error']}*\n" elif "links" in search_data: links = search_data["links"] if links: for link_info in links: title_str = str(link_info.get('title', 'N/A')) title_sanitized = title_str.replace('[','(').replace(']',')') api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n" else: api_output += "- *No specific results found by the API for this technology pair.*\n" else: api_output += "- *Unknown search result state.*\n" api_output += "\n" return api_output, results # Return formatted string and raw results dict # --- Main Processing Function --- def process_problem(problem_description, loaded_data): """ Main function called by Streamlit interface. Orchestrates the process. Returns the formatted output string AND the relevant technologies DataFrame. """ print(f"\n--- Processing request for: '{problem_description[:100]}...' ---") if not loaded_data: # This case should ideally be handled before calling process_problem return "Error: Model and data not loaded.", pd.DataFrame() (model, categories_data, category_names, category_embeddings, technologies_df, technology_embeddings) = loaded_data # 1. Categorize Problem category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings) if category_name: confidence_text = "(Confident Match)" if is_confident else "(Possible Match)" category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})" else: category_output = "**Could not identify a matching category.**" print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})") # 2. Find Relevant Technologies relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings) print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.") tech_output = "" if not relevant_technologies_df.empty: tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n" # Create a list for display, keeping relevant data display_tech_list = [] for _, row in relevant_technologies_df.iterrows(): tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip() problem_relevance = row.get('similarity_score_problem', 0.0) original_cats = str(row.get('category', 'Unknown')).strip() tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n" if original_cats: tech_output += f" *Original Category listed as: {original_cats}*\n" tech_output += "\n---\n" else: tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n" # 3. Find Top Technology Pairs top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings) print(f"Identified {len(top_pairs)} top technology pairs for searching.") pairs_output = "" if top_pairs: pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n" for pair_names, score in top_pairs: pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n" pairs_output += "\n---\n" # No 'else' needed here, handled in final assembly # 4. Search for Solutions using the Top Pairs solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here print("API search for solutions completed.") # 5. Combine Outputs final_output = ( f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n" f"{category_output}\n\n" f"{tech_output}" ) if top_pairs: final_output += pairs_output else: final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n" final_output += solution_output_text print("--- Processing finished ---") # Return both the formatted text and the DataFrame (might be useful later) return final_output, relevant_technologies_df # --- Streamlit UI --- def main(): st.set_page_config(page_title="Technical Problem Analyzer", layout="wide") st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)") st.markdown( """ Enter a technical problem. The app will: 1. Identify the best matching **category** (for informational purposes). 2. Find the **most relevant technologies** based *directly on your problem description*. 3. Identify **promising pairs** among these relevant technologies based on their similarity. 4. Search for **patents/research** using these pairs via an external API. """ ) # Load data and model (cached) loaded_data = load_data_and_model() if loaded_data is None: st.error("Application initialization failed. Check logs for details.") st.stop() # Stop execution if loading failed # Example problems (optional) st.subheader("Example Problems:") examples = [ "How can I establish reliable communication between low-orbit satellites for continuous global monitoring?", "Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.", "Develop low-latency communication protocols for 6G networks", "Design efficient routing algorithms for large scale mesh networks in smart cities", "Create biodegradable packaging material from agricultural waste", "Develop a method for real-time traffic prediction using heterogeneous data sources" ] selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples) # User input problem_description_input = st.text_area( "Enter Technical Problem Description:", height=150, placeholder="Describe your technical challenge or requirement here...", value=selected_example # Use selected example if chosen ) # Button to trigger analysis analyze_button = st.button("Analyze Problem") if analyze_button and problem_description_input: with st.spinner("Analyzing problem and searching for solutions..."): # Run the main processing function analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data) # Display results st.markdown("---") # Separator st.markdown(analysis_output) # Display formatted text results # --- Removed Google Drive Upload Section --- # You could potentially add other actions here using relevant_tech_df, # like displaying it as a table or offering a local download. # Example: Display relevant technologies table if not relevant_tech_df.empty: st.markdown("---") st.subheader("Relevant Technologies Data") st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']]) elif analyze_button and not problem_description_input: st.warning("Please enter a problem description.") # --- Run the App --- if __name__ == "__main__": main()