Spaces:
Runtime error
Runtime error
| import streamlit as st | |
| import pandas as pd | |
| import json | |
| from sentence_transformers import SentenceTransformer, util | |
| import torch | |
| import requests | |
| import re | |
| import urllib.parse | |
| import itertools # For generating pairs | |
| import os | |
| import io # Keep for potential future use (e.g., local download) | |
| import traceback # Keep for error logging | |
| # -- Fix SSL error | |
| os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt' | |
| # --- Configuration --- | |
| CATEGORY_JSON_PATH = "categories.json" | |
| TECHNOLOGY_EXCEL_PATH = "technologies.xlsx" | |
| MODEL_NAME = 'all-MiniLM-L6-v2' | |
| CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match | |
| MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories) | |
| MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search | |
| MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair | |
| SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search" | |
| # --- Removed Google Drive Config --- | |
| # --- Global Variables (will be managed by Streamlit's caching) --- | |
| # These are loaded once via the cached function below | |
| # --- Removed Google Drive API Setup --- | |
| # --- Removed Google Drive Function --- | |
| # --- Load Data and Model (Cached) --- | |
| # Cache the model and embeddings | |
| def load_data_and_model(): | |
| """Loads data files and the Sentence Transformer model once.""" | |
| print("Attempting to load data and model...") | |
| try: | |
| # Load Categories | |
| with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f: | |
| categories_data = json.load(f)["Category"] | |
| category_names = list(categories_data.keys()) | |
| category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()] | |
| print(f"Loaded {len(category_names)} categories.") | |
| # Load Technologies | |
| technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH) | |
| technologies_df.columns = technologies_df.columns.str.strip() | |
| if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns: | |
| raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx") | |
| technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str) | |
| technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str) | |
| technologies_df['tech_id'] = technologies_df.index # Use index as unique ID | |
| print(f"Loaded {len(technologies_df)} technologies.") | |
| # Load Sentence Transformer Model | |
| model = SentenceTransformer(MODEL_NAME) | |
| print(f"Loaded Sentence Transformer model: {MODEL_NAME}") | |
| # Pre-compute category embeddings | |
| print("Computing category embeddings...") | |
| category_embeddings = model.encode(category_texts, convert_to_tensor=True) | |
| print("Category embeddings computed.") | |
| # Pre-compute technology description embeddings | |
| print("Computing technology description embeddings...") | |
| valid_descriptions = technologies_df['description_clean'].tolist() | |
| technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True) | |
| print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).") | |
| return (model, categories_data, category_names, category_embeddings, | |
| technologies_df, technology_embeddings) | |
| except FileNotFoundError as e: | |
| st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.") | |
| print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.") | |
| return None # Indicate failure | |
| except Exception as e: | |
| st.error(f"ERROR loading data or model: {e}") | |
| print(f"ERROR loading data or model: {e}") | |
| traceback.print_exc() | |
| return None # Indicate failure | |
| # --- Helper Functions (unchanged, use loaded_data) --- | |
| def find_best_category(problem_description, model, category_names, category_embeddings): | |
| """Finds the most relevant category using pre-computed embeddings.""" | |
| if not problem_description or not category_names or category_embeddings is None: | |
| return None, 0.0, False | |
| try: | |
| problem_embedding = model.encode(problem_description, convert_to_tensor=True) | |
| cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0] | |
| best_score, best_idx = torch.max(cosine_scores, dim=0) | |
| best_category_name = category_names[best_idx.item()] | |
| best_category_score = best_score.item() | |
| is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD | |
| return best_category_name, best_category_score, is_confident | |
| except Exception as e: | |
| print(f"Error during category finding: {e}") | |
| return None, 0.0, False | |
| def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings): | |
| """Calculates similarity between the problem and ALL technology descriptions.""" | |
| all_tech_data = [] | |
| if technologies_df.empty or technology_embeddings is None or not problem_description: | |
| print("Warning: Technologies DF, embeddings, or problem description missing.") | |
| return pd.DataFrame() | |
| try: | |
| problem_embedding = model.encode(problem_description, convert_to_tensor=True) | |
| # Efficiently calculate all similarities at once | |
| cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0] | |
| # Add scores to the dataframe temporarily | |
| temp_df = technologies_df.copy() | |
| temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy | |
| # Sort by similarity and get top N | |
| relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem') | |
| # print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print | |
| return relevant_df | |
| except Exception as e: | |
| print(f"Error during technology finding/scoring: {e}") | |
| traceback.print_exc() # Print full traceback for debugging | |
| return pd.DataFrame() | |
| def find_top_technology_pairs(relevant_technologies_df, technology_embeddings): | |
| """Calculates similarity between pairs of relevant technologies.""" | |
| if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None: | |
| return [] | |
| pairs_with_scores = [] | |
| if 'tech_id' not in relevant_technologies_df.columns: | |
| print("Error: 'tech_id' column missing in relevant_technologies_df.") | |
| return [] | |
| tech_ids = relevant_technologies_df['tech_id'].tolist() | |
| tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict() | |
| for id_a, id_b in itertools.combinations(tech_ids, 2): | |
| try: | |
| # Boundary checks | |
| if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]: | |
| print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.") | |
| continue | |
| embedding_a = technology_embeddings[id_a] | |
| embedding_b = technology_embeddings[id_b] | |
| # Calculate inter-technology similarity | |
| inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item() | |
| tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})") | |
| tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})") | |
| clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip() | |
| clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip() | |
| pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity)) | |
| except Exception as e: | |
| print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}") | |
| traceback.print_exc() | |
| continue | |
| pairs_with_scores.sort(key=lambda item: item[1], reverse=True) | |
| pairs_with_scores_min_max = [] | |
| pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2]) | |
| pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:]) | |
| return pairs_with_scores_min_max | |
| def search_solutions_for_pairs(problem_description, top_pairs): | |
| """Searches for solutions/patents using pairs of technologies via the API.""" | |
| results = {} | |
| if not top_pairs: | |
| # Return value modified for clarity | |
| return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results | |
| if not problem_description: | |
| return "Problem description is missing. Cannot search for solutions.\n", results | |
| headers = {'accept': 'application/json'} | |
| api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n" | |
| for pair_info in top_pairs: | |
| pair_names, pair_score = pair_info | |
| tech_a_name, tech_b_name = pair_names | |
| if not tech_a_name or not tech_b_name: continue | |
| query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused | |
| params = { | |
| 'query': query, | |
| 'max_references': MAX_SEARCH_REFERENCES_PER_PAIR | |
| } | |
| pair_key = f"{tech_a_name} + {tech_b_name}" | |
| print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...") | |
| try: | |
| response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45) | |
| response.raise_for_status() | |
| api_response = response.json() # Assume JSON response | |
| search_results = [] | |
| # --- Adapt based on actual API response structure --- | |
| if isinstance(api_response, list): | |
| search_results = api_response | |
| elif isinstance(api_response, dict): | |
| # Try common keys for results lists | |
| if 'results' in api_response and isinstance(api_response.get('results'), list): | |
| search_results = api_response['results'] | |
| elif 'references' in api_response and isinstance(api_response.get('references'), list): | |
| search_results = api_response['references'] | |
| elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility | |
| search_results = api_response['links'] | |
| else: # Check if the dict itself contains title/url | |
| if 'title' in api_response and ('url' in api_response or 'link' in api_response): | |
| search_results = [api_response] # Wrap it in a list | |
| else: | |
| print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}") | |
| else: | |
| print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}") | |
| # --- End adaptation --- | |
| valid_links = [] | |
| for r in search_results: | |
| if isinstance(r, dict): | |
| title = r.get('title', 'N/A') | |
| url = r.get('url', r.get('link')) # Check for 'url' or 'link' | |
| if url and isinstance(url, str) and url.startswith(('http://', 'https://')): | |
| valid_links.append({'title': title, 'link': url}) | |
| elif url: | |
| print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}") | |
| results[pair_key] = {"score": pair_score, "links": valid_links} | |
| except requests.exceptions.Timeout: | |
| print(f"Error: API call timed out for pair '{pair_key}'") | |
| results[pair_key] = {"score": pair_score, "error": "API Timeout"} | |
| except requests.exceptions.HTTPError as e: | |
| print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}") | |
| results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"} | |
| except requests.exceptions.RequestException as e: | |
| print(f"Error calling search API for pair '{pair_key}': {e}") | |
| results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"} | |
| except json.JSONDecodeError: | |
| err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}" | |
| print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}") | |
| results[pair_key] = {"score": pair_score, "error": err_msg} | |
| except Exception as e: | |
| err_msg = f"Unexpected Error during API call: {e}" | |
| print(f"Unexpected error during API call for pair '{pair_key}': {e}") | |
| traceback.print_exc() | |
| results[pair_key] = {"score": pair_score, "error": err_msg} | |
| # Format results for display | |
| if not results: | |
| api_output += "No search results could be retrieved from the API for the generated technology pairs." | |
| return api_output, results # Return formatted string and raw results dict | |
| for pair_key, search_data in results.items(): | |
| pair_score = search_data.get('score', 0.0) | |
| api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n" | |
| if "error" in search_data: | |
| api_output += f"- *Search failed: {search_data['error']}*\n" | |
| elif "links" in search_data: | |
| links = search_data["links"] | |
| if links: | |
| for link_info in links: | |
| title_str = str(link_info.get('title', 'N/A')) | |
| title_sanitized = title_str.replace('[','(').replace(']',')') | |
| api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n" | |
| else: | |
| api_output += "- *No specific results found by the API for this technology pair.*\n" | |
| else: | |
| api_output += "- *Unknown search result state.*\n" | |
| api_output += "\n" | |
| return api_output, results # Return formatted string and raw results dict | |
| # --- Main Processing Function --- | |
| def process_problem(problem_description, loaded_data): | |
| """ | |
| Main function called by Streamlit interface. Orchestrates the process. | |
| Returns the formatted output string AND the relevant technologies DataFrame. | |
| """ | |
| print(f"\n--- Processing request for: '{problem_description[:100]}...' ---") | |
| if not loaded_data: | |
| # This case should ideally be handled before calling process_problem | |
| return "Error: Model and data not loaded.", pd.DataFrame() | |
| (model, categories_data, category_names, category_embeddings, | |
| technologies_df, technology_embeddings) = loaded_data | |
| # 1. Categorize Problem | |
| category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings) | |
| if category_name: | |
| confidence_text = "(Confident Match)" if is_confident else "(Possible Match)" | |
| category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})" | |
| else: | |
| category_output = "**Could not identify a matching category.**" | |
| print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})") | |
| # 2. Find Relevant Technologies | |
| relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings) | |
| print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.") | |
| tech_output = "" | |
| if not relevant_technologies_df.empty: | |
| tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n" | |
| # Create a list for display, keeping relevant data | |
| display_tech_list = [] | |
| for _, row in relevant_technologies_df.iterrows(): | |
| tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip() | |
| problem_relevance = row.get('similarity_score_problem', 0.0) | |
| original_cats = str(row.get('category', 'Unknown')).strip() | |
| tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n" | |
| if original_cats: | |
| tech_output += f" *Original Category listed as: {original_cats}*\n" | |
| tech_output += "\n---\n" | |
| else: | |
| tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n" | |
| # 3. Find Top Technology Pairs | |
| top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings) | |
| print(f"Identified {len(top_pairs)} top technology pairs for searching.") | |
| pairs_output = "" | |
| if top_pairs: | |
| pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n" | |
| for pair_names, score in top_pairs: | |
| pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n" | |
| pairs_output += "\n---\n" | |
| # No 'else' needed here, handled in final assembly | |
| # 4. Search for Solutions using the Top Pairs | |
| solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here | |
| print("API search for solutions completed.") | |
| # 5. Combine Outputs | |
| final_output = ( | |
| f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n" | |
| f"{category_output}\n\n" | |
| f"{tech_output}" | |
| ) | |
| if top_pairs: | |
| final_output += pairs_output | |
| else: | |
| final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n" | |
| final_output += solution_output_text | |
| print("--- Processing finished ---") | |
| # Return both the formatted text and the DataFrame (might be useful later) | |
| return final_output, relevant_technologies_df | |
| # --- Streamlit UI --- | |
| def main(): | |
| st.set_page_config(page_title="Technical Problem Analyzer", layout="wide") | |
| st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)") | |
| st.markdown( | |
| """ | |
| Enter a technical problem. The app will: | |
| 1. Identify the best matching **category** (for informational purposes). | |
| 2. Find the **most relevant technologies** based *directly on your problem description*. | |
| 3. Identify **promising pairs** among these relevant technologies based on their similarity. | |
| 4. Search for **patents/research** using these pairs via an external API. | |
| """ | |
| ) | |
| # Load data and model (cached) | |
| loaded_data = load_data_and_model() | |
| if loaded_data is None: | |
| st.error("Application initialization failed. Check logs for details.") | |
| st.stop() # Stop execution if loading failed | |
| # Example problems (optional) | |
| st.subheader("Example Problems:") | |
| examples = [ | |
| "How can I establish reliable communication between low-orbit satellites for continuous global monitoring?", | |
| "Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.", | |
| "Develop low-latency communication protocols for 6G networks", | |
| "Design efficient routing algorithms for large scale mesh networks in smart cities", | |
| "Create biodegradable packaging material from agricultural waste", | |
| "Develop a method for real-time traffic prediction using heterogeneous data sources" | |
| ] | |
| selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples) | |
| # User input | |
| problem_description_input = st.text_area( | |
| "Enter Technical Problem Description:", | |
| height=150, | |
| placeholder="Describe your technical challenge or requirement here...", | |
| value=selected_example # Use selected example if chosen | |
| ) | |
| # Button to trigger analysis | |
| analyze_button = st.button("Analyze Problem") | |
| if analyze_button and problem_description_input: | |
| with st.spinner("Analyzing problem and searching for solutions..."): | |
| # Run the main processing function | |
| analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data) | |
| # Display results | |
| st.markdown("---") # Separator | |
| st.markdown(analysis_output) # Display formatted text results | |
| # --- Removed Google Drive Upload Section --- | |
| # You could potentially add other actions here using relevant_tech_df, | |
| # like displaying it as a table or offering a local download. | |
| # Example: Display relevant technologies table | |
| if not relevant_tech_df.empty: | |
| st.markdown("---") | |
| st.subheader("Relevant Technologies Data") | |
| st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']]) | |
| elif analyze_button and not problem_description_input: | |
| st.warning("Please enter a problem description.") | |
| # --- Run the App --- | |
| if __name__ == "__main__": | |
| main() | |