Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| from sentence_transformers import SentenceTransformer, util | |
| import torch | |
| from duckduckgo_search import DDGS | |
| import re | |
| # --- Configuration --- | |
| CATEGORY_JSON_PATH = "categories.json" | |
| TECHNOLOGY_EXCEL_PATH = "technologies.xlsx" | |
| MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer | |
| SIMILARITY_THRESHOLD = 0.3 # Adjust as needed | |
| MAX_SEARCH_RESULTS_PER_TECH = 3 | |
| # --- Load Data and Model (Load once at startup) --- | |
| print("Loading data and model...") | |
| try: | |
| # Load Categories | |
| with open(CATEGORY_JSON_PATH, 'r') as f: | |
| categories_data = json.load(f)["Category"] | |
| # Prepare category texts for embedding (Category Name + Keywords) | |
| category_names = list(categories_data.keys()) | |
| category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()] | |
| print(f"Loaded {len(category_names)} categories.") | |
| # Load Technologies | |
| technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH) | |
| # Clean the technology category column - handle potential NaN and ensure string type | |
| technologies_df['category'] = technologies_df['category'].fillna('').astype(str) | |
| print(f"Loaded {len(technologies_df)} technologies.") | |
| # Load Sentence Transformer Model | |
| model = SentenceTransformer(MODEL_NAME) | |
| print(f"Loaded Sentence Transformer model: {MODEL_NAME}") | |
| # Pre-compute category embeddings | |
| print("Computing category embeddings...") | |
| category_embeddings = model.encode(category_texts, convert_to_tensor=True) | |
| print("Category embeddings computed.") | |
| except FileNotFoundError as e: | |
| print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.") | |
| # Optionally raise the error or exit if critical files are missing | |
| raise e | |
| except Exception as e: | |
| print(f"ERROR loading data or model: {e}") | |
| raise e | |
| # --- Helper Functions --- | |
| def find_best_category(problem_description): | |
| """ | |
| Finds the most relevant category for the problem description using semantic similarity. | |
| """ | |
| if not problem_description or not category_names: | |
| return None | |
| try: | |
| problem_embedding = model.encode(problem_description, convert_to_tensor=True) | |
| # Compute cosine similarities | |
| cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0] | |
| # Find the highest score and its index | |
| best_score, best_idx = torch.max(cosine_scores, dim=0) | |
| if best_score.item() >= SIMILARITY_THRESHOLD: | |
| return category_names[best_idx.item()], best_score.item() | |
| else: | |
| return None, None # No category met the threshold | |
| except Exception as e: | |
| print(f"Error during category finding: {e}") | |
| return None, None | |
| def find_relevant_technologies(category_name): | |
| """ | |
| Filters the technologies DataFrame based on the identified category. | |
| Handles categories listed like "Cat1, Cat2". | |
| """ | |
| if not category_name or technologies_df.empty: | |
| return pd.DataFrame() # Return empty DataFrame if no category or data | |
| relevant_tech = [] | |
| # Iterate through the DataFrame safely | |
| for index, row in technologies_df.iterrows(): | |
| # Split the 'category' string by comma and strip whitespace | |
| tech_categories = [cat.strip() for cat in str(row['category']).split(',')] | |
| if category_name in tech_categories: | |
| relevant_tech.append(row) | |
| if not relevant_tech: | |
| return pd.DataFrame() # Return empty if no matches | |
| return pd.DataFrame(relevant_tech) | |
| def search_solutions(problem_description, technologies): | |
| """ | |
| Searches DuckDuckGo for solutions combining the problem and technologies. | |
| """ | |
| results = {} | |
| if technologies.empty: | |
| return "No relevant technologies found to search for solutions." | |
| try: | |
| with DDGS() as ddgs: | |
| for tech_name in technologies['technology'].unique(): # Use unique names | |
| # Clean up tech_name if it has extra info (like title prefixes) | |
| # Simple cleaning - might need adjustment based on actual data | |
| clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip() | |
| if not clean_tech_name: continue # Skip if name is empty after cleaning | |
| query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length | |
| print(f"Searching for: {query}") | |
| search_results = [] | |
| for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)): | |
| search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...} | |
| if search_results: | |
| results[clean_tech_name] = search_results | |
| else: | |
| results[clean_tech_name] = [] # Indicate no results found for this tech | |
| except Exception as e: | |
| print(f"Error during web search: {e}") | |
| return f"An error occurred during the search: {e}" | |
| # Format results for display | |
| output = "### Potential Solutions & Resources:\n\n" | |
| if not results: | |
| output += "No search results found." | |
| return output | |
| for tech, links in results.items(): | |
| output += f"**For Technology: {tech}**\n" | |
| if links: | |
| for link in links: | |
| output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet | |
| else: | |
| output += "- *No specific results found for this technology combination.*\n" | |
| output += "\n" | |
| return output | |
| # --- Main Processing Function --- | |
| def process_problem(problem_description): | |
| """ | |
| Main function called by Gradio interface. | |
| Orchestrates the categorization, technology finding, and solution searching. | |
| """ | |
| if not problem_description: | |
| return "Please enter a problem description.", "", "" | |
| # 1. Categorize Problem | |
| category_name, score = find_best_category(problem_description) | |
| if category_name: | |
| category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})" | |
| else: | |
| category_output = "**Could not confidently identify a relevant category.**" | |
| # Return early if no category is found? Or proceed with empty tech? Let's proceed for now. | |
| # return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions." | |
| # 2. Find Relevant Technologies | |
| relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found | |
| if not relevant_technologies_df.empty: | |
| tech_output = "### Relevant Technologies:\n\n" | |
| for _, row in relevant_technologies_df.iterrows(): | |
| # Clean up the description for better display | |
| # Assuming description format like "- Title : ... \n - Purpose : ..." | |
| desc_lines = str(row['description']).split('<br>') # Split by <br> if present | |
| cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()]) | |
| tech_output += f"**Technology:** {row['technology']}\n**Description:**\n{cleaned_desc}\n\n---\n" | |
| elif category_name: | |
| tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data." | |
| else: | |
| tech_output = "No relevant technologies could be identified as no category was matched." | |
| # 3. Search for Solutions | |
| solution_output = search_solutions(problem_description, relevant_technologies_df) | |
| # 4. Combine Outputs for Gradio | |
| # Using Markdown for better formatting | |
| final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}" | |
| # Gradio currently works best returning separate components if you define multiple outputs. | |
| # Let's return a single formatted Markdown string for simplicity here. | |
| # If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output | |
| return final_output | |
| # --- Create Gradio Interface --- | |
| print("Setting up Gradio interface...") | |
| iface = gr.Interface( | |
| fn=process_problem, | |
| inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."), | |
| outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output | |
| # If using multiple outputs: | |
| # outputs=[ | |
| # gr.Markdown(label="Identified Category"), | |
| # gr.Markdown(label="Relevant Technologies"), | |
| # gr.Markdown(label="Potential Solutions (Search Results)") | |
| # ], | |
| title="Technical Problem Analyzer", | |
| description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.", | |
| examples=[ | |
| ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"], | |
| ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."], | |
| ["Develop a secure authentication method for a distributed IoT network without a central server."] | |
| ], | |
| allow_flagging='never', # Optional: disable flagging | |
| # theme=gr.themes.Soft() # Optional: Apply a theme | |
| ) | |
| # --- Launch the App --- | |
| if __name__ == "__main__": | |
| print("Launching Gradio app...") | |
| iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces) |