Spaces:

heymenn
/

technical-problem-analyzer

Runtime error

App Files Files Community

technical-problem-analyzer / app.py

heymenn

Create app.py

dcf3971 verified about 1 year ago

raw

history blame

9.69 kB

	import gradio as gr
	import pandas as pd
	import json
	from sentence_transformers import SentenceTransformer, util
	import torch
	from duckduckgo_search import DDGS
	import re

	# --- Configuration ---
	CATEGORY_JSON_PATH = "categories.json"
	TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
	MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
	SIMILARITY_THRESHOLD = 0.3 # Adjust as needed
	MAX_SEARCH_RESULTS_PER_TECH = 3

	# --- Load Data and Model (Load once at startup) ---
	print("Loading data and model...")
	try:
	# Load Categories
	with open(CATEGORY_JSON_PATH, 'r') as f:
	categories_data = json.load(f)["Category"]
	# Prepare category texts for embedding (Category Name + Keywords)
	category_names = list(categories_data.keys())
	category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
	print(f"Loaded {len(category_names)} categories.")

	# Load Technologies
	technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
	# Clean the technology category column - handle potential NaN and ensure string type
	technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
	print(f"Loaded {len(technologies_df)} technologies.")

	# Load Sentence Transformer Model
	model = SentenceTransformer(MODEL_NAME)
	print(f"Loaded Sentence Transformer model: {MODEL_NAME}")

	# Pre-compute category embeddings
	print("Computing category embeddings...")
	category_embeddings = model.encode(category_texts, convert_to_tensor=True)
	print("Category embeddings computed.")

	except FileNotFoundError as e:
	print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
	# Optionally raise the error or exit if critical files are missing
	raise e
	except Exception as e:
	print(f"ERROR loading data or model: {e}")
	raise e

	# --- Helper Functions ---

	def find_best_category(problem_description):
	"""
	Finds the most relevant category for the problem description using semantic similarity.
	"""
	if not problem_description or not category_names:
	return None

	try:
	problem_embedding = model.encode(problem_description, convert_to_tensor=True)
	# Compute cosine similarities
	cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]

	# Find the highest score and its index
	best_score, best_idx = torch.max(cosine_scores, dim=0)

	if best_score.item() >= SIMILARITY_THRESHOLD:
	return category_names[best_idx.item()], best_score.item()
	else:
	return None, None # No category met the threshold
	except Exception as e:
	print(f"Error during category finding: {e}")
	return None, None

	def find_relevant_technologies(category_name):
	"""
	Filters the technologies DataFrame based on the identified category.
	Handles categories listed like "Cat1, Cat2".
	"""
	if not category_name or technologies_df.empty:
	return pd.DataFrame() # Return empty DataFrame if no category or data

	relevant_tech = []
	# Iterate through the DataFrame safely
	for index, row in technologies_df.iterrows():
	# Split the 'category' string by comma and strip whitespace
	tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
	if category_name in tech_categories:
	relevant_tech.append(row)

	if not relevant_tech:
	return pd.DataFrame() # Return empty if no matches

	return pd.DataFrame(relevant_tech)


	def search_solutions(problem_description, technologies):
	"""
	Searches DuckDuckGo for solutions combining the problem and technologies.
	"""
	results = {}
	if technologies.empty:
	return "No relevant technologies found to search for solutions."

	try:
	with DDGS() as ddgs:
	for tech_name in technologies['technology'].unique(): # Use unique names
	# Clean up tech_name if it has extra info (like title prefixes)
	# Simple cleaning - might need adjustment based on actual data
	clean_tech_name = re.sub(r'^- Title\s:\s', '', str(tech_name)).strip()
	if not clean_tech_name: continue # Skip if name is empty after cleaning

	query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length
	print(f"Searching for: {query}")
	search_results = []
	for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)):
	search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...}

	if search_results:
	results[clean_tech_name] = search_results
	else:
	results[clean_tech_name] = [] # Indicate no results found for this tech

	except Exception as e:
	print(f"Error during web search: {e}")
	return f"An error occurred during the search: {e}"

	# Format results for display
	output = "### Potential Solutions & Resources:\n\n"
	if not results:
	output += "No search results found."
	return output

	for tech, links in results.items():
	output += f"For Technology: {tech}\n"
	if links:
	for link in links:
	output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet
	else:
	output += "- No specific results found for this technology combination.\n"
	output += "\n"

	return output

	# --- Main Processing Function ---
	def process_problem(problem_description):
	"""
	Main function called by Gradio interface.
	Orchestrates the categorization, technology finding, and solution searching.
	"""
	if not problem_description:
	return "Please enter a problem description.", "", ""

	# 1. Categorize Problem
	category_name, score = find_best_category(problem_description)
	if category_name:
	category_output = f"Identified Category: {category_name} (Similarity Score: {score:.2f})"
	else:
	category_output = "Could not confidently identify a relevant category."
	# Return early if no category is found? Or proceed with empty tech? Let's proceed for now.
	# return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions."

	# 2. Find Relevant Technologies
	relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found
	if not relevant_technologies_df.empty:
	tech_output = "### Relevant Technologies:\n\n"
	for _, row in relevant_technologies_df.iterrows():
	# Clean up the description for better display
	# Assuming description format like "- Title : ... \n - Purpose : ..."
	desc_lines = str(row['description']).split('<br>') # Split by <br> if present
	cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
	tech_output += f"Technology: {row['technology']}\nDescription:\n{cleaned_desc}\n\n---\n"
	elif category_name:
	tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
	else:
	tech_output = "No relevant technologies could be identified as no category was matched."


	# 3. Search for Solutions
	solution_output = search_solutions(problem_description, relevant_technologies_df)

	# 4. Combine Outputs for Gradio
	# Using Markdown for better formatting
	final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"

	# Gradio currently works best returning separate components if you define multiple outputs.
	# Let's return a single formatted Markdown string for simplicity here.
	# If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output
	return final_output


	# --- Create Gradio Interface ---
	print("Setting up Gradio interface...")
	iface = gr.Interface(
	fn=process_problem,
	inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
	outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output
	# If using multiple outputs:
	# outputs=[
	# gr.Markdown(label="Identified Category"),
	# gr.Markdown(label="Relevant Technologies"),
	# gr.Markdown(label="Potential Solutions (Search Results)")
	# ],
	title="Technical Problem Analyzer",
	description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.",
	examples=[
	["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
	["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
	["Develop a secure authentication method for a distributed IoT network without a central server."]
	],
	allow_flagging='never', # Optional: disable flagging
	# theme=gr.themes.Soft() # Optional: Apply a theme
	)

	# --- Launch the App ---
	if __name__ == "__main__":
	print("Launching Gradio app...")
	iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces)