heymenn's picture
Create app.py
dcf3971 verified
raw
history blame
9.69 kB
import gradio as gr
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
from duckduckgo_search import DDGS
import re
# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
SIMILARITY_THRESHOLD = 0.3 # Adjust as needed
MAX_SEARCH_RESULTS_PER_TECH = 3
# --- Load Data and Model (Load once at startup) ---
print("Loading data and model...")
try:
# Load Categories
with open(CATEGORY_JSON_PATH, 'r') as f:
categories_data = json.load(f)["Category"]
# Prepare category texts for embedding (Category Name + Keywords)
category_names = list(categories_data.keys())
category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
print(f"Loaded {len(category_names)} categories.")
# Load Technologies
technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
# Clean the technology category column - handle potential NaN and ensure string type
technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
print(f"Loaded {len(technologies_df)} technologies.")
# Load Sentence Transformer Model
model = SentenceTransformer(MODEL_NAME)
print(f"Loaded Sentence Transformer model: {MODEL_NAME}")
# Pre-compute category embeddings
print("Computing category embeddings...")
category_embeddings = model.encode(category_texts, convert_to_tensor=True)
print("Category embeddings computed.")
except FileNotFoundError as e:
print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
# Optionally raise the error or exit if critical files are missing
raise e
except Exception as e:
print(f"ERROR loading data or model: {e}")
raise e
# --- Helper Functions ---
def find_best_category(problem_description):
"""
Finds the most relevant category for the problem description using semantic similarity.
"""
if not problem_description or not category_names:
return None
try:
problem_embedding = model.encode(problem_description, convert_to_tensor=True)
# Compute cosine similarities
cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
# Find the highest score and its index
best_score, best_idx = torch.max(cosine_scores, dim=0)
if best_score.item() >= SIMILARITY_THRESHOLD:
return category_names[best_idx.item()], best_score.item()
else:
return None, None # No category met the threshold
except Exception as e:
print(f"Error during category finding: {e}")
return None, None
def find_relevant_technologies(category_name):
"""
Filters the technologies DataFrame based on the identified category.
Handles categories listed like "Cat1, Cat2".
"""
if not category_name or technologies_df.empty:
return pd.DataFrame() # Return empty DataFrame if no category or data
relevant_tech = []
# Iterate through the DataFrame safely
for index, row in technologies_df.iterrows():
# Split the 'category' string by comma and strip whitespace
tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
if category_name in tech_categories:
relevant_tech.append(row)
if not relevant_tech:
return pd.DataFrame() # Return empty if no matches
return pd.DataFrame(relevant_tech)
def search_solutions(problem_description, technologies):
"""
Searches DuckDuckGo for solutions combining the problem and technologies.
"""
results = {}
if technologies.empty:
return "No relevant technologies found to search for solutions."
try:
with DDGS() as ddgs:
for tech_name in technologies['technology'].unique(): # Use unique names
# Clean up tech_name if it has extra info (like title prefixes)
# Simple cleaning - might need adjustment based on actual data
clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
if not clean_tech_name: continue # Skip if name is empty after cleaning
query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length
print(f"Searching for: {query}")
search_results = []
for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)):
search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...}
if search_results:
results[clean_tech_name] = search_results
else:
results[clean_tech_name] = [] # Indicate no results found for this tech
except Exception as e:
print(f"Error during web search: {e}")
return f"An error occurred during the search: {e}"
# Format results for display
output = "### Potential Solutions & Resources:\n\n"
if not results:
output += "No search results found."
return output
for tech, links in results.items():
output += f"**For Technology: {tech}**\n"
if links:
for link in links:
output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet
else:
output += "- *No specific results found for this technology combination.*\n"
output += "\n"
return output
# --- Main Processing Function ---
def process_problem(problem_description):
"""
Main function called by Gradio interface.
Orchestrates the categorization, technology finding, and solution searching.
"""
if not problem_description:
return "Please enter a problem description.", "", ""
# 1. Categorize Problem
category_name, score = find_best_category(problem_description)
if category_name:
category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
else:
category_output = "**Could not confidently identify a relevant category.**"
# Return early if no category is found? Or proceed with empty tech? Let's proceed for now.
# return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions."
# 2. Find Relevant Technologies
relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found
if not relevant_technologies_df.empty:
tech_output = "### Relevant Technologies:\n\n"
for _, row in relevant_technologies_df.iterrows():
# Clean up the description for better display
# Assuming description format like "- Title : ... \n - Purpose : ..."
desc_lines = str(row['description']).split('<br>') # Split by <br> if present
cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
tech_output += f"**Technology:** {row['technology']}\n**Description:**\n{cleaned_desc}\n\n---\n"
elif category_name:
tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
else:
tech_output = "No relevant technologies could be identified as no category was matched."
# 3. Search for Solutions
solution_output = search_solutions(problem_description, relevant_technologies_df)
# 4. Combine Outputs for Gradio
# Using Markdown for better formatting
final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
# Gradio currently works best returning separate components if you define multiple outputs.
# Let's return a single formatted Markdown string for simplicity here.
# If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output
return final_output
# --- Create Gradio Interface ---
print("Setting up Gradio interface...")
iface = gr.Interface(
fn=process_problem,
inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output
# If using multiple outputs:
# outputs=[
# gr.Markdown(label="Identified Category"),
# gr.Markdown(label="Relevant Technologies"),
# gr.Markdown(label="Potential Solutions (Search Results)")
# ],
title="Technical Problem Analyzer",
description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.",
examples=[
["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
["Develop a secure authentication method for a distributed IoT network without a central server."]
],
allow_flagging='never', # Optional: disable flagging
# theme=gr.themes.Soft() # Optional: Apply a theme
)
# --- Launch the App ---
if __name__ == "__main__":
print("Launching Gradio app...")
iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces)