Spaces:

heymenn
/

technical-problem-analyzer

Runtime error

App Files Files Community

heymenn commited on Apr 8, 2025

Commit

d6ed968

verified ·

1 Parent(s): fd5072d

Update app.py

Browse files

Files changed (1) hide show

app.py +156 -92

app.py CHANGED Viewed

@@ -3,15 +3,18 @@ import pandas as pd
 import json
 from sentence_transformers import SentenceTransformer, util
 import torch
-from duckduckgo_search import DDGS
 import re
 # --- Configuration ---
 CATEGORY_JSON_PATH = "categories.json"
 TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
 MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
-SIMILARITY_THRESHOLD = 0.3 # Adjust as needed
-MAX_SEARCH_RESULTS_PER_TECH = 3
 # --- Load Data and Model (Load once at startup) ---
 print("Loading data and model...")
@@ -19,15 +22,15 @@ try:
     # Load Categories
     with open(CATEGORY_JSON_PATH, 'r') as f:
         categories_data = json.load(f)["Category"]
-    # Prepare category texts for embedding (Category Name + Keywords)
     category_names = list(categories_data.keys())
     category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
     print(f"Loaded {len(category_names)} categories.")
     # Load Technologies
     technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
-     # Clean the technology category column - handle potential NaN and ensure string type
     technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
     print(f"Loaded {len(technologies_df)} technologies.")
     # Load Sentence Transformer Model
@@ -39,9 +42,14 @@ try:
     category_embeddings = model.encode(category_texts, convert_to_tensor=True)
     print("Category embeddings computed.")
 except FileNotFoundError as e:
     print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
-    # Optionally raise the error or exit if critical files are missing
     raise e
 except Exception as e:
     print(f"ERROR loading data or model: {e}")
@@ -54,165 +62,221 @@ def find_best_category(problem_description):
     Finds the most relevant category for the problem description using semantic similarity.
     """
     if not problem_description or not category_names:
-        return None
     try:
         problem_embedding = model.encode(problem_description, convert_to_tensor=True)
-        # Compute cosine similarities
         cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
-        # Find the highest score and its index
         best_score, best_idx = torch.max(cosine_scores, dim=0)
-        if best_score.item() >= SIMILARITY_THRESHOLD:
             return category_names[best_idx.item()], best_score.item()
         else:
-            return None, None # No category met the threshold
     except Exception as e:
         print(f"Error during category finding: {e}")
-        return None, None
-def find_relevant_technologies(category_name):
     """
-    Filters the technologies DataFrame based on the identified category.
-    Handles categories listed like "Cat1, Cat2".
     """
-    if not category_name or technologies_df.empty:
-        return pd.DataFrame() # Return empty DataFrame if no category or data
-    relevant_tech = []
-    # Iterate through the DataFrame safely
-    for index, row in technologies_df.iterrows():
-        # Split the 'category' string by comma and strip whitespace
-        tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
-        if category_name in tech_categories:
-            relevant_tech.append(row)
-    if not relevant_tech:
-         return pd.DataFrame() # Return empty if no matches
-    return pd.DataFrame(relevant_tech)
-def search_solutions(problem_description, technologies):
     """
-    Searches DuckDuckGo for solutions combining the problem and technologies.
     """
     results = {}
-    if technologies.empty:
-        return "No relevant technologies found to search for solutions."
-    try:
-        with DDGS() as ddgs:
-            for tech_name in technologies['technology'].unique(): # Use unique names
-                # Clean up tech_name if it has extra info (like title prefixes)
-                # Simple cleaning - might need adjustment based on actual data
-                clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
-                if not clean_tech_name: continue # Skip if name is empty after cleaning
-                query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Limit query length
-                print(f"Searching for: {query}")
-                search_results = []
-                for i, result in enumerate(ddgs.text(query, max_results=MAX_SEARCH_RESULTS_PER_TECH)):
-                     search_results.append(result) # result is a dict {'title': ..., 'href': ..., 'body': ...}
-                if search_results:
-                     results[clean_tech_name] = search_results
-                else:
-                     results[clean_tech_name] = [] # Indicate no results found for this tech
-    except Exception as e:
-        print(f"Error during web search: {e}")
-        return f"An error occurred during the search: {e}"
     # Format results for display
-    output = "### Potential Solutions & Resources:\n\n"
     if not results:
-        output += "No search results found."
         return output
-    for tech, links in results.items():
         output += f"**For Technology: {tech}**\n"
-        if links:
-            for link in links:
-                 output += f"- [{link['title']}]({link['href']})\n" #{link['body'][:100]}...\n" # Optionally add body snippet
-        else:
-            output += "- *No specific results found for this technology combination.*\n"
         output += "\n"
     return output
 # --- Main Processing Function ---
 def process_problem(problem_description):
     """
     Main function called by Gradio interface.
-    Orchestrates the categorization, technology finding, and solution searching.
     """
     if not problem_description:
-        return "Please enter a problem description.", "", ""
     # 1. Categorize Problem
     category_name, score = find_best_category(problem_description)
     if category_name:
         category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
     else:
-        category_output = "**Could not confidently identify a relevant category.**"
-        # Return early if no category is found? Or proceed with empty tech? Let's proceed for now.
-        # return category_output, "No category identified, cannot find technologies.", "No category identified, cannot search solutions."
-    # 2. Find Relevant Technologies
-    relevant_technologies_df = find_relevant_technologies(category_name) # Pass None if category not found
     if not relevant_technologies_df.empty:
-        tech_output = "### Relevant Technologies:\n\n"
         for _, row in relevant_technologies_df.iterrows():
              # Clean up the description for better display
-             # Assuming description format like "- Title : ... \n - Purpose : ..."
-             desc_lines = str(row['description']).split('<br>') # Split by <br> if present
              cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
-             tech_output += f"**Technology:** {row['technology']}\n**Description:**\n{cleaned_desc}\n\n---\n"
     elif category_name:
          tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
     else:
          tech_output = "No relevant technologies could be identified as no category was matched."
-    # 3. Search for Solutions
-    solution_output = search_solutions(problem_description, relevant_technologies_df)
     # 4. Combine Outputs for Gradio
-    # Using Markdown for better formatting
     final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
-    # Gradio currently works best returning separate components if you define multiple outputs.
-    # Let's return a single formatted Markdown string for simplicity here.
-    # If you define 3 Markdown outputs in gr.Interface, you'd return: category_output, tech_output, solution_output
     return final_output
 # --- Create Gradio Interface ---
 print("Setting up Gradio interface...")
 iface = gr.Interface(
     fn=process_problem,
     inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
-    outputs=gr.Markdown(label="Analysis and Potential Solutions"), # Single Markdown output
-    # If using multiple outputs:
-    # outputs=[
-    #     gr.Markdown(label="Identified Category"),
-    #     gr.Markdown(label="Relevant Technologies"),
-    #     gr.Markdown(label="Potential Solutions (Search Results)")
-    # ],
-    title="Technical Problem Analyzer",
-    description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies from a predefined list, and search for potential online solutions using those technologies.",
     examples=[
         ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
         ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
         ["Develop a secure authentication method for a distributed IoT network without a central server."]
     ],
-    allow_flagging='never', # Optional: disable flagging
-    # theme=gr.themes.Soft() # Optional: Apply a theme
 )
 # --- Launch the App ---
 if __name__ == "__main__":
     print("Launching Gradio app...")
-    iface.launch() # Share=True to create a public link (requires login on Hugging Face Spaces)

 import json
 from sentence_transformers import SentenceTransformer, util
 import torch
+import requests # Use requests for API calls
 import re
+import urllib.parse # To encode URL parameters
 # --- Configuration ---
 CATEGORY_JSON_PATH = "categories.json"
 TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
 MODEL_NAME = 'all-MiniLM-L6-v2' # A good general-purpose sentence transformer
+CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for matching category
+MAX_TECHNOLOGIES_TO_SHOW = 8 # Enhancement 1: Limit displayed technologies
+MAX_SEARCH_REFERENCES_PER_TECH = 3 # Max references from the search API
+SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search" # Enhancement 3: New API endpoint
 # --- Load Data and Model (Load once at startup) ---
 print("Loading data and model...")
     # Load Categories
     with open(CATEGORY_JSON_PATH, 'r') as f:
         categories_data = json.load(f)["Category"]
     category_names = list(categories_data.keys())
     category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
     print(f"Loaded {len(category_names)} categories.")
     # Load Technologies
     technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
     technologies_df['category'] = technologies_df['category'].fillna('').astype(str)
+    # Pre-process description for embedding (use description column directly)
+    technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
     print(f"Loaded {len(technologies_df)} technologies.")
     # Load Sentence Transformer Model
     category_embeddings = model.encode(category_texts, convert_to_tensor=True)
     print("Category embeddings computed.")
+    # Pre-compute technology description embeddings (Optional but speeds up repeated calculations)
+    # print("Computing technology description embeddings...")
+    # technology_desc_embeddings = model.encode(technologies_df['description_clean'].tolist(), convert_to_tensor=True, show_progress_bar=True)
+    # print("Technology description embeddings computed.")
+    # NOTE: If pre-computing tech embeddings, adjust find_relevant_technologies to use them by index
 except FileNotFoundError as e:
     print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
     raise e
 except Exception as e:
     print(f"ERROR loading data or model: {e}")
     Finds the most relevant category for the problem description using semantic similarity.
     """
     if not problem_description or not category_names:
+        return None, 0.0
     try:
         problem_embedding = model.encode(problem_description, convert_to_tensor=True)
         cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
         best_score, best_idx = torch.max(cosine_scores, dim=0)
+        if best_score.item() >= CATEGORY_SIMILARITY_THRESHOLD:
             return category_names[best_idx.item()], best_score.item()
         else:
+            return None, best_score.item() # Return score even if below threshold
     except Exception as e:
         print(f"Error during category finding: {e}")
+        return None, 0.0
+def find_relevant_technologies(category_name, problem_description):
     """
+    Filters technologies by category, calculates similarity with the problem,
+    sorts by similarity, and returns the top results.
     """
+    relevant_tech_data = [] # Store tuples of (row, similarity_score)
+    if not category_name or technologies_df.empty or not problem_description:
+        return pd.DataFrame() # Return empty DataFrame if no category, data, or problem description
+    try:
+        problem_embedding = model.encode(problem_description, convert_to_tensor=True)
+        # Filter by category first
+        for index, row in technologies_df.iterrows():
+            tech_categories = [cat.strip() for cat in str(row['category']).split(',')]
+            if category_name in tech_categories:
+                # Enhancement 2: Calculate similarity between problem and tech description
+                tech_desc = row['description_clean']
+                if tech_desc: # Only calculate if description exists
+                    tech_embedding = model.encode(tech_desc, convert_to_tensor=True)
+                    similarity_score = util.pytorch_cos_sim(problem_embedding, tech_embedding)[0][0].item()
+                else:
+                    similarity_score = 0.0 # Assign 0 if no description
+                relevant_tech_data.append((row, similarity_score))
+        # Sort by similarity score (descending)
+        relevant_tech_data.sort(key=lambda item: item[1], reverse=True)
+        # Prepare DataFrame with sorted data and scores
+        if not relevant_tech_data:
+             return pd.DataFrame()
+        sorted_rows = [item[0] for item in relevant_tech_data]
+        scores = [item[1] for item in relevant_tech_data]
+        relevant_df = pd.DataFrame(sorted_rows)
+        relevant_df['similarity_score'] = scores # Add score column
+        # Enhancement 1: Limit the number of technologies shown
+        return relevant_df.head(MAX_TECHNOLOGIES_TO_SHOW)
+    except Exception as e:
+        print(f"Error during technology finding/scoring: {e}")
+        return pd.DataFrame() # Return empty on error
+def search_solutions_api(problem_description, technologies):
     """
+    Enhancement 3: Searches for solutions using the specified API endpoint.
     """
     results = {}
+    if technologies.empty or not problem_description:
+        return "No relevant technologies found or problem description missing, cannot search for solutions."
+    headers = {'accept': 'application/json'}
+    for index, tech_row in technologies.iterrows():
+        tech_name = tech_row['technology']
+        # Clean up tech_name if it has extra info (like title prefixes)
+        clean_tech_name = re.sub(r'^- Title\s*:\s*', '', str(tech_name)).strip()
+        if not clean_tech_name: continue # Skip if name is empty
+        # Construct query for the API
+        query = f'"{problem_description[:100]}" using "{clean_tech_name}" solution OR tutorial OR implementation' # Keep query concise
+        # Prepare URL with encoded parameters
+        params = {
+            'query': query,
+            'max_references': MAX_SEARCH_REFERENCES_PER_TECH
+        }
+        encoded_params = urllib.parse.urlencode(params)
+        full_url = f"{SEARCH_API_URL}?{encoded_params}"
+        print(f"Calling API: POST {full_url}") # Log the call
+        try:
+            # Make the POST request (as per curl example, though query params in URL is GET-like)
+            response = requests.post(full_url, headers=headers, timeout=30) # Added timeout
+            response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
+            # Assume the API returns JSON with a structure like:
+            # {'results': [{'title': '...', 'link': '...', 'snippet': '...'}, ...]}
+            # OR potentially just a list: [{'title': '...', 'link': '...', 'snippet': '...'}]
+            # Adjust parsing based on the *actual* API response structure
+            api_response = response.json()
+            # --- Adapt the following lines based on the API's actual JSON structure ---
+            search_results = []
+            if isinstance(api_response, list): # If the root is a list of results
+                 search_results = api_response
+            elif isinstance(api_response, dict) and 'results' in api_response and isinstance(api_response['results'], list): # If it's a dict with a 'results' key
+                 search_results = api_response['results']
+            else:
+                 print(f"Warning: Unexpected API response format for tech '{clean_tech_name}'. Response: {api_response}")
+            # --- End of adaptation section ---
+            # Store results, ensuring keys like 'title' and 'link' exist
+            results[clean_tech_name] = [
+                {'title': r.get('title', 'N/A'), 'link': r.get('link', '#')}
+                for r in search_results if isinstance(r, dict) # Basic validation
+            ]
+        except requests.exceptions.RequestException as e:
+            print(f"Error calling search API for tech '{clean_tech_name}': {e}")
+            results[clean_tech_name] = f"API Error: {e}" # Store error message
+        except json.JSONDecodeError:
+             print(f"Error decoding JSON response for tech '{clean_tech_name}'. Status: {response.status_code}, Response text: {response.text[:200]}")
+             results[clean_tech_name] = "API Error: Invalid JSON response."
+        except Exception as e: # Catch other potential errors
+             print(f"Unexpected error during API call for tech '{clean_tech_name}': {e}")
+             results[clean_tech_name] = f"Unexpected Error: {e}"
     # Format results for display
+    output = "### Potential Solutions & Resources (via API):\n\n"
     if not results:
+        output += "No search results could be retrieved from the API."
         return output
+    for tech, search_data in results.items():
         output += f"**For Technology: {tech}**\n"
+        if isinstance(search_data, list):
+            if search_data:
+                for link_info in search_data:
+                    # Ensure link starts with http:// or https:// for Markdown link validity
+                    href = link_info.get('link', '#')
+                    if not href.startswith(('http://', 'https://')):
+                       href = '#' # Default to '#' if link is invalid or missing protocol
+                    output += f"- [{link_info.get('title', 'N/A')}]({href})\n"
+            else:
+                output += "- *No specific results found by the API for this technology combination.*\n"
+        else: # Handle cases where an error message was stored
+             output += f"- *Search failed: {search_data}*\n"
         output += "\n"
     return output
 # --- Main Processing Function ---
 def process_problem(problem_description):
     """
     Main function called by Gradio interface.
+    Orchestrates categorization, technology finding, and solution searching.
     """
     if not problem_description:
+        return "Please enter a problem description."
     # 1. Categorize Problem
     category_name, score = find_best_category(problem_description)
     if category_name:
         category_output = f"**Identified Category:** {category_name} (Similarity Score: {score:.2f})"
     else:
+        category_output = f"**Could not confidently identify a relevant category.** (Highest score: {score:.2f})"
+    # 2. Find Relevant Technologies (Pass problem description for similarity scoring)
+    relevant_technologies_df = find_relevant_technologies(category_name, problem_description)
     if not relevant_technologies_df.empty:
+        tech_output = f"### Relevant Technologies (Top {len(relevant_technologies_df)} based on relevance to problem):\n\n"
         for _, row in relevant_technologies_df.iterrows():
              # Clean up the description for better display
+             desc_lines = str(row['description']).split('<br>')
              cleaned_desc = "\n".join([line.strip() for line in desc_lines if line.strip()])
+             # Enhancement 2: Show similarity score
+             tech_output += f"**Technology:** {row['technology']}\n"
+             tech_output += f"**Relevance Score:** {row['similarity_score']:.2f}\n" # Display score
+             tech_output += f"**Description:**\n{cleaned_desc}\n\n---\n"
     elif category_name:
          tech_output = f"No specific technologies found listed under the '{category_name}' category in the provided data."
     else:
          tech_output = "No relevant technologies could be identified as no category was matched."
+    # 3. Search for Solutions (using the API)
+    solution_output = search_solutions_api(problem_description, relevant_technologies_df)
     # 4. Combine Outputs for Gradio
     final_output = f"## Analysis Results\n\n{category_output}\n\n{tech_output}\n\n{solution_output}"
     return final_output
 # --- Create Gradio Interface ---
 print("Setting up Gradio interface...")
 iface = gr.Interface(
     fn=process_problem,
     inputs=gr.Textbox(lines=5, label="Enter Technical Problem Description", placeholder="Describe your technical challenge or requirement here..."),
+    outputs=gr.Markdown(label="Analysis and Potential Solutions"),
+    title="Technical Problem Analyzer v2",
+    description="Enter a technical problem. The application will attempt to categorize it, find relevant technologies (showing top matches with relevance scores), and search for potential online solutions using a dedicated API.",
     examples=[
         ["How can I establish reliable communication between low-orbit satellites for continuous global monitoring?"],
         ["Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning."],
         ["Develop a secure authentication method for a distributed IoT network without a central server."]
     ],
+    allow_flagging='never',
 )
 # --- Launch the App ---
 if __name__ == "__main__":
     print("Launching Gradio app...")
+    iface.launch()