Spaces:

heymenn
/

technical-problem-analyzer

Runtime error

File size: 21,518 Bytes

9be7b6e
dcf3971
 
 
 
20436a6
dcf3971
20436a6
9be7b6e
fdcb3d0
9be7b6e
 
 
 
 
dcf3971
 
 
 
20436a6
06cfe93
 
 
2a63273
20436a6
9be7b6e
20436a6
9be7b6e
 
7ce161c
9be7b6e
7ce161c
9be7b6e
7ce161c
9be7b6e
 
20436a6
9be7b6e
 
20436a6
 
9be7b6e
20436a6
 
 
 
 
 
 
06cfe93
 
9be7b6e
 
20436a6
9be7b6e
20436a6
 
 
 
 
 
 
 
9be7b6e
20436a6
 
 
 
 
9be7b6e
20436a6
 
9be7b6e
 
 
20436a6
9be7b6e
20436a6
9be7b6e
20436a6
9be7b6e
20436a6
9be7b6e
 
dcf3971
9be7b6e
 
 
20436a6
9be7b6e
dcf3971
 
 
 
06cfe93
 
 
 
dcf3971
 
06cfe93
dcf3971
9be7b6e
 
06cfe93
 
 
20436a6
d6ed968
 
9be7b6e
 
dcf3971
9be7b6e
 
 
d6ed968
9be7b6e
 
d6ed968
06cfe93
9be7b6e
d6ed968
 
 
06cfe93
20436a6
dcf3971
d6ed968
9be7b6e
 
20436a6
 
 
 
06cfe93
9be7b6e
 
06cfe93
20436a6
06cfe93
 
 
20436a6
9be7b6e
06cfe93
9be7b6e
 
06cfe93
 
 
 
20436a6
9be7b6e
20436a6
06cfe93
 
20436a6
 
 
 
 
 
 
06cfe93
 
 
20436a6
 
9be7b6e
 
 
 
20436a6
 
 
9be7b6e
 
 
 
 
 
 
d6ed968
 
9be7b6e
d6ed968
20436a6
 
 
9be7b6e
d6ed968
9be7b6e
d6ed968
 
20436a6
d6ed968
9be7b6e
 
d6ed968
 
9be7b6e
 
 
d6ed968
 
06cfe93
20436a6
9be7b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
d6ed968
9be7b6e
20436a6
d6ed968
06cfe93
 
9be7b6e
 
 
 
 
 
 
 
 
d6ed968
06cfe93
 
 
 
 
 
d6ed968
20436a6
06cfe93
9be7b6e
 
 
 
20436a6
06cfe93
 
 
 
dcf3971
 
 
9be7b6e
 
dcf3971
06cfe93
20436a6
9be7b6e
20436a6
9be7b6e
20436a6
 
 
 
9be7b6e
 
 
d6ed968
9be7b6e
20436a6
9be7b6e
 
dcf3971
9be7b6e
dcf3971
 
9be7b6e
dcf3971
9be7b6e
 
dcf3971
9be7b6e
 
 
 
dcf3971
9be7b6e
 
 
 
 
dcf3971
06cfe93
 
dcf3971
06cfe93
 
d6ed968
9be7b6e
 
06cfe93
20436a6
dcf3971
06cfe93
9be7b6e
 
dcf3971
9be7b6e
 
 
20436a6
9be7b6e
 
 
dcf3971
9be7b6e
 
 
20436a6
9be7b6e
 
 
20436a6
 
06cfe93
 
05c88fd
06cfe93
9be7b6e
06cfe93
 
9be7b6e
06cfe93
dcf3971
9be7b6e
06cfe93
05c88fd
06cfe93
 
 
05c88fd
9be7b6e
05c88fd
9be7b6e
05c88fd
9be7b6e
05c88fd
06cfe93
9be7b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20436a6
 
9be7b6e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dcf3971
9be7b6e
dcf3971
9be7b6e

import streamlit as st
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
import requests
import re
import urllib.parse
import itertools  # For generating pairs
import os
import io         # Keep for potential future use (e.g., local download)
import traceback  # Keep for error logging

# -- Fix SSL error
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'

# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2'
CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair
SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
# --- Removed Google Drive Config ---

# --- Global Variables (will be managed by Streamlit's caching) ---
# These are loaded once via the cached function below

# --- Removed Google Drive API Setup ---

# --- Removed Google Drive Function ---

# --- Load Data and Model (Cached) ---
@st.cache_resource # Cache the model and embeddings
def load_data_and_model():
    """Loads data files and the Sentence Transformer model once."""
    print("Attempting to load data and model...")
    try:
        # Load Categories
        with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f:
            categories_data = json.load(f)["Category"]
        category_names = list(categories_data.keys())
        category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
        print(f"Loaded {len(category_names)} categories.")

        # Load Technologies
        technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
        technologies_df.columns = technologies_df.columns.str.strip()
        if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
            raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
        technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str)
        technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
        technologies_df['tech_id'] = technologies_df.index # Use index as unique ID
        print(f"Loaded {len(technologies_df)} technologies.")

        # Load Sentence Transformer Model
        model = SentenceTransformer(MODEL_NAME)
        print(f"Loaded Sentence Transformer model: {MODEL_NAME}")

        # Pre-compute category embeddings
        print("Computing category embeddings...")
        category_embeddings = model.encode(category_texts, convert_to_tensor=True)
        print("Category embeddings computed.")

        # Pre-compute technology description embeddings
        print("Computing technology description embeddings...")
        valid_descriptions = technologies_df['description_clean'].tolist()
        technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True)
        print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")

        return (model, categories_data, category_names, category_embeddings,
                technologies_df, technology_embeddings)

    except FileNotFoundError as e:
        st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.")
        print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
        return None # Indicate failure
    except Exception as e:
        st.error(f"ERROR loading data or model: {e}")
        print(f"ERROR loading data or model: {e}")
        traceback.print_exc()
        return None # Indicate failure

# --- Helper Functions (unchanged, use loaded_data) ---
def find_best_category(problem_description, model, category_names, category_embeddings):
    """Finds the most relevant category using pre-computed embeddings."""
    if not problem_description or not category_names or category_embeddings is None:
        return None, 0.0, False
    try:
        problem_embedding = model.encode(problem_description, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
        best_score, best_idx = torch.max(cosine_scores, dim=0)
        best_category_name = category_names[best_idx.item()]
        best_category_score = best_score.item()
        is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
        return best_category_name, best_category_score, is_confident
    except Exception as e:
        print(f"Error during category finding: {e}")
        return None, 0.0, False

def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings):
    """Calculates similarity between the problem and ALL technology descriptions."""
    all_tech_data = []
    if technologies_df.empty or technology_embeddings is None or not problem_description:
        print("Warning: Technologies DF, embeddings, or problem description missing.")
        return pd.DataFrame()
    try:
        problem_embedding = model.encode(problem_description, convert_to_tensor=True)
        # Efficiently calculate all similarities at once
        cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0]

        # Add scores to the dataframe temporarily
        temp_df = technologies_df.copy()
        temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy

        # Sort by similarity and get top N
        relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem')

        # print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
        return relevant_df

    except Exception as e:
        print(f"Error during technology finding/scoring: {e}")
        traceback.print_exc() # Print full traceback for debugging
        return pd.DataFrame()


def find_top_technology_pairs(relevant_technologies_df, technology_embeddings):
    """Calculates similarity between pairs of relevant technologies."""
    if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
        return []

    pairs_with_scores = []
    if 'tech_id' not in relevant_technologies_df.columns:
        print("Error: 'tech_id' column missing in relevant_technologies_df.")
        return []

    tech_ids = relevant_technologies_df['tech_id'].tolist()
    tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()

    for id_a, id_b in itertools.combinations(tech_ids, 2):
        try:
            # Boundary checks
            if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
                print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.")
                continue

            embedding_a = technology_embeddings[id_a]
            embedding_b = technology_embeddings[id_b]

            # Calculate inter-technology similarity
            inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item()

            tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
            tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")

            clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
            clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip()

            pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))

        except Exception as e:
            print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
            traceback.print_exc()
            continue

    pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
    pairs_with_scores_min_max = []
    pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2])
    pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:])
    return pairs_with_scores_min_max


def search_solutions_for_pairs(problem_description, top_pairs):
    """Searches for solutions/patents using pairs of technologies via the API."""
    results = {}
    if not top_pairs:
        # Return value modified for clarity
        return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results
    if not problem_description:
        return "Problem description is missing. Cannot search for solutions.\n", results

    headers = {'accept': 'application/json'}
    api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n"

    for pair_info in top_pairs:
        pair_names, pair_score = pair_info
        tech_a_name, tech_b_name = pair_names
        if not tech_a_name or not tech_b_name: continue

        query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused
        params = {
            'query': query,
            'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
        }
        pair_key = f"{tech_a_name} + {tech_b_name}"
        print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...")

        try:
            response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45)
            response.raise_for_status()
            api_response = response.json() # Assume JSON response

            search_results = []
            # --- Adapt based on actual API response structure ---
            if isinstance(api_response, list):
                search_results = api_response
            elif isinstance(api_response, dict):
                 # Try common keys for results lists
                 if 'results' in api_response and isinstance(api_response.get('results'), list):
                     search_results = api_response['results']
                 elif 'references' in api_response and isinstance(api_response.get('references'), list):
                     search_results = api_response['references']
                 elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility
                     search_results = api_response['links']
                 else: # Check if the dict itself contains title/url
                     if 'title' in api_response and ('url' in api_response or 'link' in api_response):
                          search_results = [api_response] # Wrap it in a list
                     else:
                          print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}")
            else:
                 print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}")
            # --- End adaptation ---

            valid_links = []
            for r in search_results:
                if isinstance(r, dict):
                    title = r.get('title', 'N/A')
                    url = r.get('url', r.get('link')) # Check for 'url' or 'link'
                    if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
                         valid_links.append({'title': title, 'link': url})
                    elif url:
                         print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")

            results[pair_key] = {"score": pair_score, "links": valid_links}

        except requests.exceptions.Timeout:
            print(f"Error: API call timed out for pair '{pair_key}'")
            results[pair_key] = {"score": pair_score, "error": "API Timeout"}
        except requests.exceptions.HTTPError as e:
             print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
             results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
        except requests.exceptions.RequestException as e:
            print(f"Error calling search API for pair '{pair_key}': {e}")
            results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
        except json.JSONDecodeError:
            err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
            print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
            results[pair_key] = {"score": pair_score, "error": err_msg}
        except Exception as e:
            err_msg = f"Unexpected Error during API call: {e}"
            print(f"Unexpected error during API call for pair '{pair_key}': {e}")
            traceback.print_exc()
            results[pair_key] = {"score": pair_score, "error": err_msg}

    # Format results for display
    if not results:
        api_output += "No search results could be retrieved from the API for the generated technology pairs."
        return api_output, results # Return formatted string and raw results dict

    for pair_key, search_data in results.items():
        pair_score = search_data.get('score', 0.0)
        api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n"
        if "error" in search_data:
            api_output += f"- *Search failed: {search_data['error']}*\n"
        elif "links" in search_data:
            links = search_data["links"]
            if links:
                for link_info in links:
                    title_str = str(link_info.get('title', 'N/A'))
                    title_sanitized = title_str.replace('[','(').replace(']',')')
                    api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
            else:
                api_output += "- *No specific results found by the API for this technology pair.*\n"
        else:
            api_output += "- *Unknown search result state.*\n"
        api_output += "\n"

    return api_output, results # Return formatted string and raw results dict

# --- Main Processing Function ---
def process_problem(problem_description, loaded_data):
    """
    Main function called by Streamlit interface. Orchestrates the process.
    Returns the formatted output string AND the relevant technologies DataFrame.
    """
    print(f"\n--- Processing request for: '{problem_description[:100]}...' ---")
    if not loaded_data:
         # This case should ideally be handled before calling process_problem
         return "Error: Model and data not loaded.", pd.DataFrame()

    (model, categories_data, category_names, category_embeddings,
     technologies_df, technology_embeddings) = loaded_data

    # 1. Categorize Problem
    category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings)
    if category_name:
        confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
        category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
    else:
        category_output = "**Could not identify a matching category.**"
    print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")

    # 2. Find Relevant Technologies
    relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings)
    print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
    tech_output = ""
    if not relevant_technologies_df.empty:
        tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
        # Create a list for display, keeping relevant data
        display_tech_list = []
        for _, row in relevant_technologies_df.iterrows():
             tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
             problem_relevance = row.get('similarity_score_problem', 0.0)
             original_cats = str(row.get('category', 'Unknown')).strip()

             tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n"
             if original_cats:
                 tech_output += f"  *Original Category listed as: {original_cats}*\n"

        tech_output += "\n---\n"
    else:
        tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"

    # 3. Find Top Technology Pairs
    top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings)
    print(f"Identified {len(top_pairs)} top technology pairs for searching.")
    pairs_output = ""
    if top_pairs:
        pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
        for pair_names, score in top_pairs:
            pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n"
        pairs_output += "\n---\n"
    # No 'else' needed here, handled in final assembly

    # 4. Search for Solutions using the Top Pairs
    solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here
    print("API search for solutions completed.")

    # 5. Combine Outputs
    final_output = (
        f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
        f"{category_output}\n\n"
        f"{tech_output}"
    )
    if top_pairs:
        final_output += pairs_output
    else:
        final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n"

    final_output += solution_output_text

    print("--- Processing finished ---")
    # Return both the formatted text and the DataFrame (might be useful later)
    return final_output, relevant_technologies_df

# --- Streamlit UI ---
def main():
    st.set_page_config(page_title="Technical Problem Analyzer", layout="wide")
    st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)")

    st.markdown(
        """
        Enter a technical problem. The app will:
        1. Identify the best matching **category** (for informational purposes).
        2. Find the **most relevant technologies** based *directly on your problem description*.
        3. Identify **promising pairs** among these relevant technologies based on their similarity.
        4. Search for **patents/research** using these pairs via an external API.
        """
    )

    # Load data and model (cached)
    loaded_data = load_data_and_model()

    if loaded_data is None:
        st.error("Application initialization failed. Check logs for details.")
        st.stop() # Stop execution if loading failed

    # Example problems (optional)
    st.subheader("Example Problems:")
    examples = [
        "How can I establish reliable communication between low-orbit satellites for continuous global monitoring?",
        "Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.",
        "Develop low-latency communication protocols for 6G networks",
        "Design efficient routing algorithms for large scale mesh networks in smart cities",
        "Create biodegradable packaging material from agricultural waste",
        "Develop a method for real-time traffic prediction using heterogeneous data sources"
    ]
    selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples)


    # User input
    problem_description_input = st.text_area(
        "Enter Technical Problem Description:",
        height=150,
        placeholder="Describe your technical challenge or requirement here...",
        value=selected_example # Use selected example if chosen
    )

    # Button to trigger analysis
    analyze_button = st.button("Analyze Problem")

    if analyze_button and problem_description_input:
        with st.spinner("Analyzing problem and searching for solutions..."):
            # Run the main processing function
            analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data)

            # Display results
            st.markdown("---") # Separator
            st.markdown(analysis_output) # Display formatted text results

            # --- Removed Google Drive Upload Section ---
            # You could potentially add other actions here using relevant_tech_df,
            # like displaying it as a table or offering a local download.
            # Example: Display relevant technologies table
            if not relevant_tech_df.empty:
                st.markdown("---")
                st.subheader("Relevant Technologies Data")
                st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']])


    elif analyze_button and not problem_description_input:
        st.warning("Please enter a problem description.")

# --- Run the App ---
if __name__ == "__main__":
    main()