import streamlit as st
import pandas as pd
import json
from sentence_transformers import SentenceTransformer, util
import torch
import requests
import re
import urllib.parse
import itertools  # For generating pairs
import os
import io         # Keep for potential future use (e.g., local download)
import traceback  # Keep for error logging

# -- Fix SSL error
os.environ['REQUESTS_CA_BUNDLE'] = '/etc/ssl/certs/ca-certificates.crt'

# --- Configuration ---
CATEGORY_JSON_PATH = "categories.json"
TECHNOLOGY_EXCEL_PATH = "technologies.xlsx"
MODEL_NAME = 'all-MiniLM-L6-v2'
CATEGORY_SIMILARITY_THRESHOLD = 0.3 # Threshold for *displaying* the best category match
MAX_TECHNOLOGIES_TO_SHOW = 8 # Max technologies relevant to the problem (selected across ALL categories)
MAX_TECHNOLOGY_PAIRS_TO_SEARCH = 5 # Max pairs (from the relevant tech) to use for solution search
MAX_SEARCH_REFERENCES_PER_PAIR = 5 # Max references from the API per pair
SEARCH_API_URL = "https://ychkhan-ptt-endpoints.hf.space/search"
# --- Removed Google Drive Config ---

# --- Global Variables (will be managed by Streamlit's caching) ---
# These are loaded once via the cached function below

# --- Removed Google Drive API Setup ---

# --- Removed Google Drive Function ---

# --- Load Data and Model (Cached) ---
@st.cache_resource # Cache the model and embeddings
def load_data_and_model():
    """Loads data files and the Sentence Transformer model once."""
    print("Attempting to load data and model...")
    try:
        # Load Categories
        with open(CATEGORY_JSON_PATH, 'r', encoding='utf-8') as f:
            categories_data = json.load(f)["Category"]
        category_names = list(categories_data.keys())
        category_texts = [f"{name}: {', '.join(keywords)}" for name, keywords in categories_data.items()]
        print(f"Loaded {len(category_names)} categories.")

        # Load Technologies
        technologies_df = pd.read_excel(TECHNOLOGY_EXCEL_PATH)
        technologies_df.columns = technologies_df.columns.str.strip()
        if 'technology' not in technologies_df.columns or 'description' not in technologies_df.columns:
            raise ValueError("Missing required columns 'technology' or 'description' in technologies.xlsx")
        technologies_df['category'] = technologies_df.get('category', '').fillna('').astype(str)
        technologies_df['description_clean'] = technologies_df['description'].fillna('').astype(str)
        technologies_df['tech_id'] = technologies_df.index # Use index as unique ID
        print(f"Loaded {len(technologies_df)} technologies.")

        # Load Sentence Transformer Model
        model = SentenceTransformer(MODEL_NAME)
        print(f"Loaded Sentence Transformer model: {MODEL_NAME}")

        # Pre-compute category embeddings
        print("Computing category embeddings...")
        category_embeddings = model.encode(category_texts, convert_to_tensor=True)
        print("Category embeddings computed.")

        # Pre-compute technology description embeddings
        print("Computing technology description embeddings...")
        valid_descriptions = technologies_df['description_clean'].tolist()
        technology_embeddings = model.encode(valid_descriptions, convert_to_tensor=True)
        print(f"Technology description embeddings computed (shape: {technology_embeddings.shape}).")

        return (model, categories_data, category_names, category_embeddings,
                technologies_df, technology_embeddings)

    except FileNotFoundError as e:
        st.error(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' are in the same directory as the script.")
        print(f"ERROR: File not found - {e}. Please ensure '{CATEGORY_JSON_PATH}' and '{TECHNOLOGY_EXCEL_PATH}' exist.")
        return None # Indicate failure
    except Exception as e:
        st.error(f"ERROR loading data or model: {e}")
        print(f"ERROR loading data or model: {e}")
        traceback.print_exc()
        return None # Indicate failure

# --- Helper Functions (unchanged, use loaded_data) ---
def find_best_category(problem_description, model, category_names, category_embeddings):
    """Finds the most relevant category using pre-computed embeddings."""
    if not problem_description or not category_names or category_embeddings is None:
        return None, 0.0, False
    try:
        problem_embedding = model.encode(problem_description, convert_to_tensor=True)
        cosine_scores = util.pytorch_cos_sim(problem_embedding, category_embeddings)[0]
        best_score, best_idx = torch.max(cosine_scores, dim=0)
        best_category_name = category_names[best_idx.item()]
        best_category_score = best_score.item()
        is_confident = best_category_score >= CATEGORY_SIMILARITY_THRESHOLD
        return best_category_name, best_category_score, is_confident
    except Exception as e:
        print(f"Error during category finding: {e}")
        return None, 0.0, False

def find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings):
    """Calculates similarity between the problem and ALL technology descriptions."""
    all_tech_data = []
    if technologies_df.empty or technology_embeddings is None or not problem_description:
        print("Warning: Technologies DF, embeddings, or problem description missing.")
        return pd.DataFrame()
    try:
        problem_embedding = model.encode(problem_description, convert_to_tensor=True)
        # Efficiently calculate all similarities at once
        cosine_scores = util.pytorch_cos_sim(problem_embedding, technology_embeddings)[0]

        # Add scores to the dataframe temporarily
        temp_df = technologies_df.copy()
        temp_df['similarity_score_problem'] = cosine_scores.cpu().numpy() # Move scores to CPU and numpy

        # Sort by similarity and get top N
        relevant_df = temp_df.nlargest(MAX_TECHNOLOGIES_TO_SHOW, 'similarity_score_problem')

        # print(f"Top relevant technologies DF head:\n{relevant_df.head()}") # Debug print
        return relevant_df

    except Exception as e:
        print(f"Error during technology finding/scoring: {e}")
        traceback.print_exc() # Print full traceback for debugging
        return pd.DataFrame()


def find_top_technology_pairs(relevant_technologies_df, technology_embeddings):
    """Calculates similarity between pairs of relevant technologies."""
    if relevant_technologies_df.empty or len(relevant_technologies_df) < 2 or technology_embeddings is None:
        return []

    pairs_with_scores = []
    if 'tech_id' not in relevant_technologies_df.columns:
        print("Error: 'tech_id' column missing in relevant_technologies_df.")
        return []

    tech_ids = relevant_technologies_df['tech_id'].tolist()
    tech_id_to_name = pd.Series(relevant_technologies_df['technology'].values, index=relevant_technologies_df['tech_id']).to_dict()

    for id_a, id_b in itertools.combinations(tech_ids, 2):
        try:
            # Boundary checks
            if id_a >= technology_embeddings.shape[0] or id_b >= technology_embeddings.shape[0]:
                print(f"Warning: tech_id {id_a} or {id_b} out of bounds. Skipping pair.")
                continue

            embedding_a = technology_embeddings[id_a]
            embedding_b = technology_embeddings[id_b]

            # Calculate inter-technology similarity
            inter_similarity = util.pytorch_cos_sim(embedding_a.unsqueeze(0), embedding_b.unsqueeze(0))[0][0].item()

            tech_name_a = tech_id_to_name.get(id_a, f"Unknown Tech (ID:{id_a})")
            tech_name_b = tech_id_to_name.get(id_b, f"Unknown Tech (ID:{id_b})")

            clean_tech_name_a = re.sub(r'^- Title\s*:\s*', '', str(tech_name_a)).strip()
            clean_tech_name_b = re.sub(r'^- Title\s*:\s*', '', str(tech_name_b)).strip()

            pairs_with_scores.append(((clean_tech_name_a, clean_tech_name_b), inter_similarity))

        except Exception as e:
            print(f"Error calculating similarity for pair ({id_a}, {id_b}): {e}")
            traceback.print_exc()
            continue

    pairs_with_scores.sort(key=lambda item: item[1], reverse=True)
    pairs_with_scores_min_max = []
    pairs_with_scores_min_max.extend(pairs_with_scores[:MAX_TECHNOLOGY_PAIRS_TO_SEARCH-2])
    pairs_with_scores_min_max.extend(pairs_with_scores[MAX_TECHNOLOGY_PAIRS_TO_SEARCH-3:])
    return pairs_with_scores_min_max


def search_solutions_for_pairs(problem_description, top_pairs):
    """Searches for solutions/patents using pairs of technologies via the API."""
    results = {}
    if not top_pairs:
        # Return value modified for clarity
        return "No relevant technology pairs were identified (need at least 2 relevant technologies). Cannot search for solutions.\n", results
    if not problem_description:
        return "Problem description is missing. Cannot search for solutions.\n", results

    headers = {'accept': 'application/json'}
    api_output = f"### Potential Solutions & Patents (Found using Top {len(top_pairs)} Technology Pairs):\n\n"

    for pair_info in top_pairs:
        pair_names, pair_score = pair_info
        tech_a_name, tech_b_name = pair_names
        if not tech_a_name or not tech_b_name: continue

        query = f'research paper or patent on {tech_a_name} and {tech_b_name} related to {problem_description[:100]}...' # Keep query focused
        params = {
            'query': query,
            'max_references': MAX_SEARCH_REFERENCES_PER_PAIR
        }
        pair_key = f"{tech_a_name} + {tech_b_name}"
        print(f"Calling API for pair ({pair_key}): POST {SEARCH_API_URL} with query snippet: {query[:100]}...")

        try:
            response = requests.post(SEARCH_API_URL, headers=headers, params=params, timeout=45)
            response.raise_for_status()
            api_response = response.json() # Assume JSON response

            search_results = []
            # --- Adapt based on actual API response structure ---
            if isinstance(api_response, list):
                search_results = api_response
            elif isinstance(api_response, dict):
                 # Try common keys for results lists
                 if 'results' in api_response and isinstance(api_response.get('results'), list):
                     search_results = api_response['results']
                 elif 'references' in api_response and isinstance(api_response.get('references'), list):
                     search_results = api_response['references']
                 elif 'links' in api_response and isinstance(api_response.get('links'), list): # Another possibility
                     search_results = api_response['links']
                 else: # Check if the dict itself contains title/url
                     if 'title' in api_response and ('url' in api_response or 'link' in api_response):
                          search_results = [api_response] # Wrap it in a list
                     else:
                          print(f"Warning: Unexpected API response format for pair '{pair_key}'. Response keys: {list(api_response.keys())}")
            else:
                 print(f"Warning: Unexpected API response type for pair '{pair_key}'. Type: {type(api_response)}")
            # --- End adaptation ---

            valid_links = []
            for r in search_results:
                if isinstance(r, dict):
                    title = r.get('title', 'N/A')
                    url = r.get('url', r.get('link')) # Check for 'url' or 'link'
                    if url and isinstance(url, str) and url.startswith(('http://', 'https://')):
                         valid_links.append({'title': title, 'link': url})
                    elif url:
                         print(f"Warning: Invalid or missing URL for result '{title}' in pair '{pair_key}': {url}")

            results[pair_key] = {"score": pair_score, "links": valid_links}

        except requests.exceptions.Timeout:
            print(f"Error: API call timed out for pair '{pair_key}'")
            results[pair_key] = {"score": pair_score, "error": "API Timeout"}
        except requests.exceptions.HTTPError as e:
             print(f"Error: HTTP Error calling search API for pair '{pair_key}': {e}")
             results[pair_key] = {"score": pair_score, "error": f"API HTTP Error: {e.response.status_code}"}
        except requests.exceptions.RequestException as e:
            print(f"Error calling search API for pair '{pair_key}': {e}")
            results[pair_key] = {"score": pair_score, "error": f"API Request Error: {e}"}
        except json.JSONDecodeError:
            err_msg = f"API Error: Invalid JSON response. Status: {response.status_code}, Response text: {response.text[:200]}"
            print(f"Error decoding JSON response for pair '{pair_key}'. {err_msg}")
            results[pair_key] = {"score": pair_score, "error": err_msg}
        except Exception as e:
            err_msg = f"Unexpected Error during API call: {e}"
            print(f"Unexpected error during API call for pair '{pair_key}': {e}")
            traceback.print_exc()
            results[pair_key] = {"score": pair_score, "error": err_msg}

    # Format results for display
    if not results:
        api_output += "No search results could be retrieved from the API for the generated technology pairs."
        return api_output, results # Return formatted string and raw results dict

    for pair_key, search_data in results.items():
        pair_score = search_data.get('score', 0.0)
        api_output += f"**For Technology Pair: {pair_key}** (Inter-Similarity Score: {pair_score:.3f})\n"
        if "error" in search_data:
            api_output += f"- *Search failed: {search_data['error']}*\n"
        elif "links" in search_data:
            links = search_data["links"]
            if links:
                for link_info in links:
                    title_str = str(link_info.get('title', 'N/A'))
                    title_sanitized = title_str.replace('[','(').replace(']',')')
                    api_output += f"- [{title_sanitized}]({link_info.get('link', '#')})\n"
            else:
                api_output += "- *No specific results found by the API for this technology pair.*\n"
        else:
            api_output += "- *Unknown search result state.*\n"
        api_output += "\n"

    return api_output, results # Return formatted string and raw results dict

# --- Main Processing Function ---
def process_problem(problem_description, loaded_data):
    """
    Main function called by Streamlit interface. Orchestrates the process.
    Returns the formatted output string AND the relevant technologies DataFrame.
    """
    print(f"\n--- Processing request for: '{problem_description[:100]}...' ---")
    if not loaded_data:
         # This case should ideally be handled before calling process_problem
         return "Error: Model and data not loaded.", pd.DataFrame()

    (model, categories_data, category_names, category_embeddings,
     technologies_df, technology_embeddings) = loaded_data

    # 1. Categorize Problem
    category_name, cat_score, is_confident = find_best_category(problem_description, model, category_names, category_embeddings)
    if category_name:
        confidence_text = "(Confident Match)" if is_confident else "(Possible Match)"
        category_output = f"**Best Matching Category:** {category_name} {confidence_text} (Similarity Score: {cat_score:.3f})"
    else:
        category_output = "**Could not identify a matching category.**"
    print(f"Category identified: {category_name} (Score: {cat_score:.3f}, Confident: {is_confident})")

    # 2. Find Relevant Technologies
    relevant_technologies_df = find_relevant_technologies(problem_description, model, technologies_df, technology_embeddings)
    print(f"Found {len(relevant_technologies_df)} relevant technologies based on problem similarity.")
    tech_output = ""
    if not relevant_technologies_df.empty:
        tech_output += f"### Top {len(relevant_technologies_df)} Most Relevant Technologies (selected based on similarity to your problem):\n\n"
        # Create a list for display, keeping relevant data
        display_tech_list = []
        for _, row in relevant_technologies_df.iterrows():
             tech_name = re.sub(r'^- Title\s*:\s*', '', str(row.get('technology', 'N/A'))).strip()
             problem_relevance = row.get('similarity_score_problem', 0.0)
             original_cats = str(row.get('category', 'Unknown')).strip()

             tech_output += f"- **{tech_name}** (Problem Relevance: {problem_relevance:.3f})\n"
             if original_cats:
                 tech_output += f"  *Original Category listed as: {original_cats}*\n"

        tech_output += "\n---\n"
    else:
        tech_output = "Could not identify any relevant technologies based on the problem description.\n\n---\n"

    # 3. Find Top Technology Pairs
    top_pairs = find_top_technology_pairs(relevant_technologies_df, technology_embeddings)
    print(f"Identified {len(top_pairs)} top technology pairs for searching.")
    pairs_output = ""
    if top_pairs:
        pairs_output += f"### Top {len(top_pairs)} Technology Pairs (selected from the relevant technologies above, based on their inter-similarity):\n\n"
        for pair_names, score in top_pairs:
            pairs_output += f"- **{pair_names[0]} + {pair_names[1]}** (Inter-Similarity: {score:.3f})\n"
        pairs_output += "\n---\n"
    # No 'else' needed here, handled in final assembly

    # 4. Search for Solutions using the Top Pairs
    solution_output_text, _ = search_solutions_for_pairs(problem_description, top_pairs) # Ignore raw results dict here
    print("API search for solutions completed.")

    # 5. Combine Outputs
    final_output = (
        f"## Analysis Results for: \"{problem_description[:150]}...\"\n\n"
        f"{category_output}\n\n"
        f"{tech_output}"
    )
    if top_pairs:
        final_output += pairs_output
    else:
        final_output += "No technology pairs identified (need >= 2 relevant technologies to form pairs).\n\n---\n"

    final_output += solution_output_text

    print("--- Processing finished ---")
    # Return both the formatted text and the DataFrame (might be useful later)
    return final_output, relevant_technologies_df

# --- Streamlit UI ---
def main():
    st.set_page_config(page_title="Technical Problem Analyzer", layout="wide")
    st.title("🔧 Technical Problem Analyzer v4 (Local Streamlit)")

    st.markdown(
        """
        Enter a technical problem. The app will:
        1. Identify the best matching **category** (for informational purposes).
        2. Find the **most relevant technologies** based *directly on your problem description*.
        3. Identify **promising pairs** among these relevant technologies based on their similarity.
        4. Search for **patents/research** using these pairs via an external API.
        """
    )

    # Load data and model (cached)
    loaded_data = load_data_and_model()

    if loaded_data is None:
        st.error("Application initialization failed. Check logs for details.")
        st.stop() # Stop execution if loading failed

    # Example problems (optional)
    st.subheader("Example Problems:")
    examples = [
        "How can I establish reliable communication between low-orbit satellites for continuous global monitoring?",
        "Need a system to automatically detect anomalies in sensor data from industrial machinery using machine learning.",
        "Develop low-latency communication protocols for 6G networks",
        "Design efficient routing algorithms for large scale mesh networks in smart cities",
        "Create biodegradable packaging material from agricultural waste",
        "Develop a method for real-time traffic prediction using heterogeneous data sources"
    ]
    selected_example = st.selectbox("Select an example or enter your own below:", [""] + examples)


    # User input
    problem_description_input = st.text_area(
        "Enter Technical Problem Description:",
        height=150,
        placeholder="Describe your technical challenge or requirement here...",
        value=selected_example # Use selected example if chosen
    )

    # Button to trigger analysis
    analyze_button = st.button("Analyze Problem")

    if analyze_button and problem_description_input:
        with st.spinner("Analyzing problem and searching for solutions..."):
            # Run the main processing function
            analysis_output, relevant_tech_df = process_problem(problem_description_input, loaded_data)

            # Display results
            st.markdown("---") # Separator
            st.markdown(analysis_output) # Display formatted text results

            # --- Removed Google Drive Upload Section ---
            # You could potentially add other actions here using relevant_tech_df,
            # like displaying it as a table or offering a local download.
            # Example: Display relevant technologies table
            if not relevant_tech_df.empty:
                st.markdown("---")
                st.subheader("Relevant Technologies Data")
                st.dataframe(relevant_tech_df[['technology', 'description', 'category', 'similarity_score_problem']])


    elif analyze_button and not problem_description_input:
        st.warning("Please enter a problem description.")

# --- Run the App ---
if __name__ == "__main__":
    main()