import numpy as np
import pandas as pd
import streamlit as st
import sentence_transformers
from fuzzywuzzy import fuzz
from google.genai import Client, types
import json
from datasets import load_dataset, Dataset
from huggingface_hub import hf_hub_download
import os
from datasets import load_dataset


GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
os.environ["HF_HOME"] = "/home/user/huggingface"

model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
embeddings = []

dataset = load_dataset("heymenn/Technologies", split="train")


markdown = "| Score   | Technology                                      | Purpose         |\n"
markdown += "|---------|-------------------------------------------------|---------------------------|\n"


def search_and_retrieve(user_input, dataset): # Renamed parameters to avoid confusion with global variables

    user_embedding = model.encode(user_input)
    results = []
    max_result = {"score":0, "technology": "", "purpose":""}

    for row in dataset:
        name = row["name"]
        embedding = row["embeddings"]
        
        cosim = model.similarity(embedding, user_embedding)
        token_set_ratio = fuzz.token_set_ratio(user_input, name)
        
        fuzzy_score = token_set_ratio / 100
        alpha = 0.6
        combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
        result = {"score": combined_score, "technology": name, "purpose": row["purpose"]}
        if combined_score > max_result["score"]:
            max_result = result          

        results.append(result) 


    top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5]
    markdown = "| Score   | Technology                                      | Purpose         |\n"
    markdown += "|---------|-------------------------------------------------|---------------------------|\n"

    # Store results in session_state
    for item in top_5:
        score = float(item['score'][0][0])  # handles tensor-like [[value]]
        tech = item['technology']
        purpose = item['purpose']
        markdown += f"| {score:.4f}  | {tech} | {purpose} |\n"
        

    markdown_max = f"**{max_result['technology']}** have been found with a confidence score of **{max_result['score'][0][0]:.4f}**"

    st.session_state.best_result = markdown_max
    st.session_state.top_5_results = markdown

    if max_result["score"] < 0.7: 
        st.session_state.show_generate_button = True
    else:
        st.session_state.show_generate_button = False

st.title("Search technologies from a dataset")

tech = st.text_input("Technology title 👇", placeholder="e.g Virtual Private Network", key="tech_input")

if 'best_result' not in st.session_state:
    st.session_state.best_result = "#### 🙄 No search have been made yet"
if 'top_5_results' not in st.session_state:
    markdown += f"| N/A  | N/A | N/A |\n"
    st.session_state.top_5_results = markdown
if 'show_generate_button' not in st.session_state:
    st.session_state.show_generate_button = False
if 'generate_answer' not in st.session_state:
    st.session_state.generate_answer = False
if 'generate_text' not in st.session_state:
    st.session_state.generate_text = ""

# Pass a lambda function to on_click, which then calls your search_and_retrieve function
st.button("Search 🔍", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv))

# Display results after the function has been called
st.markdown(f"{st.session_state.best_result}")
st.markdown(f"{st.session_state.top_5_results}")

if st.session_state.show_generate_button:
    st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions))
    instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions")


def generate_tech(user_input, user_instructions):
    prompt = f"""
    # ROLE

    You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.

    # OBJECTIVE

    Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology. 
    Create a complete JSON object according to the schema below. 
    Your final output must be a single, valid JSON document containing a technology you created. 
    The technology should be described with sentences.

    # INSTRUCTIONS & RULES

    1.  **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. 
    Do not include any explanatory text before or after the JSON.
    2.  **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
    3.  **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. 
    Do not use single keywords.
    4.  **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.

    # YAML SCHEMA & EXAMPLE

    Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.

    {{"name": "Generative Watermarking"
      "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
      "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
      "advantages": "Way faster to generate by an AI"
      "limitations": "Takes a lot of computational time to generate"
      "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
    }}

    Take into account those additionnal informations if there is any:
    {user_instructions}
    ---
    ***NOW, BEGIN THE TASK.***

    <USER_INPUT>
    {user_input}
    </USER_INPUT>    
    """ 

    client = Client(api_key=GEMINI_API_KEY)

    # Define the grounding tool
    grounding_tool = types.Tool(
        google_search=types.GoogleSearch()
    )

    # Configure generation settings
    config = types.GenerateContentConfig(
        tools=[grounding_tool]
    )

    response = client.models.generate_content(
        model="gemini-2.5-flash",
        contents=prompt,
        config=config,
    )

    if response.text:
        st.session_state.generate_answer = True
        st.session_state.generate_text = response.text


def send_to_dataset(data, dataset):

    data = data[data.find("{"):data.find("}")+1].replace('\n','')
    json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))

    data_embedding = model.encode(str(json_data))
    json_data["embeddings"] = data_embedding

    updated_dataset = dataset.add_item(json_data)
    updated_dataset.push_to_hub("heymenn/Technologies")


if st.session_state.generate_answer:
    st.markdown(st.session_state.generate_text)
    st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, dataset))