Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import streamlit as st | |
| import sentence_transformers | |
| from fuzzywuzzy import fuzz | |
| from google.genai import Client, types | |
| import json | |
| from datasets import load_dataset, Dataset | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| from datasets import load_dataset | |
| GEMINI_API_KEY = os.getenv("GEMINI_API_KEY") | |
| os.environ["HF_HOME"] = "/home/user/huggingface" | |
| model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2') | |
| embeddings = [] | |
| dataset = load_dataset("heymenn/Technologies", split="train") | |
| markdown = "| Score | Technology | Purpose |\n" | |
| markdown += "|---------|-------------------------------------------------|---------------------------|\n" | |
| def search_and_retrieve(user_input, dataset): # Renamed parameters to avoid confusion with global variables | |
| user_embedding = model.encode(user_input) | |
| results = [] | |
| max_result = {"score":0, "technology": "", "purpose":""} | |
| for row in dataset: | |
| name = row["name"] | |
| embedding = row["embeddings"] | |
| cosim = model.similarity(embedding, user_embedding) | |
| token_set_ratio = fuzz.token_set_ratio(user_input, name) | |
| fuzzy_score = token_set_ratio / 100 | |
| alpha = 0.6 | |
| combined_score = alpha * cosim + (1 - alpha) * fuzzy_score | |
| result = {"score": combined_score, "technology": name, "purpose": row["purpose"]} | |
| if combined_score > max_result["score"]: | |
| max_result = result | |
| results.append(result) | |
| top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5] | |
| markdown = "| Score | Technology | Purpose |\n" | |
| markdown += "|---------|-------------------------------------------------|---------------------------|\n" | |
| # Store results in session_state | |
| for item in top_5: | |
| score = float(item['score'][0][0]) # handles tensor-like [[value]] | |
| tech = item['technology'] | |
| purpose = item['purpose'] | |
| markdown += f"| {score:.4f} | {tech} | {purpose} |\n" | |
| markdown_max = f"**{max_result['technology']}** have been found with a confidence score of **{max_result['score'][0][0]:.4f}**" | |
| st.session_state.best_result = markdown_max | |
| st.session_state.top_5_results = markdown | |
| if max_result["score"] < 0.7: | |
| st.session_state.show_generate_button = True | |
| else: | |
| st.session_state.show_generate_button = False | |
| st.title("Search technologies from a dataset") | |
| tech = st.text_input("Technology title π", placeholder="e.g Virtual Private Network", key="tech_input") | |
| if 'best_result' not in st.session_state: | |
| st.session_state.best_result = "#### π No search have been made yet" | |
| if 'top_5_results' not in st.session_state: | |
| markdown += f"| N/A | N/A | N/A |\n" | |
| st.session_state.top_5_results = markdown | |
| if 'show_generate_button' not in st.session_state: | |
| st.session_state.show_generate_button = False | |
| if 'generate_answer' not in st.session_state: | |
| st.session_state.generate_answer = False | |
| if 'generate_text' not in st.session_state: | |
| st.session_state.generate_text = "" | |
| # Pass a lambda function to on_click, which then calls your search_and_retrieve function | |
| st.button("Search π", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv)) | |
| # Display results after the function has been called | |
| st.markdown(f"{st.session_state.best_result}") | |
| st.markdown(f"{st.session_state.top_5_results}") | |
| if st.session_state.show_generate_button: | |
| st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions)) | |
| instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions") | |
| def generate_tech(user_input, user_instructions): | |
| prompt = f""" | |
| # ROLE | |
| You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object. | |
| # OBJECTIVE | |
| Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology. | |
| Create a complete JSON object according to the schema below. | |
| Your final output must be a single, valid JSON document containing a technology you created. | |
| The technology should be described with sentences. | |
| # INSTRUCTIONS & RULES | |
| 1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list. | |
| Do not include any explanatory text before or after the JSON. | |
| 2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it. | |
| 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves. | |
| Do not use single keywords. | |
| 4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context. | |
| # YAML SCHEMA & EXAMPLE | |
| Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences. | |
| {{"name": "Generative Watermarking" | |
| "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source." | |
| "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes." | |
| "advantages": "Way faster to generate by an AI" | |
| "limitations": "Takes a lot of computational time to generate" | |
| "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation" | |
| }} | |
| Take into account those additionnal informations if there is any: | |
| {user_instructions} | |
| --- | |
| ***NOW, BEGIN THE TASK.*** | |
| <USER_INPUT> | |
| {user_input} | |
| </USER_INPUT> | |
| """ | |
| client = Client(api_key=GEMINI_API_KEY) | |
| # Define the grounding tool | |
| grounding_tool = types.Tool( | |
| google_search=types.GoogleSearch() | |
| ) | |
| # Configure generation settings | |
| config = types.GenerateContentConfig( | |
| tools=[grounding_tool] | |
| ) | |
| response = client.models.generate_content( | |
| model="gemini-2.5-flash", | |
| contents=prompt, | |
| config=config, | |
| ) | |
| if response.text: | |
| st.session_state.generate_answer = True | |
| st.session_state.generate_text = response.text | |
| def send_to_dataset(data, dataset): | |
| data = data[data.find("{"):data.find("}")+1].replace('\n','') | |
| json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n','')) | |
| data_embedding = model.encode(str(json_data)) | |
| json_data["embeddings"] = data_embedding | |
| updated_dataset = dataset.add_item(json_data) | |
| updated_dataset.push_to_hub("heymenn/Technologies") | |
| if st.session_state.generate_answer: | |
| st.markdown(st.session_state.generate_text) | |
| st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, dataset)) |