Spaces:

heymenn
/

Search-Technologies-V2

Sleeping

App Files Files Community

Search-Technologies-V2 / src /streamlit_app.py

heymenn

Update src/streamlit_app.py

cdfed98 verified 7 months ago

raw

history blame contribute delete

7.32 kB

	import numpy as np
	import pandas as pd
	import streamlit as st
	import sentence_transformers
	from fuzzywuzzy import fuzz
	from google.genai import Client, types
	import json
	from datasets import load_dataset, Dataset
	from huggingface_hub import hf_hub_download
	import os
	from datasets import load_dataset


	GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
	os.environ["HF_HOME"] = "/home/user/huggingface"

	model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
	embeddings = []

	dataset = load_dataset("heymenn/Technologies", split="train")


	markdown = "\| Score \| Technology \| Purpose \|\n"
	markdown += "\|---------\|-------------------------------------------------\|---------------------------\|\n"


	def search_and_retrieve(user_input, dataset): # Renamed parameters to avoid confusion with global variables

	user_embedding = model.encode(user_input)
	results = []
	max_result = {"score":0, "technology": "", "purpose":""}

	for row in dataset:
	name = row["name"]
	embedding = row["embeddings"]

	cosim = model.similarity(embedding, user_embedding)
	token_set_ratio = fuzz.token_set_ratio(user_input, name)

	fuzzy_score = token_set_ratio / 100
	alpha = 0.6
	combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
	result = {"score": combined_score, "technology": name, "purpose": row["purpose"]}
	if combined_score > max_result["score"]:
	max_result = result

	results.append(result)


	top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5]
	markdown = "\| Score \| Technology \| Purpose \|\n"
	markdown += "\|---------\|-------------------------------------------------\|---------------------------\|\n"

	# Store results in session_state
	for item in top_5:
	score = float(item['score'][0][0]) # handles tensor-like [[value]]
	tech = item['technology']
	purpose = item['purpose']
	markdown += f"\| {score:.4f} \| {tech} \| {purpose} \|\n"


	markdown_max = f"{max_result['technology']} have been found with a confidence score of {max_result['score'][0][0]:.4f}"

	st.session_state.best_result = markdown_max
	st.session_state.top_5_results = markdown

	if max_result["score"] < 0.7:
	st.session_state.show_generate_button = True
	else:
	st.session_state.show_generate_button = False

	st.title("Search technologies from a dataset")

	tech = st.text_input("Technology title 👇", placeholder="e.g Virtual Private Network", key="tech_input")

	if 'best_result' not in st.session_state:
	st.session_state.best_result = "#### 🙄 No search have been made yet"
	if 'top_5_results' not in st.session_state:
	markdown += f"\| N/A \| N/A \| N/A \|\n"
	st.session_state.top_5_results = markdown
	if 'show_generate_button' not in st.session_state:
	st.session_state.show_generate_button = False
	if 'generate_answer' not in st.session_state:
	st.session_state.generate_answer = False
	if 'generate_text' not in st.session_state:
	st.session_state.generate_text = ""

	# Pass a lambda function to on_click, which then calls your search_and_retrieve function
	st.button("Search 🔍", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv))

	# Display results after the function has been called
	st.markdown(f"{st.session_state.best_result}")
	st.markdown(f"{st.session_state.top_5_results}")

	if st.session_state.show_generate_button:
	st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions))
	instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions")


	def generate_tech(user_input, user_instructions):
	prompt = f"""
	# ROLE

	You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.

	# OBJECTIVE

	Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
	Create a complete JSON object according to the schema below.
	Your final output must be a single, valid JSON document containing a technology you created.
	The technology should be described with sentences.

	# INSTRUCTIONS & RULES

	1. JSON List Output: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
	Do not include any explanatory text before or after the JSON.
	2. Discover and Iterate: Your primary task is to understand the technology and create a JSON entry for it.
	3. Descriptive Sentences: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
	Do not use single keywords.
	4. Infer Where Necessary: The source material may not contain all details. Infer plausible information based on the context.

	# YAML SCHEMA & EXAMPLE

	Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.

	{{"name": "Generative Watermarking"
	"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
	"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
	"advantages": "Way faster to generate by an AI"
	"limitations": "Takes a lot of computational time to generate"
	"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
	}}

	Take into account those additionnal informations if there is any:
	{user_instructions}
	---
	*NOW, BEGIN THE TASK.*

	<USER_INPUT>
	{user_input}
	</USER_INPUT>
	"""

	client = Client(api_key=GEMINI_API_KEY)

	# Define the grounding tool
	grounding_tool = types.Tool(
	google_search=types.GoogleSearch()
	)

	# Configure generation settings
	config = types.GenerateContentConfig(
	tools=[grounding_tool]
	)

	response = client.models.generate_content(
	model="gemini-2.5-flash",
	contents=prompt,
	config=config,
	)

	if response.text:
	st.session_state.generate_answer = True
	st.session_state.generate_text = response.text


	def send_to_dataset(data, dataset):

	data = data[data.find("{"):data.find("}")+1].replace('\n','')
	json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))

	data_embedding = model.encode(str(json_data))
	json_data["embeddings"] = data_embedding

	updated_dataset = dataset.add_item(json_data)
	updated_dataset.push_to_hub("heymenn/Technologies")


	if st.session_state.generate_answer:
	st.markdown(st.session_state.generate_text)
	st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, dataset))