Spaces:
Build error
Build error
ALLOUNE commited on
Commit Β·
4d23c18
1
Parent(s): 1589b2d
add app with api and ui
Browse files- Dockerfile +18 -16
- api/main.py +1 -0
- app.py +29 -0
- requirements.txt +11 -2
- src/streamlit_app.py +0 -40
- streamlit_app/app.py +183 -0
Dockerfile
CHANGED
|
@@ -1,21 +1,23 @@
|
|
| 1 |
-
|
|
|
|
| 2 |
|
| 3 |
-
|
|
|
|
| 4 |
|
| 5 |
-
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 11 |
|
| 12 |
-
|
| 13 |
-
COPY
|
| 14 |
|
| 15 |
-
|
|
|
|
|
|
|
| 16 |
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
|
|
|
|
| 1 |
+
# 1. Use an official Python runtime as a parent image
|
| 2 |
+
FROM python:3.10-slim
|
| 3 |
|
| 4 |
+
# 2. Set the working directory in the container
|
| 5 |
+
WORKDIR /code
|
| 6 |
|
| 7 |
+
# 3. Copy the dependencies file and install them
|
| 8 |
+
# This is done in a separate step to leverage Docker's layer caching.
|
| 9 |
+
# The dependencies will only be re-installed if requirements.txt changes.
|
| 10 |
+
COPY ./requirements.txt /code/requirements.txt
|
| 11 |
+
RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
|
|
|
|
| 12 |
|
| 13 |
+
# 4. Copy the rest of the application's code into the container
|
| 14 |
+
COPY . /code/
|
| 15 |
|
| 16 |
+
# 5. Expose the port that the app will run on
|
| 17 |
+
# Hugging Face Spaces expect the app to listen on port 7860
|
| 18 |
+
EXPOSE 7860
|
| 19 |
|
| 20 |
+
# 6. Define the command to run your app
|
| 21 |
+
# This command starts the Uvicorn server which serves your FastAPI app.
|
| 22 |
+
# --host 0.0.0.0 makes the server accessible from outside the container.
|
| 23 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
api/main.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
print("hi")
|
app.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import uvicorn
|
| 3 |
+
from fastapi import FastAPI
|
| 4 |
+
from starlette.middleware.wsgi import WSGIMiddleware
|
| 5 |
+
from starlette.responses import RedirectResponse
|
| 6 |
+
|
| 7 |
+
# Use this library to make Streamlit WSGI-compatible
|
| 8 |
+
from streamlit_wsgi import StreamlitPatcher
|
| 9 |
+
|
| 10 |
+
# Import your FastAPI app
|
| 11 |
+
from api.main import api_app
|
| 12 |
+
|
| 13 |
+
# Create and patch the Streamlit app
|
| 14 |
+
sp = StreamlitPatcher()
|
| 15 |
+
streamlit_script_path = os.path.join("streamlit_app", "app.py")
|
| 16 |
+
streamlit_app = sp.create_app(streamlit_script_path)
|
| 17 |
+
sp.patch() # Apply patches for Streamlit to work correctly
|
| 18 |
+
|
| 19 |
+
# The main FastAPI app that will serve everything
|
| 20 |
+
app = FastAPI()
|
| 21 |
+
|
| 22 |
+
# Mount the API and the Streamlit UI
|
| 23 |
+
app.mount('/api', api_app)
|
| 24 |
+
app.mount('/ui', WSGIMiddleware(streamlit_app))
|
| 25 |
+
|
| 26 |
+
@app.get("/")
|
| 27 |
+
def root():
|
| 28 |
+
# Redirect root to the Streamlit UI
|
| 29 |
+
return RedirectResponse(url="/ui")
|
requirements.txt
CHANGED
|
@@ -1,3 +1,12 @@
|
|
| 1 |
-
altair
|
| 2 |
pandas
|
| 3 |
-
streamlit
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
pandas
|
| 2 |
+
streamlit
|
| 3 |
+
numpy
|
| 4 |
+
sentence-transformers
|
| 5 |
+
fuzzywuzzy
|
| 6 |
+
fastapi
|
| 7 |
+
uvicorn[standard]
|
| 8 |
+
requests
|
| 9 |
+
streamlit-wsgi
|
| 10 |
+
huggingface_hub
|
| 11 |
+
datasets
|
| 12 |
+
pydantic
|
src/streamlit_app.py
DELETED
|
@@ -1,40 +0,0 @@
|
|
| 1 |
-
import altair as alt
|
| 2 |
-
import numpy as np
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import streamlit as st
|
| 5 |
-
|
| 6 |
-
"""
|
| 7 |
-
# Welcome to Streamlit!
|
| 8 |
-
|
| 9 |
-
Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
|
| 10 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 11 |
-
forums](https://discuss.streamlit.io).
|
| 12 |
-
|
| 13 |
-
In the meantime, below is an example of what you can do with just a few lines of code:
|
| 14 |
-
"""
|
| 15 |
-
|
| 16 |
-
num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
|
| 17 |
-
num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
|
| 18 |
-
|
| 19 |
-
indices = np.linspace(0, 1, num_points)
|
| 20 |
-
theta = 2 * np.pi * num_turns * indices
|
| 21 |
-
radius = indices
|
| 22 |
-
|
| 23 |
-
x = radius * np.cos(theta)
|
| 24 |
-
y = radius * np.sin(theta)
|
| 25 |
-
|
| 26 |
-
df = pd.DataFrame({
|
| 27 |
-
"x": x,
|
| 28 |
-
"y": y,
|
| 29 |
-
"idx": indices,
|
| 30 |
-
"rand": np.random.randn(num_points),
|
| 31 |
-
})
|
| 32 |
-
|
| 33 |
-
st.altair_chart(alt.Chart(df, height=700, width=700)
|
| 34 |
-
.mark_point(filled=True)
|
| 35 |
-
.encode(
|
| 36 |
-
x=alt.X("x", axis=None),
|
| 37 |
-
y=alt.Y("y", axis=None),
|
| 38 |
-
color=alt.Color("idx", legend=None, scale=alt.Scale()),
|
| 39 |
-
size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
|
| 40 |
-
))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
streamlit_app/app.py
ADDED
|
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import pandas as pd
|
| 3 |
+
import streamlit as st
|
| 4 |
+
import sentence_transformers
|
| 5 |
+
from fuzzywuzzy import fuzz
|
| 6 |
+
from google.genai import Client, types
|
| 7 |
+
import json
|
| 8 |
+
from datasets import load_dataset, Dataset
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
from huggingface_hub import hf_hub_download
|
| 12 |
+
pickle_path = hf_hub_download(repo_id="heymenn/Technologies", filename="embeddings.pkl", repo_type="dataset")
|
| 13 |
+
csv_path = hf_hub_download(repo_id="heymenn/Technologies", filename="technologies.csv", repo_type="dataset")
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
| 17 |
+
embeddings = [] # This line initializes embeddings as an empty list, but it's overwritten later by df_pickle. Consider removing it if not used.
|
| 18 |
+
|
| 19 |
+
df_csv = pd.read_csv(csv_path)
|
| 20 |
+
df_pickle = pd.read_pickle(pickle_path)
|
| 21 |
+
|
| 22 |
+
markdown = ""
|
| 23 |
+
markdown += "| Score | Technology | Similarity Method |\n"
|
| 24 |
+
markdown += "|---------|-------------------------------------------------|---------------------------|\n"
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def search_and_retrieve(user_input, embeddings_df, technologies_df): # Renamed parameters to avoid confusion with global variables
|
| 28 |
+
|
| 29 |
+
user_embedding = model.encode(user_input)
|
| 30 |
+
results = []
|
| 31 |
+
max_result = {"score":0, "technology": "", "type":""} # Renamed 'max' to 'max_result' to avoid conflict with built-in max() function
|
| 32 |
+
|
| 33 |
+
for i, embedding in enumerate(embeddings_df["embbedings"]):
|
| 34 |
+
name = technologies_df.iloc[i]["name"]
|
| 35 |
+
|
| 36 |
+
cosim = model.similarity(embedding, user_embedding)
|
| 37 |
+
token_set_ratio = fuzz.token_set_ratio(user_input, name)
|
| 38 |
+
|
| 39 |
+
fuzzy_score = token_set_ratio / 100
|
| 40 |
+
alpha = 0.6
|
| 41 |
+
combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
|
| 42 |
+
result = {"score": combined_score, "technology": name, "type": "Cosim + Levenshtein"}
|
| 43 |
+
if combined_score > max_result["score"]:
|
| 44 |
+
max_result = result
|
| 45 |
+
|
| 46 |
+
results.append(result)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5]
|
| 50 |
+
print(top_5)
|
| 51 |
+
markdown = ""
|
| 52 |
+
markdown += "| Score | Technology | Similarity Method |\n"
|
| 53 |
+
markdown += "|---------|-------------------------------------------------|---------------------------|\n"
|
| 54 |
+
|
| 55 |
+
# Store results in session_state
|
| 56 |
+
for item in top_5:
|
| 57 |
+
score = float(item['score'][0][0]) # handles tensor-like [[value]]
|
| 58 |
+
tech = item['technology']
|
| 59 |
+
method = item['type']
|
| 60 |
+
markdown += f"| {score:.4f} | {tech} | {method} |\n"
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
markdown_max = f"**{max_result['technology']}** have been found with a confidence score of **{max_result['score'][0][0]:.4f}**"
|
| 64 |
+
|
| 65 |
+
st.session_state.best_result = markdown_max
|
| 66 |
+
st.session_state.top_5_results = markdown
|
| 67 |
+
|
| 68 |
+
if max_result["score"] < 0.7:
|
| 69 |
+
st.session_state.show_generate_button = True
|
| 70 |
+
else:
|
| 71 |
+
st.session_state.show_generate_button = False
|
| 72 |
+
|
| 73 |
+
st.title("Search technologies from a dataset")
|
| 74 |
+
|
| 75 |
+
tech = st.text_input("Technology title π", placeholder="e.g Virtual Private Network", key="tech_input")
|
| 76 |
+
|
| 77 |
+
if 'best_result' not in st.session_state:
|
| 78 |
+
st.session_state.best_result = "#### π No search have been made yet"
|
| 79 |
+
if 'top_5_results' not in st.session_state:
|
| 80 |
+
markdown += f"| N/A | N/A | N/A |\n"
|
| 81 |
+
st.session_state.top_5_results = markdown
|
| 82 |
+
if 'show_generate_button' not in st.session_state:
|
| 83 |
+
st.session_state.show_generate_button = False
|
| 84 |
+
if 'generate_answer' not in st.session_state:
|
| 85 |
+
st.session_state.generate_answer = False
|
| 86 |
+
if 'generate_text' not in st.session_state:
|
| 87 |
+
st.session_state.generate_text = ""
|
| 88 |
+
|
| 89 |
+
# Pass a lambda function to on_click, which then calls your search_and_retrieve function
|
| 90 |
+
st.button("Search π", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv))
|
| 91 |
+
|
| 92 |
+
# Display results after the function has been called
|
| 93 |
+
st.markdown(f"{st.session_state.best_result}")
|
| 94 |
+
st.markdown(f"{st.session_state.top_5_results}")
|
| 95 |
+
|
| 96 |
+
if st.session_state.show_generate_button:
|
| 97 |
+
st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions))
|
| 98 |
+
instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions")
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def generate_tech(user_input, user_instructions):
|
| 102 |
+
prompt = f"""
|
| 103 |
+
# ROLE
|
| 104 |
+
|
| 105 |
+
You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
|
| 106 |
+
|
| 107 |
+
# OBJECTIVE
|
| 108 |
+
|
| 109 |
+
Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
|
| 110 |
+
Create a complete JSON object according to the schema below.
|
| 111 |
+
Your final output must be a single, valid JSON document containing a technology you created.
|
| 112 |
+
The technology should be described with sentences.
|
| 113 |
+
|
| 114 |
+
# INSTRUCTIONS & RULES
|
| 115 |
+
|
| 116 |
+
1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
|
| 117 |
+
Do not include any explanatory text before or after the JSON.
|
| 118 |
+
2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
|
| 119 |
+
3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
|
| 120 |
+
Do not use single keywords.
|
| 121 |
+
4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
|
| 122 |
+
|
| 123 |
+
# YAML SCHEMA & EXAMPLE
|
| 124 |
+
|
| 125 |
+
Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
|
| 126 |
+
|
| 127 |
+
{{"name": "Generative Watermarking"
|
| 128 |
+
"purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
|
| 129 |
+
"problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
|
| 130 |
+
"advantages": "Way faster to generate by an AI"
|
| 131 |
+
"limitations": "Takes a lot of computational time to generate"
|
| 132 |
+
"domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
|
| 133 |
+
}}
|
| 134 |
+
|
| 135 |
+
Take into account those additionnal informations if there is any:
|
| 136 |
+
{user_instructions}
|
| 137 |
+
---
|
| 138 |
+
***NOW, BEGIN THE TASK.***
|
| 139 |
+
|
| 140 |
+
<USER_INPUT>
|
| 141 |
+
{user_input}
|
| 142 |
+
</USER_INPUT>
|
| 143 |
+
"""
|
| 144 |
+
|
| 145 |
+
client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
|
| 146 |
+
|
| 147 |
+
# Define the grounding tool
|
| 148 |
+
grounding_tool = types.Tool(
|
| 149 |
+
google_search=types.GoogleSearch()
|
| 150 |
+
)
|
| 151 |
+
|
| 152 |
+
# Configure generation settings
|
| 153 |
+
config = types.GenerateContentConfig(
|
| 154 |
+
tools=[grounding_tool]
|
| 155 |
+
)
|
| 156 |
+
|
| 157 |
+
response = client.models.generate_content(
|
| 158 |
+
model="gemini-2.5-flash",
|
| 159 |
+
contents=prompt,
|
| 160 |
+
config=config,
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
if response.text:
|
| 164 |
+
st.session_state.generate_answer = True
|
| 165 |
+
st.session_state.generate_text = response.text
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def send_to_dataset(data, df_csv, df_pickle):
|
| 169 |
+
|
| 170 |
+
data = data[data.find("{"):data.find("}")+1].replace('\n','')
|
| 171 |
+
json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
|
| 172 |
+
|
| 173 |
+
data_embedding = model.encode(str(json_data))
|
| 174 |
+
json_data["embeddings"] = data_embedding
|
| 175 |
+
|
| 176 |
+
dataset = load_dataset("heymenn/Technologies", split="train")
|
| 177 |
+
updated_dataset = dataset.add_item(json_data)
|
| 178 |
+
updated_dataset.push_to_hub("heymenn/Technologies")
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
if st.session_state.generate_answer:
|
| 182 |
+
st.markdown(st.session_state.generate_text)
|
| 183 |
+
st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, df_csv, df_pickle))
|