ALLOUNE commited on
Commit
4d23c18
Β·
1 Parent(s): 1589b2d

add app with api and ui

Browse files
Files changed (6) hide show
  1. Dockerfile +18 -16
  2. api/main.py +1 -0
  3. app.py +29 -0
  4. requirements.txt +11 -2
  5. src/streamlit_app.py +0 -40
  6. streamlit_app/app.py +183 -0
Dockerfile CHANGED
@@ -1,21 +1,23 @@
1
- FROM python:3.9-slim
 
2
 
3
- WORKDIR /app
 
4
 
5
- RUN apt-get update && apt-get install -y \
6
- build-essential \
7
- curl \
8
- software-properties-common \
9
- git \
10
- && rm -rf /var/lib/apt/lists/*
11
 
12
- COPY requirements.txt ./
13
- COPY src/ ./src/
14
 
15
- RUN pip3 install -r requirements.txt
 
 
16
 
17
- EXPOSE 8501
18
-
19
- HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
20
-
21
- ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]
 
1
+ # 1. Use an official Python runtime as a parent image
2
+ FROM python:3.10-slim
3
 
4
+ # 2. Set the working directory in the container
5
+ WORKDIR /code
6
 
7
+ # 3. Copy the dependencies file and install them
8
+ # This is done in a separate step to leverage Docker's layer caching.
9
+ # The dependencies will only be re-installed if requirements.txt changes.
10
+ COPY ./requirements.txt /code/requirements.txt
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
 
12
 
13
+ # 4. Copy the rest of the application's code into the container
14
+ COPY . /code/
15
 
16
+ # 5. Expose the port that the app will run on
17
+ # Hugging Face Spaces expect the app to listen on port 7860
18
+ EXPOSE 7860
19
 
20
+ # 6. Define the command to run your app
21
+ # This command starts the Uvicorn server which serves your FastAPI app.
22
+ # --host 0.0.0.0 makes the server accessible from outside the container.
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
api/main.py ADDED
@@ -0,0 +1 @@
 
 
1
+ print("hi")
app.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import uvicorn
3
+ from fastapi import FastAPI
4
+ from starlette.middleware.wsgi import WSGIMiddleware
5
+ from starlette.responses import RedirectResponse
6
+
7
+ # Use this library to make Streamlit WSGI-compatible
8
+ from streamlit_wsgi import StreamlitPatcher
9
+
10
+ # Import your FastAPI app
11
+ from api.main import api_app
12
+
13
+ # Create and patch the Streamlit app
14
+ sp = StreamlitPatcher()
15
+ streamlit_script_path = os.path.join("streamlit_app", "app.py")
16
+ streamlit_app = sp.create_app(streamlit_script_path)
17
+ sp.patch() # Apply patches for Streamlit to work correctly
18
+
19
+ # The main FastAPI app that will serve everything
20
+ app = FastAPI()
21
+
22
+ # Mount the API and the Streamlit UI
23
+ app.mount('/api', api_app)
24
+ app.mount('/ui', WSGIMiddleware(streamlit_app))
25
+
26
+ @app.get("/")
27
+ def root():
28
+ # Redirect root to the Streamlit UI
29
+ return RedirectResponse(url="/ui")
requirements.txt CHANGED
@@ -1,3 +1,12 @@
1
- altair
2
  pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
1
  pandas
2
+ streamlit
3
+ numpy
4
+ sentence-transformers
5
+ fuzzywuzzy
6
+ fastapi
7
+ uvicorn[standard]
8
+ requests
9
+ streamlit-wsgi
10
+ huggingface_hub
11
+ datasets
12
+ pydantic
src/streamlit_app.py DELETED
@@ -1,40 +0,0 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
4
- import streamlit as st
5
-
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
streamlit_app/app.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+ import sentence_transformers
5
+ from fuzzywuzzy import fuzz
6
+ from google.genai import Client, types
7
+ import json
8
+ from datasets import load_dataset, Dataset
9
+
10
+
11
+ from huggingface_hub import hf_hub_download
12
+ pickle_path = hf_hub_download(repo_id="heymenn/Technologies", filename="embeddings.pkl", repo_type="dataset")
13
+ csv_path = hf_hub_download(repo_id="heymenn/Technologies", filename="technologies.csv", repo_type="dataset")
14
+
15
+
16
+ model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
17
+ embeddings = [] # This line initializes embeddings as an empty list, but it's overwritten later by df_pickle. Consider removing it if not used.
18
+
19
+ df_csv = pd.read_csv(csv_path)
20
+ df_pickle = pd.read_pickle(pickle_path)
21
+
22
+ markdown = ""
23
+ markdown += "| Score | Technology | Similarity Method |\n"
24
+ markdown += "|---------|-------------------------------------------------|---------------------------|\n"
25
+
26
+
27
+ def search_and_retrieve(user_input, embeddings_df, technologies_df): # Renamed parameters to avoid confusion with global variables
28
+
29
+ user_embedding = model.encode(user_input)
30
+ results = []
31
+ max_result = {"score":0, "technology": "", "type":""} # Renamed 'max' to 'max_result' to avoid conflict with built-in max() function
32
+
33
+ for i, embedding in enumerate(embeddings_df["embbedings"]):
34
+ name = technologies_df.iloc[i]["name"]
35
+
36
+ cosim = model.similarity(embedding, user_embedding)
37
+ token_set_ratio = fuzz.token_set_ratio(user_input, name)
38
+
39
+ fuzzy_score = token_set_ratio / 100
40
+ alpha = 0.6
41
+ combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
42
+ result = {"score": combined_score, "technology": name, "type": "Cosim + Levenshtein"}
43
+ if combined_score > max_result["score"]:
44
+ max_result = result
45
+
46
+ results.append(result)
47
+
48
+
49
+ top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5]
50
+ print(top_5)
51
+ markdown = ""
52
+ markdown += "| Score | Technology | Similarity Method |\n"
53
+ markdown += "|---------|-------------------------------------------------|---------------------------|\n"
54
+
55
+ # Store results in session_state
56
+ for item in top_5:
57
+ score = float(item['score'][0][0]) # handles tensor-like [[value]]
58
+ tech = item['technology']
59
+ method = item['type']
60
+ markdown += f"| {score:.4f} | {tech} | {method} |\n"
61
+
62
+
63
+ markdown_max = f"**{max_result['technology']}** have been found with a confidence score of **{max_result['score'][0][0]:.4f}**"
64
+
65
+ st.session_state.best_result = markdown_max
66
+ st.session_state.top_5_results = markdown
67
+
68
+ if max_result["score"] < 0.7:
69
+ st.session_state.show_generate_button = True
70
+ else:
71
+ st.session_state.show_generate_button = False
72
+
73
+ st.title("Search technologies from a dataset")
74
+
75
+ tech = st.text_input("Technology title πŸ‘‡", placeholder="e.g Virtual Private Network", key="tech_input")
76
+
77
+ if 'best_result' not in st.session_state:
78
+ st.session_state.best_result = "#### πŸ™„ No search have been made yet"
79
+ if 'top_5_results' not in st.session_state:
80
+ markdown += f"| N/A | N/A | N/A |\n"
81
+ st.session_state.top_5_results = markdown
82
+ if 'show_generate_button' not in st.session_state:
83
+ st.session_state.show_generate_button = False
84
+ if 'generate_answer' not in st.session_state:
85
+ st.session_state.generate_answer = False
86
+ if 'generate_text' not in st.session_state:
87
+ st.session_state.generate_text = ""
88
+
89
+ # Pass a lambda function to on_click, which then calls your search_and_retrieve function
90
+ st.button("Search πŸ”", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv))
91
+
92
+ # Display results after the function has been called
93
+ st.markdown(f"{st.session_state.best_result}")
94
+ st.markdown(f"{st.session_state.top_5_results}")
95
+
96
+ if st.session_state.show_generate_button:
97
+ st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions))
98
+ instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions")
99
+
100
+
101
+ def generate_tech(user_input, user_instructions):
102
+ prompt = f"""
103
+ # ROLE
104
+
105
+ You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
106
+
107
+ # OBJECTIVE
108
+
109
+ Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
110
+ Create a complete JSON object according to the schema below.
111
+ Your final output must be a single, valid JSON document containing a technology you created.
112
+ The technology should be described with sentences.
113
+
114
+ # INSTRUCTIONS & RULES
115
+
116
+ 1. **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
117
+ Do not include any explanatory text before or after the JSON.
118
+ 2. **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
119
+ 3. **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
120
+ Do not use single keywords.
121
+ 4. **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
122
+
123
+ # YAML SCHEMA & EXAMPLE
124
+
125
+ Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
126
+
127
+ {{"name": "Generative Watermarking"
128
+ "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
129
+ "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
130
+ "advantages": "Way faster to generate by an AI"
131
+ "limitations": "Takes a lot of computational time to generate"
132
+ "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
133
+ }}
134
+
135
+ Take into account those additionnal informations if there is any:
136
+ {user_instructions}
137
+ ---
138
+ ***NOW, BEGIN THE TASK.***
139
+
140
+ <USER_INPUT>
141
+ {user_input}
142
+ </USER_INPUT>
143
+ """
144
+
145
+ client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
146
+
147
+ # Define the grounding tool
148
+ grounding_tool = types.Tool(
149
+ google_search=types.GoogleSearch()
150
+ )
151
+
152
+ # Configure generation settings
153
+ config = types.GenerateContentConfig(
154
+ tools=[grounding_tool]
155
+ )
156
+
157
+ response = client.models.generate_content(
158
+ model="gemini-2.5-flash",
159
+ contents=prompt,
160
+ config=config,
161
+ )
162
+
163
+ if response.text:
164
+ st.session_state.generate_answer = True
165
+ st.session_state.generate_text = response.text
166
+
167
+
168
+ def send_to_dataset(data, df_csv, df_pickle):
169
+
170
+ data = data[data.find("{"):data.find("}")+1].replace('\n','')
171
+ json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
172
+
173
+ data_embedding = model.encode(str(json_data))
174
+ json_data["embeddings"] = data_embedding
175
+
176
+ dataset = load_dataset("heymenn/Technologies", split="train")
177
+ updated_dataset = dataset.add_item(json_data)
178
+ updated_dataset.push_to_hub("heymenn/Technologies")
179
+
180
+
181
+ if st.session_state.generate_answer:
182
+ st.markdown(st.session_state.generate_text)
183
+ st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, df_csv, df_pickle))