Spaces:

heymenn
/

Search-Technologies

Build error

App Files Files Community

ALLOUNE commited on Jul 22, 2025

Commit

4d23c18

1 Parent(s): 1589b2d

add app with api and ui

Browse files

Files changed (6) hide show

Dockerfile +18 -16
api/main.py +1 -0
app.py +29 -0
requirements.txt +11 -2
src/streamlit_app.py +0 -40
streamlit_app/app.py +183 -0

Dockerfile CHANGED Viewed

@@ -1,21 +1,23 @@
-FROM python:3.9-slim
-WORKDIR /app
-RUN apt-get update && apt-get install -y \
-    build-essential \
-    curl \
-    software-properties-common \
-    git \
-    && rm -rf /var/lib/apt/lists/*
-COPY requirements.txt ./
-COPY src/ ./src/
-RUN pip3 install -r requirements.txt
-EXPOSE 8501
-HEALTHCHECK CMD curl --fail http://localhost:8501/_stcore/health
-ENTRYPOINT ["streamlit", "run", "src/streamlit_app.py", "--server.port=8501", "--server.address=0.0.0.0"]

+# 1. Use an official Python runtime as a parent image
+FROM python:3.10-slim
+# 2. Set the working directory in the container
+WORKDIR /code
+# 3. Copy the dependencies file and install them
+# This is done in a separate step to leverage Docker's layer caching.
+# The dependencies will only be re-installed if requirements.txt changes.
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# 4. Copy the rest of the application's code into the container
+COPY . /code/
+# 5. Expose the port that the app will run on
+# Hugging Face Spaces expect the app to listen on port 7860
+EXPOSE 7860
+# 6. Define the command to run your app
+# This command starts the Uvicorn server which serves your FastAPI app.
+# --host 0.0.0.0 makes the server accessible from outside the container.
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

api/main.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ print("hi")

app.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+import uvicorn
+from fastapi import FastAPI
+from starlette.middleware.wsgi import WSGIMiddleware
+from starlette.responses import RedirectResponse
+# Use this library to make Streamlit WSGI-compatible
+from streamlit_wsgi import StreamlitPatcher
+# Import your FastAPI app
+from api.main import api_app
+# Create and patch the Streamlit app
+sp = StreamlitPatcher()
+streamlit_script_path = os.path.join("streamlit_app", "app.py")
+streamlit_app = sp.create_app(streamlit_script_path)
+sp.patch() # Apply patches for Streamlit to work correctly
+# The main FastAPI app that will serve everything
+app = FastAPI()
+# Mount the API and the Streamlit UI
+app.mount('/api', api_app)
+app.mount('/ui', WSGIMiddleware(streamlit_app))
+@app.get("/")
+def root():
+    # Redirect root to the Streamlit UI
+    return RedirectResponse(url="/ui")

requirements.txt CHANGED Viewed

@@ -1,3 +1,12 @@
-altair
 pandas
-streamlit

 pandas
+streamlit
+numpy
+sentence-transformers
+fuzzywuzzy
+fastapi
+uvicorn[standard]
+requests
+streamlit-wsgi
+huggingface_hub
+datasets
+pydantic

src/streamlit_app.py DELETED Viewed

@@ -1,40 +0,0 @@
-import altair as alt
-import numpy as np
-import pandas as pd
-import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

streamlit_app/app.py ADDED Viewed

	@@ -0,0 +1,183 @@

+import numpy as np
+import pandas as pd
+import streamlit as st
+import sentence_transformers
+from fuzzywuzzy import fuzz
+from google.genai import Client, types
+import json
+from datasets import load_dataset, Dataset
+from huggingface_hub import hf_hub_download
+pickle_path = hf_hub_download(repo_id="heymenn/Technologies", filename="embeddings.pkl", repo_type="dataset")
+csv_path = hf_hub_download(repo_id="heymenn/Technologies", filename="technologies.csv", repo_type="dataset")
+model = sentence_transformers.SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
+embeddings = [] # This line initializes embeddings as an empty list, but it's overwritten later by df_pickle. Consider removing it if not used.
+df_csv = pd.read_csv(csv_path)
+df_pickle = pd.read_pickle(pickle_path)
+markdown = ""
+markdown += "| Score   | Technology                                      | Similarity Method         |\n"
+markdown += "|---------|-------------------------------------------------|---------------------------|\n"
+def search_and_retrieve(user_input, embeddings_df, technologies_df): # Renamed parameters to avoid confusion with global variables
+    user_embedding = model.encode(user_input)
+    results = []
+    max_result = {"score":0, "technology": "", "type":""} # Renamed 'max' to 'max_result' to avoid conflict with built-in max() function
+    for i, embedding in enumerate(embeddings_df["embbedings"]):
+        name = technologies_df.iloc[i]["name"]
+        cosim = model.similarity(embedding, user_embedding)
+        token_set_ratio = fuzz.token_set_ratio(user_input, name)
+        fuzzy_score = token_set_ratio / 100
+        alpha = 0.6
+        combined_score = alpha * cosim + (1 - alpha) * fuzzy_score
+        result = {"score": combined_score, "technology": name, "type": "Cosim + Levenshtein"}
+        if combined_score > max_result["score"]:
+            max_result = result
+        results.append(result)
+    top_5 = sorted(results, key=lambda x: x['score'], reverse=True)[:5]
+    print(top_5)
+    markdown = ""
+    markdown += "| Score   | Technology                                      | Similarity Method         |\n"
+    markdown += "|---------|-------------------------------------------------|---------------------------|\n"
+    # Store results in session_state
+    for item in top_5:
+        score = float(item['score'][0][0])  # handles tensor-like [[value]]
+        tech = item['technology']
+        method = item['type']
+        markdown += f"| {score:.4f}  | {tech} | {method} |\n"
+    markdown_max = f"**{max_result['technology']}** have been found with a confidence score of **{max_result['score'][0][0]:.4f}**"
+    st.session_state.best_result = markdown_max
+    st.session_state.top_5_results = markdown
+    if max_result["score"] < 0.7:
+        st.session_state.show_generate_button = True
+    else:
+        st.session_state.show_generate_button = False
+st.title("Search technologies from a dataset")
+tech = st.text_input("Technology title 👇", placeholder="e.g Virtual Private Network", key="tech_input")
+if 'best_result' not in st.session_state:
+    st.session_state.best_result = "#### 🙄 No search have been made yet"
+if 'top_5_results' not in st.session_state:
+    markdown += f"| N/A  | N/A | N/A |\n"
+    st.session_state.top_5_results = markdown
+if 'show_generate_button' not in st.session_state:
+    st.session_state.show_generate_button = False
+if 'generate_answer' not in st.session_state:
+    st.session_state.generate_answer = False
+if 'generate_text' not in st.session_state:
+    st.session_state.generate_text = ""
+# Pass a lambda function to on_click, which then calls your search_and_retrieve function
+st.button("Search 🔍", on_click=lambda: search_and_retrieve(st.session_state.tech_input, df_pickle, df_csv))
+# Display results after the function has been called
+st.markdown(f"{st.session_state.best_result}")
+st.markdown(f"{st.session_state.top_5_results}")
+if st.session_state.show_generate_button:
+    st.button("Generate your technology", on_click=lambda: generate_tech(st.session_state.tech_input, st.session_state.instructions))
+    instructions = st.text_input("Optional: add instructions to the generation", placeholder="Be more oriented towards the cybersecurity domain", key="instructions")
+def generate_tech(user_input, user_instructions):
+    prompt = f"""
+    # ROLE
+    You are a meticulous senior technical analyst and technology scout. Your task is to generate a technology into a structured JSON object.
+    # OBJECTIVE
+    Analyze the provided `<USER_INPUT>`. Identify what is technology discussed, focus on the highest level of the technology.
+    Create a complete JSON object according to the schema below.
+    Your final output must be a single, valid JSON document containing a technology you created.
+    The technology should be described with sentences.
+    # INSTRUCTIONS & RULES
+    1.  **JSON List Output**: Your entire response MUST be a single JSON code block starting with a hyphen (`-`) to denote a list.
+    Do not include any explanatory text before or after the JSON.
+    2.  **Discover and Iterate**: Your primary task is to understand the technology and create a JSON entry for it.
+    3.  **Descriptive Sentences**: You MUST write clear, full sentences that describe the technology's abilities and the issues it resolves.
+    Do not use single keywords.
+    4.  **Infer Where Necessary**: The source material may not contain all details. Infer plausible information based on the context.
+    # YAML SCHEMA & EXAMPLE
+    Your output must be a list of YAML objects matching this structure. Note how `functional_capabilities` and `problem_types_solved` contain full sentences.
+    {{"name": "Generative Watermarking"
+      "purpose": "Add an invisible, machine-readable tags to content generated by AI models and enables the tracing and authentication of digital media to its source."
+      "problem_types_solved": "Helps to combat digital misinformation by providing a method to verify content authenticity and addresses the erosion of trust in digital media caused by the proliferation of deepfakes."
+      "advantages": "Way faster to generate by an AI"
+      "limitations": "Takes a lot of computational time to generate"
+      "domain_tags": "Present in the domains of : AI ethics, cybersecurity, digital media, content moderation"
+    }}
+    Take into account those additionnal informations if there is any:
+    {user_instructions}
+    ---
+    ***NOW, BEGIN THE TASK.***
+    <USER_INPUT>
+    {user_input}
+    </USER_INPUT>
+    """
+    client = Client(api_key="AIzaSyCHcw26RRXZAlb1twhc_cRkoqCFW3e8QKk")
+    # Define the grounding tool
+    grounding_tool = types.Tool(
+        google_search=types.GoogleSearch()
+    )
+    # Configure generation settings
+    config = types.GenerateContentConfig(
+        tools=[grounding_tool]
+    )
+    response = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=prompt,
+        config=config,
+    )
+    if response.text:
+        st.session_state.generate_answer = True
+        st.session_state.generate_text = response.text
+def send_to_dataset(data, df_csv, df_pickle):
+    data = data[data.find("{"):data.find("}")+1].replace('\n','')
+    json_data = json.loads(data[data.find("{"):data.find("}")+1].replace('\n',''))
+    data_embedding = model.encode(str(json_data))
+    json_data["embeddings"] = data_embedding
+    dataset = load_dataset("heymenn/Technologies", split="train")
+    updated_dataset = dataset.add_item(json_data)
+    updated_dataset.push_to_hub("heymenn/Technologies")
+if st.session_state.generate_answer:
+    st.markdown(st.session_state.generate_text)
+    st.button("Send to dataset", on_click=lambda: send_to_dataset(st.session_state.generate_text, df_csv, df_pickle))