Gabriele Tuccio commited on
Commit
62cdc78
·
1 Parent(s): 641d64f
Files changed (2) hide show
  1. app.py +83 -0
  2. requirements.txt +114 -0
app.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import pandas as pd
3
+ import numpy as np
4
+ from supabase import create_client
5
+ from sentence_transformers import SentenceTransformer
6
+ import faiss
7
+ import gradio as gr
8
+
9
+ # Configura Supabase
10
+ SUPABASE_URL = os.getenv("SUPABASE_URL")
11
+ SUPABASE_KEY = os.getenv("SUPABASE_KEY")
12
+
13
+ EMBEDDING_MODEL = "all-MiniLM-L6-v2"
14
+
15
+ # Crea client per Supabase
16
+ supabase = create_client(SUPABASE_URL, SUPABASE_KEY)
17
+
18
+ # Carica il modello per gli embedding
19
+ model = SentenceTransformer(EMBEDDING_MODEL)
20
+
21
+ # Funzione per caricare i dati da Supabase
22
+ def load_database(supabase_client):
23
+ # Recupera gli articoli da Supabase (assumiamo che la tabella si chiami 'papers')
24
+ response = supabase_client.table("papers").select("*").execute()
25
+ data = response.data
26
+
27
+ # Converti i dati in un DataFrame
28
+ df = pd.DataFrame(data)
29
+ return df
30
+
31
+ # Funzione per creare l'indice FAISS
32
+ def create_faiss_index(embeddings):
33
+ dimension = len(embeddings[0]) # La dimensione degli embedding
34
+ index = faiss.IndexFlatL2(dimension) # Crea un indice per la ricerca basato sulla distanza L2
35
+ index.add(np.array(embeddings, dtype=np.float32)) # Aggiungi gli embedding all'indice
36
+ return index
37
+
38
+ # Funzione per eseguire la ricerca
39
+ def search_papers(query, supabase_client):
40
+ # Carica i dati e l'indice FAISS
41
+ df = load_database(supabase_client)
42
+ index = create_faiss_index(df["embedding"].tolist())
43
+
44
+ # Calcola l'embedding della query
45
+ query_embedding = model.encode(query).reshape(1, -1).astype(np.float32)
46
+ distances, indices = index.search(query_embedding, 3) # Cerca i top 2 articoli più simili
47
+
48
+ results = df.iloc[indices[0]] # Recupera i risultati corrispondenti
49
+ return [(res["title"], res["url"]) for _, res in results.iterrows()]
50
+
51
+
52
+ # Funzione per generare l'interfaccia Gradio
53
+ def gradio_interface(query):
54
+ results = search_papers(query, supabase)
55
+ html_output = "<ul style='list-style-type: none; padding-left: 0;'>"
56
+
57
+ for title, url in results:
58
+ html_output += f"""
59
+ <li style="margin-bottom: 15px;">
60
+ <a href='{url}' target='_blank' style='text-decoration: none; color: #007bff; font-size: 18px; font-weight: bold;'>
61
+ {title}
62
+ </a>
63
+ </li>
64
+ """
65
+
66
+ html_output += "</ul>"
67
+
68
+ return html_output
69
+
70
+ # Interfaccia Gradio
71
+ iface = gr.Interface(
72
+ fn=gradio_interface,
73
+ inputs=gr.Textbox(label="Inserisci la tua query di ricerca", placeholder="Es. 'Deep Learning for NLP'", lines=1),
74
+ outputs=gr.HTML(label="Articoli correlati", elem_id="output-section"),
75
+ live=True,
76
+ title="Ricerca articoli arXiv",
77
+ description="Inserisci una query per trovare articoli correlati.",
78
+ )
79
+
80
+
81
+ # Avvia l'interfaccia
82
+ if __name__ == "__main__":
83
+ iface.launch()
requirements.txt ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohappyeyeballs==2.4.8
3
+ aiohttp==3.11.13
4
+ aiosignal==1.3.2
5
+ altair==5.5.0
6
+ annotated-types==0.7.0
7
+ anyio==4.8.0
8
+ arxiv==2.1.3
9
+ async-timeout==5.0.1
10
+ attrs==25.1.0
11
+ blinker==1.9.0
12
+ cachetools==5.5.2
13
+ certifi==2025.1.31
14
+ charset-normalizer==3.4.1
15
+ click==8.1.8
16
+ deprecation==2.1.0
17
+ exceptiongroup==1.2.2
18
+ faiss-cpu==1.10.0
19
+ fastapi==0.115.11
20
+ feedparser==6.0.11
21
+ ffmpy==0.5.0
22
+ filelock==3.17.0
23
+ frozenlist==1.5.0
24
+ fsspec==2025.2.0
25
+ gitdb==4.0.12
26
+ GitPython==3.1.44
27
+ gotrue==2.11.4
28
+ gradio==5.20.0
29
+ gradio_client==1.7.2
30
+ groovy==0.1.2
31
+ h11==0.14.0
32
+ h2==4.2.0
33
+ hpack==4.1.0
34
+ httpcore==1.0.7
35
+ httpx==0.28.1
36
+ huggingface-hub==0.29.1
37
+ hyperframe==6.1.0
38
+ idna==3.10
39
+ Jinja2==3.1.5
40
+ joblib==1.4.2
41
+ jsonschema==4.23.0
42
+ jsonschema-specifications==2024.10.1
43
+ markdown-it-py==3.0.0
44
+ MarkupSafe==2.1.5
45
+ mdurl==0.1.2
46
+ mpmath==1.3.0
47
+ multidict==6.1.0
48
+ narwhals==1.29.0
49
+ networkx==3.4.2
50
+ numpy==2.2.3
51
+ orjson==3.10.15
52
+ packaging==24.2
53
+ pandas==2.2.3
54
+ pillow==11.1.0
55
+ postgrest==0.19.3
56
+ propcache==0.3.0
57
+ protobuf==5.29.3
58
+ psycopg2-binary==2.9.10
59
+ pyaes==1.6.1
60
+ pyarrow==19.0.1
61
+ pyasn1==0.6.1
62
+ pydantic==2.10.6
63
+ pydantic_core==2.27.2
64
+ pydeck==0.9.1
65
+ pydub==0.25.1
66
+ Pygments==2.19.1
67
+ PyMuPDF==1.25.3
68
+ python-dateutil==2.9.0.post0
69
+ python-multipart==0.0.20
70
+ pytz==2025.1
71
+ PyYAML==6.0.2
72
+ realtime==2.4.1
73
+ referencing==0.36.2
74
+ regex==2024.11.6
75
+ requests==2.32.3
76
+ rich==13.9.4
77
+ rpds-py==0.23.1
78
+ rsa==4.9
79
+ ruff==0.9.9
80
+ safehttpx==0.1.6
81
+ safetensors==0.5.3
82
+ scikit-learn==1.6.1
83
+ scipy==1.15.2
84
+ semantic-version==2.10.0
85
+ sentence-transformers==3.4.1
86
+ sgmllib3k==1.0.0
87
+ shellingham==1.5.4
88
+ six==1.17.0
89
+ smmap==5.0.2
90
+ sniffio==1.3.1
91
+ starlette==0.46.0
92
+ storage3==0.11.3
93
+ streamlit==1.42.2
94
+ StrEnum==0.4.15
95
+ supabase==2.13.0
96
+ supafunc==0.9.3
97
+ sympy==1.13.1
98
+ Telethon==1.39.0
99
+ tenacity==9.0.0
100
+ threadpoolctl==3.5.0
101
+ tokenizers==0.21.0
102
+ toml==0.10.2
103
+ tomlkit==0.13.2
104
+ torch==2.6.0
105
+ tornado==6.4.2
106
+ tqdm==4.67.1
107
+ transformers==4.49.0
108
+ typer==0.15.2
109
+ typing_extensions==4.12.2
110
+ tzdata==2025.1
111
+ urllib3==2.3.0
112
+ uvicorn==0.34.0
113
+ websockets==14.2
114
+ yarl==1.18.3