File size: 1,666 Bytes
513fc07
ba02453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffcc5fa
ba02453
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import hf_hub_download
import os

st.set_page_config(page_title="ArXiv Expert Finder", page_icon="🔬", layout="wide")
st.title("ArXiv Expert Finder")

@st.cache_resource
def load_model():
    return SentenceTransformer("intfloat/multilingual-e5-large-instruct", trust_remote_code=True)

@st.cache_data
def load_data():
    parquet_path = hf_hub_download(
        repo_id="jadenhoch/jina-embeddings-v4",
        filename="arxiv_2025_zstd.parquet",
        repo_type="space"
    )

    npy_path = hf_hub_download(
        repo_id="jadenhochh/multilingual-e5-large-instruct_2",
        filename="corpus_embeddings_multilingual-e5-large-instruct_2.npy",
        repo_type="dataset"
    )

    return pd.read_parquet(parquet_path), np.load(npy_path)

model = load_model()
df, corpus_embeddings = load_data()

top_k = st.sidebar.slider("Number of results", 1, 20, 6)
query = st.text_area("🔍 Text eingeben:", height=200)
 
if st.button("Suchen") and query:
    
    query_emb = model.encode(query, convert_to_tensor=True, normalize_embeddings=True)
    
    results = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
    
    for rank, hit in enumerate(results, 1):
        idx = hit["corpus_id"]
        st.markdown(f"### {rank} | Similarity Score: {hit['score']:.4f} | Index: {idx}")
        st.write(f"**Autoren:** {df.iloc[idx]['authors']}")
        st.write(f"**Titel:** {df.iloc[idx]['title']}")
        with st.expander("Abstract"):
            st.write(df.iloc[idx]['abstract'])
        st.divider()