import streamlit as st import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer, util from huggingface_hub import hf_hub_download import os st.set_page_config(page_title="ArXiv Expert Finder", page_icon="🔬", layout="wide") st.title("ArXiv Expert Finder") @st.cache_resource def load_model(): return SentenceTransformer("bflhc/MoD-Embedding", trust_remote_code=True) @st.cache_data def load_data(): parquet_path = hf_hub_download( repo_id="jadenhoch/jina-embeddings-v4", filename="arxiv_2025_zstd.parquet", repo_type="space" ) npy_path = hf_hub_download( repo_id="jadenhoch/MoD-Embedding", filename="corpus_embeddingsTensor_MOD.npy", repo_type="dataset" ) return pd.read_parquet(parquet_path), np.load(npy_path) model = load_model() df, corpus_embeddings = load_data() top_k = st.sidebar.slider("Number of results", 1, 20, 6) query = st.text_area("🔍 Text eingeben:", height=200) if st.button("Suchen") and query: query_emb = model.encode(query) results = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0] for rank, hit in enumerate(results, 1): idx = hit["corpus_id"] st.markdown(f"### {rank} | Similarity Score: {hit['score']:.4f} | Index: {idx}") st.write(f"**Autoren:** {df.iloc[idx]['authors']}") st.write(f"**Titel:** {df.iloc[idx]['title']}") with st.expander("Abstract"): st.write(df.iloc[idx]['abstract']) st.divider()