MoD-Embedding / src /streamlit_app.py
jadenhochh's picture
Update src/streamlit_app.py
40d82a2 verified
import streamlit as st
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import hf_hub_download
import os
st.set_page_config(page_title="ArXiv Expert Finder", page_icon="πŸ”¬", layout="wide")
st.title("ArXiv Expert Finder")
@st.cache_resource
def load_model():
return SentenceTransformer("bflhc/MoD-Embedding", trust_remote_code=True)
@st.cache_data
def load_data():
parquet_path = hf_hub_download(
repo_id="jadenhoch/jina-embeddings-v4",
filename="arxiv_2025_zstd.parquet",
repo_type="space"
)
npy_path = hf_hub_download(
repo_id="jadenhoch/MoD-Embedding",
filename="corpus_embeddingsTensor_MOD.npy",
repo_type="dataset"
)
return pd.read_parquet(parquet_path), np.load(npy_path)
model = load_model()
df, corpus_embeddings = load_data()
top_k = st.sidebar.slider("Number of results", 1, 20, 6)
query = st.text_area("πŸ” Text eingeben:", height=200)
if st.button("Suchen") and query:
query_emb = model.encode(query)
results = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0]
for rank, hit in enumerate(results, 1):
idx = hit["corpus_id"]
st.markdown(f"### {rank} | Similarity Score: {hit['score']:.4f} | Index: {idx}")
st.write(f"**Autoren:** {df.iloc[idx]['authors']}")
st.write(f"**Titel:** {df.iloc[idx]['title']}")
with st.expander("Abstract"):
st.write(df.iloc[idx]['abstract'])
st.divider()