Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer, util | |
| from huggingface_hub import hf_hub_download | |
| import os | |
| st.set_page_config(page_title="ArXiv Expert Finder", page_icon="π¬", layout="wide") | |
| st.title("ArXiv Expert Finder") | |
| def load_model(): | |
| return SentenceTransformer("bflhc/MoD-Embedding", trust_remote_code=True) | |
| def load_data(): | |
| parquet_path = hf_hub_download( | |
| repo_id="jadenhoch/jina-embeddings-v4", | |
| filename="arxiv_2025_zstd.parquet", | |
| repo_type="space" | |
| ) | |
| npy_path = hf_hub_download( | |
| repo_id="jadenhoch/MoD-Embedding", | |
| filename="corpus_embeddingsTensor_MOD.npy", | |
| repo_type="dataset" | |
| ) | |
| return pd.read_parquet(parquet_path), np.load(npy_path) | |
| model = load_model() | |
| df, corpus_embeddings = load_data() | |
| top_k = st.sidebar.slider("Number of results", 1, 20, 6) | |
| query = st.text_area("π Text eingeben:", height=200) | |
| if st.button("Suchen") and query: | |
| query_emb = model.encode(query) | |
| results = util.semantic_search(query_emb, corpus_embeddings, top_k=top_k)[0] | |
| for rank, hit in enumerate(results, 1): | |
| idx = hit["corpus_id"] | |
| st.markdown(f"### {rank} | Similarity Score: {hit['score']:.4f} | Index: {idx}") | |
| st.write(f"**Autoren:** {df.iloc[idx]['authors']}") | |
| st.write(f"**Titel:** {df.iloc[idx]['title']}") | |
| with st.expander("Abstract"): | |
| st.write(df.iloc[idx]['abstract']) | |
| st.divider() |