import streamlit as st import pandas as pd from sklearn.metrics.pairwise import cosine_similarity import joblib import requests import io import re from nltk.stem import PorterStemmer import nltk nltk.download('stopwords') @st.cache_resource def load_data(): matrix_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_matrix.pkl') vectorizer_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_vectorizer.pkl') tfidf_matrix = joblib.load(io.BytesIO(matrix_res.content)) tfidf_vectorizer = joblib.load(io.BytesIO(vectorizer_res.content)) df = pd.read_csv("https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/clean_processed_dataset.csv") return tfidf_matrix, tfidf_vectorizer, df tfidf_matrix, tfidf_vectorizer, df = load_data() st.title("Arxiv Expert Finder") st.sidebar.header("Query") user_query = st.text_input("Suchtext eingeben", "") if user_query: # Remove numbers and special characters, convert to lowercase user_query = re.sub(r'[^a-zA-Z\s]', ' ', user_query).lower() # Stem words stemmer = PorterStemmer() user_query = " ".join([stemmer.stem(word) for word in user_query.split()]) num_experts = st.sidebar.number_input("Anzahl Experten", min_value=1, max_value=10, value=5, step=1) if user_query: similarities = cosine_similarity(tfidf_vectorizer.transform([user_query]), tfidf_matrix).flatten() top_results = pd.Series(similarities, index=df.index) \ .sort_values(ascending=False) \ .loc[lambda x: x >= 0.1] \ .head(num_experts) for rank, (idx, score) in enumerate(top_results.items(), 1): row = df.loc[idx] st.write(f"**Rank:** {rank} | **Similarity Score:** {score:.4f} | **Index:** {idx}") st.write(f"**Autoren:** {row['authors']}") st.write(f"**Titel:** {row['title']}") with st.expander("Abstract anzeigen"): st.write(row['abstract']) st.divider()