Spaces:
Running
Running
| import streamlit as st | |
| import pandas as pd | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import joblib | |
| import requests | |
| import io | |
| import re | |
| from nltk.stem import PorterStemmer | |
| import nltk | |
| nltk.download('stopwords') | |
| def load_data(): | |
| matrix_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_matrix.pkl') | |
| vectorizer_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_vectorizer.pkl') | |
| tfidf_matrix = joblib.load(io.BytesIO(matrix_res.content)) | |
| tfidf_vectorizer = joblib.load(io.BytesIO(vectorizer_res.content)) | |
| df = pd.read_csv("https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/clean_processed_dataset.csv") | |
| return tfidf_matrix, tfidf_vectorizer, df | |
| tfidf_matrix, tfidf_vectorizer, df = load_data() | |
| st.title("Arxiv Expert Finder") | |
| st.sidebar.header("Query") | |
| user_query = st.text_input("Suchtext eingeben", "") | |
| if user_query: | |
| # Remove numbers and special characters, convert to lowercase | |
| user_query = re.sub(r'[^a-zA-Z\s]', ' ', user_query).lower() | |
| # Stem words | |
| stemmer = PorterStemmer() | |
| user_query = " ".join([stemmer.stem(word) for word in user_query.split()]) | |
| num_experts = st.sidebar.number_input("Anzahl Experten", min_value=1, max_value=10, value=5, step=1) | |
| if user_query: | |
| similarities = cosine_similarity(tfidf_vectorizer.transform([user_query]), tfidf_matrix).flatten() | |
| top_results = pd.Series(similarities, index=df.index) \ | |
| .sort_values(ascending=False) \ | |
| .loc[lambda x: x >= 0.1] \ | |
| .head(num_experts) | |
| for rank, (idx, score) in enumerate(top_results.items(), 1): | |
| row = df.loc[idx] | |
| st.write(f"**Rank:** {rank} | **Similarity Score:** {score:.4f} | **Index:** {idx}") | |
| st.write(f"**Autoren:** {row['authors']}") | |
| st.write(f"**Titel:** {row['title']}") | |
| with st.expander("Abstract anzeigen"): | |
| st.write(row['abstract']) | |
| st.divider() | |