Spaces:
Running
Running
File size: 2,253 Bytes
f67f969 3c748a9 8183a8b 72dc1c4 1aaa40d adfd9be 6ed60e1 9129326 1aaa40d 4a6436f a687e7a 72dc1c4 6193c09 72dc1c4 3c748a9 beedacb 3c748a9 72dc1c4 6ed60e1 72dc1c4 1aaa40d 6ed60e1 1aaa40d 3c748a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 |
import streamlit as st
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import requests
import io
import re
from nltk.stem import PorterStemmer
import nltk
from functools import lru_cache
from nltk.corpus import stopwords
nltk.download('stopwords')
stemmer = PorterStemmer()
stop = set(stopwords.words("english"))
re_words = re.compile(r"[a-z]+")
@st.cache_resource
def load_data():
matrix_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_matrix.pkl')
vectorizer_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_vectorizer.pkl')
tfidf_matrix = joblib.load(io.BytesIO(matrix_res.content))
tfidf_vectorizer = joblib.load(io.BytesIO(vectorizer_res.content))
df = pd.read_csv("https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/clean_processed_dataset.csv")
return tfidf_matrix, tfidf_vectorizer, df
tfidf_matrix, tfidf_vectorizer, df = load_data()
st.title("Arxiv Expert Finder")
st.sidebar.header("Query")
@lru_cache(maxsize=200_000)
def stem_cached(w: str) -> str:
return stemmer.stem(w)
def text_reinigen_fast(text: str) -> str:
if not isinstance(text, str) or not text:
return ""
words = re_words.findall(text.lower())
return " ".join(stem_cached(w) for w in words if w not in stop)
user_query = st.text_input("Suchtext eingeben", "")
user_query = text_reinigen_fast(user_query)
num_experts = st.sidebar.number_input("Anzahl Experten", min_value=1, max_value=10, value=5, step=1)
if user_query:
similarities = cosine_similarity(tfidf_vectorizer.transform([user_query]), tfidf_matrix).flatten()
top_results = pd.Series(similarities, index=df.index) \
.sort_values(ascending=False) \
.loc[lambda x: x >= 0.1] \
.head(num_experts)
for rank, (idx, score) in enumerate(top_results.items(), 1):
row = df.loc[idx]
st.write(f"**Rank:** {rank} | **Similarity Score:** {score:.4f} | **Index:** {idx}")
st.write(f"**Autoren:** {row['authors']}")
st.write(f"**Titel:** {row['title']}")
with st.expander("Abstract anzeigen"):
st.write(row['abstract'])
st.divider()
|