File size: 2,253 Bytes
f67f969
3c748a9
 
 
8183a8b
72dc1c4
1aaa40d
 
adfd9be
6ed60e1
9129326
1aaa40d
4a6436f
 
a687e7a
 
 
 
72dc1c4
 
 
 
 
 
 
 
 
 
6193c09
72dc1c4
3c748a9
beedacb
3c748a9
72dc1c4
6ed60e1
 
 
 
 
 
 
 
 
 
72dc1c4
1aaa40d
6ed60e1
1aaa40d
3c748a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import streamlit as st
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import requests 
import io
import re
from nltk.stem import PorterStemmer
import nltk
from functools import lru_cache
from nltk.corpus import stopwords

nltk.download('stopwords')

stemmer = PorterStemmer()
stop = set(stopwords.words("english"))
re_words = re.compile(r"[a-z]+")

@st.cache_resource
def load_data():
    matrix_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_matrix.pkl')
    vectorizer_res = requests.get('https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/tfidf_vectorizer.pkl')
    
    tfidf_matrix = joblib.load(io.BytesIO(matrix_res.content))
    tfidf_vectorizer = joblib.load(io.BytesIO(vectorizer_res.content))
    df = pd.read_csv("https://huggingface.co/datasets/jadenhochh/TF_IDF/resolve/main/clean_processed_dataset.csv")
    
    return tfidf_matrix, tfidf_vectorizer, df

tfidf_matrix, tfidf_vectorizer, df = load_data()

st.title("Arxiv Expert Finder")
st.sidebar.header("Query")

@lru_cache(maxsize=200_000)
def stem_cached(w: str) -> str:
    return stemmer.stem(w)

def text_reinigen_fast(text: str) -> str:
    if not isinstance(text, str) or not text:
        return ""
    words = re_words.findall(text.lower())
    return " ".join(stem_cached(w) for w in words if w not in stop)

user_query = st.text_input("Suchtext eingeben", "")

user_query = text_reinigen_fast(user_query)

num_experts = st.sidebar.number_input("Anzahl Experten", min_value=1, max_value=10, value=5, step=1)

if user_query:
    similarities = cosine_similarity(tfidf_vectorizer.transform([user_query]), tfidf_matrix).flatten()

    top_results = pd.Series(similarities, index=df.index) \
        .sort_values(ascending=False) \
        .loc[lambda x: x >= 0.1] \
        .head(num_experts)

    for rank, (idx, score) in enumerate(top_results.items(), 1):
        row = df.loc[idx]
        st.write(f"**Rank:** {rank} | **Similarity Score:** {score:.4f} | **Index:** {idx}")
        st.write(f"**Autoren:** {row['authors']}")
        st.write(f"**Titel:** {row['title']}")
        with st.expander("Abstract anzeigen"):
            st.write(row['abstract'])
        st.divider()