Spaces:
Runtime error
Runtime error
v1.1.0
Browse files- app.py +2 -3
- lrt/clustering/clustering_pipeline.py +16 -13
- lrt/clustering/config.py +1 -1
- lrt/lrt.py +10 -13
- lrt/utils/dimension_reduction.py +17 -0
- lrt/utils/functions.py +5 -1
- lrt_instance/instances.py +2 -1
- scripts/tests/lrt_test_run.py +1 -1
- setup.py +1 -1
- widgets/body.py +16 -3
- widgets/sidebar.py +28 -8
app.py
CHANGED
|
@@ -1,12 +1,11 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from widgets import *
|
| 3 |
-
from lrt_instance import *
|
| 4 |
|
| 5 |
|
| 6 |
# [](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
|
| 7 |
|
| 8 |
# sidebar content
|
| 9 |
-
platforms, number_papers,start_year,end_year,
|
| 10 |
|
| 11 |
# body head
|
| 12 |
with st.form("my_form",clear_on_submit=False):
|
|
@@ -26,7 +25,7 @@ with st.form("my_form",clear_on_submit=False):
|
|
| 26 |
|
| 27 |
if submitted:
|
| 28 |
# body
|
| 29 |
-
render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year,
|
| 30 |
# '''
|
| 31 |
# bar = (
|
| 32 |
# Bar()
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from widgets import *
|
|
|
|
| 3 |
|
| 4 |
|
| 5 |
# [](https://github.com/Mondkuchen/idp_LiteratureResearch_Tool)
|
| 6 |
|
| 7 |
# sidebar content
|
| 8 |
+
platforms, number_papers,start_year,end_year, clustering_params = render_sidebar()
|
| 9 |
|
| 10 |
# body head
|
| 11 |
with st.form("my_form",clear_on_submit=False):
|
|
|
|
| 25 |
|
| 26 |
if submitted:
|
| 27 |
# body
|
| 28 |
+
render_body(platforms, number_papers, 5, query_input, show_preview,start_year,end_year, clustering_params)
|
| 29 |
# '''
|
| 30 |
# bar = (
|
| 31 |
# Bar()
|
lrt/clustering/clustering_pipeline.py
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
from typing import List
|
| 2 |
from .config import BaselineConfig, Configuration
|
| 3 |
from ..utils import __create_model__
|
| 4 |
-
|
| 5 |
from sklearn.cluster import KMeans
|
| 6 |
-
|
| 7 |
from .clusters import ClusterList
|
| 8 |
|
| 9 |
class ClusterPipeline:
|
|
@@ -15,7 +15,7 @@ class ClusterPipeline:
|
|
| 15 |
|
| 16 |
def __setup__(self, config:Configuration):
|
| 17 |
self.PTM = __create_model__(config.plm)
|
| 18 |
-
self.dimension_reduction = __create_model__(config.dimension_reduction)
|
| 19 |
self.clustering = __create_model__(config.clustering)
|
| 20 |
self.keywords_extraction = __create_model__(config.keywords_extraction)
|
| 21 |
|
|
@@ -38,9 +38,11 @@ class ClusterPipeline:
|
|
| 38 |
if self.dimension_reduction is None:
|
| 39 |
return embeddings
|
| 40 |
print(f'>>> start dimension reduction...')
|
|
|
|
| 41 |
print(f'>>> finished dimension reduction...')
|
|
|
|
| 42 |
|
| 43 |
-
def __3_clustering__(self, embeddings, return_cluster_centers = False,
|
| 44 |
'''
|
| 45 |
|
| 46 |
:param embeddings: Nxd
|
|
@@ -51,13 +53,14 @@ class ClusterPipeline:
|
|
| 51 |
else:
|
| 52 |
print(f'>>> start clustering...')
|
| 53 |
model = KMeans()
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
#
|
| 60 |
-
|
|
|
|
| 61 |
|
| 62 |
labels, cluster_centers = self.clustering(embeddings, k=best_k)
|
| 63 |
clusters = ClusterList(best_k)
|
|
@@ -90,11 +93,11 @@ class ClusterPipeline:
|
|
| 90 |
return clusters
|
| 91 |
|
| 92 |
|
| 93 |
-
def __call__(self, documents: List[str],
|
| 94 |
print(f'>>> pipeline starts...')
|
| 95 |
x = self.__1_generate_word_embeddings__(documents)
|
| 96 |
x = self.__2_dimenstion_reduction__(x)
|
| 97 |
-
clusters = self.__3_clustering__(x,
|
| 98 |
outputs = self.__4_keywords_extraction__(clusters, documents)
|
| 99 |
print(f'>>> pipeline finished!\n')
|
| 100 |
return outputs
|
|
|
|
| 1 |
from typing import List
|
| 2 |
from .config import BaselineConfig, Configuration
|
| 3 |
from ..utils import __create_model__
|
| 4 |
+
import numpy as np
|
| 5 |
from sklearn.cluster import KMeans
|
| 6 |
+
from yellowbrick.cluster import KElbowVisualizer
|
| 7 |
from .clusters import ClusterList
|
| 8 |
|
| 9 |
class ClusterPipeline:
|
|
|
|
| 15 |
|
| 16 |
def __setup__(self, config:Configuration):
|
| 17 |
self.PTM = __create_model__(config.plm)
|
| 18 |
+
self.dimension_reduction = __create_model__(config.dimension_reduction)
|
| 19 |
self.clustering = __create_model__(config.clustering)
|
| 20 |
self.keywords_extraction = __create_model__(config.keywords_extraction)
|
| 21 |
|
|
|
|
| 38 |
if self.dimension_reduction is None:
|
| 39 |
return embeddings
|
| 40 |
print(f'>>> start dimension reduction...')
|
| 41 |
+
embeddings = self.dimension_reduction.dimension_reduction(embeddings)
|
| 42 |
print(f'>>> finished dimension reduction...')
|
| 43 |
+
return embeddings
|
| 44 |
|
| 45 |
+
def __3_clustering__(self, embeddings, return_cluster_centers = False, max_k: int =10):
|
| 46 |
'''
|
| 47 |
|
| 48 |
:param embeddings: Nxd
|
|
|
|
| 53 |
else:
|
| 54 |
print(f'>>> start clustering...')
|
| 55 |
model = KMeans()
|
| 56 |
+
visualizer = KElbowVisualizer(
|
| 57 |
+
model, k=(2, max_k+1), metric='silhouette', timings=False, locate_elbow=False
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
visualizer.fit(embeddings)
|
| 61 |
+
# visualizer.show()
|
| 62 |
+
best_k = visualizer.k_values_[np.argmax(np.array(visualizer.k_scores_))]
|
| 63 |
+
print(f'>>> The best K is {best_k}.')
|
| 64 |
|
| 65 |
labels, cluster_centers = self.clustering(embeddings, k=best_k)
|
| 66 |
clusters = ClusterList(best_k)
|
|
|
|
| 93 |
return clusters
|
| 94 |
|
| 95 |
|
| 96 |
+
def __call__(self, documents: List[str], max_k:int):
|
| 97 |
print(f'>>> pipeline starts...')
|
| 98 |
x = self.__1_generate_word_embeddings__(documents)
|
| 99 |
x = self.__2_dimenstion_reduction__(x)
|
| 100 |
+
clusters = self.__3_clustering__(x,max_k=max_k)
|
| 101 |
outputs = self.__4_keywords_extraction__(clusters, documents)
|
| 102 |
print(f'>>> pipeline finished!\n')
|
| 103 |
return outputs
|
lrt/clustering/config.py
CHANGED
|
@@ -8,4 +8,4 @@ class Configuration:
|
|
| 8 |
|
| 9 |
class BaselineConfig(Configuration):
|
| 10 |
def __init__(self):
|
| 11 |
-
super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')
|
|
|
|
| 8 |
|
| 9 |
class BaselineConfig(Configuration):
|
| 10 |
def __init__(self):
|
| 11 |
+
super().__init__('''all-mpnet-base-v2''', 'none', 'kmeans-euclidean', 'keyphrase-transformer')
|
lrt/lrt.py
CHANGED
|
@@ -46,8 +46,8 @@ class LiteratureResearchTool:
|
|
| 46 |
num_papers: int,
|
| 47 |
start_year: int,
|
| 48 |
end_year: int,
|
|
|
|
| 49 |
platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
|
| 50 |
-
best_k: int = 5,
|
| 51 |
loading_ctx_manager = None,
|
| 52 |
):
|
| 53 |
|
|
@@ -55,9 +55,9 @@ class LiteratureResearchTool:
|
|
| 55 |
for platform in platforms:
|
| 56 |
if loading_ctx_manager:
|
| 57 |
with loading_ctx_manager():
|
| 58 |
-
clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,
|
| 59 |
else:
|
| 60 |
-
clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,
|
| 61 |
|
| 62 |
clusters.sort()
|
| 63 |
yield clusters,articles
|
|
@@ -69,7 +69,7 @@ class LiteratureResearchTool:
|
|
| 69 |
num_papers: int,
|
| 70 |
start_year: int,
|
| 71 |
end_year: int,
|
| 72 |
-
|
| 73 |
) -> (ClusterList,ArticleList):
|
| 74 |
|
| 75 |
@st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
|
|
@@ -78,12 +78,11 @@ class LiteratureResearchTool:
|
|
| 78 |
num_papers: int,
|
| 79 |
start_year: int,
|
| 80 |
end_year: int,
|
| 81 |
-
best_k: int = 5
|
| 82 |
):
|
| 83 |
articles = ArticleList.parse_ieee_articles(
|
| 84 |
self.literature_search.ieee(query, start_year, end_year, num_papers)) # ArticleList
|
| 85 |
abstracts = articles.getAbstracts() # List[str]
|
| 86 |
-
clusters = self.cluster_pipeline(abstracts,
|
| 87 |
clusters = self.__postprocess_clusters__(clusters)
|
| 88 |
return clusters, articles
|
| 89 |
|
|
@@ -91,12 +90,11 @@ class LiteratureResearchTool:
|
|
| 91 |
def arxiv_process(
|
| 92 |
query: str,
|
| 93 |
num_papers: int,
|
| 94 |
-
best_k: int = 5
|
| 95 |
):
|
| 96 |
articles = ArticleList.parse_arxiv_articles(
|
| 97 |
self.literature_search.arxiv(query, num_papers)) # ArticleList
|
| 98 |
abstracts = articles.getAbstracts() # List[str]
|
| 99 |
-
clusters = self.cluster_pipeline(abstracts,
|
| 100 |
clusters = self.__postprocess_clusters__(clusters)
|
| 101 |
return clusters, articles
|
| 102 |
|
|
@@ -104,21 +102,20 @@ class LiteratureResearchTool:
|
|
| 104 |
def pwc_process(
|
| 105 |
query: str,
|
| 106 |
num_papers: int,
|
| 107 |
-
best_k: int = 5
|
| 108 |
):
|
| 109 |
articles = ArticleList.parse_pwc_articles(
|
| 110 |
self.literature_search.paper_with_code(query, num_papers)) # ArticleList
|
| 111 |
abstracts = articles.getAbstracts() # List[str]
|
| 112 |
-
clusters = self.cluster_pipeline(abstracts,
|
| 113 |
clusters = self.__postprocess_clusters__(clusters)
|
| 114 |
return clusters, articles
|
| 115 |
|
| 116 |
if platforn_name == 'IEEE':
|
| 117 |
-
return ieee_process(query,num_papers,start_year,end_year
|
| 118 |
elif platforn_name == 'Arxiv':
|
| 119 |
-
return arxiv_process(query,num_papers
|
| 120 |
elif platforn_name == 'Paper with Code':
|
| 121 |
-
return pwc_process(query,num_papers
|
| 122 |
else:
|
| 123 |
raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')
|
| 124 |
|
|
|
|
| 46 |
num_papers: int,
|
| 47 |
start_year: int,
|
| 48 |
end_year: int,
|
| 49 |
+
max_k: int,
|
| 50 |
platforms: List[str] = ['IEEE', 'Arxiv', 'Paper with Code'],
|
|
|
|
| 51 |
loading_ctx_manager = None,
|
| 52 |
):
|
| 53 |
|
|
|
|
| 55 |
for platform in platforms:
|
| 56 |
if loading_ctx_manager:
|
| 57 |
with loading_ctx_manager():
|
| 58 |
+
clusters, articles = self.__platformPipeline__(platform,query,num_papers,start_year,end_year,max_k)
|
| 59 |
else:
|
| 60 |
+
clusters, articles = self.__platformPipeline__(platform, query, num_papers, start_year, end_year,max_k)
|
| 61 |
|
| 62 |
clusters.sort()
|
| 63 |
yield clusters,articles
|
|
|
|
| 69 |
num_papers: int,
|
| 70 |
start_year: int,
|
| 71 |
end_year: int,
|
| 72 |
+
max_k: int
|
| 73 |
) -> (ClusterList,ArticleList):
|
| 74 |
|
| 75 |
@st.cache(hash_funcs={Tokenizer: Tokenizer.__hash__},allow_output_mutation=True)
|
|
|
|
| 78 |
num_papers: int,
|
| 79 |
start_year: int,
|
| 80 |
end_year: int,
|
|
|
|
| 81 |
):
|
| 82 |
articles = ArticleList.parse_ieee_articles(
|
| 83 |
self.literature_search.ieee(query, start_year, end_year, num_papers)) # ArticleList
|
| 84 |
abstracts = articles.getAbstracts() # List[str]
|
| 85 |
+
clusters = self.cluster_pipeline(abstracts,max_k)
|
| 86 |
clusters = self.__postprocess_clusters__(clusters)
|
| 87 |
return clusters, articles
|
| 88 |
|
|
|
|
| 90 |
def arxiv_process(
|
| 91 |
query: str,
|
| 92 |
num_papers: int,
|
|
|
|
| 93 |
):
|
| 94 |
articles = ArticleList.parse_arxiv_articles(
|
| 95 |
self.literature_search.arxiv(query, num_papers)) # ArticleList
|
| 96 |
abstracts = articles.getAbstracts() # List[str]
|
| 97 |
+
clusters = self.cluster_pipeline(abstracts,max_k)
|
| 98 |
clusters = self.__postprocess_clusters__(clusters)
|
| 99 |
return clusters, articles
|
| 100 |
|
|
|
|
| 102 |
def pwc_process(
|
| 103 |
query: str,
|
| 104 |
num_papers: int,
|
|
|
|
| 105 |
):
|
| 106 |
articles = ArticleList.parse_pwc_articles(
|
| 107 |
self.literature_search.paper_with_code(query, num_papers)) # ArticleList
|
| 108 |
abstracts = articles.getAbstracts() # List[str]
|
| 109 |
+
clusters = self.cluster_pipeline(abstracts,max_k)
|
| 110 |
clusters = self.__postprocess_clusters__(clusters)
|
| 111 |
return clusters, articles
|
| 112 |
|
| 113 |
if platforn_name == 'IEEE':
|
| 114 |
+
return ieee_process(query,num_papers,start_year,end_year)
|
| 115 |
elif platforn_name == 'Arxiv':
|
| 116 |
+
return arxiv_process(query,num_papers)
|
| 117 |
elif platforn_name == 'Paper with Code':
|
| 118 |
+
return pwc_process(query,num_papers)
|
| 119 |
else:
|
| 120 |
raise RuntimeError('This platform is not supported. Please open an issue on the GitHub.')
|
| 121 |
|
lrt/utils/dimension_reduction.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from sklearn.decomposition import PCA as pca
|
| 2 |
+
|
| 3 |
+
|
| 4 |
+
class BaseDimensionReduction:
|
| 5 |
+
def dimension_reduction(self,X):
|
| 6 |
+
raise NotImplementedError()
|
| 7 |
+
|
| 8 |
+
class PCA(BaseDimensionReduction):
|
| 9 |
+
def __init__(self, n_components: int = 0.8, *args, **kwargs) -> None:
|
| 10 |
+
super().__init__()
|
| 11 |
+
self.pca = pca(n_components,*args,**kwargs)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def dimension_reduction(self, X):
|
| 15 |
+
self.pca.fit(X=X)
|
| 16 |
+
print(f'>>> The reduced dimension is {self.pca.n_components_}.')
|
| 17 |
+
return self.pca.transform(X)
|
lrt/utils/functions.py
CHANGED
|
@@ -5,6 +5,7 @@ import torch
|
|
| 5 |
from sklearn.cluster import KMeans
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
|
| 7 |
from inference_hf import InferenceHF
|
|
|
|
| 8 |
|
| 9 |
class Template:
|
| 10 |
def __init__(self):
|
|
@@ -14,7 +15,7 @@ class Template:
|
|
| 14 |
'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
|
| 15 |
}
|
| 16 |
self.dimension_reduction = {
|
| 17 |
-
'pca':
|
| 18 |
'vae': None,
|
| 19 |
'cnn': None
|
| 20 |
}
|
|
@@ -55,6 +56,9 @@ def __create_model__(model_ckpt):
|
|
| 55 |
)
|
| 56 |
return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
|
| 57 |
return ret
|
|
|
|
|
|
|
|
|
|
| 58 |
|
| 59 |
elif model_ckpt =='kmeans-euclidean':
|
| 60 |
def ret(x,k):
|
|
|
|
| 5 |
from sklearn.cluster import KMeans
|
| 6 |
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM,Text2TextGenerationPipeline
|
| 7 |
from inference_hf import InferenceHF
|
| 8 |
+
from .dimension_reduction import PCA
|
| 9 |
|
| 10 |
class Template:
|
| 11 |
def __init__(self):
|
|
|
|
| 15 |
'all-mpnet-base-v2':'''sentence-transformers/all-mpnet-base-v2'''
|
| 16 |
}
|
| 17 |
self.dimension_reduction = {
|
| 18 |
+
'pca': PCA,
|
| 19 |
'vae': None,
|
| 20 |
'cnn': None
|
| 21 |
}
|
|
|
|
| 56 |
)
|
| 57 |
return tmp[0].cpu().detach().numpy(), tmp[1].cpu().detach().numpy()
|
| 58 |
return ret
|
| 59 |
+
elif model_ckpt == 'pca':
|
| 60 |
+
pca = template.dimension_reduction[model_ckpt](0.8)
|
| 61 |
+
return pca
|
| 62 |
|
| 63 |
elif model_ckpt =='kmeans-euclidean':
|
| 64 |
def ret(x,k):
|
lrt_instance/instances.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
| 1 |
from lrt import LiteratureResearchTool
|
|
|
|
| 2 |
|
| 3 |
-
baseline_lrt = LiteratureResearchTool()
|
|
|
|
| 1 |
from lrt import LiteratureResearchTool
|
| 2 |
+
from lrt.clustering.config import *
|
| 3 |
|
| 4 |
+
baseline_lrt = LiteratureResearchTool()
|
scripts/tests/lrt_test_run.py
CHANGED
|
@@ -10,7 +10,7 @@ if __name__ == '__main__':
|
|
| 10 |
from lrt.utils import ArticleList
|
| 11 |
config = Configuration(
|
| 12 |
plm= 'all-mpnet-base-v2',
|
| 13 |
-
dimension_reduction='
|
| 14 |
clustering='kmeans-euclidean',
|
| 15 |
# keywords_extraction='KeyBartAdapter'
|
| 16 |
keywords_extraction= 'keyphrase-transformer'
|
|
|
|
| 10 |
from lrt.utils import ArticleList
|
| 11 |
config = Configuration(
|
| 12 |
plm= 'all-mpnet-base-v2',
|
| 13 |
+
dimension_reduction='pca',
|
| 14 |
clustering='kmeans-euclidean',
|
| 15 |
# keywords_extraction='KeyBartAdapter'
|
| 16 |
keywords_extraction= 'keyphrase-transformer'
|
setup.py
CHANGED
|
@@ -21,7 +21,7 @@ requirements = [
|
|
| 21 |
|
| 22 |
setup(
|
| 23 |
name="LiteratureResearchTool",
|
| 24 |
-
version="1.
|
| 25 |
author="Tao Xiang",
|
| 26 |
author_email="tao.xiang@tum.de",
|
| 27 |
description="A tool for literature research and analysis",
|
|
|
|
| 21 |
|
| 22 |
setup(
|
| 23 |
name="LiteratureResearchTool",
|
| 24 |
+
version="1.1.0",
|
| 25 |
author="Tao Xiang",
|
| 26 |
author_email="tao.xiang@tum.de",
|
| 27 |
description="A tool for literature research and analysis",
|
widgets/body.py
CHANGED
|
@@ -1,7 +1,8 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
|
| 3 |
from lrt.clustering.clusters import SingleCluster
|
| 4 |
-
from lrt import
|
|
|
|
| 5 |
from lrt_instance import *
|
| 6 |
# from pyecharts.charts import Bar
|
| 7 |
# from pyecharts import options as opts
|
|
@@ -54,7 +55,7 @@ We have found following papers for you! (displaying 5 papers for each literature
|
|
| 54 |
|
| 55 |
paperInGeneral.markdown(paperInGeneral_md)
|
| 56 |
|
| 57 |
-
def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year,
|
| 58 |
|
| 59 |
tmp = st.empty()
|
| 60 |
if query_input != '':
|
|
@@ -66,7 +67,19 @@ def render_body(platforms, num_papers, num_papers_preview, query_input, show_pre
|
|
| 66 |
|
| 67 |
|
| 68 |
# lrt results
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
for i,plat in enumerate(platforms):
|
| 71 |
clusters, articles = next(generator)
|
| 72 |
st.markdown(f'''# {i+1} {plat} Results''')
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
from api_ import ArxivQuery, IEEEQuery, PaperWithCodeQuery
|
| 3 |
from lrt.clustering.clusters import SingleCluster
|
| 4 |
+
from lrt.clustering.config import Configuration
|
| 5 |
+
from lrt import ArticleList, LiteratureResearchTool
|
| 6 |
from lrt_instance import *
|
| 7 |
# from pyecharts.charts import Bar
|
| 8 |
# from pyecharts import options as opts
|
|
|
|
| 55 |
|
| 56 |
paperInGeneral.markdown(paperInGeneral_md)
|
| 57 |
|
| 58 |
+
def render_body(platforms, num_papers, num_papers_preview, query_input, show_preview:bool,start_year,end_year, clustering_params: dict):
|
| 59 |
|
| 60 |
tmp = st.empty()
|
| 61 |
if query_input != '':
|
|
|
|
| 67 |
|
| 68 |
|
| 69 |
# lrt results
|
| 70 |
+
## baseline
|
| 71 |
+
if clustering_params['dimension_reduction'] == 'none':
|
| 72 |
+
model = baseline_lrt
|
| 73 |
+
else:
|
| 74 |
+
config = Configuration(
|
| 75 |
+
plm= '''all-mpnet-base-v2''',
|
| 76 |
+
dimension_reduction= clustering_params['dimension_reduction'],
|
| 77 |
+
clustering= 'kmeans-euclidean',
|
| 78 |
+
keywords_extraction='keyphrase-transformer'
|
| 79 |
+
)
|
| 80 |
+
model = LiteratureResearchTool(config)
|
| 81 |
+
|
| 82 |
+
generator = model(query_input,num_papers,start_year,end_year,max_k=clustering_params['max_k'],platforms=platforms)
|
| 83 |
for i,plat in enumerate(platforms):
|
| 84 |
clusters, articles = next(generator)
|
| 85 |
st.markdown(f'''# {i+1} {plat} Results''')
|
widgets/sidebar.py
CHANGED
|
@@ -3,6 +3,12 @@ import datetime
|
|
| 3 |
# from .utils import PACKAGE_ROOT
|
| 4 |
|
| 5 |
def render_sidebar():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
sidebar_markdown = f'''
|
| 7 |
|
| 8 |
<center>
|
|
@@ -14,16 +20,14 @@ def render_sidebar():
|
|
| 14 |
|
| 15 |
|
| 16 |
<code>
|
| 17 |
-
v1.
|
| 18 |
</code>
|
| 19 |
|
| 20 |
|
| 21 |
</center>
|
| 22 |
|
| 23 |
|
| 24 |
-
|
| 25 |
-
<a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a> <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
|
| 26 |
-
</center>
|
| 27 |
|
| 28 |
---
|
| 29 |
|
|
@@ -50,7 +54,7 @@ def render_sidebar():
|
|
| 50 |
|
| 51 |
|
| 52 |
st.sidebar.markdown('## Choose the max number of papers to search')
|
| 53 |
-
number_papers=st.sidebar.slider('number',
|
| 54 |
|
| 55 |
st.sidebar.markdown('## Choose the start year of publication')
|
| 56 |
this_year = datetime.date.today().year
|
|
@@ -59,7 +63,23 @@ def render_sidebar():
|
|
| 59 |
st.sidebar.markdown('## Choose the end year of publication')
|
| 60 |
end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
|
| 61 |
|
| 62 |
-
|
| 63 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
|
| 65 |
-
return platforms, number_papers, start_year, end_year,
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
# from .utils import PACKAGE_ROOT
|
| 4 |
|
| 5 |
def render_sidebar():
|
| 6 |
+
icons = f'''
|
| 7 |
+
<center>
|
| 8 |
+
<a href="https://github.com/Mondkuchen/idp_LiteratureResearch_Tool"><img src = "https://cdn-icons-png.flaticon.com/512/733/733609.png" width="23"></img></a> <a href="mailto:xiang.tao@outlook.de"><img src="https://cdn-icons-png.flaticon.com/512/646/646094.png" alt="email" width = "27" ></a>
|
| 9 |
+
</center>
|
| 10 |
+
'''
|
| 11 |
+
|
| 12 |
sidebar_markdown = f'''
|
| 13 |
|
| 14 |
<center>
|
|
|
|
| 20 |
|
| 21 |
|
| 22 |
<code>
|
| 23 |
+
v1.1.0
|
| 24 |
</code>
|
| 25 |
|
| 26 |
|
| 27 |
</center>
|
| 28 |
|
| 29 |
|
| 30 |
+
{icons}
|
|
|
|
|
|
|
| 31 |
|
| 32 |
---
|
| 33 |
|
|
|
|
| 54 |
|
| 55 |
|
| 56 |
st.sidebar.markdown('## Choose the max number of papers to search')
|
| 57 |
+
number_papers=st.sidebar.slider('number', 10, 200, 20, 5)
|
| 58 |
|
| 59 |
st.sidebar.markdown('## Choose the start year of publication')
|
| 60 |
this_year = datetime.date.today().year
|
|
|
|
| 63 |
st.sidebar.markdown('## Choose the end year of publication')
|
| 64 |
end_year = st.sidebar.slider('year end:', 2000, this_year, this_year, 1)
|
| 65 |
|
| 66 |
+
|
| 67 |
+
with st.sidebar:
|
| 68 |
+
st.markdown('## Adjust clustering hyperparameters')
|
| 69 |
+
with st.expander('Clustering Hyperparameters'):
|
| 70 |
+
dr = st.selectbox('1) Dimension Reduction', options=['none', 'pca'], index=0)
|
| 71 |
+
tmp = min(number_papers,15)
|
| 72 |
+
max_k = st.slider('2) Max number of clusters', 2,tmp , tmp//2)
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
st.markdown('---')
|
| 76 |
+
st.markdown(icons,unsafe_allow_html=True)
|
| 77 |
+
st.markdown('''<center>copyright@2022</center>''',unsafe_allow_html=True)
|
| 78 |
+
|
| 79 |
+
# st.sidebar.markdown('## Choose the number of clusters')
|
| 80 |
+
# k = st.sidebar.slider('number',1,10,3)
|
| 81 |
|
| 82 |
+
return platforms, number_papers, start_year, end_year, dict(
|
| 83 |
+
dimension_reduction= dr,
|
| 84 |
+
max_k = max_k
|
| 85 |
+
)
|