Spaces:
Sleeping
Sleeping
Commit
·
0bfcadb
1
Parent(s):
bd9233d
ádd
Browse files
.DS_Store
CHANGED
|
Binary files a/.DS_Store and b/.DS_Store differ
|
|
|
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import spacy
|
|
|
|
| 4 |
import gradio as gr
|
| 5 |
import umap
|
| 6 |
from sklearn.cluster import OPTICS
|
|
@@ -27,15 +28,13 @@ ARTICLE = r"""<center>
|
|
| 27 |
|
| 28 |
def load_data(fileobj):
|
| 29 |
"""Load dataset (keep only 500 rows for efficiency)"""
|
| 30 |
-
data = pd.read_csv(fileobj, on_bad_lines='skip', nrows=500)
|
| 31 |
assert "text" in data.columns, "The data must have a column named 'text'"
|
| 32 |
return data[['text']]
|
| 33 |
|
| 34 |
|
| 35 |
def run_nlp_processing(data):
|
| 36 |
"""As reference for standard NLP processing"""
|
| 37 |
-
import os
|
| 38 |
-
|
| 39 |
# NLP processing
|
| 40 |
docs = []
|
| 41 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
|
|
@@ -130,7 +129,7 @@ with blocks:
|
|
| 130 |
)
|
| 131 |
in_file = gr.File()
|
| 132 |
gr.Markdown("## Inspect the data")
|
| 133 |
-
in_data = gr.Dataframe()
|
| 134 |
submit_button = gr.Button("Run BERTopic!")
|
| 135 |
gr.Examples(inputs=in_file, examples=EXAMPLES)
|
| 136 |
with gr.Column():
|
|
@@ -139,7 +138,7 @@ with blocks:
|
|
| 139 |
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
|
| 140 |
)
|
| 141 |
gr.Markdown("## Processed Text")
|
| 142 |
-
out_dataset = gr.Dataframe()
|
| 143 |
gr.Markdown("## Embedding + Projection + Clustering")
|
| 144 |
embedding_plot = gr.Plot(label="BERTopic projections")
|
| 145 |
gr.Markdown("## Extracted Topics")
|
|
@@ -147,7 +146,7 @@ with blocks:
|
|
| 147 |
gr.Markdown(ARTICLE)
|
| 148 |
# event listeners
|
| 149 |
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
|
| 150 |
-
|
| 151 |
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
|
| 152 |
|
| 153 |
blocks.launch()
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
import spacy
|
| 4 |
+
import os
|
| 5 |
import gradio as gr
|
| 6 |
import umap
|
| 7 |
from sklearn.cluster import OPTICS
|
|
|
|
| 28 |
|
| 29 |
def load_data(fileobj):
|
| 30 |
"""Load dataset (keep only 500 rows for efficiency)"""
|
| 31 |
+
data = pd.read_csv(fileobj.name, on_bad_lines='skip', nrows=500)
|
| 32 |
assert "text" in data.columns, "The data must have a column named 'text'"
|
| 33 |
return data[['text']]
|
| 34 |
|
| 35 |
|
| 36 |
def run_nlp_processing(data):
|
| 37 |
"""As reference for standard NLP processing"""
|
|
|
|
|
|
|
| 38 |
# NLP processing
|
| 39 |
docs = []
|
| 40 |
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "ner"])
|
|
|
|
| 129 |
)
|
| 130 |
in_file = gr.File()
|
| 131 |
gr.Markdown("## Inspect the data")
|
| 132 |
+
in_data = gr.Dataframe(max_rows=5)
|
| 133 |
submit_button = gr.Button("Run BERTopic!")
|
| 134 |
gr.Examples(inputs=in_file, examples=EXAMPLES)
|
| 135 |
with gr.Column():
|
|
|
|
| 138 |
"Text -> Word-Piece Tokenization -> BERT-embedding -> UMAP -> HDBSCAN -> Topic"
|
| 139 |
)
|
| 140 |
gr.Markdown("## Processed Text")
|
| 141 |
+
out_dataset = gr.Dataframe(max_rows=5)
|
| 142 |
gr.Markdown("## Embedding + Projection + Clustering")
|
| 143 |
embedding_plot = gr.Plot(label="BERTopic projections")
|
| 144 |
gr.Markdown("## Extracted Topics")
|
|
|
|
| 146 |
gr.Markdown(ARTICLE)
|
| 147 |
# event listeners
|
| 148 |
in_file = in_file.upload(inputs=in_file, outputs=in_data, fn=load_data)
|
| 149 |
+
submit_button.click(inputs=in_data, outputs=out_dataset, fn=run_bert_tokenization)
|
| 150 |
# out_dataset.change(inputs=out_dataset, outputs=embedding_plot, fn=run_bertopic)
|
| 151 |
|
| 152 |
blocks.launch()
|