Spaces:
Runtime error
Runtime error
add sample
Browse files
app.py
CHANGED
|
@@ -1,15 +1,7 @@
|
|
| 1 |
import streamlit as st
|
| 2 |
import requests
|
| 3 |
-
import os
|
| 4 |
|
| 5 |
-
enable_xorbits =
|
| 6 |
-
|
| 7 |
-
if enable_xorbits:
|
| 8 |
-
import xorbits
|
| 9 |
-
xorbits.init()
|
| 10 |
-
import xorbits.pandas as pd
|
| 11 |
-
else:
|
| 12 |
-
import pandas as pd
|
| 13 |
|
| 14 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
| 15 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
|
@@ -25,28 +17,39 @@ st.sidebar.header("Please Paste The HF Dataset Name Here:")
|
|
| 25 |
|
| 26 |
#@st.cache_data
|
| 27 |
def load_dataset(j, name, fraction):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
|
| 29 |
-
if not os.path.exists('train.gzip'):
|
| 30 |
with st.spinner('Downloading file from remote server'):
|
| 31 |
import pandas
|
| 32 |
train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
|
| 33 |
train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
|
| 34 |
-
train_dataset.to_parquet('train.gzip')
|
| 35 |
|
| 36 |
-
if not os.path.exists('test.gzip'):
|
| 37 |
with st.spinner('Downloading file from remote server'):
|
| 38 |
import pandas
|
| 39 |
test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
|
| 40 |
test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
|
| 41 |
-
test_dataset.to_parquet('test.gzip')
|
| 42 |
|
| 43 |
-
train_dataset = pd.read_parquet('train.gzip', engine='pyarrow')
|
|
|
|
| 44 |
|
| 45 |
-
|
|
|
|
|
|
|
| 46 |
|
| 47 |
dataset = {
|
| 48 |
-
"train": train_dataset
|
| 49 |
-
"test": test_dataset
|
| 50 |
}
|
| 51 |
|
| 52 |
return dataset
|
|
@@ -351,9 +354,9 @@ data was heavily used in their benchmark datasets.
|
|
| 351 |
|
| 352 |
def process_data(df):
|
| 353 |
minhashes = {}
|
| 354 |
-
for idx,
|
| 355 |
minhash = MinHash(num_perm=128)
|
| 356 |
-
for d in ngrams(
|
| 357 |
s = "".join(d).encode('utf-8')
|
| 358 |
minhash.update(s)
|
| 359 |
minhashes[idx] = minhash
|
|
|
|
| 1 |
import streamlit as st
|
| 2 |
import requests
|
|
|
|
| 3 |
|
| 4 |
+
enable_xorbits = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
st.set_page_config(page_title="Analyzing Text Corpus on Hugging Face", page_icon=":bar_chart:", layout="wide")
|
| 7 |
st.sidebar.title('A Tool for Analyzing Text Corpus on Hugging Face')
|
|
|
|
| 17 |
|
| 18 |
#@st.cache_data
|
| 19 |
def load_dataset(j, name, fraction):
|
| 20 |
+
import os
|
| 21 |
+
|
| 22 |
+
if enable_xorbits:
|
| 23 |
+
import xorbits
|
| 24 |
+
xorbits.init()
|
| 25 |
+
import xorbits.pandas as pd
|
| 26 |
+
else:
|
| 27 |
+
import pandas as pd
|
| 28 |
|
| 29 |
+
if not os.path.exists('%s-train.gzip' % name):
|
| 30 |
with st.spinner('Downloading file from remote server'):
|
| 31 |
import pandas
|
| 32 |
train_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'train']
|
| 33 |
train_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in train_urls], ignore_index=True)
|
| 34 |
+
train_dataset.to_parquet('%s-train.gzip' % name)
|
| 35 |
|
| 36 |
+
if not os.path.exists('%s-test.gzip' % name):
|
| 37 |
with st.spinner('Downloading file from remote server'):
|
| 38 |
import pandas
|
| 39 |
test_urls = [f['url'] for f in j['parquet_files'] if f['config'] == name and f['split'] == 'validation']
|
| 40 |
test_dataset = pandas.concat([pandas.read_parquet(url, engine='pyarrow') for url in test_urls], ignore_index=True)
|
| 41 |
+
test_dataset.to_parquet('%s-test.gzip' % name)
|
| 42 |
|
| 43 |
+
train_dataset = pd.read_parquet('%s-train.gzip' % name, engine='pyarrow')
|
| 44 |
+
test_dataset = pd.read_parquet('%s-test.gzip' % name, engine='pyarrow')
|
| 45 |
|
| 46 |
+
if enable_xorbits:
|
| 47 |
+
train_dataset.rebalance()
|
| 48 |
+
test_dataset.rebalance()
|
| 49 |
|
| 50 |
dataset = {
|
| 51 |
+
"train": train_dataset.sample(frac=fraction),
|
| 52 |
+
"test": test_dataset.sample(frac=fraction),
|
| 53 |
}
|
| 54 |
|
| 55 |
return dataset
|
|
|
|
| 354 |
|
| 355 |
def process_data(df):
|
| 356 |
minhashes = {}
|
| 357 |
+
for idx, text in enumerate(df['text']):
|
| 358 |
minhash = MinHash(num_perm=128)
|
| 359 |
+
for d in ngrams(text, 13):
|
| 360 |
s = "".join(d).encode('utf-8')
|
| 361 |
minhash.update(s)
|
| 362 |
minhashes[idx] = minhash
|