Spaces:
Runtime error
Runtime error
Mariusz Kossakowski commited on
Commit ·
acb3b1d
1
Parent(s): 1d713ee
Generalize dashboard for multiple datasets
Browse files- app.py +96 -84
- data/{dev.csv → validation.csv} +0 -0
app.py
CHANGED
|
@@ -1,22 +1,26 @@
|
|
| 1 |
import re
|
| 2 |
-
from typing import Dict, List
|
| 3 |
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import plotly.figure_factory as ff
|
| 6 |
import plotly.graph_objects as go
|
|
|
|
| 7 |
import streamlit as st
|
| 8 |
from unidecode import unidecode
|
| 9 |
|
| 10 |
-
|
| 11 |
|
| 12 |
-
DATA_SPLITS = ["train", "dev", "test"]
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 17 |
|
| 18 |
|
| 19 |
-
def flatten_list(main_list:
|
| 20 |
return [item for sublist in main_list for item in sublist]
|
| 21 |
|
| 22 |
|
|
@@ -28,54 +32,78 @@ def count_num_of_words(text: str) -> int:
|
|
| 28 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
| 29 |
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
|
| 33 |
header = st.container()
|
| 34 |
description = st.container()
|
| 35 |
dataset_statistics = st.container()
|
| 36 |
-
class_distribution = st.container()
|
| 37 |
|
| 38 |
with header:
|
| 39 |
-
st.title(
|
| 40 |
|
| 41 |
with description:
|
| 42 |
st.header("Dataset description")
|
| 43 |
-
|
| 44 |
-
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
| 45 |
-
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
| 46 |
-
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
| 47 |
-
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
| 48 |
-
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
| 49 |
-
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
| 50 |
-
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
| 51 |
-
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
| 52 |
-
requiring consumers to accept.
|
| 53 |
-
|
| 54 |
-
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
| 55 |
-
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
| 56 |
-
situation of imbalance between the duties and rights of the parties.
|
| 57 |
-
|
| 58 |
-
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
| 59 |
-
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
| 60 |
-
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
| 61 |
-
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
| 62 |
-
analyze contracts and understand what they agree upon.
|
| 63 |
-
"""
|
| 64 |
-
st.write(desc)
|
| 65 |
-
|
| 66 |
-
st.header("Dataset statistics")
|
| 67 |
|
| 68 |
with dataset_statistics:
|
|
|
|
| 69 |
st.subheader("Number of samples in each data split")
|
| 70 |
metrics_df = pd.DataFrame.from_dict(
|
| 71 |
{
|
| 72 |
"Train": DATA_DICT["train"].shape[0],
|
| 73 |
-
"
|
| 74 |
"Test": DATA_DICT["test"].shape[0],
|
| 75 |
"Total": sum(
|
| 76 |
[
|
| 77 |
DATA_DICT["train"].shape[0],
|
| 78 |
-
DATA_DICT["
|
| 79 |
DATA_DICT["test"].shape[0],
|
| 80 |
]
|
| 81 |
),
|
|
@@ -84,61 +112,44 @@ with dataset_statistics:
|
|
| 84 |
).reset_index()
|
| 85 |
metrics_df.columns = ["Subset", "Number of samples"]
|
| 86 |
st.dataframe(metrics_df)
|
| 87 |
-
latex_df = pd.DataFrame([metrics_df.style.to_latex()])
|
| 88 |
-
st.button(
|
| 89 |
-
label="Copy table to LaTeX",
|
| 90 |
-
on_click=latex_df.to_clipboard(index=False, header=False),
|
| 91 |
-
key="copy_metrics_df",
|
| 92 |
-
)
|
| 93 |
|
| 94 |
-
|
| 95 |
-
|
|
|
|
|
|
|
| 96 |
st.subheader("Class distribution in each subset")
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
[
|
| 102 |
-
|
| 103 |
-
for k, df in DATA_DICT.items()
|
| 104 |
-
]
|
| 105 |
-
)
|
| 106 |
-
.reset_index()
|
| 107 |
-
.rename({"index": "split_name"}, axis=1)
|
| 108 |
-
)
|
| 109 |
-
barchart_class_dist = go.Figure(
|
| 110 |
-
data=[
|
| 111 |
-
go.Bar(
|
| 112 |
-
name="BEZPIECZNE_POSTANOWIENIE_UMOWNE",
|
| 113 |
-
x=DATA_SPLITS,
|
| 114 |
-
y=hist["BEZPIECZNE_POSTANOWIENIE_UMOWNE"].values,
|
| 115 |
-
),
|
| 116 |
-
go.Bar(
|
| 117 |
-
name="KLAUZULA_ABUZYWNA",
|
| 118 |
-
x=DATA_SPLITS,
|
| 119 |
-
y=hist["KLAUZULA_ABUZYWNA"].values,
|
| 120 |
-
),
|
| 121 |
]
|
| 122 |
)
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
for _ in range(10):
|
| 132 |
-
st.text("")
|
| 133 |
-
st.dataframe(hist)
|
| 134 |
-
latex_df_class_dist = pd.DataFrame([hist.style.to_latex()])
|
| 135 |
-
st.button(
|
| 136 |
-
label="Copy table to LaTeX",
|
| 137 |
-
on_click=latex_df_class_dist.to_clipboard(header=False, index=False),
|
| 138 |
-
key="copy_class_dist_df",
|
| 139 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 140 |
|
| 141 |
# Number of words per observation
|
|
|
|
| 142 |
hist_data_num_words = [
|
| 143 |
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
| 144 |
]
|
|
@@ -155,6 +166,7 @@ with class_distribution:
|
|
| 155 |
st.plotly_chart(fig_num_words, use_container_width=True)
|
| 156 |
|
| 157 |
# Number of characters per observation
|
|
|
|
| 158 |
hist_data_num_characters = [
|
| 159 |
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
| 160 |
]
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
|
| 3 |
+
from datasets import load_dataset
|
| 4 |
import pandas as pd
|
| 5 |
import plotly.figure_factory as ff
|
| 6 |
import plotly.graph_objects as go
|
| 7 |
+
import pyperclip
|
| 8 |
import streamlit as st
|
| 9 |
from unidecode import unidecode
|
| 10 |
|
| 11 |
+
DATA_SPLITS = ["train", "validation", "test"]
|
| 12 |
|
|
|
|
| 13 |
|
| 14 |
+
def load_data() -> dict[str, pd.DataFrame]:
|
| 15 |
+
return {
|
| 16 |
+
data: pd.read_csv(f"data/{data}.csv").rename(
|
| 17 |
+
{"label": "target"}, axis="columns"
|
| 18 |
+
)
|
| 19 |
+
for data in DATA_SPLITS
|
| 20 |
+
}
|
| 21 |
|
| 22 |
|
| 23 |
+
def flatten_list(main_list: list[list]) -> list:
|
| 24 |
return [item for sublist in main_list for item in sublist]
|
| 25 |
|
| 26 |
|
|
|
|
| 32 |
return len(re.sub(r"[^a-zA-Z ]", "", unidecode(text)).split(" "))
|
| 33 |
|
| 34 |
|
| 35 |
+
selected_dataset = st.sidebar.selectbox(
|
| 36 |
+
"Choose a dataset to load",
|
| 37 |
+
("clarin-pl/polemo2-official", "laugustyniak/abusive-clauses-pl"),
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def load_hf_dataset():
|
| 42 |
+
match selected_dataset:
|
| 43 |
+
case "clarin-pl/polemo2-official":
|
| 44 |
+
data = load_dataset("clarin-pl/polemo2-official")
|
| 45 |
+
DATA_DICT = {
|
| 46 |
+
"train": data["train"].to_pandas(),
|
| 47 |
+
"validation": data["validation"].to_pandas(),
|
| 48 |
+
"test": data["test"].to_pandas(),
|
| 49 |
+
}
|
| 50 |
+
DATA_DESCRIPTION = """The PolEmo2.0 is a dataset of online consumer reviews from four domains: medicine,
|
| 51 |
+
hotels, products, and university. It is human-annotated on a level of full reviews and individual
|
| 52 |
+
sentences. Current version (PolEmo 2.0) contains 8,216 reviews having 57,466 sentences. Each text and
|
| 53 |
+
sentence was manually annotated with sentiment in the 2+1 scheme, which gives a total of 197,
|
| 54 |
+
046 annotations. About 85% of the reviews are from the medicine and hotel domains. Each review is
|
| 55 |
+
annotated with four labels: positive, negative, neutral, or ambiguous. """
|
| 56 |
+
case "laugustyniak/abusive-clauses-pl":
|
| 57 |
+
DATA_DICT = load_data()
|
| 58 |
+
DATA_DESCRIPTION = """
|
| 59 |
+
''I have read and agree to the terms and conditions'' is one of the biggest lies on the Internet.
|
| 60 |
+
Consumers rarely read the contracts they are required to accept. We conclude agreements over the Internet daily.
|
| 61 |
+
But do we know the content of these agreements? Do we check potential unfair statements? On the Internet,
|
| 62 |
+
we probably skip most of the Terms and Conditions. However, we must remember that we have concluded many more
|
| 63 |
+
contracts. Imagine that we want to buy a house, a car, send our kids to the nursery, open a bank account,
|
| 64 |
+
or many more. In all these situations, you will need to conclude the contract, but there is a high probability
|
| 65 |
+
that you will not read the entire agreement with proper understanding. European consumer law aims to prevent
|
| 66 |
+
businesses from using so-called ''unfair contractual terms'' in their unilaterally drafted contracts,
|
| 67 |
+
requiring consumers to accept.
|
| 68 |
+
|
| 69 |
+
Our dataset treats ''unfair contractual term'' as the equivalent of an abusive clause. It could be defined as a
|
| 70 |
+
clause that is unilaterally imposed by one of the contract's parties, unequally affecting the other, or creating a
|
| 71 |
+
situation of imbalance between the duties and rights of the parties.
|
| 72 |
+
|
| 73 |
+
On the EU and at the national such as the Polish levels, agencies cannot check possible agreements by hand. Hence,
|
| 74 |
+
we took the first step to evaluate the possibility of accelerating this process. We created a dataset and machine
|
| 75 |
+
learning models to automate potentially abusive clauses detection partially. Consumer protection organizations and
|
| 76 |
+
agencies can use these resources to make their work more effective and efficient. Moreover, consumers can automatically
|
| 77 |
+
analyze contracts and understand what they agree upon.
|
| 78 |
+
"""
|
| 79 |
+
return DATA_DICT, DATA_DESCRIPTION
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
DATA_DICT, DATA_DESCRIPTION = load_hf_dataset()
|
| 83 |
|
| 84 |
header = st.container()
|
| 85 |
description = st.container()
|
| 86 |
dataset_statistics = st.container()
|
|
|
|
| 87 |
|
| 88 |
with header:
|
| 89 |
+
st.title(selected_dataset)
|
| 90 |
|
| 91 |
with description:
|
| 92 |
st.header("Dataset description")
|
| 93 |
+
st.write(DATA_DESCRIPTION)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
with dataset_statistics:
|
| 96 |
+
st.header("Dataset statistics")
|
| 97 |
st.subheader("Number of samples in each data split")
|
| 98 |
metrics_df = pd.DataFrame.from_dict(
|
| 99 |
{
|
| 100 |
"Train": DATA_DICT["train"].shape[0],
|
| 101 |
+
"Validation": DATA_DICT["validation"].shape[0],
|
| 102 |
"Test": DATA_DICT["test"].shape[0],
|
| 103 |
"Total": sum(
|
| 104 |
[
|
| 105 |
DATA_DICT["train"].shape[0],
|
| 106 |
+
DATA_DICT["validation"].shape[0],
|
| 107 |
DATA_DICT["test"].shape[0],
|
| 108 |
]
|
| 109 |
),
|
|
|
|
| 112 |
).reset_index()
|
| 113 |
metrics_df.columns = ["Subset", "Number of samples"]
|
| 114 |
st.dataframe(metrics_df)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 115 |
|
| 116 |
+
latex_df = metrics_df.style.to_latex()
|
| 117 |
+
st.text_area(label="Latex code", value=latex_df)
|
| 118 |
+
|
| 119 |
+
# Class distribution in each subset
|
| 120 |
st.subheader("Class distribution in each subset")
|
| 121 |
+
target_unique_values = DATA_DICT["train"]["target"].unique()
|
| 122 |
+
hist = (
|
| 123 |
+
pd.DataFrame(
|
| 124 |
+
[
|
| 125 |
+
df["target"].value_counts(normalize=True).rename(k)
|
| 126 |
+
for k, df in DATA_DICT.items()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
]
|
| 128 |
)
|
| 129 |
+
.reset_index()
|
| 130 |
+
.rename({"index": "split_name"}, axis=1)
|
| 131 |
+
)
|
| 132 |
+
plot_data = [
|
| 133 |
+
go.Bar(
|
| 134 |
+
name=str(target_unique_values[i]),
|
| 135 |
+
x=DATA_SPLITS,
|
| 136 |
+
y=hist[target_unique_values[i]].values,
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 137 |
)
|
| 138 |
+
for i in range(len(target_unique_values))
|
| 139 |
+
]
|
| 140 |
+
barchart_class_dist = go.Figure(data=plot_data)
|
| 141 |
+
barchart_class_dist.update_layout(
|
| 142 |
+
barmode="group",
|
| 143 |
+
title_text="Barchart - class distribution",
|
| 144 |
+
xaxis_title="Split name",
|
| 145 |
+
yaxis_title="Number of data points",
|
| 146 |
+
)
|
| 147 |
+
st.plotly_chart(barchart_class_dist, use_container_width=True)
|
| 148 |
+
st.dataframe(hist)
|
| 149 |
+
st.text_area(label="Latex code", value=hist.style.to_latex())
|
| 150 |
|
| 151 |
# Number of words per observation
|
| 152 |
+
st.subheader("Number of words per observation in each subset")
|
| 153 |
hist_data_num_words = [
|
| 154 |
df["text"].apply(count_num_of_words) for df in DATA_DICT.values()
|
| 155 |
]
|
|
|
|
| 166 |
st.plotly_chart(fig_num_words, use_container_width=True)
|
| 167 |
|
| 168 |
# Number of characters per observation
|
| 169 |
+
st.subheader("Number of characters per observation in each subset")
|
| 170 |
hist_data_num_characters = [
|
| 171 |
df["text"].apply(count_num_of_characters) for df in DATA_DICT.values()
|
| 172 |
]
|
data/{dev.csv → validation.csv}
RENAMED
|
File without changes
|