Spaces:
Runtime error
Runtime error
Mariusz Kossakowski commited on
Commit ·
f10673c
1
Parent(s): 8eb9cdc
Black formatting
Browse files
clarin_datasets/kpwr_ner_datasets.py
CHANGED
|
@@ -72,7 +72,9 @@ class KpwrNerDataset(DatasetToShow):
|
|
| 72 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
| 73 |
tokens_all = full_dataframe["tokens"].tolist()
|
| 74 |
tokens_all = [x for subarray in tokens_all for x in subarray]
|
| 75 |
-
labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
|
|
|
|
|
|
|
| 76 |
labels_all = [x for subarray in labels_all for x in subarray]
|
| 77 |
|
| 78 |
with dataframe_head:
|
|
@@ -93,9 +95,9 @@ class KpwrNerDataset(DatasetToShow):
|
|
| 93 |
all_labels_from_subset = pd.Series(all_labels_from_subset)
|
| 94 |
class_distribution_dict[subset] = (
|
| 95 |
all_labels_from_subset.value_counts(normalize=True)
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
)
|
| 100 |
|
| 101 |
class_distribution_df = pd.merge(
|
|
@@ -117,7 +119,10 @@ class KpwrNerDataset(DatasetToShow):
|
|
| 117 |
"ner": labels_all,
|
| 118 |
}
|
| 119 |
)
|
| 120 |
-
full_df_unzipped = full_df_unzipped.loc[
|
|
|
|
|
|
|
|
|
|
| 121 |
possible_options = sorted(full_df_unzipped["ner"].unique())
|
| 122 |
with most_common_tokens:
|
| 123 |
st.header("10 most common tokens from selected class (without 'O')")
|
|
|
|
| 72 |
full_dataframe = pd.concat(self.data_dict.values(), axis="rows")
|
| 73 |
tokens_all = full_dataframe["tokens"].tolist()
|
| 74 |
tokens_all = [x for subarray in tokens_all for x in subarray]
|
| 75 |
+
labels_all = pd.concat(self.data_dict_named.values(), axis="rows")[
|
| 76 |
+
"ner"
|
| 77 |
+
].tolist()
|
| 78 |
labels_all = [x for subarray in labels_all for x in subarray]
|
| 79 |
|
| 80 |
with dataframe_head:
|
|
|
|
| 95 |
all_labels_from_subset = pd.Series(all_labels_from_subset)
|
| 96 |
class_distribution_dict[subset] = (
|
| 97 |
all_labels_from_subset.value_counts(normalize=True)
|
| 98 |
+
.sort_index()
|
| 99 |
+
.reset_index()
|
| 100 |
+
.rename({"index": "class", 0: subset}, axis="columns")
|
| 101 |
)
|
| 102 |
|
| 103 |
class_distribution_df = pd.merge(
|
|
|
|
| 119 |
"ner": labels_all,
|
| 120 |
}
|
| 121 |
)
|
| 122 |
+
full_df_unzipped = full_df_unzipped.loc[
|
| 123 |
+
(full_df_unzipped["ner"] != "O")
|
| 124 |
+
& (full_df_unzipped["ner"].str.starstwith("I-"))
|
| 125 |
+
]
|
| 126 |
possible_options = sorted(full_df_unzipped["ner"].unique())
|
| 127 |
with most_common_tokens:
|
| 128 |
st.header("10 most common tokens from selected class (without 'O')")
|