Spaces:
Sleeping
Sleeping
Sasha
commited on
Commit
·
4f31875
1
Parent(s):
74e9f8c
catching dataset-specific metrics
Browse files
app.py
CHANGED
|
@@ -31,6 +31,8 @@ tasks= ['classification', 'question answering', 'automatic speech recognition',
|
|
| 31 |
'textual entailment', 'commonsense reasoning', 'summarization']
|
| 32 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
|
| 33 |
|
|
|
|
|
|
|
| 34 |
def find_task(dname):
|
| 35 |
task = None
|
| 36 |
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
|
|
@@ -83,28 +85,29 @@ if dataset_name in metrics:
|
|
| 83 |
code = ''' from datasets import load_metric
|
| 84 |
metric = load_metric(\"'''+dataset_name+'''\")'''
|
| 85 |
st.code(code, language='python')
|
|
|
|
| 86 |
else:
|
| 87 |
st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
|
| 88 |
dedicated_metric = False
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
task = find_task(dataset_name)
|
| 93 |
-
|
| 94 |
-
if task is not None:
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
else:
|
| 107 |
-
|
| 108 |
|
| 109 |
|
| 110 |
#print(dataset_builder.info.task_templates)
|
|
@@ -119,44 +122,45 @@ else:
|
|
| 119 |
#print(dataset_name, dataset_config, dataset_split)
|
| 120 |
|
| 121 |
#print(labels.head())
|
| 122 |
-
if
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
dataset = load_dataset(dataset_name, split=dataset_split)
|
| 126 |
-
|
| 127 |
-
try:
|
| 128 |
-
num_classes = dataset_builder.info.features['label'].num_classes
|
| 129 |
-
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
|
| 130 |
-
labels = labels.rename(columns={"count_star()": "count"})
|
| 131 |
-
labels.index = dataset_builder.info.features['label'].names
|
| 132 |
-
st.markdown("### Labelled Metrics")
|
| 133 |
-
st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
|
| 134 |
-
#TODO : figure out how to make a label plot
|
| 135 |
-
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
|
| 136 |
-
total = sum(c for c in labels['count'])
|
| 137 |
-
proportion = [c/total for c in labels['count']]
|
| 138 |
-
#proportion = [0.85, 0.15]
|
| 139 |
-
stdev_dataset= statistics.stdev(proportion)
|
| 140 |
-
if stdev_dataset <= balanced_stdev:
|
| 141 |
-
st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
|
| 142 |
-
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
|
| 143 |
-
accuracy_code = '''from datasets import load_metric
|
| 144 |
-
metric = load_metric("accuracy")'''
|
| 145 |
-
st.code(accuracy_code, language='python')
|
| 146 |
-
|
| 147 |
else:
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
st.markdown("
|
| 157 |
-
|
| 158 |
-
st.
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 31 |
'textual entailment', 'commonsense reasoning', 'summarization']
|
| 32 |
metrics= ['matthews_correlation', 'perplexity', 'meteor', 'code_eval', 'super_glue', 'rouge', 'mauve', 'cer', 'accuracy', 'recall', 'bleurt', 'sari', 'precision', 'mean_iou', 'squad', 'mahalanobis', 'chrf', 'mae', 'squad_v2', 'seqeval', 'cuad', 'wiki_split', 'google_bleu', 'competition_math', 'pearsonr', 'xtreme_s', 'comet', 'gleu', 'spearmanr', 'f1', 'frugalscore', 'bertscore', 'indic_glue', 'mse', 'xnli', 'ter', 'coval', 'wer', 'bleu', 'glue', 'sacrebleu']
|
| 33 |
|
| 34 |
+
dedicated_metric = False
|
| 35 |
+
|
| 36 |
def find_task(dname):
|
| 37 |
task = None
|
| 38 |
dataset_builder = load_dataset_builder(dataset_name, dataset_config)
|
|
|
|
| 85 |
code = ''' from datasets import load_metric
|
| 86 |
metric = load_metric(\"'''+dataset_name+'''\")'''
|
| 87 |
st.code(code, language='python')
|
| 88 |
+
dedicated_metric = True
|
| 89 |
else:
|
| 90 |
st.markdown("This dataset doesn't have a dedicated metric, but that's ok! :wink:")
|
| 91 |
dedicated_metric = False
|
| 92 |
|
| 93 |
+
if dedicated_metric == False:
|
| 94 |
+
st.markdown("### Task-Specific Metrics")
|
| 95 |
+
task = find_task(dataset_name)
|
| 96 |
+
|
| 97 |
+
if task is not None:
|
| 98 |
+
st.markdown("The task associated to it this dataset is: " + task.replace('-',' '))
|
| 99 |
+
if task == 'automatic-speech-recognition':
|
| 100 |
+
st.markdown('Automatic Speech Recognition has some dedicated metrics such as:')
|
| 101 |
+
st.markdown('[Word Error Rate](https://huggingface.co/metrics/wer)')
|
| 102 |
+
wer_code = '''from datasets import load_metric
|
| 103 |
+
metric = load_metric("wer")'''
|
| 104 |
+
st.code(wer_code, language='python')
|
| 105 |
+
st.markdown('[Character Error Rate](https://huggingface.co/metrics/cer)')
|
| 106 |
+
cer_code = '''from datasets import load_metric
|
| 107 |
+
metric = load_metric("cer")'''
|
| 108 |
+
st.code(cer_code, language='python')
|
| 109 |
+
else:
|
| 110 |
+
st.markdown("The task for this dataset doesn't have any dedicated metrics, but you can still use general ones! :cowboy_hat_face:")
|
| 111 |
|
| 112 |
|
| 113 |
#print(dataset_builder.info.task_templates)
|
|
|
|
| 122 |
#print(dataset_name, dataset_config, dataset_split)
|
| 123 |
|
| 124 |
#print(labels.head())
|
| 125 |
+
if dedicated_metric == False:
|
| 126 |
+
if dataset_name in ['glue','super_glue', 'paws', 'squad_es']:
|
| 127 |
+
dataset = load_dataset(dataset_name, dataset_config, split=dataset_split)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
else:
|
| 129 |
+
dataset = load_dataset(dataset_name, split=dataset_split)
|
| 130 |
+
|
| 131 |
+
try:
|
| 132 |
+
num_classes = dataset_builder.info.features['label'].num_classes
|
| 133 |
+
labels = query("SELECT COUNT(*) from dataset GROUP BY label").to_pandas()
|
| 134 |
+
labels = labels.rename(columns={"count_star()": "count"})
|
| 135 |
+
labels.index = dataset_builder.info.features['label'].names
|
| 136 |
+
st.markdown("### Labelled Metrics")
|
| 137 |
+
st.markdown("This dataset has "+ str(dataset_builder.info.features['label'].num_classes) + " labels : " + ', '.join(dataset_builder.info.features['label'].names))
|
| 138 |
+
#TODO : figure out how to make a label plot
|
| 139 |
+
st.plotly_chart(px.pie(labels, values = "count", names = labels.index, width=800, height=400))
|
| 140 |
+
total = sum(c for c in labels['count'])
|
| 141 |
+
proportion = [c/total for c in labels['count']]
|
| 142 |
+
#proportion = [0.85, 0.15]
|
| 143 |
+
stdev_dataset= statistics.stdev(proportion)
|
| 144 |
+
if stdev_dataset <= balanced_stdev:
|
| 145 |
+
st.markdown("Since this dataset is well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
|
| 146 |
+
st.markdown('[Accuracy](https://huggingface.co/metrics/accuracy)')
|
| 147 |
+
accuracy_code = '''from datasets import load_metric
|
| 148 |
+
metric = load_metric("accuracy")'''
|
| 149 |
+
st.code(accuracy_code, language='python')
|
| 150 |
+
|
| 151 |
+
else:
|
| 152 |
+
st.markdown("Since this dataset is not well-balanced (with a standard deviation of " + str(round(stdev_dataset,2)) +"), you can look at using:")
|
| 153 |
+
st.markdown('[F1 Score](https://huggingface.co/metrics/f1)')
|
| 154 |
+
accuracy_code = '''from datasets import load_metric
|
| 155 |
+
metric = load_metric("accuracy")'''
|
| 156 |
+
st.code(accuracy_code, language='python')
|
| 157 |
+
st.markdown('Since it takes into account both precision and recall, which works well to evaluate model performance on minority classes.')
|
| 158 |
+
except:
|
| 159 |
+
if task != 'automatic-speech-recognition':
|
| 160 |
+
st.markdown("### Unsupervised Metrics")
|
| 161 |
+
st.markdown("Since this dataset doesn't have any labels, the metrics that you can use for evaluation are:")
|
| 162 |
+
st.markdown('[Perplexity](https://huggingface.co/metrics/perplexity)')
|
| 163 |
+
perplexity_code = '''from datasets import load_metric
|
| 164 |
+
metric = load_metric("perplexity")'''
|
| 165 |
+
st.code(perplexity_code, language='python')
|
| 166 |
+
st.markdown('If you choose a model that was trained on **' + dataset_name + '** and use it to compute perplexity on text generated by your model, this can help determine how similar the two are.')
|