| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| """ seqeval metric. """ |
|
|
| import importlib |
| from typing import List, Optional, Union |
|
|
| import datasets |
| from seqeval.metrics import accuracy_score, classification_report |
|
|
| import evaluate |
|
|
|
|
| _CITATION = """\ |
| @inproceedings{ramshaw-marcus-1995-text, |
| title = "Text Chunking using Transformation-Based Learning", |
| author = "Ramshaw, Lance and |
| Marcus, Mitch", |
| booktitle = "Third Workshop on Very Large Corpora", |
| year = "1995", |
| url = "https://www.aclweb.org/anthology/W95-0107", |
| } |
| @misc{seqeval, |
| title={{seqeval}: A Python framework for sequence labeling evaluation}, |
| url={https://github.com/chakki-works/seqeval}, |
| note={Software available from https://github.com/chakki-works/seqeval}, |
| author={Hiroki Nakayama}, |
| year={2018}, |
| } |
| """ |
|
|
| _DESCRIPTION = """\ |
| seqeval is a Python framework for sequence labeling evaluation. |
| seqeval can evaluate the performance of chunking tasks such as named-entity recognition, part-of-speech tagging, semantic role labeling and so on. |
| |
| This is well-tested by using the Perl script conlleval, which can be used for |
| measuring the performance of a system that has processed the CoNLL-2000 shared task data. |
| |
| seqeval supports following formats: |
| IOB1 |
| IOB2 |
| IOE1 |
| IOE2 |
| IOBES |
| |
| See the [README.md] file at https://github.com/chakki-works/seqeval for more information. |
| """ |
|
|
| _KWARGS_DESCRIPTION = """ |
| Produces labelling scores along with its sufficient statistics |
| from a source against one or more references. |
| |
| Args: |
| predictions: List of List of predicted labels (Estimated targets as returned by a tagger) |
| references: List of List of reference labels (Ground truth (correct) target values) |
| suffix: True if the IOB prefix is after type, False otherwise. default: False |
| scheme: Specify target tagging scheme. Should be one of ["IOB1", "IOB2", "IOE1", "IOE2", "IOBES", "BILOU"]. |
| default: None |
| mode: Whether to count correct entity labels with incorrect I/B tags as true positives or not. |
| If you want to only count exact matches, pass mode="strict". default: None. |
| sample_weight: Array-like of shape (n_samples,), weights for individual samples. default: None |
| zero_division: Which value to substitute as a metric value when encountering zero division. Should be on of 0, 1, |
| "warn". "warn" acts as 0, but the warning is raised. |
| |
| Returns: |
| 'scores': dict. Summary of the scores for overall and per type |
| Overall: |
| 'accuracy': accuracy, |
| 'precision': precision, |
| 'recall': recall, |
| 'f1': F1 score, also known as balanced F-score or F-measure, |
| Per type: |
| 'precision': precision, |
| 'recall': recall, |
| 'f1': F1 score, also known as balanced F-score or F-measure |
| Examples: |
| |
| >>> predictions = [['O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] |
| >>> references = [['O', 'O', 'O', 'B-MISC', 'I-MISC', 'I-MISC', 'O'], ['B-PER', 'I-PER', 'O']] |
| >>> seqeval = evaluate.load("seqeval") |
| >>> results = seqeval.compute(predictions=predictions, references=references) |
| >>> print(list(results.keys())) |
| ['MISC', 'PER', 'overall_precision', 'overall_recall', 'overall_f1', 'overall_accuracy'] |
| >>> print(results["overall_f1"]) |
| 0.5 |
| >>> print(results["PER"]["f1"]) |
| 1.0 |
| """ |
|
|
|
|
| @evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION) |
| class Seqeval(evaluate.Metric): |
| def _info(self): |
| return evaluate.MetricInfo( |
| description=_DESCRIPTION, |
| citation=_CITATION, |
| homepage="https://github.com/chakki-works/seqeval", |
| inputs_description=_KWARGS_DESCRIPTION, |
| features=datasets.Features( |
| { |
| "predictions": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), |
| "references": datasets.Sequence(datasets.Value("string", id="label"), id="sequence"), |
| } |
| ), |
| codebase_urls=["https://github.com/chakki-works/seqeval"], |
| reference_urls=["https://github.com/chakki-works/seqeval"], |
| ) |
|
|
| def _compute( |
| self, |
| predictions, |
| references, |
| suffix: bool = False, |
| scheme: Optional[str] = None, |
| mode: Optional[str] = None, |
| sample_weight: Optional[List[int]] = None, |
| zero_division: Union[str, int] = "warn", |
| ): |
| if scheme is not None: |
| try: |
| scheme_module = importlib.import_module("seqeval.scheme") |
| scheme = getattr(scheme_module, scheme) |
| except AttributeError: |
| raise ValueError(f"Scheme should be one of [IOB1, IOB2, IOE1, IOE2, IOBES, BILOU], got {scheme}") |
| report = classification_report( |
| y_true=references, |
| y_pred=predictions, |
| suffix=suffix, |
| output_dict=True, |
| scheme=scheme, |
| mode=mode, |
| sample_weight=sample_weight, |
| zero_division=zero_division, |
| ) |
| report.pop("macro avg") |
| report.pop("weighted avg") |
| overall_score = report.pop("micro avg") |
|
|
| scores = { |
| type_name: { |
| "precision": score["precision"], |
| "recall": score["recall"], |
| "f1": score["f1-score"], |
| "number": score["support"], |
| } |
| for type_name, score in report.items() |
| } |
| scores["overall_precision"] = overall_score["precision"] |
| scores["overall_recall"] = overall_score["recall"] |
| scores["overall_f1"] = overall_score["f1-score"] |
| scores["overall_accuracy"] = accuracy_score(y_true=references, y_pred=predictions) |
|
|
| return scores |
|
|