Spaces:
Running
Running
| """Dataset management module for NER annotation.""" | |
| from typing import List, Dict, Union, Tuple | |
| import json | |
| import os | |
| import re | |
| class DynamicDataset: | |
| """A class to manage and navigate through annotated dataset examples.""" | |
| def __init__( | |
| self, data: List[Dict[str, Union[List[Union[int, str]], bool]]] | |
| ) -> None: | |
| """Initialize the dataset with examples. | |
| Args: | |
| data: List of examples, each containing tokenized text and NER annotations | |
| """ | |
| self.data = data | |
| self.data_len = len(self.data) | |
| self.current = -1 | |
| for example in self.data: | |
| if "validated" not in example: | |
| example["validated"] = False | |
| def next_example(self) -> None: | |
| """Move to the next example in the dataset.""" | |
| self.current += 1 | |
| if self.current > self.data_len - 1: | |
| self.current = self.data_len - 1 | |
| elif self.current < 0: | |
| self.current = 0 | |
| def previous_example(self) -> None: | |
| """Move to the previous example in the dataset.""" | |
| self.current -= 1 | |
| if self.current > self.data_len - 1: | |
| self.current = self.data_len - 1 | |
| elif self.current < 0: | |
| self.current = 0 | |
| def example_by_id(self, id: int) -> None: | |
| """Navigate to a specific example by its ID. | |
| Args: | |
| id: The index of the example to navigate to | |
| """ | |
| self.current = id | |
| if self.current > self.data_len - 1: | |
| self.current = self.data_len - 1 | |
| elif self.current < 0: | |
| self.current = 0 | |
| def validate(self) -> None: | |
| """Mark the current example as validated.""" | |
| self.data[self.current]["validated"] = True | |
| def load_current_example(self) -> Dict: | |
| """Get the current example. | |
| Returns: | |
| The current example data | |
| """ | |
| return self.data[self.current] | |
| def tokenize_text(text: str) -> List[str]: | |
| """Tokenize the input text into a list of tokens. | |
| Args: | |
| text: The input text to tokenize | |
| Returns: | |
| List of tokens | |
| """ | |
| return re.findall(r'\w+(?:[-_]\w+)*|\S', text) | |
| def join_tokens(tokens: List[str]) -> str: | |
| """Join tokens with proper spacing. | |
| Args: | |
| tokens: List of tokens to join | |
| Returns: | |
| Joined text string | |
| """ | |
| text = "" | |
| for token in tokens: | |
| if token in {",", ".", "!", "?", ":", ";", "..."}: | |
| text = text.rstrip() + token | |
| else: | |
| text += " " + token | |
| return text.strip() | |
| def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]: | |
| """Prepare text for highlighting with NER annotations. | |
| Args: | |
| data: Dictionary containing tokenized text and NER annotations | |
| Returns: | |
| List of tuples containing text segments and their entity labels | |
| """ | |
| tokens = data["tokenized_text"] | |
| ner = data["ner"] | |
| highlighted_text = [] | |
| current_entity = None | |
| entity_tokens = [] | |
| normal_tokens = [] | |
| for idx, token in enumerate(tokens): | |
| if current_entity is None or idx > current_entity[1]: | |
| if entity_tokens: | |
| highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
| entity_tokens = [] | |
| current_entity = next((entity for entity in ner if entity[0] == idx), None) | |
| if current_entity and current_entity[0] <= idx <= current_entity[1]: | |
| if normal_tokens: | |
| highlighted_text.append((" ".join(normal_tokens), None)) | |
| normal_tokens = [] | |
| entity_tokens.append(token + " ") | |
| else: | |
| if entity_tokens: | |
| highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
| entity_tokens = [] | |
| normal_tokens.append(token + " ") | |
| if entity_tokens: | |
| highlighted_text.append((" ".join(entity_tokens), current_entity[2])) | |
| if normal_tokens: | |
| highlighted_text.append((" ".join(normal_tokens), None)) | |
| cleaned_highlighted_text = [] | |
| for text, label in highlighted_text: | |
| cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text) | |
| cleaned_highlighted_text.append((cleaned_text, label)) | |
| return cleaned_highlighted_text | |
| def save_dataset(data: List[Dict], filepath: str) -> None: | |
| """Save the dataset to a JSON file. | |
| Args: | |
| data: The dataset to save | |
| filepath: Path to save the dataset | |
| """ | |
| os.makedirs(os.path.dirname(filepath), exist_ok=True) | |
| with open(filepath, "wt") as file: | |
| json.dump(data, file, ensure_ascii=False) | |
| def load_dataset(filepath: str) -> List[Dict]: | |
| """Load a dataset from a JSON file. | |
| Args: | |
| filepath: Path to the dataset file | |
| Returns: | |
| The loaded dataset | |
| """ | |
| with open(filepath, "rt") as file: | |
| return json.load(file) |