Spaces:

BookingCare
/

ner-annotation

Running

ner-annotation / src /ner_annotation /core /dataset.py

nam pham

feat: improve ui/ux

a33a001 9 months ago

4.96 kB

	"""Dataset management module for NER annotation."""

	from typing import List, Dict, Union, Tuple
	import json
	import os
	import re

	class DynamicDataset:
	"""A class to manage and navigate through annotated dataset examples."""

	def __init__(
	self, data: List[Dict[str, Union[List[Union[int, str]], bool]]]
	) -> None:
	"""Initialize the dataset with examples.

	Args:
	data: List of examples, each containing tokenized text and NER annotations
	"""
	self.data = data
	self.data_len = len(self.data)
	self.current = -1
	for example in self.data:
	if "validated" not in example:
	example["validated"] = False

	def next_example(self) -> None:
	"""Move to the next example in the dataset."""
	self.current += 1
	if self.current > self.data_len - 1:
	self.current = self.data_len - 1
	elif self.current < 0:
	self.current = 0

	def previous_example(self) -> None:
	"""Move to the previous example in the dataset."""
	self.current -= 1
	if self.current > self.data_len - 1:
	self.current = self.data_len - 1
	elif self.current < 0:
	self.current = 0

	def example_by_id(self, id: int) -> None:
	"""Navigate to a specific example by its ID.

	Args:
	id: The index of the example to navigate to
	"""
	self.current = id
	if self.current > self.data_len - 1:
	self.current = self.data_len - 1
	elif self.current < 0:
	self.current = 0

	def validate(self) -> None:
	"""Mark the current example as validated."""
	self.data[self.current]["validated"] = True

	def load_current_example(self) -> Dict:
	"""Get the current example.

	Returns:
	The current example data
	"""
	return self.data[self.current]

	def tokenize_text(text: str) -> List[str]:
	"""Tokenize the input text into a list of tokens.

	Args:
	text: The input text to tokenize

	Returns:
	List of tokens
	"""
	return re.findall(r'\w+(?:[-_]\w+)*\|\S', text)

	def join_tokens(tokens: List[str]) -> str:
	"""Join tokens with proper spacing.

	Args:
	tokens: List of tokens to join

	Returns:
	Joined text string
	"""
	text = ""
	for token in tokens:
	if token in {",", ".", "!", "?", ":", ";", "..."}:
	text = text.rstrip() + token
	else:
	text += " " + token
	return text.strip()

	def prepare_for_highlight(data: Dict) -> List[Tuple[str, str]]:
	"""Prepare text for highlighting with NER annotations.

	Args:
	data: Dictionary containing tokenized text and NER annotations

	Returns:
	List of tuples containing text segments and their entity labels
	"""
	tokens = data["tokenized_text"]
	ner = data["ner"]

	highlighted_text = []
	current_entity = None
	entity_tokens = []
	normal_tokens = []

	for idx, token in enumerate(tokens):
	if current_entity is None or idx > current_entity[1]:
	if entity_tokens:
	highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
	entity_tokens = []
	current_entity = next((entity for entity in ner if entity[0] == idx), None)

	if current_entity and current_entity[0] <= idx <= current_entity[1]:
	if normal_tokens:
	highlighted_text.append((" ".join(normal_tokens), None))
	normal_tokens = []
	entity_tokens.append(token + " ")
	else:
	if entity_tokens:
	highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
	entity_tokens = []
	normal_tokens.append(token + " ")

	if entity_tokens:
	highlighted_text.append((" ".join(entity_tokens), current_entity[2]))
	if normal_tokens:
	highlighted_text.append((" ".join(normal_tokens), None))

	cleaned_highlighted_text = []
	for text, label in highlighted_text:
	cleaned_text = re.sub(r'\s(?=[,\.!?…:;])', '', text)
	cleaned_highlighted_text.append((cleaned_text, label))

	return cleaned_highlighted_text

	def save_dataset(data: List[Dict], filepath: str) -> None:
	"""Save the dataset to a JSON file.

	Args:
	data: The dataset to save
	filepath: Path to save the dataset
	"""
	os.makedirs(os.path.dirname(filepath), exist_ok=True)
	with open(filepath, "wt") as file:
	json.dump(data, file, ensure_ascii=False)

	def load_dataset(filepath: str) -> List[Dict]:
	"""Load a dataset from a JSON file.

	Args:
	filepath: Path to the dataset file

	Returns:
	The loaded dataset
	"""
	with open(filepath, "rt") as file:
	return json.load(file)