Spaces:

rudaoshi
/

lang2logic

Sleeping

App Files Files Community

lang2logic / lingua /utils /segement.py

rudaoshi

new shcema

685c2c0 3 months ago

raw

history blame contribute delete

4.66 kB

	"""
	convert a token gpgraph to word gpgraph
	"""

	import re
	from lingua.structure.gpgraph import GPGraph, GPGAuxNode, GPGPhraseNode, GPGTextNode, TextGPGraph

	from typing import List

	def add_prefix_space(words: List[str], sentence: str) -> List[str]:
	"""
	Add a space prefix to words if there is a space before them in the sentence.

	Args:
	words: List of words (without spacing information), which are tokenization of sentence
	sentence: The original sentence string

	Returns:
	List of words with space prefixes where appropriate
	"""
	if not words or not sentence:
	return words

	result = []
	sent_idx = 0 # Current position in sentence

	for word in words:
	# Strip any existing spaces from the word for matching
	word_stripped = word.strip()

	if not word_stripped:
	# Empty word, keep as is
	result.append(word)
	continue

	word_start_pos = sentence.find(word_stripped, sent_idx)
	if word_start_pos != -1:
	if word_start_pos > 1 and sentence[word_start_pos - 1] == ' ':
	result.append(' ' + word_stripped)
	else:
	result.append(word_stripped)
	sent_idx = word_start_pos + len(word_stripped)
	else:
	raise ValueError(f"Word [{word_stripped}] not found in sentence [{sentence}] starting from position {sent_idx}.")
	return result



	import string

	import spacy
	nlp = spacy.blank("en") # 不加载模型，只创建英文 tokenizer


	def split_mixed_word(word: str) -> List[str]:
	"""
	Split a word if it contains a mix of digits, characters, and punctuations.

	Args:
	word: A word that may contain mixed content

	Returns:
	List of split parts if the word contains mixed content, otherwise [word]
	"""
	if not word:
	return [word]

	# Check if word contains digits, letters, and punctuation
	has_digit = any(c.isdigit() for c in word)
	has_letter = any(c.isalpha() for c in word)
	has_punct = any(c in string.punctuation for c in word)
	# Check for other characters (e.g. en-dash, em-dash, other unicode symbols not in string.punctuation)
	has_other = any(not (c.isdigit() or c.isalpha() or c in string.punctuation) for c in word)

	# Count how many types are present
	type_count = sum([has_digit, has_letter, has_punct, has_other])

	# If word contains mixed types (2 or more), split it
	if type_count >= 2:
	# Split on boundaries between different character types
	# This regex splits on transitions between digits, letters, and punctuation
	parts = []
	current_part = []
	current_type = None

	for char in word:
	if char.isdigit():
	char_type = 'digit'
	elif char.isalpha():
	char_type = 'letter'
	elif char in string.punctuation:
	char_type = 'punct'
	else:
	char_type = 'other'

	if current_type is None:
	current_type = char_type
	current_part.append(char)
	elif char_type == current_type:
	current_part.append(char)
	else:
	# Type changed, save current part and start new one
	if current_part:
	parts.append(''.join(current_part))
	current_part = [char]
	current_type = char_type

	# Add the last part
	if current_part:
	parts.append(''.join(current_part))

	return parts if len(parts) > 1 else [word]

	return [word]


	def segment(sentence: str) -> List[str]:
	"""
	Segment a sentence into words.

	Steps:
	1. Use NLTK tokenizer to tokenize the sentence to words
	2. Check whether each word contains a mix of digit, char and punctuations, if so split them
	3. Add prefix spaces of the word

	Args:
	sentence: The input sentence string

	Returns:
	List of words with prefix spaces where appropriate
	"""
	# Step 1: Use Spacy tokenizer to tokenize the sentence

	doc = nlp.tokenizer(sentence)
	words = [t.text for t in doc]

	# Step 2: Split words that contain mixed content (digits, chars, punctuations)
	split_words = []
	for word in words:

	split_parts = split_mixed_word(word)
	split_words.extend(split_parts)

	# Step 3: Add prefix spaces
	result = add_prefix_space(split_words, sentence)

	return result