lang2logic / lingua /utils /segement.py
rudaoshi's picture
new shcema
685c2c0
"""
convert a token gpgraph to word gpgraph
"""
import re
from lingua.structure.gpgraph import GPGraph, GPGAuxNode, GPGPhraseNode, GPGTextNode, TextGPGraph
from typing import List
def add_prefix_space(words: List[str], sentence: str) -> List[str]:
"""
Add a space prefix to words if there is a space before them in the sentence.
Args:
words: List of words (without spacing information), which are tokenization of sentence
sentence: The original sentence string
Returns:
List of words with space prefixes where appropriate
"""
if not words or not sentence:
return words
result = []
sent_idx = 0 # Current position in sentence
for word in words:
# Strip any existing spaces from the word for matching
word_stripped = word.strip()
if not word_stripped:
# Empty word, keep as is
result.append(word)
continue
word_start_pos = sentence.find(word_stripped, sent_idx)
if word_start_pos != -1:
if word_start_pos > 1 and sentence[word_start_pos - 1] == ' ':
result.append(' ' + word_stripped)
else:
result.append(word_stripped)
sent_idx = word_start_pos + len(word_stripped)
else:
raise ValueError(f"Word [{word_stripped}] not found in sentence [{sentence}] starting from position {sent_idx}.")
return result
import string
import spacy
nlp = spacy.blank("en") # 不加载模型,只创建英文 tokenizer
def split_mixed_word(word: str) -> List[str]:
"""
Split a word if it contains a mix of digits, characters, and punctuations.
Args:
word: A word that may contain mixed content
Returns:
List of split parts if the word contains mixed content, otherwise [word]
"""
if not word:
return [word]
# Check if word contains digits, letters, and punctuation
has_digit = any(c.isdigit() for c in word)
has_letter = any(c.isalpha() for c in word)
has_punct = any(c in string.punctuation for c in word)
# Check for other characters (e.g. en-dash, em-dash, other unicode symbols not in string.punctuation)
has_other = any(not (c.isdigit() or c.isalpha() or c in string.punctuation) for c in word)
# Count how many types are present
type_count = sum([has_digit, has_letter, has_punct, has_other])
# If word contains mixed types (2 or more), split it
if type_count >= 2:
# Split on boundaries between different character types
# This regex splits on transitions between digits, letters, and punctuation
parts = []
current_part = []
current_type = None
for char in word:
if char.isdigit():
char_type = 'digit'
elif char.isalpha():
char_type = 'letter'
elif char in string.punctuation:
char_type = 'punct'
else:
char_type = 'other'
if current_type is None:
current_type = char_type
current_part.append(char)
elif char_type == current_type:
current_part.append(char)
else:
# Type changed, save current part and start new one
if current_part:
parts.append(''.join(current_part))
current_part = [char]
current_type = char_type
# Add the last part
if current_part:
parts.append(''.join(current_part))
return parts if len(parts) > 1 else [word]
return [word]
def segment(sentence: str) -> List[str]:
"""
Segment a sentence into words.
Steps:
1. Use NLTK tokenizer to tokenize the sentence to words
2. Check whether each word contains a mix of digit, char and punctuations, if so split them
3. Add prefix spaces of the word
Args:
sentence: The input sentence string
Returns:
List of words with prefix spaces where appropriate
"""
# Step 1: Use Spacy tokenizer to tokenize the sentence
doc = nlp.tokenizer(sentence)
words = [t.text for t in doc]
# Step 2: Split words that contain mixed content (digits, chars, punctuations)
split_words = []
for word in words:
split_parts = split_mixed_word(word)
split_words.extend(split_parts)
# Step 3: Add prefix spaces
result = add_prefix_space(split_words, sentence)
return result