File size: 2,256 Bytes
92e7042 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 | import re
import spacy
import importlib
import nltk
############################################
### CHANGE THIS LINE TO CHOOSE TOKENIZER ###
ORIGINAL_TOKENIZER = False
############################################
try:
importlib.util.find_spec("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
except:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
def glimpse_tokenizer(text: str) -> list:
# If the original tokenizer is set to True, use the original tokenizer
if ORIGINAL_TOKENIZER:
return original_tokenizer(text)
# else, use the new tokenizer
else:
# More general-purpose tokenizer that handles both natural paragraph text and structured reviews.
# Normalize long dashes
text = re.sub(r"[-]{2,}", "\n", text)
# Keep line breaks meaningful (but fallback to sentence splitting)
chunks = re.split(r"\n+", text)
sentences = []
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
# Section headers and bullets become single “sentences”
if re.match(r"^(Summary|Strengths?|Weaknesses?|Minor)\s*:?", chunk, re.IGNORECASE):
sentences.append(chunk)
continue
if re.match(r"^(\d+(\.\d+)*\.|-)\s+.+", chunk):
sentences.append(chunk)
continue
# Otherwise, apply SpaCy sentence splitting
doc = nlp(chunk)
sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()])
return sentences
# reuse the original glimpse tokenizer
# def glimpse_tokenizer(text: str) -> list:
# return tokenize_sentences(text)
# Default glimpse tokenizer from the original code
def original_tokenizer(text: str) -> list:
"""
Tokenizes the input text into sentences.
@param text: The input text to be tokenized
@return: A list of tokenized sentences
"""
text = text.replace('-----', '\n')
sentences = nltk.sent_tokenize(text)
# remove empty sentences
sentences = [sentence for sentence in sentences if sentence != ""]
return sentences |