import re import spacy import importlib import nltk ############################################ ### CHANGE THIS LINE TO CHOOSE TOKENIZER ### ORIGINAL_TOKENIZER = False ############################################ try: importlib.util.find_spec("en_core_web_sm") nlp = spacy.load("en_core_web_sm") except: import spacy.cli spacy.cli.download("en_core_web_sm") nlp = spacy.load("en_core_web_sm") def glimpse_tokenizer(text: str) -> list: # If the original tokenizer is set to True, use the original tokenizer if ORIGINAL_TOKENIZER: return original_tokenizer(text) # else, use the new tokenizer else: # More general-purpose tokenizer that handles both natural paragraph text and structured reviews. # Normalize long dashes text = re.sub(r"[-]{2,}", "\n", text) # Keep line breaks meaningful (but fallback to sentence splitting) chunks = re.split(r"\n+", text) sentences = [] for chunk in chunks: chunk = chunk.strip() if not chunk: continue # Section headers and bullets become single “sentences” if re.match(r"^(Summary|Strengths?|Weaknesses?|Minor)\s*:?", chunk, re.IGNORECASE): sentences.append(chunk) continue if re.match(r"^(\d+(\.\d+)*\.|-)\s+.+", chunk): sentences.append(chunk) continue # Otherwise, apply SpaCy sentence splitting doc = nlp(chunk) sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()]) return sentences # reuse the original glimpse tokenizer # def glimpse_tokenizer(text: str) -> list: # return tokenize_sentences(text) # Default glimpse tokenizer from the original code def original_tokenizer(text: str) -> list: """ Tokenizes the input text into sentences. @param text: The input text to be tokenized @return: A list of tokenized sentences """ text = text.replace('-----', '\n') sentences = nltk.sent_tokenize(text) # remove empty sentences sentences = [sentence for sentence in sentences if sentence != ""] return sentences