ReView / dependencies /Glimpse_tokenizer.py
Sina1138
Remove dependencies on glimpse repo for interface.
92e7042
import re
import spacy
import importlib
import nltk
############################################
### CHANGE THIS LINE TO CHOOSE TOKENIZER ###
ORIGINAL_TOKENIZER = False
############################################
try:
importlib.util.find_spec("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
except:
import spacy.cli
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")
def glimpse_tokenizer(text: str) -> list:
# If the original tokenizer is set to True, use the original tokenizer
if ORIGINAL_TOKENIZER:
return original_tokenizer(text)
# else, use the new tokenizer
else:
# More general-purpose tokenizer that handles both natural paragraph text and structured reviews.
# Normalize long dashes
text = re.sub(r"[-]{2,}", "\n", text)
# Keep line breaks meaningful (but fallback to sentence splitting)
chunks = re.split(r"\n+", text)
sentences = []
for chunk in chunks:
chunk = chunk.strip()
if not chunk:
continue
# Section headers and bullets become single “sentences”
if re.match(r"^(Summary|Strengths?|Weaknesses?|Minor)\s*:?", chunk, re.IGNORECASE):
sentences.append(chunk)
continue
if re.match(r"^(\d+(\.\d+)*\.|-)\s+.+", chunk):
sentences.append(chunk)
continue
# Otherwise, apply SpaCy sentence splitting
doc = nlp(chunk)
sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()])
return sentences
# reuse the original glimpse tokenizer
# def glimpse_tokenizer(text: str) -> list:
# return tokenize_sentences(text)
# Default glimpse tokenizer from the original code
def original_tokenizer(text: str) -> list:
"""
Tokenizes the input text into sentences.
@param text: The input text to be tokenized
@return: A list of tokenized sentences
"""
text = text.replace('-----', '\n')
sentences = nltk.sent_tokenize(text)
# remove empty sentences
sentences = [sentence for sentence in sentences if sentence != ""]
return sentences