File size: 2,256 Bytes
92e7042
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
import re
import spacy
import importlib
import nltk

############################################
### CHANGE THIS LINE TO CHOOSE TOKENIZER ###
ORIGINAL_TOKENIZER = False
############################################

try:
    importlib.util.find_spec("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")
except:
    import spacy.cli
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

def glimpse_tokenizer(text: str) -> list:
    
    # If the original tokenizer is set to True, use the original tokenizer
    if ORIGINAL_TOKENIZER:
        return original_tokenizer(text)
    
    # else, use the new tokenizer
    else: 
        
        # More general-purpose tokenizer that handles both natural paragraph text and structured reviews.

        # Normalize long dashes
        text = re.sub(r"[-]{2,}", "\n", text)

        # Keep line breaks meaningful (but fallback to sentence splitting)
        chunks = re.split(r"\n+", text)
        sentences = []

        for chunk in chunks:
            chunk = chunk.strip()
            if not chunk:
                continue

            # Section headers and bullets become single “sentences”
            if re.match(r"^(Summary|Strengths?|Weaknesses?|Minor)\s*:?", chunk, re.IGNORECASE):
                sentences.append(chunk)
                continue

            if re.match(r"^(\d+(\.\d+)*\.|-)\s+.+", chunk):
                sentences.append(chunk)
                continue

            # Otherwise, apply SpaCy sentence splitting
            doc = nlp(chunk)
            sentences.extend([sent.text.strip() for sent in doc.sents if sent.text.strip()])

        return sentences
    
# reuse the original glimpse tokenizer
# def glimpse_tokenizer(text: str) -> list:    
#     return tokenize_sentences(text)

# Default glimpse tokenizer from the original code
def original_tokenizer(text: str) -> list:
    """
    Tokenizes the input text into sentences.
    
    @param text: The input text to be tokenized
    @return: A list of tokenized sentences
    """
    text = text.replace('-----', '\n')
    sentences = nltk.sent_tokenize(text)
    # remove empty sentences
    sentences = [sentence for sentence in sentences if sentence != ""]
    
    return sentences