File size: 1,437 Bytes
dd6d0d3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import re
import nltk
from nltk.corpus import stopwords
import Stemmer

# Download stopwords if not already present
try:
    nltk.data.find('corpora/stopwords')
except LookupError:
    nltk.download('stopwords')

# Initialize Porter Stemmer
stemmer = Stemmer.Stemmer('porter')
stop_words = set(stopwords.words('english'))

def preprocess(text: str) -> list[str]:
    """
    Preprocessing pipeline:
    1. Case folding: .lower()
    2. Remove punctuation: [^a-zA-Z0-9\s]
    3. Tokenize: split on whitespace and commas
    4. Stop word removal
    5. Stemming
    """
    if not text:
        return []

    # 1. Case folding
    text = text.lower()

    # 2. Remove punctuation (keeping spaces for tokenization)
    # We replace commas with spaces first to handle "word1,word2"
    text = text.replace(',', ' ')
    text = re.sub(r'[^a-z0-9\s]', '', text)

    # 3. Tokenize
    tokens = text.split()

    # 4. Stop word removal
    tokens = [t for t in tokens if t not in stop_words]

    # 5. Stemming
    stemmed_tokens = stemmer.stemWords(tokens)

    return stemmed_tokens

def get_token_positions(tokens: list[str]) -> dict[str, list[int]]:
    """
    Returns a dictionary mapping each unique token to its positions in the original list.
    """
    positions = {}
    for i, token in enumerate(tokens):
        if token not in positions:
            positions[token] = []
        positions[token].append(i)
    return positions