Spaces:
Running
Running
File size: 1,437 Bytes
dd6d0d3 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | import re
import nltk
from nltk.corpus import stopwords
import Stemmer
# Download stopwords if not already present
try:
nltk.data.find('corpora/stopwords')
except LookupError:
nltk.download('stopwords')
# Initialize Porter Stemmer
stemmer = Stemmer.Stemmer('porter')
stop_words = set(stopwords.words('english'))
def preprocess(text: str) -> list[str]:
"""
Preprocessing pipeline:
1. Case folding: .lower()
2. Remove punctuation: [^a-zA-Z0-9\s]
3. Tokenize: split on whitespace and commas
4. Stop word removal
5. Stemming
"""
if not text:
return []
# 1. Case folding
text = text.lower()
# 2. Remove punctuation (keeping spaces for tokenization)
# We replace commas with spaces first to handle "word1,word2"
text = text.replace(',', ' ')
text = re.sub(r'[^a-z0-9\s]', '', text)
# 3. Tokenize
tokens = text.split()
# 4. Stop word removal
tokens = [t for t in tokens if t not in stop_words]
# 5. Stemming
stemmed_tokens = stemmer.stemWords(tokens)
return stemmed_tokens
def get_token_positions(tokens: list[str]) -> dict[str, list[int]]:
"""
Returns a dictionary mapping each unique token to its positions in the original list.
"""
positions = {}
for i, token in enumerate(tokens):
if token not in positions:
positions[token] = []
positions[token].append(i)
return positions
|