Spaces:

sae8d
/

not-google

Running

not-google / indexer /preprocess.py

Upload 35 files

dd6d0d3 verified about 1 month ago

1.44 kB

	import re
	import nltk
	from nltk.corpus import stopwords
	import Stemmer

	# Download stopwords if not already present
	try:
	nltk.data.find('corpora/stopwords')
	except LookupError:
	nltk.download('stopwords')

	# Initialize Porter Stemmer
	stemmer = Stemmer.Stemmer('porter')
	stop_words = set(stopwords.words('english'))

	def preprocess(text: str) -> list[str]:
	"""
	Preprocessing pipeline:
	1. Case folding: .lower()
	2. Remove punctuation: [^a-zA-Z0-9\s]
	3. Tokenize: split on whitespace and commas
	4. Stop word removal
	5. Stemming
	"""
	if not text:
	return []

	# 1. Case folding
	text = text.lower()

	# 2. Remove punctuation (keeping spaces for tokenization)
	# We replace commas with spaces first to handle "word1,word2"
	text = text.replace(',', ' ')
	text = re.sub(r'[^a-z0-9\s]', '', text)

	# 3. Tokenize
	tokens = text.split()

	# 4. Stop word removal
	tokens = [t for t in tokens if t not in stop_words]

	# 5. Stemming
	stemmed_tokens = stemmer.stemWords(tokens)

	return stemmed_tokens

	def get_token_positions(tokens: list[str]) -> dict[str, list[int]]:
	"""
	Returns a dictionary mapping each unique token to its positions in the original list.
	"""
	positions = {}
	for i, token in enumerate(tokens):
	if token not in positions:
	positions[token] = []
	positions[token].append(i)
	return positions