miracl-search / preprocessor.py
thoshiths's picture
Upload preprocessor.py with huggingface_hub
53e8c10 verified
"""
preprocessor.py
Multilingual text preprocessing pipeline for the MIRACL corpus (EN, ES, FR, DE).
Handles tokenisation, stopword removal, and Snowball/Porter stemming per language.
Builds the inverted index with TF scores used by the TF-IDF search engine.
Author : Thoshith S
"""
import re
import unicodedata
import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)
nltk.download("stopwords", quiet=True)
from nltk.corpus import stopwords as nltk_sw
from nltk.stem import PorterStemmer, SnowballStemmer
_en_stemmer = PorterStemmer()
_es_stemmer = SnowballStemmer("spanish")
_fr_stemmer = SnowballStemmer("french")
_de_stemmer = SnowballStemmer("german")
# โ”€โ”€ Stopword lists โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
ENGLISH_STOPWORDS = set(nltk_sw.words("english")) | {
"also", "one", "two", "three", "known", "used", "given",
"however", "although", "including", "several", "many", "other",
"first", "second", "new", "old", "made", "may", "like",
}
SPANISH_STOPWORDS = set(nltk_sw.words("spanish")) | {
"tambiรฉn", "asรญ", "sino", "aunque", "durante", "travรฉs",
}
FRENCH_STOPWORDS = set(nltk_sw.words("french")) | {
"aussi", "ainsi", "dont", "lors", "jusqu", "comme",
}
GERMAN_STOPWORDS = set(nltk_sw.words("german")) | {
"auch", "sowie", "jedoch", "dabei", "durch", "beim",
}
_STOPWORDS = {
"en": ENGLISH_STOPWORDS,
"es": SPANISH_STOPWORDS,
"fr": FRENCH_STOPWORDS,
"de": GERMAN_STOPWORDS,
}
_STEMMERS = {
"en": _en_stemmer,
"es": _es_stemmer,
"fr": _fr_stemmer,
"de": _de_stemmer,
}
# โ”€โ”€ Regex helpers โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
_URL_RE = re.compile(r"https?://\S+|www\.\S+")
_HTML_RE = re.compile(r"<[^>]+>")
_NUM_RE = re.compile(r"\b\d+\b")
# Allow letters from Latin + Latin Extended (covers ES/FR/DE accents + รŸ)
_LATIN_RE = re.compile(r"[^a-zรกรฉรญรณรบรผรฑร รขรฆรงรจรชรซรฎรฏรดรนรปรผรฟล“รฆล“ร„ร–รœรŸ\s]", re.IGNORECASE)
# โ”€โ”€ Core tokenizer (Latin-script languages: EN, ES, FR, DE) โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def tokenize_latin(text: str, lang: str = "en") -> list[str]:
"""
Normalize โ†’ clean โ†’ NLTK tokenize โ†’ filter stopwords โ†’ Snowball/Porter stem.
Works for EN, ES, FR, DE.
"""
text = unicodedata.normalize("NFKC", text).lower()
text = _URL_RE.sub(" ", text)
text = _HTML_RE.sub(" ", text)
text = _NUM_RE.sub(" ", text)
text = _LATIN_RE.sub(" ", text)
text = re.sub(r"\s+", " ", text).strip()
tokens = nltk.word_tokenize(text)
stops = _STOPWORDS.get(lang, ENGLISH_STOPWORDS)
stemmer = _STEMMERS.get(lang, _en_stemmer)
result = []
for t in tokens:
if len(t) < 2 or t in stops:
continue
stemmed = stemmer.stem(t)
if len(stemmed) >= 2:
result.append(stemmed)
return result
def tokenize_english(text: str) -> list[str]:
return tokenize_latin(text, "en")
def tokenize_for_lang(text: str, lang: str) -> list[str]:
"""Dispatch to the right tokenizer by language code."""
return tokenize_latin(text, lang if lang in _STEMMERS else "en")
# โ”€โ”€ Document preprocessing โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def preprocess_document(doc: dict) -> dict:
lang = doc.get("language", "en")
text = doc.get("text", "")
tokens = tokenize_for_lang(text, lang)
return {
"doc_id": doc["doc_id"],
"title": doc["title"],
"tokens": tokens,
"raw_text": text,
"language": lang,
"url": doc.get("url", ""),
}
def preprocess_corpus(corpus: list[dict]) -> list[dict]:
print("Preprocessing corpus โ€ฆ")
result = []
for i, doc in enumerate(corpus):
result.append(preprocess_document(doc))
if (i + 1) % 50 == 0 or (i + 1) == len(corpus):
print(f" {i+1}/{len(corpus)} documents processed", end="\r")
print(f" {len(result)}/{len(corpus)} documents processed")
return result
# โ”€โ”€ Inverted index โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€โ”€
def build_inverted_index(preprocessed_corpus: list[dict]) -> tuple[dict, dict, set]:
"""
Build inverted index from preprocessed corpus.
Returns:
inverted_index : {term: [(doc_id, raw_count), โ€ฆ]}
doc_lengths : {doc_id: number_of_tokens}
vocab : set of all unique terms
"""
print("Building inverted index โ€ฆ")
inverted_index: dict[str, list[tuple[str, int]]] = {}
doc_lengths: dict[str, int] = {}
vocab: set[str] = set()
for pdoc in preprocessed_corpus:
doc_id = pdoc["doc_id"]
tokens = pdoc["tokens"]
doc_lengths[doc_id] = max(len(tokens), 1)
tf_counts: dict[str, int] = {}
for t in tokens:
tf_counts[t] = tf_counts.get(t, 0) + 1
vocab.add(t)
for term, count in tf_counts.items():
inverted_index.setdefault(term, []).append((doc_id, count))
print(f" Vocabulary size : {len(vocab):,}")
print(f" Index entries : {sum(len(v) for v in inverted_index.values()):,}")
return inverted_index, doc_lengths, vocab
if __name__ == "__main__":
for lang, text in [
("en", "The quick brown fox jumps over the lazy dog."),
("es", "El zorro marrรณn rรกpido salta sobre el perro perezoso."),
("fr", "Le renard brun rapide saute par-dessus le chien paresseux."),
("de", "Der schnelle braune Fuchs springt รผber den faulen Hund."),
]:
tokens = tokenize_for_lang(text, lang)
print(f"[{lang}] {tokens}")