Spaces:
Sleeping
Sleeping
| """ | |
| preprocessor.py | |
| Multilingual text preprocessing pipeline for the MIRACL corpus (EN, ES, FR, DE). | |
| Handles tokenisation, stopword removal, and Snowball/Porter stemming per language. | |
| Builds the inverted index with TF scores used by the TF-IDF search engine. | |
| Author : Thoshith S | |
| """ | |
| import re | |
| import unicodedata | |
| import nltk | |
| nltk.download("punkt", quiet=True) | |
| nltk.download("punkt_tab", quiet=True) | |
| nltk.download("stopwords", quiet=True) | |
| from nltk.corpus import stopwords as nltk_sw | |
| from nltk.stem import PorterStemmer, SnowballStemmer | |
| _en_stemmer = PorterStemmer() | |
| _es_stemmer = SnowballStemmer("spanish") | |
| _fr_stemmer = SnowballStemmer("french") | |
| _de_stemmer = SnowballStemmer("german") | |
| # โโ Stopword lists โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| ENGLISH_STOPWORDS = set(nltk_sw.words("english")) | { | |
| "also", "one", "two", "three", "known", "used", "given", | |
| "however", "although", "including", "several", "many", "other", | |
| "first", "second", "new", "old", "made", "may", "like", | |
| } | |
| SPANISH_STOPWORDS = set(nltk_sw.words("spanish")) | { | |
| "tambiรฉn", "asรญ", "sino", "aunque", "durante", "travรฉs", | |
| } | |
| FRENCH_STOPWORDS = set(nltk_sw.words("french")) | { | |
| "aussi", "ainsi", "dont", "lors", "jusqu", "comme", | |
| } | |
| GERMAN_STOPWORDS = set(nltk_sw.words("german")) | { | |
| "auch", "sowie", "jedoch", "dabei", "durch", "beim", | |
| } | |
| _STOPWORDS = { | |
| "en": ENGLISH_STOPWORDS, | |
| "es": SPANISH_STOPWORDS, | |
| "fr": FRENCH_STOPWORDS, | |
| "de": GERMAN_STOPWORDS, | |
| } | |
| _STEMMERS = { | |
| "en": _en_stemmer, | |
| "es": _es_stemmer, | |
| "fr": _fr_stemmer, | |
| "de": _de_stemmer, | |
| } | |
| # โโ Regex helpers โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| _URL_RE = re.compile(r"https?://\S+|www\.\S+") | |
| _HTML_RE = re.compile(r"<[^>]+>") | |
| _NUM_RE = re.compile(r"\b\d+\b") | |
| # Allow letters from Latin + Latin Extended (covers ES/FR/DE accents + ร) | |
| _LATIN_RE = re.compile(r"[^a-zรกรฉรญรณรบรผรฑร รขรฆรงรจรชรซรฎรฏรดรนรปรผรฟลรฆลรรรร\s]", re.IGNORECASE) | |
| # โโ Core tokenizer (Latin-script languages: EN, ES, FR, DE) โโโโโโโโโโโโโโโโโโโ | |
| def tokenize_latin(text: str, lang: str = "en") -> list[str]: | |
| """ | |
| Normalize โ clean โ NLTK tokenize โ filter stopwords โ Snowball/Porter stem. | |
| Works for EN, ES, FR, DE. | |
| """ | |
| text = unicodedata.normalize("NFKC", text).lower() | |
| text = _URL_RE.sub(" ", text) | |
| text = _HTML_RE.sub(" ", text) | |
| text = _NUM_RE.sub(" ", text) | |
| text = _LATIN_RE.sub(" ", text) | |
| text = re.sub(r"\s+", " ", text).strip() | |
| tokens = nltk.word_tokenize(text) | |
| stops = _STOPWORDS.get(lang, ENGLISH_STOPWORDS) | |
| stemmer = _STEMMERS.get(lang, _en_stemmer) | |
| result = [] | |
| for t in tokens: | |
| if len(t) < 2 or t in stops: | |
| continue | |
| stemmed = stemmer.stem(t) | |
| if len(stemmed) >= 2: | |
| result.append(stemmed) | |
| return result | |
| def tokenize_english(text: str) -> list[str]: | |
| return tokenize_latin(text, "en") | |
| def tokenize_for_lang(text: str, lang: str) -> list[str]: | |
| """Dispatch to the right tokenizer by language code.""" | |
| return tokenize_latin(text, lang if lang in _STEMMERS else "en") | |
| # โโ Document preprocessing โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def preprocess_document(doc: dict) -> dict: | |
| lang = doc.get("language", "en") | |
| text = doc.get("text", "") | |
| tokens = tokenize_for_lang(text, lang) | |
| return { | |
| "doc_id": doc["doc_id"], | |
| "title": doc["title"], | |
| "tokens": tokens, | |
| "raw_text": text, | |
| "language": lang, | |
| "url": doc.get("url", ""), | |
| } | |
| def preprocess_corpus(corpus: list[dict]) -> list[dict]: | |
| print("Preprocessing corpus โฆ") | |
| result = [] | |
| for i, doc in enumerate(corpus): | |
| result.append(preprocess_document(doc)) | |
| if (i + 1) % 50 == 0 or (i + 1) == len(corpus): | |
| print(f" {i+1}/{len(corpus)} documents processed", end="\r") | |
| print(f" {len(result)}/{len(corpus)} documents processed") | |
| return result | |
| # โโ Inverted index โโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโโ | |
| def build_inverted_index(preprocessed_corpus: list[dict]) -> tuple[dict, dict, set]: | |
| """ | |
| Build inverted index from preprocessed corpus. | |
| Returns: | |
| inverted_index : {term: [(doc_id, raw_count), โฆ]} | |
| doc_lengths : {doc_id: number_of_tokens} | |
| vocab : set of all unique terms | |
| """ | |
| print("Building inverted index โฆ") | |
| inverted_index: dict[str, list[tuple[str, int]]] = {} | |
| doc_lengths: dict[str, int] = {} | |
| vocab: set[str] = set() | |
| for pdoc in preprocessed_corpus: | |
| doc_id = pdoc["doc_id"] | |
| tokens = pdoc["tokens"] | |
| doc_lengths[doc_id] = max(len(tokens), 1) | |
| tf_counts: dict[str, int] = {} | |
| for t in tokens: | |
| tf_counts[t] = tf_counts.get(t, 0) + 1 | |
| vocab.add(t) | |
| for term, count in tf_counts.items(): | |
| inverted_index.setdefault(term, []).append((doc_id, count)) | |
| print(f" Vocabulary size : {len(vocab):,}") | |
| print(f" Index entries : {sum(len(v) for v in inverted_index.values()):,}") | |
| return inverted_index, doc_lengths, vocab | |
| if __name__ == "__main__": | |
| for lang, text in [ | |
| ("en", "The quick brown fox jumps over the lazy dog."), | |
| ("es", "El zorro marrรณn rรกpido salta sobre el perro perezoso."), | |
| ("fr", "Le renard brun rapide saute par-dessus le chien paresseux."), | |
| ("de", "Der schnelle braune Fuchs springt รผber den faulen Hund."), | |
| ]: | |
| tokens = tokenize_for_lang(text, lang) | |
| print(f"[{lang}] {tokens}") | |