Spaces:

thoshiths
/

miracl-search

Sleeping

App Files Files Community

miracl-search / preprocessor.py

thoshiths

Upload preprocessor.py with huggingface_hub

53e8c10 verified about 1 month ago

raw

history blame contribute delete

6.16 kB

	"""
	preprocessor.py
	Multilingual text preprocessing pipeline for the MIRACL corpus (EN, ES, FR, DE).
	Handles tokenisation, stopword removal, and Snowball/Porter stemming per language.
	Builds the inverted index with TF scores used by the TF-IDF search engine.

	Author : Thoshith S
	"""

	import re
	import unicodedata
	import nltk

	nltk.download("punkt", quiet=True)
	nltk.download("punkt_tab", quiet=True)
	nltk.download("stopwords", quiet=True)

	from nltk.corpus import stopwords as nltk_sw
	from nltk.stem import PorterStemmer, SnowballStemmer

	_en_stemmer = PorterStemmer()
	_es_stemmer = SnowballStemmer("spanish")
	_fr_stemmer = SnowballStemmer("french")
	_de_stemmer = SnowballStemmer("german")

	# ── Stopword lists ─────────────────────────────────────────────────────────────
	ENGLISH_STOPWORDS = set(nltk_sw.words("english")) \| {
	"also", "one", "two", "three", "known", "used", "given",
	"however", "although", "including", "several", "many", "other",
	"first", "second", "new", "old", "made", "may", "like",
	}

	SPANISH_STOPWORDS = set(nltk_sw.words("spanish")) \| {
	"también", "así", "sino", "aunque", "durante", "través",
	}

	FRENCH_STOPWORDS = set(nltk_sw.words("french")) \| {
	"aussi", "ainsi", "dont", "lors", "jusqu", "comme",
	}

	GERMAN_STOPWORDS = set(nltk_sw.words("german")) \| {
	"auch", "sowie", "jedoch", "dabei", "durch", "beim",
	}

	_STOPWORDS = {
	"en": ENGLISH_STOPWORDS,
	"es": SPANISH_STOPWORDS,
	"fr": FRENCH_STOPWORDS,
	"de": GERMAN_STOPWORDS,
	}

	_STEMMERS = {
	"en": _en_stemmer,
	"es": _es_stemmer,
	"fr": _fr_stemmer,
	"de": _de_stemmer,
	}


	# ── Regex helpers ──────────────────────────────────────────────────────────────
	_URL_RE = re.compile(r"https?://\S+\|www\.\S+")
	_HTML_RE = re.compile(r"<[^>]+>")
	_NUM_RE = re.compile(r"\b\d+\b")
	# Allow letters from Latin + Latin Extended (covers ES/FR/DE accents + ß)
	_LATIN_RE = re.compile(r"[^a-záéíóúüñàâæçèêëîïôùûüÿœæœÄÖÜß\s]", re.IGNORECASE)


	# ── Core tokenizer (Latin-script languages: EN, ES, FR, DE) ───────────────────

	def tokenize_latin(text: str, lang: str = "en") -> list[str]:
	"""
	Normalize → clean → NLTK tokenize → filter stopwords → Snowball/Porter stem.
	Works for EN, ES, FR, DE.
	"""
	text = unicodedata.normalize("NFKC", text).lower()
	text = _URL_RE.sub(" ", text)
	text = _HTML_RE.sub(" ", text)
	text = _NUM_RE.sub(" ", text)
	text = _LATIN_RE.sub(" ", text)
	text = re.sub(r"\s+", " ", text).strip()

	tokens = nltk.word_tokenize(text)
	stops = _STOPWORDS.get(lang, ENGLISH_STOPWORDS)
	stemmer = _STEMMERS.get(lang, _en_stemmer)

	result = []
	for t in tokens:
	if len(t) < 2 or t in stops:
	continue
	stemmed = stemmer.stem(t)
	if len(stemmed) >= 2:
	result.append(stemmed)
	return result


	def tokenize_english(text: str) -> list[str]:
	return tokenize_latin(text, "en")


	def tokenize_for_lang(text: str, lang: str) -> list[str]:
	"""Dispatch to the right tokenizer by language code."""
	return tokenize_latin(text, lang if lang in _STEMMERS else "en")


	# ── Document preprocessing ─────────────────────────────────────────────────────

	def preprocess_document(doc: dict) -> dict:
	lang = doc.get("language", "en")
	text = doc.get("text", "")
	tokens = tokenize_for_lang(text, lang)
	return {
	"doc_id": doc["doc_id"],
	"title": doc["title"],
	"tokens": tokens,
	"raw_text": text,
	"language": lang,
	"url": doc.get("url", ""),
	}


	def preprocess_corpus(corpus: list[dict]) -> list[dict]:
	print("Preprocessing corpus …")
	result = []
	for i, doc in enumerate(corpus):
	result.append(preprocess_document(doc))
	if (i + 1) % 50 == 0 or (i + 1) == len(corpus):
	print(f" {i+1}/{len(corpus)} documents processed", end="\r")
	print(f" {len(result)}/{len(corpus)} documents processed")
	return result


	# ── Inverted index ─────────────────────────────────────────────────────────────

	def build_inverted_index(preprocessed_corpus: list[dict]) -> tuple[dict, dict, set]:
	"""
	Build inverted index from preprocessed corpus.

	Returns:
	inverted_index : {term: [(doc_id, raw_count), …]}
	doc_lengths : {doc_id: number_of_tokens}
	vocab : set of all unique terms
	"""
	print("Building inverted index …")
	inverted_index: dict[str, list[tuple[str, int]]] = {}
	doc_lengths: dict[str, int] = {}
	vocab: set[str] = set()

	for pdoc in preprocessed_corpus:
	doc_id = pdoc["doc_id"]
	tokens = pdoc["tokens"]
	doc_lengths[doc_id] = max(len(tokens), 1)

	tf_counts: dict[str, int] = {}
	for t in tokens:
	tf_counts[t] = tf_counts.get(t, 0) + 1
	vocab.add(t)

	for term, count in tf_counts.items():
	inverted_index.setdefault(term, []).append((doc_id, count))

	print(f" Vocabulary size : {len(vocab):,}")
	print(f" Index entries : {sum(len(v) for v in inverted_index.values()):,}")
	return inverted_index, doc_lengths, vocab


	if __name__ == "__main__":
	for lang, text in [
	("en", "The quick brown fox jumps over the lazy dog."),
	("es", "El zorro marrón rápido salta sobre el perro perezoso."),
	("fr", "Le renard brun rapide saute par-dessus le chien paresseux."),
	("de", "Der schnelle braune Fuchs springt über den faulen Hund."),
	]:
	tokens = tokenize_for_lang(text, lang)
	print(f"[{lang}] {tokens}")