Spaces:

krislette
/

kataklassifer

Sleeping

App Files Files Community

kataklassifer / src /preprocessor.py

krislette

Initial commit

caf26c9 about 1 month ago

raw

history blame contribute delete

4.23 kB

	"""
	Prepares the raw gairaigo DataFrame for machine learning.

	Processing steps (in order):
	1. Drop exact duplicate (katakana, language) pairs.
	2. Consolidate language classes that have too few samples into 'other'.
	3. Encode string language labels into integer class indices.
	4. Build a TF-IDF character n-gram feature matrix from the katakana strings.

	Why character n-grams?
	Katakana phonetically transcribes the donor word's pronunciation. Different
	donor languages leave distinct phonetic fingerprints in the resulting katakana:
	- German words often end in ルト (-ruto), ンゲ (-nge), or ツ (tsu).
	- French words often contain ージュ (-aju) or アン (-an) nasals.
	- Portuguese words carry パン, タバコ, or イン patterns.
	A model trained on bigram-4-gram features can learn these sub-word signals.
	"""

	import pandas as pd
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.preprocessing import LabelEncoder

	from src.constants import decode_language


	# Languages to observe
	KEEP_LANGUAGES = {"eng", "fre", "ger"}


	def preprocess(df: pd.DataFrame):
	"""
	Clean the gairaigo DataFrame and encode its language labels.

	Processing order matters here:
	1. Deduplicate first, so rare-class counts reflect unique words only.
	2. Consolidate rare classes into 'other' based on those true counts.
	3. Decode ISO 639-2 codes into full language names (e.g. 'fre' → 'French')
	so that every downstream output — charts, reports, and CSVs — uses
	human-readable labels instead of three-letter codes.
	4. Encode the final string labels into integers for scikit-learn.

	Args:
	df : Raw DataFrame with columns ['katakana', 'language'].

	Returns:
	Tuple of (cleaned_df, label_encoder).
	cleaned_df has an additional 'label' column of integer class indices.
	label_encoder is fitted and can inverse-transform predictions later.
	"""
	# Remove duplicates so the same loanword does not appear in both
	# the training and test sets, which would inflate accuracy artificially
	df = df.drop_duplicates(subset=["katakana", "language"]).copy()

	# Consolidate before decoding so the threshold applies to raw codes,
	# which is what the dataset uses internally
	df = _consolidate_rare_classes(df)

	# Replace short codes with full names now that class boundaries are set
	df["language"] = df["language"].apply(decode_language)

	label_encoder = LabelEncoder()
	df["label"] = label_encoder.fit_transform(df["language"])

	return df, label_encoder


	def build_features(katakana_series: pd.Series, vectorizer: TfidfVectorizer = None):
	"""
	Convert katakana strings into a sparse TF-IDF character n-gram matrix.

	Uses char_wb mode, which pads each word with boundary markers before
	extracting n-grams. This lets the model distinguish patterns at the
	start/end of a word from those in the middle — useful here because
	donor-language phonemes often appear at word edges.

	Args:
	katakana_series : Series of katakana strings to vectorize.
	vectorizer : A pre-fitted TfidfVectorizer for transform-only mode.
	Pass None to fit a new vectorizer on katakana_series.

	Returns:
	Tuple of (feature_matrix, vectorizer).
	feature_matrix is a sparse scipy matrix of shape (n_samples, n_features).
	"""
	if vectorizer is None:
	vectorizer = TfidfVectorizer(
	analyzer="char_wb", # character n-grams with word-boundary padding
	ngram_range=(2, 4), # bigrams, trigrams, and 4-grams
	sublinear_tf=True, # replace raw TF with 1 + log(TF) to dampen outliers
	min_df=2, # discard n-grams that appear in fewer than 2 words
	)
	feature_matrix = vectorizer.fit_transform(katakana_series)
	else:
	feature_matrix = vectorizer.transform(katakana_series)

	return feature_matrix, vectorizer


	def _consolidate_rare_classes(df: pd.DataFrame) -> pd.DataFrame:
	df["language"] = df["language"].apply(
	lambda lang: lang if lang in KEEP_LANGUAGES else None
	)
	return df.dropna(subset=["language"])