Spaces:
Sleeping
Sleeping
File size: 4,226 Bytes
caf26c9 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 | """
Prepares the raw gairaigo DataFrame for machine learning.
Processing steps (in order):
1. Drop exact duplicate (katakana, language) pairs.
2. Consolidate language classes that have too few samples into 'other'.
3. Encode string language labels into integer class indices.
4. Build a TF-IDF character n-gram feature matrix from the katakana strings.
Why character n-grams?
Katakana phonetically transcribes the donor word's pronunciation. Different
donor languages leave distinct phonetic fingerprints in the resulting katakana:
- German words often end in ルト (-ruto), ンゲ (-nge), or ツ (tsu).
- French words often contain ージュ (-aju) or アン (-an) nasals.
- Portuguese words carry パン, タバコ, or イン patterns.
A model trained on bigram-4-gram features can learn these sub-word signals.
"""
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from src.constants import decode_language
# Languages to observe
KEEP_LANGUAGES = {"eng", "fre", "ger"}
def preprocess(df: pd.DataFrame):
"""
Clean the gairaigo DataFrame and encode its language labels.
Processing order matters here:
1. Deduplicate first, so rare-class counts reflect unique words only.
2. Consolidate rare classes into 'other' based on those true counts.
3. Decode ISO 639-2 codes into full language names (e.g. 'fre' → 'French')
so that every downstream output — charts, reports, and CSVs — uses
human-readable labels instead of three-letter codes.
4. Encode the final string labels into integers for scikit-learn.
Args:
df : Raw DataFrame with columns ['katakana', 'language'].
Returns:
Tuple of (cleaned_df, label_encoder).
cleaned_df has an additional 'label' column of integer class indices.
label_encoder is fitted and can inverse-transform predictions later.
"""
# Remove duplicates so the same loanword does not appear in both
# the training and test sets, which would inflate accuracy artificially
df = df.drop_duplicates(subset=["katakana", "language"]).copy()
# Consolidate before decoding so the threshold applies to raw codes,
# which is what the dataset uses internally
df = _consolidate_rare_classes(df)
# Replace short codes with full names now that class boundaries are set
df["language"] = df["language"].apply(decode_language)
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["language"])
return df, label_encoder
def build_features(katakana_series: pd.Series, vectorizer: TfidfVectorizer = None):
"""
Convert katakana strings into a sparse TF-IDF character n-gram matrix.
Uses char_wb mode, which pads each word with boundary markers before
extracting n-grams. This lets the model distinguish patterns at the
start/end of a word from those in the middle — useful here because
donor-language phonemes often appear at word edges.
Args:
katakana_series : Series of katakana strings to vectorize.
vectorizer : A pre-fitted TfidfVectorizer for transform-only mode.
Pass None to fit a new vectorizer on katakana_series.
Returns:
Tuple of (feature_matrix, vectorizer).
feature_matrix is a sparse scipy matrix of shape (n_samples, n_features).
"""
if vectorizer is None:
vectorizer = TfidfVectorizer(
analyzer="char_wb", # character n-grams with word-boundary padding
ngram_range=(2, 4), # bigrams, trigrams, and 4-grams
sublinear_tf=True, # replace raw TF with 1 + log(TF) to dampen outliers
min_df=2, # discard n-grams that appear in fewer than 2 words
)
feature_matrix = vectorizer.fit_transform(katakana_series)
else:
feature_matrix = vectorizer.transform(katakana_series)
return feature_matrix, vectorizer
def _consolidate_rare_classes(df: pd.DataFrame) -> pd.DataFrame:
df["language"] = df["language"].apply(
lambda lang: lang if lang in KEEP_LANGUAGES else None
)
return df.dropna(subset=["language"])
|