MEAG-LID
Language identification model trained for African languages with Naive Bayes implementation. This is a preliminary model made in conjuction with an ongoing project with MediaCloud.
Sample usage
import os
import joblib
import pandas as pd
from huggingface_hub import hf_hub_download
REPO_ID = "JessicaOjo/meag_lid"
FILENAME = "model/model.joblib"
# model load logic
def load_nb_bundle_from_hf(repo_id=REPO_ID):
model_path = hf_hub_download(
repo_id=repo_id,
filename=FILENAME,
repo_type="model",
)
return joblib.load(model_path)
# model prediction logic
def nb_model_predict(nb_bundle, texts):
"""Predict language + probabilities using trained NB bundle (clf + vectorizer)."""
clf = nb_bundle["model"]
vec = nb_bundle["vectorizer"]
X_vec = vec.transform(texts)
pred_lang = clf.predict(X_vec)
pred_prob = clf.predict_proba(X_vec)
return pred_lang, pred_prob
# generating predictions
def final_media_cloud_nb_generations(infile, output_dir):
os.makedirs(output_dir, exist_ok=True)
data = pd.read_csv(infile)
data = data.drop_duplicates(subset=["text"], keep="first")
data = data.dropna(subset=["text"])
print(f"Data shape: {data.shape}")
if "language" in data.columns:
print(f"Unique languages: {data.language.nunique()}")
bundle = load_nb_bundle_from_hf()
print("Model loaded from Hugging Face")
pred, prob = nb_model_predict(bundle, data["text"].tolist())
data["pred_lang"] = pred
out_path = os.path.join(output_dir, "media_cloud_predictions.csv")
data.to_csv(out_path, index=False)
print(f"Saved predictions โ {out_path}")
return data
Inference Providers
NEW
This model isn't deployed by any Inference Provider.
๐
Ask for provider support