MEAG-LID

Language identification model trained for African languages with Naive Bayes implementation. This is a preliminary model made in conjuction with an ongoing project with MediaCloud.

Sample usage

import os
import joblib
import pandas as pd
from huggingface_hub import hf_hub_download

REPO_ID = "JessicaOjo/meag_lid"
FILENAME = "model/model.joblib"

# model load logic
def load_nb_bundle_from_hf(repo_id=REPO_ID):
    model_path = hf_hub_download(
        repo_id=repo_id,
        filename=FILENAME,
        repo_type="model",
    )
    return joblib.load(model_path)

# model prediction logic
def nb_model_predict(nb_bundle, texts):
    """Predict language + probabilities using trained NB bundle (clf + vectorizer)."""
    clf = nb_bundle["model"]
    vec = nb_bundle["vectorizer"]

    X_vec = vec.transform(texts)
    pred_lang = clf.predict(X_vec)
    pred_prob = clf.predict_proba(X_vec)
    return pred_lang, pred_prob

# generating predictions
def final_media_cloud_nb_generations(infile, output_dir):
    os.makedirs(output_dir, exist_ok=True)

    data = pd.read_csv(infile)
    data = data.drop_duplicates(subset=["text"], keep="first")
    data = data.dropna(subset=["text"])

    print(f"Data shape: {data.shape}")
    if "language" in data.columns:
        print(f"Unique languages: {data.language.nunique()}")

    bundle = load_nb_bundle_from_hf()
    print("Model loaded from Hugging Face")

    pred, prob = nb_model_predict(bundle, data["text"].tolist())
    data["pred_lang"] = pred

    out_path = os.path.join(output_dir, "media_cloud_predictions.csv")
    data.to_csv(out_path, index=False)
    print(f"Saved predictions โ†’ {out_path}")

    return data
Downloads last month

-

Downloads are not tracked for this model. How to track
Inference Providers NEW
This model isn't deployed by any Inference Provider. ๐Ÿ™‹ Ask for provider support