Spaces:

NIIHAAD
/

freesound-popularity

Running

File size: 18,401 Bytes

import gradio as gr
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OneHotEncoder
from sklearn.feature_extraction.text import HashingVectorizer
from collections import Counter
import joblib
import freesound
import gensim.downloader as api
from huggingface_hub import hf_hub_download

import xgboost as xgb


# -------- FreeSound API --------
client = freesound.FreesoundClient()
client.set_token("zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK", "token")

dataset_dir = "dataset_audio"
os.makedirs(dataset_dir, exist_ok=True)

class AvgRatingTransformer:
    def __init__(self, est, class_mapping=None):
        self.est = est
        if class_mapping is None:
            self.class_mapping = {0:"MissedInfo", 1:"Low", 2:"Medium", 3:"High"}
        else:
            self.class_mapping = class_mapping

    def transform(self, X):
        X = X.copy()
        mask_non_zero = X != 0
        Xt = np.zeros_like(X, dtype=int)
        if mask_non_zero.any():
            Xt[mask_non_zero] = self.est.transform(X[mask_non_zero].reshape(-1,1)).flatten() + 1
        X_transformed = np.array([self.class_mapping.get(v, "MissedInfo") for v in Xt])
        return X_transformed


# -------- Charger les objets sauvegardés --------
# Music
scaler_samplerate_music = joblib.load("music/scaler_music_samplerate.joblib")
scaler_age_days_music = joblib.load("music/scaler_music_age_days_log.joblib")
username_freq_music = joblib.load("music/username_freq_dict_music.joblib")
est_num_downloads_music = joblib.load("music/est_num_downloads_music.joblib")
avg_rating_transformer_music = joblib.load("music/avg_rating_transformer_music.joblib")
music_subcategory_cols = joblib.load("music/music_subcategory_cols.joblib")
music_onehot_cols = joblib.load("music/music_onehot_cols.joblib")
music_onehot_tags = joblib.load("music/music_onehot_tags.joblib")


# -------- MODELS --------

# =============================
# Load ML models at runtime
# =============================

os.makedirs("models_cache", exist_ok=True)

# ---- MUSIC MODELS ----
music_model_num_downloads = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",  # <-- important pour HF Hub
        filename="music_model_num_downloads.joblib",
        cache_dir="models_cache"
    )
)

music_model_avg_rating = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="music_xgb_avg_rating.joblib",
        cache_dir="models_cache"
    )
)

music_avg_rating_le = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="music_xgb_avg_rating_label_encoder.joblib",
        cache_dir="models_cache"
    )
)

# ---- EFFECT SOUND MODELS ----
effect_model_num_downloads = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="effectSound_model_num_downloads.joblib",
        cache_dir="models_cache"
    )
)
effect_model_avg_rating = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="effectSound_xgb_avg_rating.joblib",
        cache_dir="models_cache"
    )
)

effect_avg_rating_le = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="effectSound_xgb_avg_rating_label_encoder.joblib",
        cache_dir="models_cache"
    )
)




# Charger les listes de colonnes exactes utilisées pendant l'entraînement
music_model_features = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="music_model_features_list.joblib",
        cache_dir="models_cache"
    )
)

effect_model_features = joblib.load(
    hf_hub_download(
        repo_id="NIIHAAD/freesound-models",
        repo_type="model",
        filename="effect_model_features_list.joblib",
        cache_dir="models_cache"
    )
)

# Charger les listes
music_model_features_raw = music_model_features
effect_model_features_raw = effect_model_features 


# NETTOYAGE : Supprimer les doublons en gardant l'ordre
music_model_features = list(dict.fromkeys(music_model_features_raw))
effect_model_features = list(dict.fromkeys(effect_model_features_raw))

print(f"Après nettoyage - Music: {len(music_model_features)} features")
print(f"Après nettoyage - Effect: {len(effect_model_features)} features")

# EffectSound
scaler_samplerate_effect = joblib.load("effectSound/scaler_effectSamplerate.joblib")
scaler_age_days_effect = joblib.load("effectSound/scaler_effectSound_age_days_log.joblib")
username_freq_effect = joblib.load("effectSound/username_freq_dict_effectSound.joblib")
est_num_downloads_effect = joblib.load("effectSound/est_num_downloads_effectSound.joblib")
avg_rating_transformer_effect = joblib.load("effectSound/avg_rating_transformer_effectSound.joblib")
effect_subcategory_cols = joblib.load("effectSound/effectSound_subcategory_cols.joblib")
effect_onehot_cols = joblib.load("effectSound/effectSound_onehot_cols.joblib")
effect_onehot_tags = joblib.load("effectSound/effect_onehot_tags.joblib")

# GloVe pour description
glove_model = api.load("glove-wiki-gigaword-100")
# --- AJOUTE LE CODE ICI ---
print("--- DIAGNOSTIC DES FEATURES ---")
print(f"Nombre de features Music : {len(music_model_features)}")
print(f"Doublons dans Music : {len(music_model_features) - len(set(music_model_features))}")

print(f"Nombre de features Effect : {len(effect_model_features)}")
print(f"Doublons dans Effect : {len(effect_model_features) - len(set(effect_model_features))}")
print("-------------------------------")


# ---------------------------
# -------- Fonctions --------

def fetch_sound_metadata(sound_url):
    """Télécharge les métadonnées du son FreeSound"""
    sound_id = int(sound_url.rstrip("/").split("/")[-1])
    sound = client.get_sound(sound_id)
    file_name = f"{sound.name.replace(' ', '_')}.mp3"
    file_path = os.path.join(dataset_dir, file_name)
    try:
        sound.retrieve_preview(dataset_dir, file_name)
    except Exception as e:
        print(f"Erreur téléchargement {file_name}: {e}")
        file_path = None
    data = {
        "file_path": file_path,
        "name": sound.name,
        "num_ratings": sound.num_ratings,
        "tags": ",".join(sound.tags) if getattr(sound, "tags", None) else "",
        "username": sound.username,
        "description": sound.description if sound.description else "",
        "created": getattr(sound, "created", ""),
        "license": getattr(sound, "license", ""),
        "num_downloads": getattr(sound, "num_downloads", 0),
        "channels": getattr(sound, "channels", 0),
        "filesize": getattr(sound, "filesize", 0),
        "num_comments": getattr(sound, "num_comments", 0),
        "category_is_user_provided": getattr(sound, "category_is_user_provided", 0),
        "duration": getattr(sound, "duration", 0),
        "avg_rating": getattr(sound, "avg_rating", 0),
        "category": getattr(sound, "category", "Unknown"),
        "subcategory": getattr(sound, "subcategory", "Other"),
        "type": getattr(sound, "type", ""),
        "samplerate": getattr(sound, "samplerate", 0)
    }
    return pd.DataFrame([data])

def description_to_vec(text, model, dim=100):
    if not text:
        return np.zeros(dim)
    words = text.lower().split()
    vecs = [model[w] for w in words if w in model]
    if len(vecs) == 0:
        return np.zeros(dim)
    return np.mean(vecs, axis=0)

def preprocess_sound(df):
    """Applique le preprocessing complet selon duration pour choisir music ou effectSound"""
    df = df.copy()
    dur = df["duration"].iloc[0]
    
    if 0.5 <= dur <= 3:
        dataset_type = "effectSound"
        scaler_samplerate = scaler_samplerate_effect
        scaler_age = scaler_age_days_effect
        username_freq = username_freq_effect
        est_num_downloads = est_num_downloads_effect
        avg_rating_transformer = avg_rating_transformer_effect
        subcat_cols = effect_subcategory_cols
        onehot_cols = effect_onehot_cols
        onehot_tags = effect_onehot_tags
    elif 10 <= dur <= 60:
        dataset_type = "music"
        scaler_samplerate = scaler_samplerate_music
        scaler_age = scaler_age_days_music
        username_freq = username_freq_music
        est_num_downloads = est_num_downloads_music
        avg_rating_transformer = avg_rating_transformer_music
        subcat_cols = music_subcategory_cols
        onehot_cols = music_onehot_cols
        onehot_tags = music_onehot_tags
    else:
        return f"❌ Son trop court ou trop long ({dur} sec)"

    # ----------------- Features -----------------
    # Category bool
    df["category_is_user_provided"] = df["category_is_user_provided"].astype(int)

    
    # Username frequency
    df["username_freq"] = df["username"].map(username_freq).fillna(0)
    
    # Numeric features
    for col in ["num_ratings", "num_comments", "filesize", "duration"]:
        df[col] = np.log1p(df[col])
    df["samplerate"] = scaler_samplerate.transform(df[["samplerate"]])
    
    # Age_days
    df["created"] = pd.to_datetime(df["created"], errors="coerce").dt.tz_localize(None)
    df["age_days"] = (pd.Timestamp.now() - df["created"]).dt.days
    df["age_days_log"] = np.log1p(df["age_days"])
    df["age_days_log_scaled"] = scaler_age.transform(df[["age_days_log"]])
    df = df.drop(columns=["created", "age_days", "age_days_log"])
  
    # num_downloads
    df["num_downloads_class"] = est_num_downloads.transform(df[["num_downloads"]])
    
    # avg_rating
    df["avg_rating"] = avg_rating_transformer.transform(df["avg_rating"].to_numpy())
    
    # Subcategory
    for col in subcat_cols:
        df[col] = 0  # toutes les colonnes initialisées à 0
    # activer 1 pour la bonne subcategory
    subcat_val = df["subcategory"].iloc[0]
    for col in subcat_cols:
        cat_name = col.replace("subcategory_", "")
        if subcat_val == cat_name:
            df[col] = 1
    df.drop(columns=["subcategory"], inplace=True)
        
    

    # créer toutes les colonnes attendues à 0
    for col in onehot_cols:
        if col not in df.columns:
            df[col] = 0

    # activer les bonnes colonnes one-hot
    license_val = df.loc[0, "license"]
    category_val = df.loc[0, "category"]
    type_val = df.loc[0, "type"]

    for col_name in [
        f"license_{license_val}",
        f"category_{category_val}",
        f"type_{type_val}",
    ]:
        if col_name in df.columns:
            df[col_name] = 1




        
    # Tags
    # Si la colonne "tags" n'existe pas, on la crée avec une valeur vide
    for col in ["name", "tags", "description"]:
        if col not in df.columns:
            df[col] = ""

    df["tags_list"] = df["tags"].fillna("").astype(str).str.lower().str.split(",")

    # Si aucun tag n'existe ou que la liste est vide, mettre "Other"
    if not df["tags_list"].iloc[0] or df["tags_list"].iloc[0] == [""]:
        df["tags_list"] = [["Other"]]

    # One-hot sur toutes les colonnes enregistrées
    # 1️ Créer toutes les colonnes attendues avec 0
    for col in onehot_tags:
        if col not in df.columns:
            df[col] = 0

    # 2️ Activer seulement les colonnes correspondant aux tags existants
    tags_list = df["tags"].iloc[0].lower().split(",") if df["tags"].iloc[0] else []
    for col in onehot_tags:
        tag_name = col.replace("tag_", "").lower()
        if tag_name in tags_list:
            df[col] = 1

    # 3️ Supprimer la colonne temporaire
    df.drop(columns=["tags"], inplace=True)



    # Name
    df["name_clean"] = df["name"].astype(str).str.lower().str.rsplit(".", n=1).str[0]
    df = preprocess_name(df, vec_dim=8)
    df.drop(columns=["name","name_clean"], inplace=True)


    
    # Description
    desc_vec = description_to_vec(df["description"].iloc[0], glove_model)
    for i in range(100):
        df[f"description_glove_{i}"] = desc_vec[i]
    df.drop(columns=["description"], inplace=True)

    
    df.drop(columns=[ "license","category","type","created","subcategory","id","num_downloads","file_path","username"],inplace=True, errors="ignore")

    # --- SAFE REORDER (CRUCIAL) ---
    """
    final_cols = []

    for col in onehot_cols:
        if col in df.columns:
            final_cols.append(col)

    # subcategories
    for col in subcat_cols:
        if col in df.columns:
            final_cols.append(col)

    # le reste
    final_cols += [c for c in df.columns if c not in final_cols]

    df = df[final_cols]
    """




    return df



def xgb_predict_safe(model, X, label_encoder=None):
    booster_features = model.get_booster().feature_names
    X_safe = X.reindex(columns=booster_features, fill_value=0.0).astype(np.float32)
    
    dmatrix = xgb.DMatrix(X_safe.values, feature_names=list(booster_features))

    pred = model.get_booster().predict(dmatrix)[0]

    if label_encoder is not None:
        # label_encoder est une liste de classes
        pred_int = int(round(pred))
        if pred_int < 0: pred_int = 0
        if pred_int >= len(label_encoder): pred_int = len(label_encoder) - 1
        return label_encoder[pred_int]

    return pred


# -------- Gradio --------
def predict_with_model(model, df_input, feat_list, le=None):
    """
    On passe directement le DataFrame filtré pour éviter les erreurs de dictionnaire
    """
    # 1. On s'assure de n'avoir que les colonnes attendues par le booster
    booster_feats = model.get_booster().feature_names
    
    # 2. On aligne le DataFrame sur ces colonnes précisément
    X_aligned = df_input.reindex(columns=booster_feats, fill_value=0.0).astype(float)
    
    # 3. Création de la DMatrix avec les noms de features officiels du modèle
    dmatrix = xgb.DMatrix(X_aligned.values, feature_names=booster_feats)
    
    # 4. Prédiction
    preds = model.get_booster().predict(dmatrix)
    pred_val = preds[0]
    
    # Si c'est une classification (plusieurs probabilités), on prend l'index max
    if len(preds.shape) > 1 and preds.shape[1] > 1:
        pred_int = int(np.argmax(pred_val))
    else:
        pred_int = int(round(float(pred_val)))

    if le:
        try:
            return le.inverse_transform([pred_int])[0]
        except:
            return f"Classe inconnue ({pred_int})"
    return pred_int


def predict_with_metadata(url):
    if url.strip() == "":
        return "❌ Veuillez entrer une URL FreeSound."

    # 1️ Récupérer les métadonnées brutes
    df_raw = fetch_sound_metadata(url)
    raw_lines = ["=== Métadonnées brutes ==="]
    for col in df_raw.columns:
        raw_lines.append(f"{col}: {df_raw[col].iloc[0]}")
    raw_str = "\n".join(raw_lines)

    # 2️ Vérifier la durée
    dur = df_raw["duration"].iloc[0]
    if dur < 0.5:
        return raw_str + f"\n\n❌ Son trop court ({dur} sec). Plage acceptée: 0.5-3 ou 10-60 sec"
    elif 3 < dur < 10 or dur > 60:
        return raw_str + f"\n\n❌ Son hors plage ({dur} sec). Plage acceptée: 0.5-3 ou 10-60 sec"

    # 3️ Prétraitement
    df_processed = preprocess_sound(df_raw)
    cols_to_remove = ["avg_rating", "num_downloads_class"]
    df_for_model = df_processed.drop(columns=[c for c in cols_to_remove if c in df_processed.columns])

    # 4️ Choix modèle selon durée
    if 0.5 <= dur <= 3:
        model_nd = effect_model_num_downloads
        model_ar = effect_model_avg_rating
        model_features = effect_model_features
        sound_type = "EffectSound"
    else:
        model_nd = music_model_num_downloads
        model_ar = music_model_avg_rating
        model_features = music_model_features
        sound_type = "Music"

    # 5️ Forcer exactement les colonnes du modèle
    df_for_model = df_for_model.reindex(columns=model_features, fill_value=0.0).astype(float)

    # 6️ DMatrix XGBoost
    dmatrix = xgb.DMatrix(df_for_model.values, feature_names=list(df_for_model.columns))


    # 7️ Faire les prédictions
    # On passe 'df_for_model' directement (qui est déjà un DataFrame)
    pred_num_downloads_val = predict_with_model(model_nd, df_for_model, model_features)
    
    # Mapping pour num_downloads si le modèle renvoie un entier
    NUM_DOWNLOADS_MAP = {0: "Low", 1: "Medium", 2: "High"}
    pred_num_downloads = NUM_DOWNLOADS_MAP.get(pred_num_downloads_val, str(pred_num_downloads_val))

    # Prédiction du rating avec le LabelEncoder
    current_le = music_avg_rating_le if dur >= 10 else effect_avg_rating_le
    pred_avg_rating = predict_with_model(model_ar, df_for_model, model_features, le=current_le)
    # 8️ Affichage des features prétraitées
    processed_lines = ["\n=== Features après preprocessing ==="]
    for col in df_processed.columns:
        processed_lines.append(f"{col}: {df_processed[col].iloc[0]}")
    processed_str = "\n".join(processed_lines)

    # 9️ Résultat final
    prediction_lines = [
        "\n=== Prédictions ===",
        f"Type détecté : {sound_type}",
        f"📥 Num downloads prédit : {pred_num_downloads}",
        f"⭐ Avg rating prédit : {pred_avg_rating}"
    ]
    prediction_str = "\n".join(prediction_lines)

    return raw_str + processed_str + prediction_str


def preprocess_name(df, vec_dim=8):
    df = df.copy()

    # Calcul de la longueur du nom
    df["name_len"] = df["name_clean"].str.len()

    # HashingVectorizer pour transformer le texte en vecteur
    vectorizer = HashingVectorizer(n_features=vec_dim, alternate_sign=False, norm=None)
    name_vec_sparse = vectorizer.transform(df["name_clean"])

    name_vec_df = pd.DataFrame(
        name_vec_sparse.toarray(),
        columns=[f"name_vec_{i}" for i in range(vec_dim)],
        index=df.index
    )

    df = pd.concat([df, name_vec_df], axis=1)

    return df


with gr.Blocks(title="FreeSound Popularity Detector") as demo:
    gr.Markdown("# 🎧 FreeSound Popularity Detector")
    gr.Markdown("Collez l'URL d'un son FreeSound et le preprocessing complet sera appliqué automatiquement.")

    url_input = gr.Textbox(label="URL du son FreeSound")
    btn_meta = gr.Button("📊 Prétraiter et afficher features")
    output = gr.Textbox(label="Résultat")

    btn_meta.click(fn=predict_with_metadata, inputs=url_input, outputs=output)

demo.launch()