update app.py
Browse files
app.py
CHANGED
|
@@ -5,39 +5,51 @@ import numpy as np
|
|
| 5 |
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OneHotEncoder
|
| 6 |
from sklearn.feature_extraction.text import HashingVectorizer
|
| 7 |
from collections import Counter
|
|
|
|
| 8 |
import freesound
|
| 9 |
import gensim.downloader as api
|
| 10 |
|
| 11 |
-
|
| 12 |
client = freesound.FreesoundClient()
|
| 13 |
client.set_token("zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK", "token")
|
| 14 |
|
| 15 |
-
# Répertoire dataset
|
| 16 |
dataset_dir = "dataset_audio"
|
| 17 |
os.makedirs(dataset_dir, exist_ok=True)
|
| 18 |
|
| 19 |
-
#
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
|
|
|
| 26 |
def fetch_sound_metadata(sound_url):
|
| 27 |
-
|
| 28 |
sound_id = int(sound_url.rstrip("/").split("/")[-1])
|
| 29 |
sound = client.get_sound(sound_id)
|
| 30 |
-
|
| 31 |
file_name = f"{sound.name.replace(' ', '_')}.mp3"
|
| 32 |
file_path = os.path.join(dataset_dir, file_name)
|
| 33 |
-
|
| 34 |
-
# Télécharger le preview
|
| 35 |
try:
|
| 36 |
sound.retrieve_preview(dataset_dir, file_name)
|
| 37 |
except Exception as e:
|
| 38 |
-
print(f"Erreur téléchargement {file_name}
|
| 39 |
file_path = None
|
| 40 |
-
|
| 41 |
data = {
|
| 42 |
"file_path": file_path,
|
| 43 |
"name": sound.name,
|
|
@@ -57,291 +69,119 @@ def fetch_sound_metadata(sound_url):
|
|
| 57 |
"category": getattr(sound, "category", "Unknown"),
|
| 58 |
"subcategory": getattr(sound, "subcategory", "Other"),
|
| 59 |
"type": getattr(sound, "type", ""),
|
| 60 |
-
"samplerate": getattr(sound, "samplerate", 0)
|
| 61 |
-
"amplitude_peak_ratio":getattr(sound, "amplitude_peak_ratio", ""),
|
| 62 |
-
"beat_count":getattr(sound, "beat_count", "")
|
| 63 |
}
|
| 64 |
return pd.DataFrame([data])
|
| 65 |
|
| 66 |
-
def preprocess_subcategory_ohe(df, seuil=2):
|
| 67 |
-
df = df.copy()
|
| 68 |
-
df["subcategory"] = df["subcategory"].fillna("Other")
|
| 69 |
-
counts = df["subcategory"].value_counts(normalize=True) * 100
|
| 70 |
-
rare_subs = counts[counts < seuil].index
|
| 71 |
-
df["subcategory"] = df["subcategory"].apply(lambda x: "Other" if x in rare_subs else x)
|
| 72 |
-
|
| 73 |
-
ohe = OneHotEncoder(sparse_output=False)
|
| 74 |
-
subcat_ohe = ohe.fit_transform(df[["subcategory"]])
|
| 75 |
-
subcat_df = pd.DataFrame(
|
| 76 |
-
subcat_ohe,
|
| 77 |
-
columns=[f"subcategory_{c}" for c in ohe.categories_[0]],
|
| 78 |
-
index=df.index
|
| 79 |
-
)
|
| 80 |
-
return pd.concat([df, subcat_df], axis=1)
|
| 81 |
-
|
| 82 |
-
|
| 83 |
-
def preprocess_tags_ohe(df, seuil=1.0):
|
| 84 |
-
df = df.copy()
|
| 85 |
-
df["tags"] = df["tags"].fillna("")
|
| 86 |
-
df["tags_list"] = df["tags"].str.lower().str.split(",")
|
| 87 |
-
|
| 88 |
-
all_tags = [t.strip() for sub in df["tags_list"] for t in sub if t.strip()]
|
| 89 |
-
counts = Counter(all_tags)
|
| 90 |
-
total = len(df)
|
| 91 |
-
|
| 92 |
-
frequent_tags = {
|
| 93 |
-
tag for tag, cnt in counts.items()
|
| 94 |
-
if cnt / total * 100 >= seuil
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
def filter_tags(tags):
|
| 98 |
-
return list(set([t if t in frequent_tags else "Other" for t in tags]))
|
| 99 |
-
|
| 100 |
-
df["tags_list"] = df["tags_list"].apply(filter_tags)
|
| 101 |
-
|
| 102 |
-
tags_for_ohe = df["tags_list"].apply(lambda x: ";".join(x)).to_numpy().reshape(-1, 1)
|
| 103 |
-
ohe = OneHotEncoder(sparse_output=False)
|
| 104 |
-
tags_ohe = ohe.fit_transform(tags_for_ohe)
|
| 105 |
-
|
| 106 |
-
cols = [f"tag_{c}" for c in ohe.categories_[0]]
|
| 107 |
-
df_tags = pd.DataFrame(tags_ohe, columns=cols, index=df.index)
|
| 108 |
-
|
| 109 |
-
return pd.concat([df, df_tags], axis=1)
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
def preprocess_targets(df):
|
| 113 |
-
# num_downloads -> discretisation 3 classes
|
| 114 |
-
X = df["num_downloads"].to_numpy().reshape(-1,1)
|
| 115 |
-
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
|
| 116 |
-
df["num_downloads_class"] = est.fit_transform(X).astype(int)
|
| 117 |
-
|
| 118 |
-
# avg_rating -> discretisation en 4 classes
|
| 119 |
-
mask_non_zero = df["avg_rating"] != 0
|
| 120 |
-
X_non_zero = df.loc[mask_non_zero, "avg_rating"].to_numpy().reshape(-1,1)
|
| 121 |
-
est = KBinsDiscretizer(n_bins=3, encode="ordinal", strategy="quantile")
|
| 122 |
-
df["avg_rating_class"] = 0
|
| 123 |
-
df.loc[mask_non_zero, "avg_rating_class"] = est.fit_transform(X_non_zero).flatten().astype(int) + 1
|
| 124 |
-
df["avg_rating"] = df["avg_rating_class"]
|
| 125 |
-
df.drop(columns=["avg_rating_class"], inplace=True)
|
| 126 |
-
|
| 127 |
-
return df
|
| 128 |
-
|
| 129 |
-
|
| 130 |
def description_to_vec(text, model, dim=100):
|
| 131 |
if not text:
|
| 132 |
return np.zeros(dim)
|
| 133 |
words = text.lower().split()
|
| 134 |
vecs = [model[w] for w in words if w in model]
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
def preprocess_features(df):
|
| 139 |
-
df = df.copy()
|
| 140 |
-
|
| 141 |
-
# Colonnes booléennes
|
| 142 |
-
df["category_is_user_provided"] = df["category_is_user_provided"].astype(int)
|
| 143 |
-
|
| 144 |
-
# Colonnes catégorielles -> one-hot
|
| 145 |
-
cat_cols = ["license", "category", "type"]
|
| 146 |
-
df[cat_cols] = df[cat_cols].fillna("Unknown")
|
| 147 |
-
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)
|
| 148 |
-
|
| 149 |
-
# username -> frequency encoding
|
| 150 |
-
user_freq = df["username"].value_counts(normalize=True)
|
| 151 |
-
df["username_freq"] = df["username"].map(user_freq)
|
| 152 |
-
df.drop(columns=["username"], inplace=True)
|
| 153 |
-
|
| 154 |
-
# subcategory -> one-hot, rare <2% regroupé
|
| 155 |
-
df["subcategory"] = df["subcategory"].fillna("Other")
|
| 156 |
-
counts = df["subcategory"].value_counts(normalize=True)*100
|
| 157 |
-
rare_subs = counts[counts<2].index
|
| 158 |
-
df["subcategory"] = df["subcategory"].apply(lambda x: "Other" if x in rare_subs else x)
|
| 159 |
-
ohe = OneHotEncoder(sparse_output=False)
|
| 160 |
-
subcat_ohe = ohe.fit_transform(df[["subcategory"]])
|
| 161 |
-
subcat_df = pd.DataFrame(subcat_ohe, columns=[f"subcategory_{c}" for c in ohe.categories_[0]], index=df.index)
|
| 162 |
-
df = pd.concat([df, subcat_df], axis=1)
|
| 163 |
-
df.drop(columns=["subcategory"], inplace=True)
|
| 164 |
-
|
| 165 |
-
# Colonnes numériques -> log1p + standard scaler
|
| 166 |
-
numeric_cols = ["num_ratings", "filesize", "duration", "samplerate"]
|
| 167 |
-
for col in numeric_cols:
|
| 168 |
-
df[col] = np.log1p(df[col])
|
| 169 |
-
scaler = StandardScaler()
|
| 170 |
-
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])
|
| 171 |
-
|
| 172 |
-
# Description -> vecteur GloVe 100 dim
|
| 173 |
-
glove_model = api.load("glove-wiki-gigaword-100")
|
| 174 |
-
def description_to_vec(text, model):
|
| 175 |
-
if not text: return np.zeros(100)
|
| 176 |
-
words = text.lower().split()
|
| 177 |
-
vecs = [model[w] for w in words if w in model]
|
| 178 |
-
return np.mean(vecs, axis=0) if vecs else np.zeros(100)
|
| 179 |
-
desc_vecs = np.vstack(df['description'].fillna('').apply(lambda x: description_to_vec(x, glove_model)))
|
| 180 |
-
desc_cols = [f'description_glove_{i}' for i in range(desc_vecs.shape[1])]
|
| 181 |
-
df[desc_cols] = pd.DataFrame(desc_vecs, columns=desc_cols, index=df.index)
|
| 182 |
-
df.drop(columns=["description"], inplace=True)
|
| 183 |
-
|
| 184 |
-
return df
|
| 185 |
|
| 186 |
-
def
|
|
|
|
| 187 |
df = df.copy()
|
|
|
|
| 188 |
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
df[col] = np.log1p(df[col])
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
# 5. age_days
|
| 210 |
df["created"] = pd.to_datetime(df["created"], errors="coerce")
|
| 211 |
df["age_days"] = (pd.Timestamp.now() - df["created"]).dt.days
|
| 212 |
df["age_days_log"] = np.log1p(df["age_days"])
|
| 213 |
-
df["age_days_log_scaled"] =
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
df.
|
| 220 |
-
|
| 221 |
-
#
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
#
|
| 226 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
vectorizer = HashingVectorizer(n_features=8, alternate_sign=False, norm=None)
|
| 228 |
name_vec = vectorizer.transform(df["name_clean"])
|
| 229 |
-
|
| 230 |
-
|
| 231 |
-
df.drop(columns=["name",
|
| 232 |
-
|
| 233 |
-
#
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
df[desc_cols] = pd.DataFrame(desc_vecs, columns=desc_cols, index=df.index)
|
| 238 |
df.drop(columns=["description"], inplace=True)
|
| 239 |
-
|
| 240 |
-
# 10. Supprimer colonnes inutiles restantes
|
| 241 |
-
cols_to_remove = ["file_path","previews","similar_sounds","comments","geotag","bitrate","bitdepth","is_remix","was_remixed"]
|
| 242 |
-
df = df.drop(columns=[c for c in cols_to_remove if c in df.columns], errors="ignore")
|
| 243 |
-
|
| 244 |
-
return df
|
| 245 |
-
|
| 246 |
-
|
| 247 |
-
def preprocess_features_step1(df):
|
| 248 |
-
df = df.copy()
|
| 249 |
-
|
| 250 |
-
# 1. Bool → int
|
| 251 |
-
df["category_is_user_provided"] = df["category_is_user_provided"].astype(int)
|
| 252 |
-
|
| 253 |
-
# 2. Catégorielles simples → one-hot
|
| 254 |
-
cat_cols = ["license", "category", "type"]
|
| 255 |
-
df[cat_cols] = df[cat_cols].fillna("Unknown")
|
| 256 |
-
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)
|
| 257 |
-
|
| 258 |
-
# 3. username → frequency encoding
|
| 259 |
-
user_freq = df["username"].value_counts(normalize=True)
|
| 260 |
-
df["username_freq"] = df["username"].map(user_freq)
|
| 261 |
-
df.drop(columns=["username"], inplace=True)
|
| 262 |
-
|
| 263 |
-
# 4. Numériques simples
|
| 264 |
-
num_cols = ["num_ratings", "filesize", "duration"]
|
| 265 |
-
for col in num_cols:
|
| 266 |
-
df[col] = np.log1p(df[col])
|
| 267 |
-
|
| 268 |
-
scaler = StandardScaler()
|
| 269 |
-
df[["samplerate"]] = scaler.fit_transform(df[["samplerate"]])
|
| 270 |
-
|
| 271 |
return df
|
| 272 |
|
| 273 |
-
|
| 274 |
-
# -------- Fonctions --------
|
| 275 |
-
|
| 276 |
def predict_with_metadata(url):
|
| 277 |
-
if url.strip() == "":
|
| 278 |
-
return " Veuillez entrer une URL FreeSound."
|
| 279 |
-
|
| 280 |
-
df = fetch_sound_metadata(url)
|
| 281 |
-
df = preprocess_targets(df)
|
| 282 |
-
#df = preprocess_features_step1(df)
|
| 283 |
-
|
| 284 |
-
#df_raw = df.copy()
|
| 285 |
-
#df = preprocess_targets(df)
|
| 286 |
-
#df = preprocess_features_full(df) # <-- version complète
|
| 287 |
-
|
| 288 |
-
# Pour l'affichage
|
| 289 |
-
#avg_rating = df["avg_rating"].iloc[0]
|
| 290 |
-
#num_downloads_class = df["num_downloads_class"].iloc[0]
|
| 291 |
-
#total_columns = df.shape[1]
|
| 292 |
-
|
| 293 |
-
#raw_display = "\n".join(
|
| 294 |
-
# [f"{col} : {df_raw[col].iloc[0]}" for col in df_raw.columns]
|
| 295 |
-
# )
|
| 296 |
-
print("=== COLONNES AVANT PRÉTRAITEMENT ===")
|
| 297 |
-
print(df.columns.tolist())
|
| 298 |
-
print(df.dtypes)
|
| 299 |
-
return (
|
| 300 |
-
"OK – FEATURES STEP 1\n"
|
| 301 |
-
f"Nombre de colonnes : {df.shape[1]}\n\n"
|
| 302 |
-
f"{df.dtypes}"
|
| 303 |
-
f"{df.columns.dtype}"
|
| 304 |
-
)
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
def predict_with_audio(url):
|
| 308 |
if url.strip() == "":
|
| 309 |
return "❌ Veuillez entrer une URL FreeSound."
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
return "🎵 Résultat (audio) : SON NON POPULAIRE (exemple)"
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
# -------- Interface Gradio --------
|
| 319 |
|
| 320 |
with gr.Blocks(title="FreeSound Popularity Detector") as demo:
|
| 321 |
gr.Markdown("# 🎧 FreeSound Popularity Detector")
|
| 322 |
-
gr.Markdown("Collez l'URL d'un son FreeSound et
|
| 323 |
-
|
| 324 |
-
url_input = gr.Textbox(
|
| 325 |
-
label="URL du son FreeSound",
|
| 326 |
-
placeholder="https://freesound.org/people/..."
|
| 327 |
-
)
|
| 328 |
-
|
| 329 |
-
with gr.Row():
|
| 330 |
-
btn_meta = gr.Button("📊 Prédire via métadonnées")
|
| 331 |
-
btn_audio = gr.Button("🎼 Prédire via données acoustiques")
|
| 332 |
|
|
|
|
|
|
|
| 333 |
output = gr.Textbox(label="Résultat")
|
| 334 |
|
| 335 |
-
btn_meta.click(
|
| 336 |
-
fn=predict_with_metadata,
|
| 337 |
-
inputs=url_input,
|
| 338 |
-
outputs=output
|
| 339 |
-
)
|
| 340 |
-
|
| 341 |
-
btn_audio.click(
|
| 342 |
-
fn=predict_with_audio,
|
| 343 |
-
inputs=url_input,
|
| 344 |
-
outputs=output
|
| 345 |
-
)
|
| 346 |
|
| 347 |
demo.launch()
|
|
|
|
| 5 |
from sklearn.preprocessing import KBinsDiscretizer, StandardScaler, OneHotEncoder
|
| 6 |
from sklearn.feature_extraction.text import HashingVectorizer
|
| 7 |
from collections import Counter
|
| 8 |
+
import joblib
|
| 9 |
import freesound
|
| 10 |
import gensim.downloader as api
|
| 11 |
|
| 12 |
+
# -------- FreeSound API --------
|
| 13 |
client = freesound.FreesoundClient()
|
| 14 |
client.set_token("zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK", "token")
|
| 15 |
|
|
|
|
| 16 |
dataset_dir = "dataset_audio"
|
| 17 |
os.makedirs(dataset_dir, exist_ok=True)
|
| 18 |
|
| 19 |
+
# -------- Charger les objets sauvegardés --------
|
| 20 |
+
# Music
|
| 21 |
+
scaler_samplerate_music = joblib.load("music/scaler_samplerate.joblib")
|
| 22 |
+
scaler_age_days_music = joblib.load("music/scaler_music_age_days_log.joblib")
|
| 23 |
+
username_freq_music = joblib.load("music/username_freq_dict_music.joblib")
|
| 24 |
+
est_num_downloads_music = joblib.load("music/est_num_downloads_music.joblib")
|
| 25 |
+
avg_rating_transformer_music = joblib.load("music/avg_rating_transformer_music.joblib")
|
| 26 |
+
music_subcategory_cols = joblib.load("music/music_subcategory_cols.joblib")
|
| 27 |
+
music_onehot_cols = joblib.load("music/music_onehot_cols.joblib")
|
| 28 |
+
|
| 29 |
+
# EffectSound
|
| 30 |
+
scaler_samplerate_effect = joblib.load("effectSound/scaler_samplerate.joblib")
|
| 31 |
+
scaler_age_days_effect = joblib.load("effectSound/scaler_effectSound_age_days_log.joblib")
|
| 32 |
+
username_freq_effect = joblib.load("effectSound/username_freq_dict_effectSound.joblib")
|
| 33 |
+
est_num_downloads_effect = joblib.load("effectSound/est_num_downloads_effectSound.joblib")
|
| 34 |
+
avg_rating_transformer_effect = joblib.load("effectSound/avg_rating_transformer_effectSound.joblib")
|
| 35 |
+
effect_subcategory_cols = joblib.load("effectSound/effectSound_subcategory_cols.joblib")
|
| 36 |
+
effect_onehot_cols = joblib.load("effectSound/effectSound_onehot_cols.joblib")
|
| 37 |
+
|
| 38 |
+
# GloVe pour description
|
| 39 |
+
glove_model = api.load("glove-wiki-gigaword-100")
|
| 40 |
|
| 41 |
+
# -------- Fonctions --------
|
| 42 |
def fetch_sound_metadata(sound_url):
|
| 43 |
+
"""Télécharge les métadonnées du son FreeSound"""
|
| 44 |
sound_id = int(sound_url.rstrip("/").split("/")[-1])
|
| 45 |
sound = client.get_sound(sound_id)
|
|
|
|
| 46 |
file_name = f"{sound.name.replace(' ', '_')}.mp3"
|
| 47 |
file_path = os.path.join(dataset_dir, file_name)
|
|
|
|
|
|
|
| 48 |
try:
|
| 49 |
sound.retrieve_preview(dataset_dir, file_name)
|
| 50 |
except Exception as e:
|
| 51 |
+
print(f"Erreur téléchargement {file_name}: {e}")
|
| 52 |
file_path = None
|
|
|
|
| 53 |
data = {
|
| 54 |
"file_path": file_path,
|
| 55 |
"name": sound.name,
|
|
|
|
| 69 |
"category": getattr(sound, "category", "Unknown"),
|
| 70 |
"subcategory": getattr(sound, "subcategory", "Other"),
|
| 71 |
"type": getattr(sound, "type", ""),
|
| 72 |
+
"samplerate": getattr(sound, "samplerate", 0)
|
|
|
|
|
|
|
| 73 |
}
|
| 74 |
return pd.DataFrame([data])
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def description_to_vec(text, model, dim=100):
|
| 77 |
if not text:
|
| 78 |
return np.zeros(dim)
|
| 79 |
words = text.lower().split()
|
| 80 |
vecs = [model[w] for w in words if w in model]
|
| 81 |
+
if len(vecs) == 0:
|
| 82 |
+
return np.zeros(dim)
|
| 83 |
+
return np.mean(vecs, axis=0)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
+
def preprocess_sound(df):
|
| 86 |
+
"""Applique le preprocessing complet selon duration pour choisir music ou effectSound"""
|
| 87 |
df = df.copy()
|
| 88 |
+
dur = df["duration"].iloc[0]
|
| 89 |
|
| 90 |
+
if 0.5 <= dur <= 3:
|
| 91 |
+
dataset_type = "effectSound"
|
| 92 |
+
scaler_samplerate = scaler_samplerate_effect
|
| 93 |
+
scaler_age = scaler_age_days_effect
|
| 94 |
+
username_freq = username_freq_effect
|
| 95 |
+
est_num_downloads = est_num_downloads_effect
|
| 96 |
+
avg_rating_transformer = avg_rating_transformer_effect
|
| 97 |
+
subcat_cols = effect_subcategory_cols
|
| 98 |
+
onehot_cols = effect_onehot_cols
|
| 99 |
+
elif 10 <= dur <= 60:
|
| 100 |
+
dataset_type = "music"
|
| 101 |
+
scaler_samplerate = scaler_samplerate_music
|
| 102 |
+
scaler_age = scaler_age_days_music
|
| 103 |
+
username_freq = username_freq_music
|
| 104 |
+
est_num_downloads = est_num_downloads_music
|
| 105 |
+
avg_rating_transformer = avg_rating_transformer_music
|
| 106 |
+
subcat_cols = music_subcategory_cols
|
| 107 |
+
onehot_cols = music_onehot_cols
|
| 108 |
+
else:
|
| 109 |
+
return f"❌ Son trop court ou trop long ({dur} sec)"
|
| 110 |
+
|
| 111 |
+
# ----------------- Features -----------------
|
| 112 |
+
# Category bool
|
| 113 |
+
df["category_is_user_provided"] = int(df["category_is_user_provided"])
|
| 114 |
+
|
| 115 |
+
# Username frequency
|
| 116 |
+
df["username_freq"] = df["username"].map(username_freq).fillna(0)
|
| 117 |
+
|
| 118 |
+
# Numeric features
|
| 119 |
+
for col in ["num_ratings", "num_comments", "filesize", "duration"]:
|
| 120 |
df[col] = np.log1p(df[col])
|
| 121 |
+
df["samplerate"] = scaler_samplerate.transform(df[["samplerate"]])
|
| 122 |
+
|
| 123 |
+
# Age_days
|
|
|
|
| 124 |
df["created"] = pd.to_datetime(df["created"], errors="coerce")
|
| 125 |
df["age_days"] = (pd.Timestamp.now() - df["created"]).dt.days
|
| 126 |
df["age_days_log"] = np.log1p(df["age_days"])
|
| 127 |
+
df["age_days_log_scaled"] = scaler_age.transform(df[["age_days_log"]])
|
| 128 |
+
|
| 129 |
+
# num_downloads
|
| 130 |
+
df["num_downloads_class"] = est_num_downloads.transform(df[["num_downloads"]])
|
| 131 |
+
|
| 132 |
+
# avg_rating
|
| 133 |
+
df["avg_rating"] = avg_rating_transformer.transform(df["avg_rating"].to_numpy())
|
| 134 |
+
|
| 135 |
+
# Subcategory
|
| 136 |
+
for col in subcat_cols:
|
| 137 |
+
df[col] = 0 # pour gradio, on va juste créer les colonnes
|
| 138 |
+
|
| 139 |
+
# One-hot
|
| 140 |
+
for col in onehot_cols:
|
| 141 |
+
df[col] = 0
|
| 142 |
+
|
| 143 |
+
# Tags
|
| 144 |
+
df["tags_list"] = df["tags"].fillna("").str.lower().str.split(",")
|
| 145 |
+
# One-Hot Encoding tags à la volée
|
| 146 |
+
all_tags = [t.strip() for sub in df["tags_list"] for t in sub if t.strip() != ""]
|
| 147 |
+
frequent_tags = set(all_tags) # simplifié, car threshold appliqué dans Colab déjà
|
| 148 |
+
for tag in frequent_tags:
|
| 149 |
+
df[f"tag_{tag.replace(' ','_')}"] = 1
|
| 150 |
+
df.drop(columns=["tags","tags_list"], inplace=True)
|
| 151 |
+
|
| 152 |
+
# Name
|
| 153 |
+
df["name_clean"] = df["name"].str.lower().str.rsplit(".",1).str[0]
|
| 154 |
vectorizer = HashingVectorizer(n_features=8, alternate_sign=False, norm=None)
|
| 155 |
name_vec = vectorizer.transform(df["name_clean"])
|
| 156 |
+
for i in range(8):
|
| 157 |
+
df[f"name_vec_{i}"] = name_vec.toarray()[0][i]
|
| 158 |
+
df.drop(columns=["name","name_clean"], inplace=True)
|
| 159 |
+
|
| 160 |
+
# Description
|
| 161 |
+
desc_vec = description_to_vec(df["description"].iloc[0], glove_model)
|
| 162 |
+
for i in range(100):
|
| 163 |
+
df[f"description_glove_{i}"] = desc_vec[i]
|
|
|
|
| 164 |
df.drop(columns=["description"], inplace=True)
|
| 165 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
return df
|
| 167 |
|
| 168 |
+
# -------- Gradio --------
|
|
|
|
|
|
|
| 169 |
def predict_with_metadata(url):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
if url.strip() == "":
|
| 171 |
return "❌ Veuillez entrer une URL FreeSound."
|
| 172 |
|
| 173 |
+
df = fetch_sound_metadata(url)
|
| 174 |
+
df_processed = preprocess_sound(df)
|
| 175 |
+
return df_processed.to_string()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
with gr.Blocks(title="FreeSound Popularity Detector") as demo:
|
| 178 |
gr.Markdown("# 🎧 FreeSound Popularity Detector")
|
| 179 |
+
gr.Markdown("Collez l'URL d'un son FreeSound et le preprocessing complet sera appliqué automatiquement.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
+
url_input = gr.Textbox(label="URL du son FreeSound")
|
| 182 |
+
btn_meta = gr.Button("📊 Prétraiter et afficher features")
|
| 183 |
output = gr.Textbox(label="Résultat")
|
| 184 |
|
| 185 |
+
btn_meta.click(fn=predict_with_metadata, inputs=url_input, outputs=output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
|
| 187 |
demo.launch()
|
effectSound/scaler_effectSound_age_days_log.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:332ee96e7bca4c412bc0d5ac20c0876d5bf8304142d4fd57d4d5524e03228e61
|
| 3 |
+
size 895
|
effectSound/username_freq_dict_effectSound.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:457517d900b3c05061f398d37b00f8087ae9edb1a4776c7cbc2fc77fa60a4036
|
| 3 |
+
size 209269
|
music/scaler_music_age_days_log.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1941f5f48e21243a939080d9d7a1cedc677e2b0b813a451a50f64d00ce149588
|
| 3 |
+
size 895
|
music/username_freq_dict_music.joblib
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f6ca2078e0e0c5c0d5f871362bba1e787c6860fb547dd1f9f3c4f0f3c366b447
|
| 3 |
+
size 214933
|