freesound-popularity-interfaceTestMetadata

Sleeping

NIIHAAD commited on Feb 2

Commit

7e1871b

1 Parent(s): 27b2989

update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -242,6 +242,32 @@ def preprocess_features_full(df):
     return df
 # -------- Fonctions --------
@@ -251,6 +277,8 @@ def predict_with_metadata(url):
     df = fetch_sound_metadata(url)
     df = preprocess_targets(df)
     #df_raw = df.copy()
     #df = preprocess_targets(df)
     #df = preprocess_features_full(df)  # <-- version complète
@@ -266,13 +294,10 @@ def predict_with_metadata(url):
     print("=== COLONNES AVANT PRÉTRAITEMENT ===")
     print(df.columns.tolist())
     print(df.dtypes)
-    return ("🔹 DONNÉES BRUTES (avant traitement)\n"
-            "----------------------------------\n"
-            f"{df.columns.tolist()}\n\n"
             f"{df.dtypes}"
-            "OK – TARGETS CALCULÉES\n"
-            f"avg_rating (discrétisé) : {df['avg_rating'].iloc[0]}\n"
-            f"num_downloads_class : {df['num_downloads_class'].iloc[0]}"
             )

     return df
+def preprocess_features_step1(df):
+    df = df.copy()
+    # 1. Bool → int
+    df["category_is_user_provided"] = df["category_is_user_provided"].astype(int)
+    # 2. Catégorielles simples → one-hot
+    cat_cols = ["license", "category", "type"]
+    df[cat_cols] = df[cat_cols].fillna("Unknown")
+    df = pd.get_dummies(df, columns=cat_cols, drop_first=False)
+    # 3. username → frequency encoding
+    user_freq = df["username"].value_counts(normalize=True)
+    df["username_freq"] = df["username"].map(user_freq)
+    df.drop(columns=["username"], inplace=True)
+    # 4. Numériques simples
+    num_cols = ["num_ratings", "filesize", "duration"]
+    for col in num_cols:
+        df[col] = np.log1p(df[col])
+    scaler = StandardScaler()
+    df[["samplerate"]] = scaler.fit_transform(df[["samplerate"]])
+    return df
 # -------- Fonctions --------
     df = fetch_sound_metadata(url)
     df = preprocess_targets(df)
+    df = preprocess_features_step1(df)
     #df_raw = df.copy()
     #df = preprocess_targets(df)
     #df = preprocess_features_full(df)  # <-- version complète
     print("=== COLONNES AVANT PRÉTRAITEMENT ===")
     print(df.columns.tolist())
     print(df.dtypes)
+    return (
+             "OK – FEATURES STEP 1\n"
+            f"Nombre de colonnes : {df.shape[1]}\n\n"
             f"{df.dtypes}"
             )