update app.py
Browse files
app.py
CHANGED
|
@@ -242,6 +242,32 @@ def preprocess_features_full(df):
|
|
| 242 |
return df
|
| 243 |
|
| 244 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
# -------- Fonctions --------
|
| 247 |
|
|
@@ -251,6 +277,8 @@ def predict_with_metadata(url):
|
|
| 251 |
|
| 252 |
df = fetch_sound_metadata(url)
|
| 253 |
df = preprocess_targets(df)
|
|
|
|
|
|
|
| 254 |
#df_raw = df.copy()
|
| 255 |
#df = preprocess_targets(df)
|
| 256 |
#df = preprocess_features_full(df) # <-- version complète
|
|
@@ -266,13 +294,10 @@ def predict_with_metadata(url):
|
|
| 266 |
print("=== COLONNES AVANT PRÉTRAITEMENT ===")
|
| 267 |
print(df.columns.tolist())
|
| 268 |
print(df.dtypes)
|
| 269 |
-
return (
|
| 270 |
-
|
| 271 |
-
f"{df.
|
| 272 |
f"{df.dtypes}"
|
| 273 |
-
"OK – TARGETS CALCULÉES\n"
|
| 274 |
-
f"avg_rating (discrétisé) : {df['avg_rating'].iloc[0]}\n"
|
| 275 |
-
f"num_downloads_class : {df['num_downloads_class'].iloc[0]}"
|
| 276 |
)
|
| 277 |
|
| 278 |
|
|
|
|
| 242 |
return df
|
| 243 |
|
| 244 |
|
| 245 |
+
def preprocess_features_step1(df):
|
| 246 |
+
df = df.copy()
|
| 247 |
+
|
| 248 |
+
# 1. Bool → int
|
| 249 |
+
df["category_is_user_provided"] = df["category_is_user_provided"].astype(int)
|
| 250 |
+
|
| 251 |
+
# 2. Catégorielles simples → one-hot
|
| 252 |
+
cat_cols = ["license", "category", "type"]
|
| 253 |
+
df[cat_cols] = df[cat_cols].fillna("Unknown")
|
| 254 |
+
df = pd.get_dummies(df, columns=cat_cols, drop_first=False)
|
| 255 |
+
|
| 256 |
+
# 3. username → frequency encoding
|
| 257 |
+
user_freq = df["username"].value_counts(normalize=True)
|
| 258 |
+
df["username_freq"] = df["username"].map(user_freq)
|
| 259 |
+
df.drop(columns=["username"], inplace=True)
|
| 260 |
+
|
| 261 |
+
# 4. Numériques simples
|
| 262 |
+
num_cols = ["num_ratings", "filesize", "duration"]
|
| 263 |
+
for col in num_cols:
|
| 264 |
+
df[col] = np.log1p(df[col])
|
| 265 |
+
|
| 266 |
+
scaler = StandardScaler()
|
| 267 |
+
df[["samplerate"]] = scaler.fit_transform(df[["samplerate"]])
|
| 268 |
+
|
| 269 |
+
return df
|
| 270 |
+
|
| 271 |
|
| 272 |
# -------- Fonctions --------
|
| 273 |
|
|
|
|
| 277 |
|
| 278 |
df = fetch_sound_metadata(url)
|
| 279 |
df = preprocess_targets(df)
|
| 280 |
+
df = preprocess_features_step1(df)
|
| 281 |
+
|
| 282 |
#df_raw = df.copy()
|
| 283 |
#df = preprocess_targets(df)
|
| 284 |
#df = preprocess_features_full(df) # <-- version complète
|
|
|
|
| 294 |
print("=== COLONNES AVANT PRÉTRAITEMENT ===")
|
| 295 |
print(df.columns.tolist())
|
| 296 |
print(df.dtypes)
|
| 297 |
+
return (
|
| 298 |
+
"OK – FEATURES STEP 1\n"
|
| 299 |
+
f"Nombre de colonnes : {df.shape[1]}\n\n"
|
| 300 |
f"{df.dtypes}"
|
|
|
|
|
|
|
|
|
|
| 301 |
)
|
| 302 |
|
| 303 |
|