freesound-popularity-interfaceTestMetadata

Sleeping

App Files Files Community

IKRAMELHADI commited on 14 days ago

Commit

4ad7378

1 Parent(s): c019996

testtest5

Browse files

Files changed (1) hide show

app.py +102 -187

app.py CHANGED Viewed

@@ -2,227 +2,142 @@ import os
 import time
 import requests
 import pandas as pd
 import gradio as gr
-import joblib
 # =========================
 # CONFIG
 # =========================
-FREESOUND_API_BASE = "https://freesound.org/apiv2"
-API_TOKEN = os.getenv("FREESOUND_API_TOKEN", "").strip()
-# Timeout: (connect, read)
 TIMEOUT = (6, 20)
-# Session HTTP réutilisable
 SESSION = requests.Session()
-ADAPTER = requests.adapters.HTTPAdapter(pool_connections=20, pool_maxsize=20, max_retries=0)
-SESSION.mount("https://", ADAPTER)
-SESSION.headers.update({"User-Agent": "freesound-gradio-metadata/1.0"})
 # =========================
-# CHARGE TON MODELE + FEATURES
 # =========================
-# Adapte ces chemins à ton projet
-MODEL_PATH = "model.joblib"
-FEATURES_PATH = "features.txt"  # un fichier avec 1 feature par ligne (ordre = ordre du training)
-if not os.path.exists(MODEL_PATH):
-    raise FileNotFoundError(f"Modèle introuvable: {MODEL_PATH}")
-model = joblib.load(MODEL_PATH)
-if not os.path.exists(FEATURES_PATH):
-    raise FileNotFoundError(f"Liste de features introuvable: {FEATURES_PATH}")
-with open(FEATURES_PATH, "r", encoding="utf-8") as f:
-    FEATURE_NAMES = [line.strip() for line in f if line.strip()]
 # =========================
-# OUTILS
 # =========================
-def safe_get_json(url, headers=None, params=None, attempts=5, backoff=1.7):
-    """
-    GET JSON robuste : retries sur erreurs réseau/5xx/429.
-    """
-    last_err = None
-    for i in range(attempts):
-        try:
-            resp = SESSION.get(url, headers=headers, params=params, timeout=TIMEOUT)
-            # Rate limit
-            if resp.status_code == 429:
-                retry_after = resp.headers.get("Retry-After")
-                wait = float(retry_after) if retry_after and retry_after.isdigit() else (backoff ** i)
-                time.sleep(wait)
-                continue
-            # Server errors
-            if 500 <= resp.status_code < 600:
-                time.sleep(backoff ** i)
-                continue
-            # Auth / Not found / autres erreurs client
-            if resp.status_code == 401:
-                raise RuntimeError("❌ Token FreeSound invalide ou non autorisé (401).")
-            if resp.status_code == 404:
-                raise RuntimeError("❌ Sound introuvable (404).")
-            if resp.status_code >= 400:
-                raise RuntimeError(f"❌ Erreur HTTP {resp.status_code}: {resp.text[:200]}")
-            return resp.json()
-        except (requests.exceptions.ConnectionError,
-                requests.exceptions.Timeout,
-                requests.exceptions.ChunkedEncodingError) as e:
-            last_err = e
-            time.sleep(backoff ** i)
-            continue
-        except Exception as e:
-            # autre exception : on remonte direct
-            raise
-    raise RuntimeError(f"❌ Échec après {attempts} tentatives. Dernière erreur: {repr(last_err)}")
-def fetch_sound_by_id(sound_id: int, fields: str) -> dict:
-    """
-    ✅ Endpoint stable : /sounds/{id}/
-    """
-    if not API_TOKEN:
-        raise RuntimeError("❌ FREESOUND_API_TOKEN manquant (variable d'environnement).")
-    url = f"{FREESOUND_API_BASE}/sounds/{int(sound_id)}/"
-    headers = {"Authorization": f"Token {API_TOKEN}"}
-    params = {"fields": fields}
-    return safe_get_json(url, headers=headers, params=params)
-def flatten_features(ac_analysis: dict) -> dict:
-    """
-    FreeSound renvoie souvent un dict de features (ac_analysis).
-    Ici on aplatit en {feature_name: value} en gardant uniquement
-    les clés directes (et on ignore les structures trop imbriquées).
-    """
-    flat = {}
-    if not isinstance(ac_analysis, dict):
-        return flat
-    for k, v in ac_analysis.items():
-        # garde les nombres simples / bool / str courts
-        if isinstance(v, (int, float, bool)):
-            flat[k] = float(v) if isinstance(v, bool) else v
-        elif isinstance(v, str):
-            # éviter d'injecter des textes énormes
-            flat[k] = v[:200]
-        # si liste/dict: on ignore (ou tu peux custom)
-    return flat
-def build_feature_df(sound_json: dict, wanted_features: list[str]) -> pd.DataFrame:
-    """
-    Construit un DataFrame avec les features réellement utilisées par ton modèle.
-    """
-    ac = sound_json.get("ac_analysis", {}) or {}
-    flat = flatten_features(ac)
-    rows = []
-    for feat in wanted_features:
-        rows.append({"feature": feat, "value": flat.get(feat, None)})
-    return pd.DataFrame(rows)
-def build_model_vector(sound_json: dict, feature_names: list[str]) -> pd.DataFrame:
-    """
-    Construit un X (1 ligne) dans le bon ordre de features.
-    """
-    ac = sound_json.get("ac_analysis", {}) or {}
-    flat = flatten_features(ac)
-    x = {feat: flat.get(feat, None) for feat in feature_names}
-    X = pd.DataFrame([x])
-    # Option: fillna(0) si ton training le faisait (sinon enlève)
-    X = X.fillna(0)
-    return X
-def predict_label(sound_json: dict):
-    X = build_model_vector(sound_json, FEATURE_NAMES)
-    # proba si dispo
-    label = model.predict(X)[0]
-    proba = None
-    if hasattr(model, "predict_proba"):
-        try:
-            proba = float(model.predict_proba(X).max())
-        except Exception:
-            proba = None
-    return label, proba, X
-# =========================
-# GRADIO LOGIC
-# =========================
-DEFAULT_FIELDS = "id,name,username,license,tags,previews,ac_analysis"
-def run(sound_id: str):
-    sound_id = str(sound_id).strip()
-    if not sound_id.isdigit():
-        raise gr.Error("Entre un ID numérique (ex: 123456).")
-    sid = int(sound_id)
-    sound = fetch_sound_by_id(sid, fields=DEFAULT_FIELDS)
-    # Tableau des features utilisées
-    df_features = build_feature_df(sound, FEATURE_NAMES)
-    # Prediction
-    label, proba, X = predict_label(sound)
-    # Infos utiles à afficher
-    title = sound.get("name", "")
-    user = sound.get("username", "")
-    tags = sound.get("tags", [])
-    preview_url = (sound.get("previews", {}) or {}).get("preview-hq-mp3") or (sound.get("previews", {}) or {}).get("preview-lq-mp3")
-    info_md = f"""
-### 🎧 Sound
-- **ID**: `{sid}`
-- **Nom**: {title}
-- **Auteur**: {user}
-- **Tags**: {", ".join(tags[:25])}{' …' if len(tags) > 25 else ''}
-### 🔮 Prédiction
-- **Classe prédite**: **{label}**
-""" + (f"- **Confiance (max proba)**: `{proba:.3f}`\n" if proba is not None else "")
-    audio = preview_url if preview_url else None
-    # Option: montrer aussi le vecteur X (1 ligne) si tu veux
-    # df_x = X.T.reset_index().rename(columns={"index": "feature", 0: "value"})
-    # return info_md, audio, df_features, df_x
-    return info_md, audio, df_features
 # =========================
 # UI
 # =========================
-with gr.Blocks(title="FreeSound ID → Metadata + Prediction") as demo:
-    gr.Markdown("# FreeSound : Métadonnées → Features → Prédiction")
-    with gr.Row():
-        sound_id_in = gr.Textbox(label="Sound ID", placeholder="ex: 123456", scale=2)
-        btn = gr.Button("Récupérer & prédire", scale=1)
-    info_out = gr.Markdown()
-    audio_out = gr.Audio(label="Preview (si dispo)", interactive=False)
-    features_out = gr.Dataframe(label="Features utilisées (valeurs FreeSound)", interactive=False)
-    btn.click(fn=run, inputs=[sound_id_in], outputs=[info_out, audio_out, features_out])
-    sound_id_in.submit(fn=run, inputs=[sound_id_in], outputs=[info_out, audio_out, features_out])
-if __name__ == "__main__":
-    demo.launch()

 import time
 import requests
 import pandas as pd
+import numpy as np
 import gradio as gr
+from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
+from sklearn.feature_extraction.text import HashingVectorizer
+from sklearn.preprocessing import OneHotEncoder
 # =========================
 # CONFIG
 # =========================
+API_TOKEN = "zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK"
+BASE_URL = "https://freesound.org/apiv2"
 TIMEOUT = (6, 20)
 SESSION = requests.Session()
+SESSION.headers.update({"Authorization": f"Token {API_TOKEN}"})
 # =========================
+# API FREESOUND
 # =========================
+def fetch_sound(sound_id: int):
+    url = f"{BASE_URL}/sounds/{sound_id}/"
+    params = {
+        "fields": (
+            "id,name,username,description,tags,created,"
+            "duration,num_downloads,avg_rating,"
+            "category,subcategory,license,type"
+        )
+    }
+    r = SESSION.get(url, params=params, timeout=TIMEOUT)
+    if r.status_code != 200:
+        raise RuntimeError(f"Erreur API {r.status_code}")
+    return r.json()
 # =========================
+# PREPROCESSING (ONLINE)
 # =========================
+def discretize_num_downloads(x):
+    if x < 100:
+        return "Low"
+    elif x < 1000:
+        return "Medium"
+    else:
+        return "High"
+def discretize_avg_rating(x):
+    if x == 0 or pd.isna(x):
+        return "MissedInfo"
+    elif x < 2.5:
+        return "Low"
+    elif x < 3.8:
+        return "Medium"
+    else:
+        return "High"
+def preprocess_metadata(sound: dict):
+    out = {}
+    # ---- Targets (debug) ----
+    out["num_downloads_class"] = discretize_num_downloads(sound["num_downloads"])
+    out["avg_rating_class"] = discretize_avg_rating(sound["avg_rating"])
+    # ---- Numériques ----
+    out["duration_log"] = np.log1p(sound["duration"])
+    out["num_downloads_log"] = np.log1p(sound["num_downloads"])
+    # ---- Created → age_days ----
+    created = pd.to_datetime(sound["created"], errors="coerce")
+    age_days = (pd.Timestamp.now() - created).days if pd.notna(created) else 0
+    out["age_days_log"] = np.log1p(age_days)
+    # ---- Username freq (proxy) ----
+    out["username_len"] = len(sound["username"]) if sound["username"] else 0
+    # ---- Name ----
+    name = sound["name"].lower()
+    out["name_len"] = len(name)
+    hv = HashingVectorizer(n_features=8, alternate_sign=False)
+    name_vec = hv.transform([name]).toarray()[0]
+    for i, v in enumerate(name_vec):
+        out[f"name_vec_{i}"] = v
+    # ---- Tags (simple multi-hot) ----
+    tags = sound["tags"][:5]  # limiter
+    for t in tags:
+        out[f"tag_{t}"] = 1
+    # ---- Catégories ----
+    for col in ["category", "subcategory", "license", "type"]:
+        val = sound.get(col) or "Unknown"
+        out[f"{col}_{val}"] = 1
+    return out
+# =========================
+# PIPELINE GRADIO
+# =========================
+def run(sound_id):
+    if not str(sound_id).isdigit():
+        raise gr.Error("ID invalide")
+    sound = fetch_sound(int(sound_id))
+    # AVANT
+    before_df = pd.DataFrame.from_dict(sound, orient="index", columns=["value"])
+    # APRÈS
+    processed = preprocess_metadata(sound)
+    after_df = pd.DataFrame.from_dict(processed, orient="index", columns=["value"])
+    return before_df, after_df
 # =========================
 # UI
 # =========================
+with gr.Blocks(title="Metadata preprocessing FreeSound") as demo:
+    gr.Markdown("""
+    # 🎧 FreeSound – Prétraitement Metadata
+    **Objectif :** visualiser les features **avant** et **après** preprocessing
+    """)
+    sound_id = gr.Textbox(label="Sound ID", placeholder="ex: 123456")
+    btn = gr.Button("Analyser")
+    with gr.Row():
+        before = gr.Dataframe(label="AVANT preprocessing (brut FreeSound)")
+        after = gr.Dataframe(label="APRÈS preprocessing (features modèle)")
+    btn.click(run, sound_id, [before, after])
+demo.launch()