IKRAMELHADI commited on
Commit
4ad7378
·
1 Parent(s): c019996

testtest5

Browse files
Files changed (1) hide show
  1. app.py +102 -187
app.py CHANGED
@@ -2,227 +2,142 @@ import os
2
  import time
3
  import requests
4
  import pandas as pd
 
5
  import gradio as gr
6
- import joblib
 
 
 
7
 
8
  # =========================
9
  # CONFIG
10
  # =========================
11
- FREESOUND_API_BASE = "https://freesound.org/apiv2"
12
- API_TOKEN = os.getenv("FREESOUND_API_TOKEN", "").strip()
13
-
14
- # Timeout: (connect, read)
15
  TIMEOUT = (6, 20)
16
 
17
- # Session HTTP réutilisable
18
  SESSION = requests.Session()
19
- ADAPTER = requests.adapters.HTTPAdapter(pool_connections=20, pool_maxsize=20, max_retries=0)
20
- SESSION.mount("https://", ADAPTER)
21
- SESSION.headers.update({"User-Agent": "freesound-gradio-metadata/1.0"})
22
 
23
  # =========================
24
- # CHARGE TON MODELE + FEATURES
25
  # =========================
26
- # Adapte ces chemins à ton projet
27
- MODEL_PATH = "model.joblib"
28
- FEATURES_PATH = "features.txt" # un fichier avec 1 feature par ligne (ordre = ordre du training)
29
-
30
- if not os.path.exists(MODEL_PATH):
31
- raise FileNotFoundError(f"Modèle introuvable: {MODEL_PATH}")
32
- model = joblib.load(MODEL_PATH)
33
-
34
- if not os.path.exists(FEATURES_PATH):
35
- raise FileNotFoundError(f"Liste de features introuvable: {FEATURES_PATH}")
36
- with open(FEATURES_PATH, "r", encoding="utf-8") as f:
37
- FEATURE_NAMES = [line.strip() for line in f if line.strip()]
 
 
38
 
39
 
40
  # =========================
41
- # OUTILS
42
  # =========================
43
- def safe_get_json(url, headers=None, params=None, attempts=5, backoff=1.7):
44
- """
45
- GET JSON robuste : retries sur erreurs réseau/5xx/429.
46
- """
47
- last_err = None
48
- for i in range(attempts):
49
- try:
50
- resp = SESSION.get(url, headers=headers, params=params, timeout=TIMEOUT)
51
-
52
- # Rate limit
53
- if resp.status_code == 429:
54
- retry_after = resp.headers.get("Retry-After")
55
- wait = float(retry_after) if retry_after and retry_after.isdigit() else (backoff ** i)
56
- time.sleep(wait)
57
- continue
58
-
59
- # Server errors
60
- if 500 <= resp.status_code < 600:
61
- time.sleep(backoff ** i)
62
- continue
63
-
64
- # Auth / Not found / autres erreurs client
65
- if resp.status_code == 401:
66
- raise RuntimeError("❌ Token FreeSound invalide ou non autorisé (401).")
67
- if resp.status_code == 404:
68
- raise RuntimeError("❌ Sound introuvable (404).")
69
- if resp.status_code >= 400:
70
- raise RuntimeError(f"❌ Erreur HTTP {resp.status_code}: {resp.text[:200]}")
71
-
72
- return resp.json()
73
-
74
- except (requests.exceptions.ConnectionError,
75
- requests.exceptions.Timeout,
76
- requests.exceptions.ChunkedEncodingError) as e:
77
- last_err = e
78
- time.sleep(backoff ** i)
79
- continue
80
- except Exception as e:
81
- # autre exception : on remonte direct
82
- raise
83
-
84
- raise RuntimeError(f"❌ Échec après {attempts} tentatives. Dernière erreur: {repr(last_err)}")
85
-
86
-
87
- def fetch_sound_by_id(sound_id: int, fields: str) -> dict:
88
- """
89
- ✅ Endpoint stable : /sounds/{id}/
90
- """
91
- if not API_TOKEN:
92
- raise RuntimeError("❌ FREESOUND_API_TOKEN manquant (variable d'environnement).")
93
-
94
- url = f"{FREESOUND_API_BASE}/sounds/{int(sound_id)}/"
95
- headers = {"Authorization": f"Token {API_TOKEN}"}
96
- params = {"fields": fields}
97
- return safe_get_json(url, headers=headers, params=params)
98
-
99
-
100
- def flatten_features(ac_analysis: dict) -> dict:
101
- """
102
- FreeSound renvoie souvent un dict de features (ac_analysis).
103
- Ici on aplatit en {feature_name: value} en gardant uniquement
104
- les clés directes (et on ignore les structures trop imbriquées).
105
- """
106
- flat = {}
107
- if not isinstance(ac_analysis, dict):
108
- return flat
109
-
110
- for k, v in ac_analysis.items():
111
- # garde les nombres simples / bool / str courts
112
- if isinstance(v, (int, float, bool)):
113
- flat[k] = float(v) if isinstance(v, bool) else v
114
- elif isinstance(v, str):
115
- # éviter d'injecter des textes énormes
116
- flat[k] = v[:200]
117
- # si liste/dict: on ignore (ou tu peux custom)
118
- return flat
119
-
120
-
121
- def build_feature_df(sound_json: dict, wanted_features: list[str]) -> pd.DataFrame:
122
- """
123
- Construit un DataFrame avec les features réellement utilisées par ton modèle.
124
- """
125
- ac = sound_json.get("ac_analysis", {}) or {}
126
- flat = flatten_features(ac)
127
-
128
- rows = []
129
- for feat in wanted_features:
130
- rows.append({"feature": feat, "value": flat.get(feat, None)})
131
- return pd.DataFrame(rows)
132
-
133
-
134
- def build_model_vector(sound_json: dict, feature_names: list[str]) -> pd.DataFrame:
135
- """
136
- Construit un X (1 ligne) dans le bon ordre de features.
137
- """
138
- ac = sound_json.get("ac_analysis", {}) or {}
139
- flat = flatten_features(ac)
140
-
141
- x = {feat: flat.get(feat, None) for feat in feature_names}
142
- X = pd.DataFrame([x])
143
-
144
- # Option: fillna(0) si ton training le faisait (sinon enlève)
145
- X = X.fillna(0)
146
-
147
- return X
148
-
149
-
150
- def predict_label(sound_json: dict):
151
- X = build_model_vector(sound_json, FEATURE_NAMES)
152
-
153
- # proba si dispo
154
- label = model.predict(X)[0]
155
- proba = None
156
- if hasattr(model, "predict_proba"):
157
- try:
158
- proba = float(model.predict_proba(X).max())
159
- except Exception:
160
- proba = None
161
- return label, proba, X
162
 
163
 
164
- # =========================
165
- # GRADIO LOGIC
166
- # =========================
167
- DEFAULT_FIELDS = "id,name,username,license,tags,previews,ac_analysis"
 
 
 
 
 
168
 
169
- def run(sound_id: str):
170
- sound_id = str(sound_id).strip()
171
- if not sound_id.isdigit():
172
- raise gr.Error("Entre un ID numérique (ex: 123456).")
173
 
174
- sid = int(sound_id)
 
175
 
176
- sound = fetch_sound_by_id(sid, fields=DEFAULT_FIELDS)
 
 
177
 
178
- # Tableau des features utilisées
179
- df_features = build_feature_df(sound, FEATURE_NAMES)
 
180
 
181
- # Prediction
182
- label, proba, X = predict_label(sound)
 
 
183
 
184
- # Infos utiles à afficher
185
- title = sound.get("name", "")
186
- user = sound.get("username", "")
187
- tags = sound.get("tags", [])
188
- preview_url = (sound.get("previews", {}) or {}).get("preview-hq-mp3") or (sound.get("previews", {}) or {}).get("preview-lq-mp3")
189
 
190
- info_md = f"""
191
- ### 🎧 Sound
192
- - **ID**: `{sid}`
193
- - **Nom**: {title}
194
- - **Auteur**: {user}
195
- - **Tags**: {", ".join(tags[:25])}{' …' if len(tags) > 25 else ''}
196
 
197
- ### 🔮 Prédiction
198
- - **Classe prédite**: **{label}**
199
- """ + (f"- **Confiance (max proba)**: `{proba:.3f}`\n" if proba is not None else "")
 
200
 
201
- audio = preview_url if preview_url else None
 
 
 
202
 
203
- # Option: montrer aussi le vecteur X (1 ligne) si tu veux
204
- # df_x = X.T.reset_index().rename(columns={"index": "feature", 0: "value"})
205
- # return info_md, audio, df_features, df_x
 
206
 
207
- return info_md, audio, df_features
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
 
210
  # =========================
211
  # UI
212
  # =========================
213
- with gr.Blocks(title="FreeSound ID → Metadata + Prediction") as demo:
214
- gr.Markdown("# FreeSound : Métadonnées → Features → Prédiction")
 
 
 
215
 
216
- with gr.Row():
217
- sound_id_in = gr.Textbox(label="Sound ID", placeholder="ex: 123456", scale=2)
218
- btn = gr.Button("Récupérer & prédire", scale=1)
219
 
220
- info_out = gr.Markdown()
221
- audio_out = gr.Audio(label="Preview (si dispo)", interactive=False)
222
- features_out = gr.Dataframe(label="Features utilisées (valeurs FreeSound)", interactive=False)
223
 
224
- btn.click(fn=run, inputs=[sound_id_in], outputs=[info_out, audio_out, features_out])
225
- sound_id_in.submit(fn=run, inputs=[sound_id_in], outputs=[info_out, audio_out, features_out])
226
 
227
- if __name__ == "__main__":
228
- demo.launch()
 
2
  import time
3
  import requests
4
  import pandas as pd
5
+ import numpy as np
6
  import gradio as gr
7
+ from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
8
+ from sklearn.feature_extraction.text import HashingVectorizer
9
+ from sklearn.preprocessing import OneHotEncoder
10
+
11
 
12
  # =========================
13
  # CONFIG
14
  # =========================
15
+ API_TOKEN = "zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK"
16
+ BASE_URL = "https://freesound.org/apiv2"
 
 
17
  TIMEOUT = (6, 20)
18
 
 
19
  SESSION = requests.Session()
20
+ SESSION.headers.update({"Authorization": f"Token {API_TOKEN}"})
21
+
 
22
 
23
  # =========================
24
+ # API FREESOUND
25
  # =========================
26
+ def fetch_sound(sound_id: int):
27
+ url = f"{BASE_URL}/sounds/{sound_id}/"
28
+ params = {
29
+ "fields": (
30
+ "id,name,username,description,tags,created,"
31
+ "duration,num_downloads,avg_rating,"
32
+ "category,subcategory,license,type"
33
+ )
34
+ }
35
+
36
+ r = SESSION.get(url, params=params, timeout=TIMEOUT)
37
+ if r.status_code != 200:
38
+ raise RuntimeError(f"Erreur API {r.status_code}")
39
+ return r.json()
40
 
41
 
42
  # =========================
43
+ # PREPROCESSING (ONLINE)
44
  # =========================
45
+ def discretize_num_downloads(x):
46
+ if x < 100:
47
+ return "Low"
48
+ elif x < 1000:
49
+ return "Medium"
50
+ else:
51
+ return "High"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
 
54
+ def discretize_avg_rating(x):
55
+ if x == 0 or pd.isna(x):
56
+ return "MissedInfo"
57
+ elif x < 2.5:
58
+ return "Low"
59
+ elif x < 3.8:
60
+ return "Medium"
61
+ else:
62
+ return "High"
63
 
 
 
 
 
64
 
65
+ def preprocess_metadata(sound: dict):
66
+ out = {}
67
 
68
+ # ---- Targets (debug) ----
69
+ out["num_downloads_class"] = discretize_num_downloads(sound["num_downloads"])
70
+ out["avg_rating_class"] = discretize_avg_rating(sound["avg_rating"])
71
 
72
+ # ---- Numériques ----
73
+ out["duration_log"] = np.log1p(sound["duration"])
74
+ out["num_downloads_log"] = np.log1p(sound["num_downloads"])
75
 
76
+ # ---- Created → age_days ----
77
+ created = pd.to_datetime(sound["created"], errors="coerce")
78
+ age_days = (pd.Timestamp.now() - created).days if pd.notna(created) else 0
79
+ out["age_days_log"] = np.log1p(age_days)
80
 
81
+ # ---- Username freq (proxy) ----
82
+ out["username_len"] = len(sound["username"]) if sound["username"] else 0
 
 
 
83
 
84
+ # ---- Name ----
85
+ name = sound["name"].lower()
86
+ out["name_len"] = len(name)
 
 
 
87
 
88
+ hv = HashingVectorizer(n_features=8, alternate_sign=False)
89
+ name_vec = hv.transform([name]).toarray()[0]
90
+ for i, v in enumerate(name_vec):
91
+ out[f"name_vec_{i}"] = v
92
 
93
+ # ---- Tags (simple multi-hot) ----
94
+ tags = sound["tags"][:5] # limiter
95
+ for t in tags:
96
+ out[f"tag_{t}"] = 1
97
 
98
+ # ---- Catégories ----
99
+ for col in ["category", "subcategory", "license", "type"]:
100
+ val = sound.get(col) or "Unknown"
101
+ out[f"{col}_{val}"] = 1
102
 
103
+ return out
104
+
105
+
106
+ # =========================
107
+ # PIPELINE GRADIO
108
+ # =========================
109
+ def run(sound_id):
110
+ if not str(sound_id).isdigit():
111
+ raise gr.Error("ID invalide")
112
+
113
+ sound = fetch_sound(int(sound_id))
114
+
115
+ # AVANT
116
+ before_df = pd.DataFrame.from_dict(sound, orient="index", columns=["value"])
117
+
118
+ # APRÈS
119
+ processed = preprocess_metadata(sound)
120
+ after_df = pd.DataFrame.from_dict(processed, orient="index", columns=["value"])
121
+
122
+ return before_df, after_df
123
 
124
 
125
  # =========================
126
  # UI
127
  # =========================
128
+ with gr.Blocks(title="Metadata preprocessing FreeSound") as demo:
129
+ gr.Markdown("""
130
+ # 🎧 FreeSound – Prétraitement Metadata
131
+ **Objectif :** visualiser les features **avant** et **après** preprocessing
132
+ """)
133
 
134
+ sound_id = gr.Textbox(label="Sound ID", placeholder="ex: 123456")
135
+ btn = gr.Button("Analyser")
 
136
 
137
+ with gr.Row():
138
+ before = gr.Dataframe(label="AVANT preprocessing (brut FreeSound)")
139
+ after = gr.Dataframe(label="APRÈS preprocessing (features modèle)")
140
 
141
+ btn.click(run, sound_id, [before, after])
 
142
 
143
+ demo.launch()