IKRAMELHADI commited on
Commit
49de9df
·
1 Parent(s): 4ad7378
Files changed (1) hide show
  1. app.py +282 -112
app.py CHANGED
@@ -1,143 +1,313 @@
 
 
 
1
  import os
 
2
  import time
3
- import requests
4
- import pandas as pd
 
5
  import numpy as np
 
 
6
  import gradio as gr
7
- from sklearn.preprocessing import KBinsDiscretizer, StandardScaler
8
- from sklearn.feature_extraction.text import HashingVectorizer
9
- from sklearn.preprocessing import OneHotEncoder
10
-
11
-
12
- # =========================
13
- # CONFIG
14
- # =========================
15
- API_TOKEN = "zE9NjEOgUMzH9K7mjiGBaPJiNwJLjSM53LevarRK"
16
- BASE_URL = "https://freesound.org/apiv2"
17
- TIMEOUT = (6, 20)
18
-
19
- SESSION = requests.Session()
20
- SESSION.headers.update({"Authorization": f"Token {API_TOKEN}"})
21
-
22
-
23
- # =========================
24
- # API FREESOUND
25
- # =========================
26
- def fetch_sound(sound_id: int):
27
- url = f"{BASE_URL}/sounds/{sound_id}/"
28
- params = {
29
- "fields": (
30
- "id,name,username,description,tags,created,"
31
- "duration,num_downloads,avg_rating,"
32
- "category,subcategory,license,type"
33
- )
34
- }
35
-
36
- r = SESSION.get(url, params=params, timeout=TIMEOUT)
37
- if r.status_code != 200:
38
- raise RuntimeError(f"Erreur API {r.status_code}")
39
- return r.json()
40
-
41
-
42
- # =========================
43
- # PREPROCESSING (ONLINE)
44
- # =========================
45
- def discretize_num_downloads(x):
46
- if x < 100:
47
- return "Low"
48
- elif x < 1000:
49
- return "Medium"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  else:
51
- return "High"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
 
53
 
54
- def discretize_avg_rating(x):
55
- if x == 0 or pd.isna(x):
56
- return "MissedInfo"
57
- elif x < 2.5:
58
- return "Low"
59
- elif x < 3.8:
60
- return "Medium"
61
- else:
62
- return "High"
63
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
- def preprocess_metadata(sound: dict):
66
- out = {}
 
 
67
 
68
- # ---- Targets (debug) ----
69
- out["num_downloads_class"] = discretize_num_downloads(sound["num_downloads"])
70
- out["avg_rating_class"] = discretize_avg_rating(sound["avg_rating"])
 
 
71
 
72
- # ---- Numériques ----
73
- out["duration_log"] = np.log1p(sound["duration"])
74
- out["num_downloads_log"] = np.log1p(sound["num_downloads"])
 
 
75
 
76
- # ---- Created age_days ----
77
- created = pd.to_datetime(sound["created"], errors="coerce")
78
- age_days = (pd.Timestamp.now() - created).days if pd.notna(created) else 0
79
- out["age_days_log"] = np.log1p(age_days)
 
 
 
 
 
80
 
81
- # ---- Username freq (proxy) ----
82
- out["username_len"] = len(sound["username"]) if sound["username"] else 0
 
 
83
 
84
- # ---- Name ----
85
- name = sound["name"].lower()
86
- out["name_len"] = len(name)
 
 
 
 
87
 
88
- hv = HashingVectorizer(n_features=8, alternate_sign=False)
89
- name_vec = hv.transform([name]).toarray()[0]
90
- for i, v in enumerate(name_vec):
91
- out[f"name_vec_{i}"] = v
92
 
93
- # ---- Tags (simple multi-hot) ----
94
- tags = sound["tags"][:5] # limiter
95
- for t in tags:
96
- out[f"tag_{t}"] = 1
97
 
98
- # ---- Catégories ----
99
- for col in ["category", "subcategory", "license", "type"]:
100
- val = sound.get(col) or "Unknown"
101
- out[f"{col}_{val}"] = 1
102
 
103
- return out
104
 
105
 
106
- # =========================
107
- # PIPELINE GRADIO
108
- # =========================
109
- def run(sound_id):
110
- if not str(sound_id).isdigit():
111
- raise gr.Error("ID invalide")
112
 
113
- sound = fetch_sound(int(sound_id))
 
 
114
 
115
- # AVANT
116
- before_df = pd.DataFrame.from_dict(sound, orient="index", columns=["value"])
 
117
 
118
- # APRÈS
119
- processed = preprocess_metadata(sound)
120
- after_df = pd.DataFrame.from_dict(processed, orient="index", columns=["value"])
121
 
122
- return before_df, after_df
123
 
 
124
 
125
- # =========================
126
- # UI
127
- # =========================
128
- with gr.Blocks(title="Metadata preprocessing FreeSound") as demo:
129
- gr.Markdown("""
130
- # 🎧 FreeSound – Prétraitement Metadata
131
- **Objectif :** visualiser les features **avant** et **après** preprocessing
132
- """)
133
 
134
- sound_id = gr.Textbox(label="Sound ID", placeholder="ex: 123456")
135
- btn = gr.Button("Analyser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
136
 
137
  with gr.Row():
138
- before = gr.Dataframe(label="AVANT preprocessing (brut FreeSound)")
139
- after = gr.Dataframe(label="APRÈS preprocessing (features modèle)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
140
 
141
- btn.click(run, sound_id, [before, after])
 
 
 
 
142
 
143
- demo.launch()
 
 
1
+ # freesound_preprocess_ui.py
2
+ # -*- coding: utf-8 -*-
3
+
4
  import os
5
+ import re
6
  import time
7
+ import urllib.parse
8
+ from typing import Any, Dict, Tuple, Optional, List
9
+
10
  import numpy as np
11
+ import pandas as pd
12
+ import requests
13
  import gradio as gr
14
+
15
+ from sklearn.feature_extraction.text import TfidfVectorizer
16
+ from sklearn.preprocessing import StandardScaler
17
+
18
+
19
+ # ----------------------------
20
+ # Robust network helpers
21
+ # ----------------------------
22
+
23
+ DEFAULT_TIMEOUT = 20
24
+
25
+ def _session() -> requests.Session:
26
+ s = requests.Session()
27
+ s.headers.update({
28
+ "User-Agent": "Mozilla/5.0 (freesound-metadata-preprocess/1.0)",
29
+ "Accept": "application/json,text/plain,*/*",
30
+ "Connection": "keep-alive",
31
+ })
32
+ return s
33
+
34
+ def fetch_json_with_retry(
35
+ url: str,
36
+ headers: Dict[str, str],
37
+ max_retries: int = 6,
38
+ base_sleep: float = 0.8,
39
+ timeout: int = DEFAULT_TIMEOUT,
40
+ ) -> Dict[str, Any]:
41
+ """
42
+ GET JSON robuste: gère 429 (rate limit), 5xx et déconnexions.
43
+ """
44
+ sess = _session()
45
+ last_err = None
46
+
47
+ for attempt in range(max_retries):
48
+ try:
49
+ resp = sess.get(url, headers=headers, timeout=timeout)
50
+
51
+ # rate limit
52
+ if resp.status_code == 429:
53
+ time.sleep(base_sleep * (2 ** attempt))
54
+ continue
55
+
56
+ # serveur instable
57
+ if resp.status_code >= 500:
58
+ time.sleep(base_sleep * (2 ** attempt))
59
+ continue
60
+
61
+ resp.raise_for_status()
62
+ return resp.json()
63
+
64
+ except Exception as e:
65
+ last_err = e
66
+ time.sleep(base_sleep * (2 ** attempt))
67
+
68
+ raise RuntimeError(f"Échec requête après {max_retries} essais. Dernière erreur: {last_err}")
69
+
70
+
71
+ # ----------------------------
72
+ # URL -> sound_id -> API endpoint
73
+ # ----------------------------
74
+
75
+ def sound_id_from_freesound_page(url: str) -> int:
76
+ """
77
+ Extrait l'ID depuis une URL FreeSound de page son:
78
+ https://freesound.org/people/.../sounds/<id>/
79
+ """
80
+ u = url.strip()
81
+ u = urllib.parse.unquote(u)
82
+
83
+ m = re.search(r"freesound\.org\/.*\/sounds\/(\d+)\/?", u)
84
+ if not m:
85
+ # si l'utilisateur colle juste l'ID (optionnel)
86
+ if re.fullmatch(r"\d+", u):
87
+ return int(u)
88
+ raise ValueError("URL non reconnue. Colle l’URL FreeSound du son (page), ex: .../sounds/844708/")
89
+ return int(m.group(1))
90
+
91
+ def api_url_from_sound_id(sound_id: int) -> str:
92
+ return f"https://freesound.org/apiv2/sounds/{sound_id}/"
93
+
94
+
95
+ # ----------------------------
96
+ # Preprocessing helpers
97
+ # ----------------------------
98
+
99
+ def clean_tags(tags: Any) -> str:
100
+ """
101
+ Nettoie tags :
102
+ - support list ou str
103
+ - décode %3B etc
104
+ - split sur ; , espace
105
+ - lower
106
+ - supprime doublons
107
+ """
108
+ if tags is None:
109
+ return ""
110
+
111
+ if isinstance(tags, list):
112
+ raw = " ".join([str(t) for t in tags])
113
  else:
114
+ raw = str(tags)
115
+
116
+ raw = urllib.parse.unquote(raw)
117
+ raw = raw.replace(",", " ").replace(";", " ").replace("|", " ")
118
+ raw = re.sub(r"\s+", " ", raw).strip().lower()
119
+
120
+ toks = [t for t in raw.split(" ") if t]
121
+ toks = [t for t in toks if len(t) >= 2]
122
+
123
+ seen = set()
124
+ out = []
125
+ for t in toks:
126
+ if t not in seen:
127
+ seen.add(t)
128
+ out.append(t)
129
+ return " ".join(out)
130
+
131
+ def clean_text(x: Any) -> str:
132
+ if x is None:
133
+ return ""
134
+ s = str(x)
135
+ s = urllib.parse.unquote(s)
136
+ s = s.lower()
137
+ s = re.sub(r"\s+", " ", s).strip()
138
+ return s
139
+
140
+ def safe_num(x: Any) -> float:
141
+ try:
142
+ if x is None:
143
+ return 0.0
144
+ return float(x)
145
+ except Exception:
146
+ return 0.0
147
+
148
+ def safe_len_list(x: Any) -> int:
149
+ if isinstance(x, list):
150
+ return len(x)
151
+ return 0
152
+
153
+
154
+ # ----------------------------
155
+ # Extract raw features (before)
156
+ # ----------------------------
157
+
158
+ RAW_COLUMNS = [
159
+ "id", "name", "username", "license", "created",
160
+ "description", "tags",
161
+ "duration", "samplerate", "bitrate", "bitdepth", "channels",
162
+ "filesize", "type",
163
+ "num_downloads", "num_ratings", "avg_rating",
164
+ ]
165
+
166
+ def extract_raw_df(sound_json: Dict[str, Any]) -> pd.DataFrame:
167
+ row = {k: sound_json.get(k) for k in RAW_COLUMNS}
168
+
169
+ # certains champs peuvent être absents selon droits/endpoint
170
+ if "tags" not in row:
171
+ row["tags"] = sound_json.get("tags")
172
 
173
+ return pd.DataFrame([row])
174
 
 
 
 
 
 
 
 
 
 
175
 
176
+ # ----------------------------
177
+ # Build "after preprocessing" features
178
+ # ----------------------------
179
+
180
+ def build_after_features(raw_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.DataFrame]:
181
+ """
182
+ Retourne:
183
+ - after_readable_df : colonnes interprétables (nettoyées + dérivées)
184
+ - after_vector_df : features vectorisées (TFIDF + numeric scaled) pour "voir" l’embedding
185
+ """
186
+ df = raw_df.copy()
187
 
188
+ # Nettoyages
189
+ df["tags_clean"] = df["tags"].apply(clean_tags)
190
+ df["name_clean"] = df["name"].apply(clean_text)
191
+ df["desc_clean"] = df["description"].apply(clean_text)
192
 
193
+ # Features dérivées (lisibles)
194
+ df["num_tags"] = df["tags"].apply(safe_len_list)
195
+ df["name_len"] = df["name_clean"].apply(lambda s: len(s))
196
+ df["desc_len"] = df["desc_clean"].apply(lambda s: len(s))
197
+ df["text_all"] = (df["name_clean"].fillna("") + " " + df["desc_clean"].fillna("") + " " + df["tags_clean"].fillna("")).str.strip()
198
 
199
+ # Numeric basic
200
+ numeric_cols = ["duration", "samplerate", "bitrate", "bitdepth", "channels", "filesize", "num_downloads", "num_ratings", "avg_rating",
201
+ "num_tags", "name_len", "desc_len"]
202
+ for c in numeric_cols:
203
+ df[c] = df[c].apply(safe_num)
204
 
205
+ # 1) after_readable_df (ce que tu veux lire facilement)
206
+ after_readable_cols = [
207
+ "id", "type", "license", "created",
208
+ "name_clean", "tags_clean",
209
+ "duration", "samplerate", "channels", "filesize",
210
+ "num_downloads", "num_ratings", "avg_rating",
211
+ "num_tags", "name_len", "desc_len",
212
+ ]
213
+ after_readable_df = df[after_readable_cols].copy()
214
 
215
+ # 2) vectorisation texte (TF-IDF) + standardisation numeric
216
+ # Sur un seul son, TF-IDF marche quand même (tu verras les termes présents).
217
+ tfidf = TfidfVectorizer(max_features=60, ngram_range=(1, 2))
218
+ X_text = tfidf.fit_transform(df["text_all"].fillna(""))
219
 
220
+ # Numeric scaling
221
+ scaler = StandardScaler()
222
+ X_num = scaler.fit_transform(df[numeric_cols].to_numpy())
223
+
224
+ # Assemble en DataFrame pour affichage
225
+ text_feature_names = [f"tfidf:{t}" for t in tfidf.get_feature_names_out()]
226
+ X_text_dense = X_text.toarray()
227
 
228
+ num_feature_names = [f"num:{c}" for c in numeric_cols]
 
 
 
229
 
230
+ all_features = np.concatenate([X_num, X_text_dense], axis=1)
231
+ all_names = num_feature_names + text_feature_names
 
 
232
 
233
+ after_vector_df = pd.DataFrame(all_features, columns=all_names)
 
 
 
234
 
235
+ return after_readable_df, after_vector_df
236
 
237
 
238
+ # ----------------------------
239
+ # Main analysis function
240
+ # ----------------------------
 
 
 
241
 
242
+ def analyze(url: str, api_key: str) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame]:
243
+ if not url or not url.strip():
244
+ raise ValueError("Colle l’URL du son FreeSound.")
245
 
246
+ api_key = (api_key or "").strip() or os.environ.get("FREESOUND_API_KEY", "").strip()
247
+ if not api_key:
248
+ raise ValueError("Il faut une clé FreeSound API. Mets-la dans le champ 'API key' ou dans FREESOUND_API_KEY.")
249
 
250
+ sound_id = sound_id_from_freesound_page(url)
251
+ api_url = api_url_from_sound_id(sound_id)
 
252
 
253
+ headers = {"Authorization": f"Token {api_key}"}
254
 
255
+ sound_json = fetch_json_with_retry(api_url, headers=headers)
256
 
257
+ before_df = extract_raw_df(sound_json)
 
 
 
 
 
 
 
258
 
259
+ after_readable_df, after_vector_df = build_after_features(before_df)
260
+
261
+ # Bonus: afficher seulement les top features TF-IDF non-nulles
262
+ # (sur un seul sample, c'est plus clair)
263
+ nonzero = after_vector_df.loc[0]
264
+ top = nonzero[nonzero != 0].sort_values(key=lambda s: np.abs(s), ascending=False).head(30)
265
+ top_df = top.reset_index()
266
+ top_df.columns = ["feature", "value"]
267
+
268
+ return before_df, after_readable_df, top_df
269
+
270
+
271
+ # ----------------------------
272
+ # Gradio UI
273
+ # ----------------------------
274
+
275
+ with gr.Blocks(title="FreeSound - Prétraitement Metadata") as demo:
276
+ gr.Markdown("## 🎧 FreeSound – Prétraitement Metadata\n"
277
+ "Objectif : **visualiser les features AVANT et APRÈS preprocessing**.\n\n"
278
+ "- Entrée = **URL du son FreeSound** (page)\n"
279
+ "- Sorties = **tableau avant**, **tableau après**, **top features (vectorisées)**")
280
 
281
  with gr.Row():
282
+ url_in = gr.Textbox(
283
+ label="URL du son FreeSound",
284
+ placeholder="https://freesound.org/people/.../sounds/844708/",
285
+ value="",
286
+ )
287
+
288
+ api_in = gr.Textbox(
289
+ label="API key (Token) FreeSound (optionnel si FREESOUND_API_KEY est set)",
290
+ placeholder="Colle ta clé ici (Token ...)",
291
+ type="password",
292
+ value="",
293
+ )
294
+
295
+ btn = gr.Button("Analyser")
296
+
297
+ gr.Markdown("### Avant (raw metadata)")
298
+ before_out = gr.Dataframe(interactive=False, wrap=True)
299
+
300
+ gr.Markdown("### Après (nettoyé + features dérivées lisibles)")
301
+ after_out = gr.Dataframe(interactive=False, wrap=True)
302
+
303
+ gr.Markdown("### Top features après vectorisation (num + TF-IDF) — valeurs non nulles")
304
+ top_out = gr.Dataframe(interactive=False, wrap=True)
305
 
306
+ btn.click(
307
+ fn=analyze,
308
+ inputs=[url_in, api_in],
309
+ outputs=[before_out, after_out, top_out],
310
+ )
311
 
312
+ if __name__ == "__main__":
313
+ demo.launch(server_name="0.0.0.0", server_port=7860)