SonicaB commited on
Commit
a28a011
·
verified ·
1 Parent(s): aedd0b6

Upload folder using huggingface_hub

Browse files
Files changed (4) hide show
  1. README.md +25 -1
  2. README.space-api.md +25 -1
  3. fusion-app/app_local.py +267 -34
  4. requirements.txt +2 -0
README.md CHANGED
@@ -4,6 +4,30 @@ emoji: 🎬
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
- app_file: fusion-app/app_api.py
8
  pinned: false
 
 
 
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
+ app_file: fusion-app/app_local.py
8
  pinned: false
9
+ hf_oauth: true
10
+ hf_oauth_scopes:
11
+ - inference-api
12
  ---
13
+
14
+ # 🎬 Scene Mood Classifier - API Mode
15
+
16
+ Upload a video or image+audio pair to classify the mood/atmosphere of the scene.
17
+
18
+ ## This Space
19
+
20
+ This space demonstrates the **API-based** version using HuggingFace Inference API.
21
+
22
+ **Note**: You can also use **Local Mode** without signing in - just uncheck the "Use API Mode" checkbox.
23
+
24
+ ## How to Use
25
+
26
+ 1. **Sign in with Hugging Face** (click the button in the header)
27
+ 2. The "Use API Mode" checkbox is available - check it to use API mode
28
+ 3. Upload your video or image+audio pair
29
+ 4. Click "Analyze"
30
+
31
+ ## About
32
+
33
+ This app demonstrates multimodal fusion for scene mood classification using CLIP (vision) and Wav2Vec2 (audio) models via the HuggingFace Inference API.
README.space-api.md CHANGED
@@ -4,6 +4,30 @@ emoji: 🎬
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
- app_file: fusion-app/app_api.py
8
  pinned: false
 
 
 
9
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4
  colorFrom: indigo
5
  colorTo: purple
6
  sdk: gradio
7
+ app_file: fusion-app/app_local.py
8
  pinned: false
9
+ hf_oauth: true
10
+ hf_oauth_scopes:
11
+ - inference-api
12
  ---
13
+
14
+ # 🎬 Scene Mood Classifier - API Mode
15
+
16
+ Upload a video or image+audio pair to classify the mood/atmosphere of the scene.
17
+
18
+ ## This Space
19
+
20
+ This space demonstrates the **API-based** version using HuggingFace Inference API.
21
+
22
+ **Note**: You can also use **Local Mode** without signing in - just uncheck the "Use API Mode" checkbox.
23
+
24
+ ## How to Use
25
+
26
+ 1. **Sign in with Hugging Face** (click the button in the header)
27
+ 2. The "Use API Mode" checkbox is available - check it to use API mode
28
+ 3. Upload your video or image+audio pair
29
+ 4. Click "Analyze"
30
+
31
+ ## About
32
+
33
+ This app demonstrates multimodal fusion for scene mood classification using CLIP (vision) and Wav2Vec2 (audio) models via the HuggingFace Inference API.
fusion-app/app_local.py CHANGED
@@ -1,16 +1,143 @@
1
  import gradio as gr
2
- import json
 
3
  from pathlib import Path
 
 
4
  from utils_media import video_to_frame_audio, load_audio_16k, log_inference
5
  from fusion import clip_image_probs, wav2vec2_embed_energy, wav2vec2_zero_shot_probs, audio_prior_from_rms, fuse_probs, top1_label_from_probs
 
6
 
7
  HERE = Path(__file__).parent
8
  lables_PATH = HERE / "labels.json"
9
  CSV_API = HERE / "runs_api.csv"
 
10
  lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
 
11
 
12
- # lables = [x ["name"] for x in json.load(Path("fusion-app/labels.json").read_text())["labels"]]
 
 
13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  def predict_vid(video, alpha=0.7):
15
  import time, numpy as np
16
  t0 = time.time()
@@ -45,10 +172,10 @@ def predict_vid(video, alpha=0.7):
45
  "duration_s": round(float(meta.get("duration_s") or 0.0), 2),
46
  }
47
  print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
48
- log_inference(engine="local", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path="runs_local.csv")
49
  return pred, probs, lat
50
 
51
- def predict_image_audio(image, audio_path, alpha=0.7):
52
  import time, numpy as np
53
  t0 = time.time()
54
  wave = load_audio_16k(audio_path)
@@ -78,46 +205,152 @@ def predict_image_audio(image, audio_path, alpha=0.7):
78
  "rms": round(float(rms), 4),
79
  }
80
  print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
81
- log_inference(engine="local", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path="runs_local.csv")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return pred, probs, lat
83
 
 
 
 
84
 
85
- with gr.Blocks(title="Scene Mood Detection") as demo:
86
- gr.Markdown("# Scene Mood Classifier - Local \nUpload a short **video** or an **image + audio** pair.")
87
- with gr.Tab("Video"):
88
- v = gr.Video(sources=["upload"], height=240)
89
 
 
 
 
90
 
91
- # Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
92
- alpha_v = gr.Slider(
93
- minimum=0.0, maximum=1.0, value=0.7, step=0.05,
94
- label="Fusion weight α (image ↔ audio)",
95
- info="α=1 trusts image only; α=0 trusts audio only."
96
- )
97
-
98
 
99
- btn_v = gr.Button("Analyze")
100
- out_v1 = gr.Label(label="Prediction")
101
- out_v2 = gr.JSON(label="Probabilities")
102
- out_v3 = gr.JSON(label="Latency (ms)")
103
- btn_v.click(predict_vid, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
104
 
105
- with gr.Tab("Image + Audio"):
106
- img = gr.Image(type="pil", height=240)
107
- aud = gr.Audio(sources=["upload"], type="filepath")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
- # Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image audio)" and info "α=1 trusts image only; α=0 trusts audio only."
110
- alpha_ia = gr.Slider(
111
- minimum=0.0, maximum=1.0, value=0.7, step=0.05,
112
- label="Fusion weight α (image audio)",
113
- info="α=1 trusts image only; α=0 trusts audio only."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  )
115
 
116
- btn_ia = gr.Button("Analyze")
117
- out_i1 = gr.Label(label="Prediction")
118
- out_i2 = gr.JSON(label="Probabilities")
119
- out_i3 = gr.JSON(label="Latency (ms)")
120
- btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
  if __name__ == "__main__":
123
  demo.launch()
 
1
  import gradio as gr
2
+ import json, os, time, requests, io
3
+ import numpy as np
4
  from pathlib import Path
5
+ from PIL import Image
6
+ from pydub import AudioSegment
7
  from utils_media import video_to_frame_audio, load_audio_16k, log_inference
8
  from fusion import clip_image_probs, wav2vec2_embed_energy, wav2vec2_zero_shot_probs, audio_prior_from_rms, fuse_probs, top1_label_from_probs
9
+ from fusion import _ensure_audio_prototypes, _proto_embs
10
 
11
  HERE = Path(__file__).parent
12
  lables_PATH = HERE / "labels.json"
13
  CSV_API = HERE / "runs_api.csv"
14
+ CSV_LOCAL = HERE / "runs_local.csv"
15
  lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
16
+ prompts = [x["prompt"] for x in json.loads(lables_PATH.read_text())["labels"]]
17
 
18
+ # API Models
19
+ CLIP_MODEL = "openai/clip-vit-base-patch32"
20
+ W2V2_MODEL = "facebook/wav2vec2-base"
21
 
22
+ # Global HF Token - will be set by user login
23
+ USER_HF_TOKEN = None
24
+
25
+ # ============= API Helper Functions =============
26
+ def _img_to_jpeg_bytes(pil: Image.Image) -> bytes:
27
+ buf = io.BytesIO()
28
+ pil.convert("RGB").save(buf, format="JPEG", quality=90)
29
+ return buf.getvalue()
30
+
31
+ def clip_api_probs(pil: Image.Image, prompts_list=None):
32
+ if prompts_list is None:
33
+ prompts_list = prompts
34
+ if USER_HF_TOKEN is None:
35
+ raise RuntimeError("Please sign in with your HuggingFace token to use API mode.")
36
+
37
+ try:
38
+ img_bytes = _img_to_jpeg_bytes(pil)
39
+ url = f"https://api-inference.huggingface.co/models/{CLIP_MODEL}"
40
+ headers = {"Authorization": f"Bearer {USER_HF_TOKEN}"}
41
+ payload = {
42
+ "parameters": {
43
+ "candidate_labels": prompts_list,
44
+ "hypothesis_template": "{}"
45
+ }
46
+ }
47
+ files = {"file": ("image.jpg", img_bytes, "image/jpeg")}
48
+ data = {"inputs": "", "parameters": json.dumps(payload["parameters"])}
49
+
50
+ response = requests.post(url, headers=headers, files=files, data=data, timeout=60)
51
+ response.raise_for_status()
52
+ result = response.json()
53
+
54
+ if isinstance(result, list) and len(result) > 0:
55
+ scores = {item["label"]: item["score"] for item in result}
56
+ else:
57
+ scores = {p: 1.0/len(prompts_list) for p in prompts_list}
58
+
59
+ arr = np.array([scores.get(p, 0.0) for p in prompts_list], dtype=np.float32)
60
+ s = arr.sum()
61
+ arr = arr / s if s > 0 else np.ones_like(arr)/len(arr)
62
+ return arr
63
+ except Exception as e:
64
+ print(f"CLIP API error: {e}")
65
+ return np.ones(len(prompts_list), dtype=np.float32) / len(prompts_list)
66
+
67
+ def _wave_float32_to_wav_bytes(wave_16k: np.ndarray, sr=16000) -> bytes:
68
+ samples = (np.clip(wave_16k, -1, 1) * 32767.0).astype(np.int16)
69
+ seg = AudioSegment(samples.tobytes(), frame_rate=sr, sample_width=2, channels=1)
70
+ out = io.BytesIO()
71
+ seg.export(out, format="wav")
72
+ return out.getvalue()
73
+
74
+ def w2v2_api_embed(wave_16k: np.ndarray) -> np.ndarray:
75
+ if USER_HF_TOKEN is None:
76
+ raise RuntimeError("Please sign in with your HuggingFace token to use API mode.")
77
+
78
+ wav_bytes = _wave_float32_to_wav_bytes(wave_16k)
79
+ url = f"https://api-inference.huggingface.co/models/{W2V2_MODEL}"
80
+ hdrs = {"Authorization": f"Bearer {USER_HF_TOKEN}"}
81
+ r = requests.post(url, headers=hdrs, data=wav_bytes, timeout=60)
82
+ r.raise_for_status()
83
+ arr = np.asarray(r.json(), dtype=np.float32)
84
+ if arr.ndim == 3:
85
+ arr = arr[0]
86
+ vec = arr.mean(axis=0)
87
+ n = np.linalg.norm(vec) + 1e-8
88
+ return (vec / n).astype(np.float32)
89
+
90
+ _PROTO_EMBS_API = None
91
+
92
+ def _synthesize_audio_prototypes_api(sr=16000, dur=2.0):
93
+ def _sine(sr, freq, dur, amp=0.2):
94
+ t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
95
+ return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
96
+
97
+ def _burst_noise(sr, dur, amp=0.2):
98
+ x = np.random.randn(int(sr*dur)).astype(np.float32)
99
+ n = x.size
100
+ env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
101
+ env = np.pad(env, (0, n-env.size), constant_values=1.0)
102
+ env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
103
+ return (amp * x * env).astype(np.float32)
104
+
105
+ def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
106
+ third = 3/2 if minor else 4/3
107
+ w = (_sine(sr, base, dur, amp)
108
+ + _sine(sr, base*third, dur, amp*0.7)
109
+ + _sine(sr, base*2, dur, amp*0.5))
110
+ return (w / (np.max(np.abs(w)) + 1e-6)).astype(np.float32)
111
+
112
+ return {
113
+ "calm": _sine(sr, 220, dur, amp=0.08),
114
+ "energetic": _burst_noise(sr, dur, amp=0.35),
115
+ "suspense": _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12),
116
+ "joyful": _triad(sr, 262, minor=False, dur=dur, amp=0.22),
117
+ "sad": _triad(sr, 262, minor=True, dur=dur, amp=0.20),
118
+ }
119
+
120
+ def _ensure_proto_embs_api():
121
+ global _PROTO_EMBS_API
122
+ if _PROTO_EMBS_API is not None:
123
+ return
124
+ waves = _synthesize_audio_prototypes_api()
125
+ embs = {}
126
+ for lbl, wav in waves.items():
127
+ e = w2v2_api_embed(wav)
128
+ embs[lbl] = e
129
+ _PROTO_EMBS_API = embs
130
+
131
+ def w2v2_api_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
132
+ _ensure_proto_embs_api()
133
+ emb = w2v2_api_embed(wave_16k)
134
+ sims = np.array([float(np.dot(emb, _PROTO_EMBS_API[lbl])) for lbl in lables], dtype=np.float32)
135
+ z = sims / max(1e-6, float(temperature))
136
+ z = z - z.max()
137
+ p = np.exp(z); p /= (p.sum() + 1e-8)
138
+ return p.astype(np.float32)
139
+
140
+ # ============= Local Prediction Functions =============
141
  def predict_vid(video, alpha=0.7):
142
  import time, numpy as np
143
  t0 = time.time()
 
172
  "duration_s": round(float(meta.get("duration_s") or 0.0), 2),
173
  }
174
  print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
175
+ log_inference(engine="local", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path=CSV_LOCAL)
176
  return pred, probs, lat
177
 
178
+ def predict_image_audio_local(image, audio_path, alpha=0.7):
179
  import time, numpy as np
180
  t0 = time.time()
181
  wave = load_audio_16k(audio_path)
 
205
  "rms": round(float(rms), 4),
206
  }
207
  print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
208
+ log_inference(engine="local", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path=CSV_LOCAL)
209
+ return pred, probs, lat
210
+
211
+ # ============= API Prediction Functions =============
212
+ def predict_vid_api(video, alpha=0.7):
213
+ if USER_HF_TOKEN is None:
214
+ return "Error: Please sign in first", {"error": "HuggingFace token required"}, {"error": "No token"}
215
+
216
+ t0 = time.time()
217
+ frames, wave, meta = video_to_frame_audio(video, target_frames=24, fps_cap=2.0)
218
+
219
+ t_img0 = time.time()
220
+ per_frame = [clip_api_probs(pil) for pil in frames]
221
+ p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
222
+ t_img = time.time() - t_img0
223
+
224
+ t_aud0 = time.time()
225
+ p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
226
+ t_aud = time.time() - t_aud0
227
+
228
+ t_fus0 = time.time()
229
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
230
+ t_fus = time.time() - t_fus0
231
+
232
+ pred = top1_label_from_probs(p)
233
+ probs = {k: round(float(v), 4) for k, v in zip(lables, p)}
234
+ lat = {
235
+ "t_image_ms": int(t_img*1000),
236
+ "t_audio_ms": int(t_aud*1000),
237
+ "t_fuse_ms": int(t_fus*1000),
238
+ "t_total_ms": int((time.time()-t0)*1000),
239
+ "n_frames": meta.get("n_frames"),
240
+ "fps_used": meta.get("fps_used"),
241
+ "duration_s": meta.get("duration_s"),
242
+ }
243
+ log_inference(engine="api", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path=CSV_API)
244
  return pred, probs, lat
245
 
246
+ def predict_image_audio_api(image, audio_path, alpha=0.7):
247
+ if USER_HF_TOKEN is None:
248
+ return "Error: Please sign in first", {"error": "HuggingFace token required"}, {"error": "No token"}
249
 
250
+ t0 = time.time()
251
+ wave = load_audio_16k(audio_path)
 
 
252
 
253
+ t_img0 = time.time()
254
+ p_img = clip_api_probs(image)
255
+ t_img = time.time() - t_img0
256
 
257
+ t_aud0 = time.time()
258
+ p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
259
+ t_aud = time.time() - t_aud0
 
 
 
 
260
 
261
+ t_fus0 = time.time()
262
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
263
+ t_fus = time.time() - t_fus0
 
 
264
 
265
+ pred = top1_label_from_probs(p)
266
+ probs = {k: round(float(v), 4) for k, v in zip(lables, p)}
267
+ lat = {
268
+ "t_image_ms": int(t_img*1000),
269
+ "t_audio_ms": int(t_aud*1000),
270
+ "t_fuse_ms": int(t_fus*1000),
271
+ "t_total_ms": int((time.time()-t0)*1000),
272
+ }
273
+ log_inference(engine="api", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs, csv_path=CSV_API)
274
+ return pred, probs, lat
275
+
276
+ # ============= Wrapper Functions with Mode Selection =============
277
+ def predict_video_wrapper(video, alpha, use_api, oauth_token: gr.OAuthToken | None):
278
+ global USER_HF_TOKEN
279
+ if use_api:
280
+ # Get user token from OAuth login
281
+ if oauth_token is not None:
282
+ USER_HF_TOKEN = oauth_token.token
283
+ return predict_vid_api(video, alpha)
284
+ else:
285
+ return predict_vid(video, alpha)
286
 
287
+ def predict_image_audio_wrapper(image, audio_path, alpha, use_api, oauth_token: gr.OAuthToken | None):
288
+ global USER_HF_TOKEN
289
+ if use_api:
290
+ # Get user token from OAuth login
291
+ if oauth_token is not None:
292
+ USER_HF_TOKEN = oauth_token.token
293
+ return predict_image_audio_api(image, audio_path, alpha)
294
+ else:
295
+ return predict_image_audio_local(image, audio_path, alpha)
296
+
297
+ # ============= Backward Compatibility Aliases for Tests =============
298
+ def predict_image_audio(image, audio_path, alpha=0.7):
299
+ """Backward compatible function for tests - uses local mode"""
300
+ return predict_image_audio_local(image, audio_path, alpha)
301
+
302
+ def predict_video(video, alpha=0.7):
303
+ """Backward compatible function for tests - uses local mode"""
304
+ return predict_vid(video, alpha)
305
+
306
+ # ============= Gradio Interface =============
307
+ # Only create demo if not being imported for testing
308
+ # Check for pytest in sys.modules to detect test environment
309
+ import sys
310
+ _is_testing = 'pytest' in sys.modules
311
+
312
+ if not _is_testing:
313
+ with gr.Blocks(title="Scene Mood Detection") as demo:
314
+ with gr.Row():
315
+ gr.Markdown("# 🎬 Scene Mood Classifier\nUpload a short **video** or an **image + audio** pair.")
316
+ gr.LoginButton()
317
+
318
+ gr.Markdown("💡 **Tip:** Sign in with HuggingFace to use API mode, or use Local mode without signing in.")
319
+ gr.Markdown("---")
320
+
321
+ # Mode Selection
322
+ use_api_mode = gr.Checkbox(
323
+ label="Use API Mode (requires sign-in)",
324
+ value=False,
325
+ info="Unchecked = Local models, Checked = API models"
326
  )
327
 
328
+ with gr.Tab("Video"):
329
+ v = gr.Video(sources=["upload"], height=240)
330
+ alpha_v = gr.Slider(
331
+ minimum=0.0, maximum=1.0, value=0.7, step=0.05,
332
+ label="Fusion weight α (image audio)",
333
+ info="α=1 trusts image only; α=0 trusts audio only."
334
+ )
335
+ btn_v = gr.Button("Analyze")
336
+ out_v1 = gr.Label(label="Prediction")
337
+ out_v2 = gr.JSON(label="Probabilities")
338
+ out_v3 = gr.JSON(label="Latency (ms)")
339
+ btn_v.click(predict_video_wrapper, inputs=[v, alpha_v, use_api_mode], outputs=[out_v1, out_v2, out_v3])
340
+
341
+ with gr.Tab("Image + Audio"):
342
+ img = gr.Image(type="pil", height=240)
343
+ aud = gr.Audio(sources=["upload"], type="filepath")
344
+ alpha_ia = gr.Slider(
345
+ minimum=0.0, maximum=1.0, value=0.7, step=0.05,
346
+ label="Fusion weight α (image ↔ audio)",
347
+ info="α=1 trusts image only; α=0 trusts audio only."
348
+ )
349
+ btn_ia = gr.Button("Analyze")
350
+ out_i1 = gr.Label(label="Prediction")
351
+ out_i2 = gr.JSON(label="Probabilities")
352
+ out_i3 = gr.JSON(label="Latency (ms)")
353
+ btn_ia.click(predict_image_audio_wrapper, inputs=[img, aud, alpha_ia, use_api_mode], outputs=[out_i1, out_i2, out_i3])
354
 
355
  if __name__ == "__main__":
356
  demo.launch()
requirements.txt CHANGED
@@ -9,3 +9,5 @@ numpy
9
  pytest
10
  huggingface_hub
11
  datasets
 
 
 
9
  pytest
10
  huggingface_hub
11
  datasets
12
+ requests
13
+ gradio[oauth]