SonicaB commited on
Commit
65f8788
·
verified ·
1 Parent(s): cb62d77

Upload folder using huggingface_hub

Browse files
.gitignore ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Byte-compiled / optimized / DLL files
2
+ __pycache__/
3
+ *.py[codz]
4
+ *$py.class
5
+
6
+ # C extensions
7
+ *.so
8
+
9
+ # Distribution / packaging
10
+ .Python
11
+ build/
12
+ develop-eggs/
13
+ dist/
14
+ downloads/
15
+ eggs/
16
+ .eggs/
17
+ lib/
18
+ lib64/
19
+ parts/
20
+ sdist/
21
+ var/
22
+ wheels/
23
+ share/python-wheels/
24
+ *.egg-info/
25
+ .installed.cfg
26
+ *.egg
27
+ MANIFEST
28
+
29
+ # PyInstaller
30
+ # Usually these files are written by a python script from a template
31
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
32
+ *.manifest
33
+ *.spec
34
+
35
+ # Installer logs
36
+ pip-log.txt
37
+ pip-delete-this-directory.txt
38
+
39
+ # Unit test / coverage reports
40
+ htmlcov/
41
+ .tox/
42
+ .nox/
43
+ .coverage
44
+ .coverage.*
45
+ .cache
46
+ nosetests.xml
47
+ coverage.xml
48
+ *.cover
49
+ *.py.cover
50
+ .hypothesis/
51
+ .pytest_cache/
52
+ cover/
53
+
54
+ # Translations
55
+ *.mo
56
+ *.pot
57
+
58
+ # Django stuff:
59
+ *.log
60
+ local_settings.py
61
+ db.sqlite3
62
+ db.sqlite3-journal
63
+
64
+ # Flask stuff:
65
+ instance/
66
+ .webassets-cache
67
+
68
+ # Scrapy stuff:
69
+ .scrapy
70
+
71
+ # Sphinx documentation
72
+ docs/_build/
73
+
74
+ # PyBuilder
75
+ .pybuilder/
76
+ target/
77
+
78
+ # Jupyter Notebook
79
+ .ipynb_checkpoints
80
+
81
+ # IPython
82
+ profile_default/
83
+ ipython_config.py
84
+
85
+ # pyenv
86
+ # For a library or package, you might want to ignore these files since the code is
87
+ # intended to run in multiple environments; otherwise, check them in:
88
+ # .python-version
89
+
90
+ # pipenv
91
+ # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
92
+ # However, in case of collaboration, if having platform-specific dependencies or dependencies
93
+ # having no cross-platform support, pipenv may install dependencies that don't work, or not
94
+ # install all needed dependencies.
95
+ #Pipfile.lock
96
+
97
+ # UV
98
+ # Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
99
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
100
+ # commonly ignored for libraries.
101
+ #uv.lock
102
+
103
+ # poetry
104
+ # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
105
+ # This is especially recommended for binary packages to ensure reproducibility, and is more
106
+ # commonly ignored for libraries.
107
+ # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
108
+ #poetry.lock
109
+ #poetry.toml
110
+
111
+ # pdm
112
+ # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
113
+ # pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
114
+ # https://pdm-project.org/en/latest/usage/project/#working-with-version-control
115
+ #pdm.lock
116
+ #pdm.toml
117
+ .pdm-python
118
+ .pdm-build/
119
+
120
+ # pixi
121
+ # Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
122
+ #pixi.lock
123
+ # Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
124
+ # in the .venv directory. It is recommended not to include this directory in version control.
125
+ .pixi
126
+
127
+ # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
128
+ __pypackages__/
129
+
130
+ # Celery stuff
131
+ celerybeat-schedule
132
+ celerybeat.pid
133
+
134
+ # SageMath parsed files
135
+ *.sage.py
136
+
137
+ # Environments
138
+ .env
139
+ .envrc
140
+ .venv
141
+ env/
142
+ venv/
143
+ ENV/
144
+ env.bak/
145
+ venv.bak/
146
+
147
+ # Spyder project settings
148
+ .spyderproject
149
+ .spyproject
150
+
151
+ # Rope project settings
152
+ .ropeproject
153
+
154
+ # mkdocs documentation
155
+ /site
156
+
157
+ # mypy
158
+ .mypy_cache/
159
+ .dmypy.json
160
+ dmypy.json
161
+
162
+ # Pyre type checker
163
+ .pyre/
164
+
165
+ # pytype static type analyzer
166
+ .pytype/
167
+
168
+ # Cython debug symbols
169
+ cython_debug/
170
+
171
+ # PyCharm
172
+ # JetBrains specific template is maintained in a separate JetBrains.gitignore that can
173
+ # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
174
+ # and can be added to the global gitignore or merged into this file. For a more nuclear
175
+ # option (not recommended) you can uncomment the following to ignore the entire idea folder.
176
+ #.idea/
177
+
178
+ # Abstra
179
+ # Abstra is an AI-powered process automation framework.
180
+ # Ignore directories containing user credentials, local state, and settings.
181
+ # Learn more at https://abstra.io/docs
182
+ .abstra/
183
+
184
+ # Visual Studio Code
185
+ # Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
186
+ # that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
187
+ # and can be added to the global gitignore or merged into this file. However, if you prefer,
188
+ # you could uncomment the following to ignore the entire vscode folder
189
+ # .vscode/
190
+
191
+ # Ruff stuff:
192
+ .ruff_cache/
193
+
194
+ # PyPI configuration file
195
+ .pypirc
196
+
197
+ # Cursor
198
+ # Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
199
+ # exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
200
+ # refer to https://docs.cursor.com/context/ignore-files
201
+ .cursorignore
202
+ .cursorindexingignore
203
+
204
+ # Marimo
205
+ marimo/_static/
206
+ marimo/_lsp/
207
+ __marimo__/
208
+ DSCS553_CS1_Assignment.pdf
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Shreya Boyane
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,9 @@
1
  ---
2
- title: Scene Mood Classifier API
3
- emoji: 🏆
4
- colorFrom: gray
5
- colorTo: blue
6
- sdk: gradio
7
- sdk_version: 5.45.0
8
- app_file: app.py
9
  pinned: false
10
- license: mit
11
  ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Scene Mood Classifier
3
+ emoji: 🎬
4
+ colorFrom: indigo
5
+ colorTo: purple
6
+ sdk: gradio
7
+ app_file: fusion-app/app_api.py
 
8
  pinned: false
 
9
  ---
 
 
fusion-app/app_api.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+ import io, os, time, json
3
+ from pathlib import Path
4
+ from typing import List, Dict
5
+ import numpy as np
6
+ from PIL import Image
7
+ import gradio as gr
8
+ import requests
9
+ from huggingface_hub import InferenceClient
10
+ from pydub import AudioSegment
11
+ from utils_media import video_to_frame_audio, load_audio_16k, log_inference
12
+
13
+ HERE = Path(__file__).parent
14
+ LABEL_ITEMS = json.loads((HERE / "labels.json").read_text())["labels"]
15
+ LABELS = [x["name"] for x in LABEL_ITEMS]
16
+ PROMPTS = [x["prompt"] for x in LABEL_ITEMS]
17
+
18
+ CLIP_MODEL = "openai/clip-vit-base-patch32"
19
+ W2V2_MODEL = "facebook/wav2vec2-base"
20
+
21
+ HF_TOKEN = os.getenv("HF_TOKEN")
22
+ if not HF_TOKEN:
23
+ raise RuntimeError("Missing HF_TOKEN in environment.")
24
+
25
+ client = InferenceClient(token=HF_TOKEN)
26
+
27
+
28
+
29
+ def _img_to_jpeg_bytes(pil: Image.Image) -> bytes:
30
+ buf = io.BytesIO()
31
+ pil.convert("RGB").save(buf, format="JPEG", quality=90)
32
+ return buf.getvalue()
33
+
34
+ def clip_api_probs(pil: Image.Image, prompts: List[str] = PROMPTS) -> np.ndarray:
35
+
36
+ result = client.zero_shot_image_classification(
37
+ image=pil, candidate_labels=prompts,
38
+ hypothesis_template="{}",
39
+ model=CLIP_MODEL,
40
+ )
41
+
42
+ scores = {d["label"]: float(d["score"]) for d in result}
43
+ arr = np.array([scores.get(p, 0.0) for p in prompts], dtype=np.float32)
44
+
45
+ s = arr.sum(); arr = arr / s if s > 0 else np.ones_like(arr)/len(arr)
46
+ return arr
47
+
48
+
49
+
50
+ def _wave_float32_to_wav_bytes(wave_16k: np.ndarray, sr=16000) -> bytes:
51
+
52
+ samples = (np.clip(wave_16k, -1, 1) * 32767.0).astype(np.int16)
53
+ seg = AudioSegment(
54
+ samples.tobytes(), frame_rate=sr, sample_width=2, channels=1
55
+ )
56
+ out = io.BytesIO()
57
+ seg.export(out, format="wav")
58
+ return out.getvalue()
59
+
60
+ def w2v2_api_embed(wave_16k: np.ndarray) -> np.ndarray:
61
+ wav_bytes = _wave_float32_to_wav_bytes(wave_16k)
62
+
63
+ url = f"https://api-inference.huggingface.co/models/{W2V2_MODEL}"
64
+ hdrs = {"Authorization": f"Bearer {HF_TOKEN}"}
65
+ r = requests.post(url, headers=hdrs, data=wav_bytes, timeout=60)
66
+ r.raise_for_status()
67
+ arr = np.asarray(r.json(), dtype=np.float32) # shape [T, 768]
68
+ if arr.ndim == 3: # [batch, T, D]
69
+ arr = arr[0]
70
+ vec = arr.mean(axis=0) # [768]
71
+ # L2 normalize
72
+ n = np.linalg.norm(vec) + 1e-8
73
+ return (vec / n).astype(np.float32)
74
+
75
+
76
+
77
+ _PROTO_EMBS: Dict[str, np.ndarray] | None = None
78
+
79
+ def _sine(sr, freq, dur, amp=0.2):
80
+ t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
81
+ return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
82
+
83
+ def _burst_noise(sr, dur, amp=0.2):
84
+ x = np.random.randn(int(sr*dur)).astype(np.float32)
85
+ n = x.size
86
+ env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
87
+ env = np.pad(env, (0, n-env.size), constant_values=1.0)
88
+ env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
89
+ return (amp * x * env).astype(np.float32)
90
+
91
+ def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
92
+ third = 3/2 if minor else 4/3
93
+ w = (_sine(sr, base, dur, amp)
94
+ + _sine(sr, base*third, dur, amp*0.7)
95
+ + _sine(sr, base*2, dur, amp*0.5))
96
+ return (w / (np.max(np.abs(w)) + 1e-6)).astype(np.float32)
97
+
98
+ def _synthesize_audio_prototypes(sr=16000, dur=2.0):
99
+ return {
100
+ "calm": _sine(sr, 220, dur, amp=0.08),
101
+ "energetic": _burst_noise(sr, dur, amp=0.35),
102
+ "suspense": _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12),
103
+ "joyful": _triad(sr, 262, minor=False, dur=dur, amp=0.22),
104
+ "sad": _triad(sr, 262, minor=True, dur=dur, amp=0.20),
105
+ }
106
+
107
+ def _ensure_proto_embs():
108
+ global _PROTO_EMBS
109
+ if _PROTO_EMBS is not None:
110
+ return
111
+ waves = _synthesize_audio_prototypes()
112
+ embs = {}
113
+ for lbl, wav in waves.items():
114
+ e = w2v2_api_embed(wav) # API embed L2-normalized
115
+ embs[lbl] = e
116
+ _PROTO_EMBS = embs
117
+
118
+ def w2v2_api_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
119
+ _ensure_proto_embs()
120
+ emb = w2v2_api_embed(wave_16k) # [768], normalized
121
+ sims = np.array([float(np.dot(emb, _PROTO_EMBS[lbl])) for lbl in LABELS], dtype=np.float32)
122
+ z = sims / max(1e-6, float(temperature))
123
+ z = z - z.max()
124
+ p = np.exp(z); p /= (p.sum() + 1e-8)
125
+ return p.astype(np.float32)
126
+
127
+
128
+ def fuse_probs(p_img: np.ndarray, p_aud: np.ndarray, alpha: float) -> np.ndarray:
129
+ p_img = p_img / (p_img.sum() + 1e-8)
130
+ p_aud = p_aud / (p_aud.sum() + 1e-8)
131
+ p = alpha * p_img + (1 - alpha) * p_aud
132
+ return p / (p.sum() + 1e-8)
133
+
134
+ def top1_label(p: np.ndarray) -> str:
135
+ return LABELS[int(np.argmax(p))]
136
+
137
+ def predict_video(video, alpha=0.7):
138
+ t0 = time.time()
139
+
140
+ # FULL video analysis
141
+ frames, wave, meta = video_to_frame_audio(video, target_frames=24, fps_cap=2.0)
142
+
143
+ # IMAGE
144
+ t_img0 = time.time()
145
+ per_frame = [clip_api_probs(pil) for pil in frames]
146
+ p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
147
+ t_img = time.time() - t_img0
148
+
149
+ # AUDIO
150
+ t_aud0 = time.time()
151
+ p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
152
+ t_aud = time.time() - t_aud0
153
+
154
+ # FUSION
155
+ t_fus0 = time.time()
156
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
157
+ t_fus = time.time() - t_fus0
158
+
159
+ pred = top1_label(p)
160
+ probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
161
+ lat = {
162
+ "t_image_ms": int(t_img*1000),
163
+ "t_audio_ms": int(t_aud*1000),
164
+ "t_fuse_ms": int(t_fus*1000),
165
+ "t_total_ms": int((time.time()-t0)*1000),
166
+ "n_frames": meta.get("n_frames"),
167
+ "fps_used": meta.get("fps_used"),
168
+ "duration_s": meta.get("duration_s"),
169
+ }
170
+ log_inference(engine="api", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
171
+ return pred, probs, lat
172
+
173
+ def predict_image_audio(image: Image.Image, audio_path: str, alpha=0.7):
174
+ t0 = time.time()
175
+ wave = load_audio_16k(audio_path)
176
+
177
+ # IMAGE
178
+ t_img0 = time.time()
179
+ p_img = clip_api_probs(image)
180
+ t_img = time.time() - t_img0
181
+
182
+ # AUDIO
183
+ t_aud0 = time.time()
184
+ p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
185
+ t_aud = time.time() - t_aud0
186
+
187
+ # FUSION
188
+ t_fus0 = time.time()
189
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
190
+ t_fus = time.time() - t_fus0
191
+
192
+ pred = top1_label(p)
193
+ probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
194
+ lat = {
195
+ "t_image_ms": int(t_img*1000),
196
+ "t_audio_ms": int(t_aud*1000),
197
+ "t_fuse_ms": int(t_fus*1000),
198
+ "t_total_ms": int((time.time()-t0)*1000),
199
+ }
200
+ log_inference(engine="api", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
201
+ return pred, probs, lat
202
+
203
+ '''
204
+ Chat GPT : Create Gradio interface for the above API functions same as local app.
205
+ '''
206
+ with gr.Blocks(title="Scene Mood (API)") as demo:
207
+ gr.Markdown("# Scene Mood Classifier - API Version. Upload a short **video** or an **image + audio** pair.")
208
+ with gr.Tab("Video"):
209
+ v = gr.Video(sources=["upload"], height=240)
210
+ alpha_v = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
211
+ label="Fusion weight α (image ↔ audio)",
212
+ info="α=1 trusts image only; α=0 trusts audio only.")
213
+ btn_v = gr.Button("Analyze")
214
+ out_v1, out_v2, out_v3 = gr.Label(), gr.JSON(), gr.JSON()
215
+ btn_v.click(predict_video, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
216
+
217
+ with gr.Tab("Image + Audio"):
218
+ img = gr.Image(type="pil", height=240, label="Image")
219
+ aud = gr.Audio(sources=["upload"], type="filepath", label="Audio")
220
+ alpha_ia = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
221
+ label="Fusion weight α (image ↔ audio)",
222
+ info="α=1 trusts image only; α=0 trusts audio only.")
223
+ btn_ia = gr.Button("Analyze")
224
+ out_i1, out_i2, out_i3 = gr.Label(), gr.JSON(), gr.JSON()
225
+ btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
226
+
227
+ if __name__ == "__main__":
228
+ demo.launch()
fusion-app/app_local.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from pathlib import Path
4
+ from utils_media import video_to_frame_audio, load_audio_16k, log_inference
5
+ from fusion import clip_image_probs, wav2vec2_embed_energy, wav2vec2_zero_shot_probs, audio_prior_from_rms, fuse_probs, top1_label_from_probs
6
+
7
+ HERE = Path(__file__).parent
8
+ lables_PATH = HERE / "labels.json"
9
+
10
+ lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
11
+
12
+ # lables = [x ["name"] for x in json.load(Path("fusion-app/labels.json").read_text())["labels"]]
13
+
14
+ def predict_vid(video, alpha=0.7):
15
+ import time, numpy as np
16
+ t0 = time.time()
17
+ frames, wave, meta = video_to_frame_audio(video, target_frames=64, fps_cap=3.0)
18
+
19
+ t_img0 = time.time()
20
+ per_frame = []
21
+ for pil in frames:
22
+ per_frame.append(clip_image_probs(pil)) # np[K]
23
+ p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
24
+ t_img = time.time() - t_img0
25
+
26
+ t_aud0 = time.time()
27
+ _, rms = wav2vec2_embed_energy(wave) # embedding computed; report rms
28
+ p_aud = audio_prior_from_rms(rms) # np[K]
29
+ t_aud = time.time() - t_aud0
30
+
31
+ t_fus0 = time.time()
32
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
33
+ t_fus = time.time() - t_fus0
34
+
35
+ pred = top1_label_from_probs(p)
36
+ probs = {k: round(float(v), 4) for k, v in zip(lables, p)}
37
+ lat = {
38
+ "t_image_ms": int(t_img * 1000),
39
+ "t_audio_ms": int(t_aud * 1000),
40
+ "t_fuse_ms": int(t_fus * 1000),
41
+ "t_total_ms": int((time.time() - t0) * 1000),
42
+ "rms": round(float(rms), 4),
43
+ "n_frames": meta.get("n_frames"),
44
+ "fps_used": round(float(meta.get("fps_used") or 0.0), 3),
45
+ "duration_s": round(float(meta.get("duration_s") or 0.0), 2),
46
+ }
47
+ print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
48
+ log_inference(engine="local", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
49
+ return pred, probs, lat
50
+
51
+ def predict_image_audio(image, audio_path, alpha=0.7):
52
+ import time, numpy as np
53
+ t0 = time.time()
54
+ wave = load_audio_16k(audio_path)
55
+
56
+ t_img0 = time.time()
57
+ p_img = clip_image_probs(image)
58
+ t_img = time.time() - t_img0
59
+
60
+ t_aud0 = time.time()
61
+ p_aud = wav2vec2_zero_shot_probs(wave, temperature=1.0)
62
+ _, rms = wav2vec2_embed_energy(wave)
63
+ p_rms = audio_prior_from_rms(rms)
64
+ p_aud = 0.8 * p_aud + 0.2 * p_rms
65
+ t_aud = time.time() - t_aud0
66
+
67
+ t_fus0 = time.time()
68
+ p = fuse_probs(p_img, p_aud, alpha=float(alpha))
69
+ t_fus = time.time() - t_fus0
70
+
71
+ pred = top1_label_from_probs(p)
72
+ probs = {k: float(v) for k, v in zip(lables, p)}
73
+ lat = {
74
+ "t_image_ms": int(t_img*1000),
75
+ "t_audio_ms": int(t_aud*1000),
76
+ "t_fuse_ms": int(t_fus*1000),
77
+ "t_total_ms": int((time.time()-t0)*1000),
78
+ "rms": round(float(rms), 4),
79
+ }
80
+ print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
81
+ log_inference(engine="local", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
82
+ return pred, probs, lat
83
+
84
+
85
+ with gr.Blocks(title="Scene Mood Detection") as demo:
86
+ gr.Markdown("# Scene Mood Classifier - Local \nUpload a short **video** or an **image + audio** pair.")
87
+ with gr.Tab("Video"):
88
+ v = gr.Video(sources=["upload"], height=240)
89
+
90
+
91
+ # Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
92
+ alpha_v = gr.Slider(
93
+ minimum=0.0, maximum=1.0, value=0.7, step=0.05,
94
+ label="Fusion weight α (image ↔ audio)",
95
+ info="α=1 trusts image only; α=0 trusts audio only."
96
+ )
97
+
98
+
99
+ btn_v = gr.Button("Analyze")
100
+ out_v1 = gr.Label(label="Prediction")
101
+ out_v2 = gr.JSON(label="Probabilities")
102
+ out_v3 = gr.JSON(label="Latency (ms)")
103
+ btn_v.click(predict_vid, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
104
+
105
+ with gr.Tab("Image + Audio"):
106
+ img = gr.Image(type="pil", height=240)
107
+ aud = gr.Audio(sources=["upload"], type="filepath")
108
+
109
+ # Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
110
+ alpha_ia = gr.Slider(
111
+ minimum=0.0, maximum=1.0, value=0.7, step=0.05,
112
+ label="Fusion weight α (image ↔ audio)",
113
+ info="α=1 trusts image only; α=0 trusts audio only."
114
+ )
115
+
116
+ btn_ia = gr.Button("Analyze")
117
+ out_i1 = gr.Label(label="Prediction")
118
+ out_i2 = gr.JSON(label="Probabilities")
119
+ out_i3 = gr.JSON(label="Latency (ms)")
120
+ btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
121
+
122
+ if __name__ == "__main__":
123
+ demo.launch()
fusion-app/fusion.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import json
3
+ import numpy as np
4
+ import torch
5
+ import math
6
+ from transformers import CLIPProcessor, CLIPModel, Wav2Vec2Processor, Wav2Vec2Model
7
+
8
+
9
+ DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
10
+
11
+ _here = Path(__file__).parent
12
+ _labels = json.loads((_here / "labels.json").read_text())["labels"]
13
+ LABELS = [x["name"] for x in _labels]
14
+ PROMPTS = [x["prompt"] for x in _labels]
15
+
16
+ _clip_model = None
17
+ _clip_proc = None
18
+ _wav_model = None
19
+ _wav_proc = None
20
+ _proto_embs = None
21
+
22
+ def _lazy_load_models():
23
+ global _clip_model, _clip_proc, _wav_model, _wav_proc
24
+ if _clip_model is None:
25
+ _clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
26
+ _clip_model.eval()
27
+ _clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
28
+ if _wav_model is None:
29
+ _wav_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(DEVICE)
30
+ _wav_model.eval()
31
+ _wav_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
32
+
33
+
34
+ def _sine(sr, freq, dur, amp=0.2):
35
+ t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
36
+ return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
37
+
38
+ def _burst_noise(sr, dur, amp=0.2):
39
+ x = np.random.randn(int(sr*dur)).astype(np.float32)
40
+ # fast attack / fast decay envelope
41
+ n = x.size
42
+ env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
43
+ env = np.pad(env, (0, n-env.size), constant_values=1.0)
44
+ env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
45
+ return (amp * x * env).astype(np.float32)
46
+
47
+ def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
48
+ third = 3/2 if minor else 4/3 # (approx)
49
+ f1, f2, f3 = base, base*third, base*2
50
+ w = (_sine(sr,f1,dur,amp) + _sine(sr,f2,dur,amp*0.7) + _sine(sr,f3,dur,amp*0.5))
51
+ return (w / (np.max(np.abs(w))+1e-6)).astype(np.float32)
52
+
53
+ def _synthesize_audio_prototypes(sr=16000, dur=2.0):
54
+
55
+ return {
56
+ "calm": _sine(sr, 220, dur, amp=0.08), # quiet low sine
57
+ "energetic": _burst_noise(sr, dur, amp=0.35), # noisy, punchy
58
+ "suspense": _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12), # low drones
59
+ "joyful": _triad(sr, 262, minor=False, dur=dur, amp=0.22), # C major-ish
60
+ "sad": _triad(sr, 262, minor=True, dur=dur, amp=0.20), # C minor-ish
61
+ }
62
+
63
+ def _ensure_audio_prototypes():
64
+ global _proto_embs
65
+ if _proto_embs is not None:
66
+ return
67
+ _lazy_load_models()
68
+ waves = _synthesize_audio_prototypes()
69
+ embs = {}
70
+ for lbl, wav in waves.items():
71
+ emb, _ = wav2vec2_embed_energy(wav) # normalized 768-d embedding
72
+ embs[lbl] = emb / (np.linalg.norm(emb) + 1e-8)
73
+ _proto_embs = embs # cache
74
+
75
+ # image branch (CLIP)
76
+ @torch.no_grad()
77
+ def clip_image_probs(pil_image, prompts=PROMPTS):
78
+
79
+ _lazy_load_models()
80
+ # text features
81
+ text_inputs = _clip_proc(text=prompts, return_tensors="pt", padding=True).to(DEVICE)
82
+ text_feats = _clip_model.get_text_features(**text_inputs) # [K, d]
83
+ text_feats = torch.nn.functional.normalize(text_feats, dim=-1)
84
+
85
+ # image features
86
+ img_inputs = _clip_proc(images=pil_image, return_tensors="pt").to(DEVICE)
87
+ img_feats = _clip_model.get_image_features(**img_inputs) # [1, d]
88
+ img_feats = torch.nn.functional.normalize(img_feats, dim=-1)
89
+
90
+ # similarity to softmax
91
+ sims = (img_feats @ text_feats.T).squeeze(0) # [K]
92
+ probs = torch.softmax(sims, dim=-1) # [K]
93
+ return probs.detach().cpu().numpy() # np.float32[K]
94
+
95
+ # audio branch (Wav2Vec2 + energy prior)
96
+ @torch.no_grad()
97
+ def wav2vec2_embed_energy(wave_16k: np.ndarray):
98
+ _lazy_load_models()
99
+ # wave_16k must be float32 mono in [-1, 1]
100
+ inp = _wav_proc(wave_16k, sampling_rate=16000, return_tensors="pt").to(DEVICE)
101
+ out = _wav_model(**inp).last_hidden_state # [1, T, 768]
102
+ emb = out.mean(dim=1).squeeze(0) # [768]
103
+ emb = torch.nn.functional.normalize(emb, dim=-1)
104
+ emb_np = emb.detach().cpu().numpy()
105
+
106
+ # simple loudness proxy (RMS)
107
+ rms = float(np.sqrt(np.mean(np.square(wave_16k)))) # 0..~1
108
+ return emb_np, rms
109
+
110
+ def audio_prior_from_rms(rms: float) -> np.ndarray:
111
+ # clamp
112
+ r = max(0.0, min(1.0, rms))
113
+ # weights via curves
114
+ calm = max(0.0, 1.0 - 2.0*r) # high when quiet
115
+ sad = max(0.0, 1.2 - 2.2*r)
116
+ energetic = r**0.8 # grows with loudness
117
+ joyful = (r**0.9) * 0.9 + 0.1*(1-r) # energetic but with a small bias
118
+ suspense = 0.6*(1.0 - abs(r - 0.5)*2) # middle loudness means suspense
119
+
120
+ vec = np.array([calm, energetic, suspense, joyful, sad], dtype=np.float32)
121
+ vec = np.clip(vec, 1e-4, None)
122
+ vec = vec / vec.sum()
123
+ return vec
124
+
125
+ @torch.no_grad()
126
+ def wav2vec2_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
127
+ _ensure_audio_prototypes()
128
+ emb, _ = wav2vec2_embed_energy(wave_16k) # normalized already
129
+ emb = emb / (np.linalg.norm(emb) + 1e-8)
130
+ sims = np.array([float(np.dot(emb, _proto_embs[lbl])) for lbl in LABELS], dtype=np.float32) # [K]
131
+ # temperature softmax for tunable sharpness
132
+ z = sims / max(1e-6, float(temperature))
133
+ z = z - z.max() # numerical stability
134
+ p = np.exp(z); p /= (p.sum() + 1e-8)
135
+ return p.astype(np.float32)
136
+
137
+ # fusion
138
+ def fuse_probs(image_probs: np.ndarray, audio_prior: np.ndarray, alpha: float = 0.7) -> np.ndarray:
139
+
140
+ p_img = image_probs / (image_probs.sum() + 1e-8) # alpha closer to 1 favors image, 0 favors audio.
141
+ p_aud = audio_prior / (audio_prior.sum() + 1e-8)
142
+ p = alpha * p_img + (1.0 - alpha) * p_aud
143
+ p = p / (p.sum() + 1e-8)
144
+ return p
145
+
146
+ def top1_label_from_probs(p: np.ndarray) -> str:
147
+ return LABELS[int(p.argmax())]
fusion-app/labels.json ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "labels": [
3
+ {"name": "calm", "prompt": "a tranquil, peaceful scene", "def": "low motion, soft colors, quiet audio"},
4
+ {"name": "energetic", "prompt": "a high-energy lively scene", "def": "fast motion, bright colors, loud/fast audio"},
5
+ {"name": "suspense", "prompt": "a tense, foreboding scene", "def": "dim colors, slow build, ominous drones"},
6
+ {"name": "joyful", "prompt": "a happy, upbeat, celebratory scene", "def": "warm colors, smiles, upbeat music"},
7
+ {"name": "sad", "prompt": "a somber, gloomy scene", "def": "cool/dark tones, slow pace, quiet audio"}
8
+ ]
9
+ }
10
+
11
+
fusion-app/tests/test_shapes.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ import numpy as np
2
+ def test_concat_dim():
3
+ img, aud = np.random.randn(512), np.random.randn(768)
4
+ assert (img.size + aud.size) == 1280
fusion-app/tests/test_smoke.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ def test_imports():
2
+ import gradio, numpy # noqa
3
+ assert True
fusion-app/utils_media.py ADDED
@@ -0,0 +1,176 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import csv
2
+ import json
3
+ from pathlib import Path
4
+ import time
5
+ from typing import Any, Dict, Tuple, Union
6
+ import io
7
+ import numpy as np
8
+ from PIL import Image
9
+ import ffmpeg
10
+ import tempfile
11
+ from pydub import AudioSegment
12
+
13
+ # helpers
14
+ def probe_duration_sec(video_path: str) -> float:
15
+ try:
16
+ meta = ffmpeg.probe(video_path)
17
+ return float(meta.get("format", {}).get("duration", 0.0)) or 0.0
18
+ except Exception:
19
+ return 0.0
20
+
21
+ def _to_path(p: Union[str, dict, Path]) -> str:
22
+ if isinstance(p, dict):
23
+ return p.get("name") or p.get("path") or p.get("data") or ""
24
+ return str(p)
25
+
26
+ def _audiosegment_float32(seg: AudioSegment) -> np.ndarray:
27
+ seg = seg.set_frame_rate(16000).set_channels(1).set_sample_width(2) # 16-bit
28
+ samples = np.array(seg.get_array_of_samples(), dtype=np.int16)
29
+ return (samples.astype(np.float32) / 32768.0)
30
+
31
+ # public API
32
+ def video_to_frame_audio(
33
+ video_in,
34
+ target_frames: int = 64, # aim for this many frames total
35
+ fps_cap: float = 3.0 # never sample faster than this
36
+ ) -> Tuple[list, np.ndarray, dict]:
37
+
38
+ video_path = _to_path(video_in)
39
+ if not video_path:
40
+ raise ValueError("Empty video path")
41
+
42
+ dur = probe_duration_sec(video_path)
43
+
44
+ if dur <= 0:
45
+ fps = 1.0
46
+ else:
47
+ fps = min(fps_cap, max(1.0 / dur, target_frames / dur))
48
+
49
+ frames = []
50
+ with tempfile.TemporaryDirectory() as td:
51
+ td = Path(td)
52
+ out_pattern = str(td / "frame_%06d.jpg")
53
+
54
+ (
55
+ ffmpeg
56
+ .input(video_path)
57
+ .output(out_pattern, vf=f"fps={fps}", vsync="vfr", qscale=2)
58
+ .overwrite_output()
59
+ .run(capture_stdout=True, capture_stderr=True)
60
+ )
61
+ for p in sorted(td.glob("frame_*.jpg")):
62
+ frames.append(Image.open(p).convert("RGB"))
63
+
64
+
65
+ seg = AudioSegment.from_file(video_path)
66
+ audio16k = _audiosegment_float32(seg)
67
+
68
+ meta = {"duration_s": float(dur), "fps_used": float(fps), "n_frames": int(len(frames))}
69
+ return frames, audio16k, meta
70
+
71
+ def load_audio_16k(audio_path_like) -> np.ndarray:
72
+ path = _to_path(audio_path_like)
73
+ seg = AudioSegment.from_file(path)
74
+ return _audiosegment_float32(seg)
75
+
76
+
77
+ # Logging
78
+ DEFAULT_CSV = Path(__file__).parent / "runs_local.csv"
79
+
80
+ def now_iso() -> str:
81
+ # UTC-ish wall time string (sufficient for ordering/eyeballing).
82
+ return time.strftime("%Y-%m-%dT%H:%M:%S")
83
+
84
+ def append_csv(csv_path: Union[str, Path] = DEFAULT_CSV, row: Dict[str, Any] = None) -> None:
85
+ if row is None:
86
+ return
87
+ p = Path(csv_path)
88
+ p.parent.mkdir(parents=True, exist_ok=True)
89
+ is_new = not p.exists()
90
+ safe_row = {k: (json.dumps(v) if isinstance(v, (list, dict)) else v) for k, v in row.items()}
91
+ with p.open("a", newline="", encoding="utf-8") as f:
92
+ w = csv.DictWriter(f, fieldnames=list(safe_row.keys()))
93
+ if is_new:
94
+ w.writeheader()
95
+ w.writerow(safe_row)
96
+
97
+ def log_inference(
98
+ *,
99
+ engine: str, # "local" or "api"
100
+ mode: str, # "video" or "image_audio"
101
+ alpha: float,
102
+ lat: Dict[str, Any], # expects keys like t_image_ms, t_audio_ms, t_fuse_ms, t_total_ms, rms
103
+ pred: str,
104
+ probs: Dict[str, float],
105
+ csv_path: Union[str, Path] = DEFAULT_CSV
106
+ ) -> None:
107
+
108
+ payload = {
109
+ "ts": now_iso(),
110
+ "engine": engine,
111
+ "mode": mode,
112
+ "alpha": float(alpha),
113
+ "rms": lat.get("rms"),
114
+ "t_image_ms": lat.get("t_image_ms"),
115
+ "t_audio_ms": lat.get("t_audio_ms"),
116
+ "t_fuse_ms": lat.get("t_fuse_ms"),
117
+ "t_total_ms": lat.get("t_total_ms"),
118
+ "pred": pred,
119
+ "probs": probs,
120
+ }
121
+ append_csv(csv_path, payload)
122
+
123
+
124
+ # Summarizer
125
+
126
+ def summarize_csv(
127
+ csv_path: Union[str, Path] = DEFAULT_CSV,
128
+ cols = ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms")
129
+ ) -> Dict[str, Dict[str, float]]:
130
+ """
131
+ Compute p50/p95 for latency columns. Returns a dict so you can print or consume it.
132
+ """
133
+ p = Path(csv_path)
134
+ if not p.exists():
135
+ return {}
136
+
137
+ with p.open("r", encoding="utf-8") as f:
138
+ rows = list(csv.DictReader(f))
139
+
140
+ def _col_vals(c):
141
+ out = []
142
+ for r in rows:
143
+ v = r.get(c)
144
+ if v is None or v == "":
145
+ continue
146
+ try:
147
+ out.append(float(v))
148
+ except Exception:
149
+ pass
150
+ return np.array(out, dtype=float)
151
+
152
+ stats: Dict[str, Dict[str, float]] = {}
153
+ for c in cols:
154
+ arr = _col_vals(c)
155
+ if arr.size == 0:
156
+ stats[c] = {"p50": float("nan"), "p95": float("nan"), "n": 0}
157
+ else:
158
+ stats[c] = {
159
+ "p50": float(np.percentile(arr, 50)),
160
+ "p95": float(np.percentile(arr, 95)),
161
+ "n": int(arr.size),
162
+ }
163
+ return stats
164
+
165
+ if __name__ == "__main__":
166
+ # CLI usage: python fusion-app/utils_media.py [csv_path]
167
+ import sys
168
+ path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CSV
169
+ s = summarize_csv(path)
170
+ print(f"File: {path}")
171
+ if not s:
172
+ print("No rows found.")
173
+ else:
174
+ for k in ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms"):
175
+ if k in s:
176
+ print(f"{k:>11}: p50={s[k]['p50']:.1f} ms p95={s[k]['p95']:.1f} ms n={s[k]['n']}")
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg
requirements.txt ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ torch
4
+ torchaudio
5
+ torchvision
6
+ pydub
7
+ ffmpeg-python
8
+ numpy
9
+ pytest
10
+ huggingface_hub
11
+ datasets