Spaces:

SonicaB
/

Scene-Mood-Classifier-API

Sleeping

App Files Files Community

SonicaB commited on Sep 15, 2025

Commit

65f8788

verified ·

1 Parent(s): cb62d77

Upload folder using huggingface_hub

Browse files

Files changed (12) hide show

.gitignore +208 -0
LICENSE +21 -0
README.md +6 -10
fusion-app/app_api.py +228 -0
fusion-app/app_local.py +123 -0
fusion-app/fusion.py +147 -0
fusion-app/labels.json +11 -0
fusion-app/tests/test_shapes.py +4 -0
fusion-app/tests/test_smoke.py +3 -0
fusion-app/utils_media.py +176 -0
packages.txt +1 -0
requirements.txt +11 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,208 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[codz]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py.cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# UV
+#   Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#uv.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+#poetry.toml
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#   pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
+#   https://pdm-project.org/en/latest/usage/project/#working-with-version-control
+#pdm.lock
+#pdm.toml
+.pdm-python
+.pdm-build/
+# pixi
+#   Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
+#pixi.lock
+#   Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
+#   in the .venv directory. It is recommended not to include this directory in version control.
+.pixi
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.envrc
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
+# Abstra
+# Abstra is an AI-powered process automation framework.
+# Ignore directories containing user credentials, local state, and settings.
+# Learn more at https://abstra.io/docs
+.abstra/
+# Visual Studio Code
+#  Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
+#  that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
+#  and can be added to the global gitignore or merged into this file. However, if you prefer,
+#  you could uncomment the following to ignore the entire vscode folder
+# .vscode/
+# Ruff stuff:
+.ruff_cache/
+# PyPI configuration file
+.pypirc
+# Cursor
+#  Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
+#  exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
+#  refer to https://docs.cursor.com/context/ignore-files
+.cursorignore
+.cursorindexingignore
+# Marimo
+marimo/_static/
+marimo/_lsp/
+__marimo__/
+DSCS553_CS1_Assignment.pdf

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Shreya Boyane
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,13 +1,9 @@
 ---
-title: Scene Mood Classifier API
-emoji: 🏆
-colorFrom: gray
-colorTo: blue
-sdk: gradio
-sdk_version: 5.45.0
-app_file: app.py
 pinned: false
-license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Scene Mood Classifier
+emoji: 🎬
+colorFrom: indigo
+colorTo: purple
+sdk: gradio
+app_file: fusion-app/app_api.py
 pinned: false
 ---

fusion-app/app_api.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from __future__ import annotations
+import io, os, time, json
+from pathlib import Path
+from typing import List, Dict
+import numpy as np
+from PIL import Image
+import gradio as gr
+import requests
+from huggingface_hub import InferenceClient
+from pydub import AudioSegment
+from utils_media import video_to_frame_audio, load_audio_16k, log_inference
+HERE = Path(__file__).parent
+LABEL_ITEMS = json.loads((HERE / "labels.json").read_text())["labels"]
+LABELS  = [x["name"]   for x in LABEL_ITEMS]
+PROMPTS = [x["prompt"] for x in LABEL_ITEMS]
+CLIP_MODEL = "openai/clip-vit-base-patch32"
+W2V2_MODEL = "facebook/wav2vec2-base"
+HF_TOKEN = os.getenv("HF_TOKEN")
+if not HF_TOKEN:
+    raise RuntimeError("Missing HF_TOKEN in environment.")
+client = InferenceClient(token=HF_TOKEN)
+def _img_to_jpeg_bytes(pil: Image.Image) -> bytes:
+    buf = io.BytesIO()
+    pil.convert("RGB").save(buf, format="JPEG", quality=90)
+    return buf.getvalue()
+def clip_api_probs(pil: Image.Image, prompts: List[str] = PROMPTS) -> np.ndarray:
+    result = client.zero_shot_image_classification(
+        image=pil, candidate_labels=prompts,
+        hypothesis_template="{}",
+        model=CLIP_MODEL,
+    )
+    scores = {d["label"]: float(d["score"]) for d in result}
+    arr = np.array([scores.get(p, 0.0) for p in prompts], dtype=np.float32)
+    s = arr.sum();  arr = arr / s if s > 0 else np.ones_like(arr)/len(arr)
+    return arr
+def _wave_float32_to_wav_bytes(wave_16k: np.ndarray, sr=16000) -> bytes:
+    samples = (np.clip(wave_16k, -1, 1) * 32767.0).astype(np.int16)
+    seg = AudioSegment(
+        samples.tobytes(), frame_rate=sr, sample_width=2, channels=1
+    )
+    out = io.BytesIO()
+    seg.export(out, format="wav")
+    return out.getvalue()
+def w2v2_api_embed(wave_16k: np.ndarray) -> np.ndarray:
+    wav_bytes = _wave_float32_to_wav_bytes(wave_16k)
+    url = f"https://api-inference.huggingface.co/models/{W2V2_MODEL}"
+    hdrs = {"Authorization": f"Bearer {HF_TOKEN}"}
+    r = requests.post(url, headers=hdrs, data=wav_bytes, timeout=60)
+    r.raise_for_status()
+    arr = np.asarray(r.json(), dtype=np.float32)  # shape [T, 768]
+    if arr.ndim == 3:      # [batch, T, D]
+        arr = arr[0]
+    vec = arr.mean(axis=0)  # [768]
+    # L2 normalize
+    n = np.linalg.norm(vec) + 1e-8
+    return (vec / n).astype(np.float32)
+_PROTO_EMBS: Dict[str, np.ndarray] | None = None
+def _sine(sr, freq, dur, amp=0.2):
+    t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
+    return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
+def _burst_noise(sr, dur, amp=0.2):
+    x = np.random.randn(int(sr*dur)).astype(np.float32)
+    n = x.size
+    env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
+    env = np.pad(env, (0, n-env.size), constant_values=1.0)
+    env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
+    return (amp * x * env).astype(np.float32)
+def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
+    third = 3/2 if minor else 4/3
+    w = (_sine(sr, base, dur, amp)
+         + _sine(sr, base*third, dur, amp*0.7)
+         + _sine(sr, base*2, dur, amp*0.5))
+    return (w / (np.max(np.abs(w)) + 1e-6)).astype(np.float32)
+def _synthesize_audio_prototypes(sr=16000, dur=2.0):
+    return {
+        "calm":      _sine(sr, 220, dur, amp=0.08),
+        "energetic": _burst_noise(sr, dur, amp=0.35),
+        "suspense":  _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12),
+        "joyful":    _triad(sr, 262, minor=False, dur=dur, amp=0.22),
+        "sad":       _triad(sr, 262, minor=True,  dur=dur, amp=0.20),
+    }
+def _ensure_proto_embs():
+    global _PROTO_EMBS
+    if _PROTO_EMBS is not None:
+        return
+    waves = _synthesize_audio_prototypes()
+    embs = {}
+    for lbl, wav in waves.items():
+        e = w2v2_api_embed(wav)  # API embed L2-normalized
+        embs[lbl] = e
+    _PROTO_EMBS = embs
+def w2v2_api_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
+    _ensure_proto_embs()
+    emb = w2v2_api_embed(wave_16k)  # [768], normalized
+    sims = np.array([float(np.dot(emb, _PROTO_EMBS[lbl])) for lbl in LABELS], dtype=np.float32)
+    z = sims / max(1e-6, float(temperature))
+    z = z - z.max()
+    p = np.exp(z);  p /= (p.sum() + 1e-8)
+    return p.astype(np.float32)
+def fuse_probs(p_img: np.ndarray, p_aud: np.ndarray, alpha: float) -> np.ndarray:
+    p_img = p_img / (p_img.sum() + 1e-8)
+    p_aud = p_aud / (p_aud.sum() + 1e-8)
+    p = alpha * p_img + (1 - alpha) * p_aud
+    return p / (p.sum() + 1e-8)
+def top1_label(p: np.ndarray) -> str:
+    return LABELS[int(np.argmax(p))]
+def predict_video(video, alpha=0.7):
+    t0 = time.time()
+    # FULL video analysis
+    frames, wave, meta = video_to_frame_audio(video, target_frames=24, fps_cap=2.0)
+    # IMAGE
+    t_img0 = time.time()
+    per_frame = [clip_api_probs(pil) for pil in frames]
+    p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
+    t_img = time.time() - t_img0
+    # AUDIO
+    t_aud0 = time.time()
+    p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
+    t_aud = time.time() - t_aud0
+    # FUSION
+    t_fus0 = time.time()
+    p = fuse_probs(p_img, p_aud, alpha=float(alpha))
+    t_fus = time.time() - t_fus0
+    pred = top1_label(p)
+    probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
+    lat = {
+        "t_image_ms": int(t_img*1000),
+        "t_audio_ms": int(t_aud*1000),
+        "t_fuse_ms":  int(t_fus*1000),
+        "t_total_ms": int((time.time()-t0)*1000),
+        "n_frames": meta.get("n_frames"),
+        "fps_used":  meta.get("fps_used"),
+        "duration_s": meta.get("duration_s"),
+    }
+    log_inference(engine="api", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
+    return pred, probs, lat
+def predict_image_audio(image: Image.Image, audio_path: str, alpha=0.7):
+    t0 = time.time()
+    wave = load_audio_16k(audio_path)
+    # IMAGE
+    t_img0 = time.time()
+    p_img = clip_api_probs(image)
+    t_img = time.time() - t_img0
+    # AUDIO
+    t_aud0 = time.time()
+    p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
+    t_aud = time.time() - t_aud0
+    # FUSION
+    t_fus0 = time.time()
+    p = fuse_probs(p_img, p_aud, alpha=float(alpha))
+    t_fus = time.time() - t_fus0
+    pred = top1_label(p)
+    probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
+    lat = {
+        "t_image_ms": int(t_img*1000),
+        "t_audio_ms": int(t_aud*1000),
+        "t_fuse_ms":  int(t_fus*1000),
+        "t_total_ms": int((time.time()-t0)*1000),
+    }
+    log_inference(engine="api", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
+    return pred, probs, lat
+'''
+Chat GPT : Create Gradio interface for the above API functions same as local app.
+'''
+with gr.Blocks(title="Scene Mood (API)") as demo:
+    gr.Markdown("# Scene Mood Classifier - API Version. Upload a short **video** or an **image + audio** pair.")
+    with gr.Tab("Video"):
+        v = gr.Video(sources=["upload"], height=240)
+        alpha_v = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
+            label="Fusion weight α (image ↔ audio)",
+            info="α=1 trusts image only; α=0 trusts audio only.")
+        btn_v = gr.Button("Analyze")
+        out_v1, out_v2, out_v3 = gr.Label(), gr.JSON(), gr.JSON()
+        btn_v.click(predict_video, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
+    with gr.Tab("Image + Audio"):
+        img = gr.Image(type="pil", height=240, label="Image")
+        aud = gr.Audio(sources=["upload"], type="filepath", label="Audio")
+        alpha_ia = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
+            label="Fusion weight α (image ↔ audio)",
+            info="α=1 trusts image only; α=0 trusts audio only.")
+        btn_ia = gr.Button("Analyze")
+        out_i1, out_i2, out_i3 = gr.Label(), gr.JSON(), gr.JSON()
+        btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
+if __name__ == "__main__":
+    demo.launch()

fusion-app/app_local.py ADDED Viewed

	@@ -0,0 +1,123 @@

+import gradio as gr
+import json
+from pathlib import Path
+from utils_media import video_to_frame_audio, load_audio_16k, log_inference
+from fusion import clip_image_probs, wav2vec2_embed_energy, wav2vec2_zero_shot_probs, audio_prior_from_rms, fuse_probs, top1_label_from_probs
+HERE = Path(__file__).parent
+lables_PATH = HERE / "labels.json"
+lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
+# lables = [x ["name"] for x in json.load(Path("fusion-app/labels.json").read_text())["labels"]]
+def predict_vid(video, alpha=0.7):
+    import time, numpy as np
+    t0 = time.time()
+    frames, wave, meta = video_to_frame_audio(video, target_frames=64, fps_cap=3.0)
+    t_img0 = time.time()
+    per_frame = []
+    for pil in frames:
+        per_frame.append(clip_image_probs(pil))  # np[K]
+    p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
+    t_img = time.time() - t_img0
+    t_aud0 = time.time()
+    _, rms = wav2vec2_embed_energy(wave)            # embedding computed; report rms
+    p_aud = audio_prior_from_rms(rms)               # np[K]
+    t_aud = time.time() - t_aud0
+    t_fus0 = time.time()
+    p = fuse_probs(p_img, p_aud, alpha=float(alpha))
+    t_fus = time.time() - t_fus0
+    pred = top1_label_from_probs(p)
+    probs = {k: round(float(v), 4) for k, v in zip(lables, p)}
+    lat = {
+        "t_image_ms": int(t_img * 1000),
+        "t_audio_ms": int(t_aud * 1000),
+        "t_fuse_ms":  int(t_fus * 1000),
+        "t_total_ms": int((time.time() - t0) * 1000),
+        "rms": round(float(rms), 4),
+        "n_frames": meta.get("n_frames"),
+        "fps_used": round(float(meta.get("fps_used") or 0.0), 3),
+        "duration_s": round(float(meta.get("duration_s") or 0.0), 2),
+    }
+    print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
+    log_inference(engine="local", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
+    return pred, probs, lat
+def predict_image_audio(image, audio_path, alpha=0.7):
+    import time, numpy as np
+    t0 = time.time()
+    wave = load_audio_16k(audio_path)
+    t_img0 = time.time()
+    p_img = clip_image_probs(image)
+    t_img = time.time() - t_img0
+    t_aud0 = time.time()
+    p_aud = wav2vec2_zero_shot_probs(wave, temperature=1.0)
+    _, rms = wav2vec2_embed_energy(wave)
+    p_rms = audio_prior_from_rms(rms)
+    p_aud = 0.8 * p_aud + 0.2 * p_rms
+    t_aud = time.time() - t_aud0
+    t_fus0 = time.time()
+    p = fuse_probs(p_img, p_aud, alpha=float(alpha))
+    t_fus = time.time() - t_fus0
+    pred = top1_label_from_probs(p)
+    probs = {k: float(v) for k, v in zip(lables, p)}
+    lat = {
+        "t_image_ms": int(t_img*1000),
+        "t_audio_ms": int(t_aud*1000),
+        "t_fuse_ms":  int(t_fus*1000),
+        "t_total_ms": int((time.time()-t0)*1000),
+        "rms": round(float(rms), 4),
+    }
+    print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
+    log_inference(engine="local", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
+    return pred, probs, lat
+with gr.Blocks(title="Scene Mood Detection") as demo:
+    gr.Markdown("# Scene Mood Classifier - Local \nUpload a short **video** or an **image + audio** pair.")
+    with gr.Tab("Video"):
+        v = gr.Video(sources=["upload"], height=240)
+# Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
+        alpha_v = gr.Slider(
+        minimum=0.0, maximum=1.0, value=0.7, step=0.05,
+        label="Fusion weight α (image ↔ audio)",
+        info="α=1 trusts image only; α=0 trusts audio only."
+       )
+        btn_v = gr.Button("Analyze")
+        out_v1 = gr.Label(label="Prediction")
+        out_v2 = gr.JSON(label="Probabilities")
+        out_v3 = gr.JSON(label="Latency (ms)")
+        btn_v.click(predict_vid, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
+    with gr.Tab("Image + Audio"):
+        img = gr.Image(type="pil", height=240)
+        aud = gr.Audio(sources=["upload"], type="filepath")
+# Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
+        alpha_ia = gr.Slider(
+        minimum=0.0, maximum=1.0, value=0.7, step=0.05,
+        label="Fusion weight α (image ↔ audio)",
+        info="α=1 trusts image only; α=0 trusts audio only."
+        )
+        btn_ia = gr.Button("Analyze")
+        out_i1 = gr.Label(label="Prediction")
+        out_i2 = gr.JSON(label="Probabilities")
+        out_i3 = gr.JSON(label="Latency (ms)")
+        btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
+if __name__ == "__main__":
+    demo.launch()

fusion-app/fusion.py ADDED Viewed

	@@ -0,0 +1,147 @@

+from pathlib import Path
+import json
+import numpy as np
+import torch
+import math
+from transformers import CLIPProcessor, CLIPModel, Wav2Vec2Processor, Wav2Vec2Model
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+_here = Path(__file__).parent
+_labels = json.loads((_here / "labels.json").read_text())["labels"]
+LABELS = [x["name"] for x in _labels]
+PROMPTS = [x["prompt"] for x in _labels]
+_clip_model = None
+_clip_proc = None
+_wav_model = None
+_wav_proc = None
+_proto_embs = None
+def _lazy_load_models():
+    global _clip_model, _clip_proc, _wav_model, _wav_proc
+    if _clip_model is None:
+        _clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
+        _clip_model.eval()
+        _clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+    if _wav_model is None:
+        _wav_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(DEVICE)
+        _wav_model.eval()
+        _wav_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
+def _sine(sr, freq, dur, amp=0.2):
+    t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
+    return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
+def _burst_noise(sr, dur, amp=0.2):
+    x = np.random.randn(int(sr*dur)).astype(np.float32)
+    # fast attack / fast decay envelope
+    n = x.size
+    env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
+    env = np.pad(env, (0, n-env.size), constant_values=1.0)
+    env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
+    return (amp * x * env).astype(np.float32)
+def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
+    third = 3/2 if minor else 4/3   # (approx)
+    f1, f2, f3 = base, base*third, base*2
+    w = (_sine(sr,f1,dur,amp) + _sine(sr,f2,dur,amp*0.7) + _sine(sr,f3,dur,amp*0.5))
+    return (w / (np.max(np.abs(w))+1e-6)).astype(np.float32)
+def _synthesize_audio_prototypes(sr=16000, dur=2.0):
+    return {
+        "calm":      _sine(sr, 220, dur, amp=0.08),                   # quiet low sine
+        "energetic": _burst_noise(sr, dur, amp=0.35),                 # noisy, punchy
+        "suspense":  _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12),  # low drones
+        "joyful":    _triad(sr, 262, minor=False, dur=dur, amp=0.22), # C major-ish
+        "sad":       _triad(sr, 262, minor=True,  dur=dur, amp=0.20), # C minor-ish
+    }
+def _ensure_audio_prototypes():
+    global _proto_embs
+    if _proto_embs is not None:
+        return
+    _lazy_load_models()
+    waves = _synthesize_audio_prototypes()
+    embs = {}
+    for lbl, wav in waves.items():
+        emb, _ = wav2vec2_embed_energy(wav)   # normalized 768-d embedding
+        embs[lbl] = emb / (np.linalg.norm(emb) + 1e-8)
+    _proto_embs = embs  # cache
+# image branch (CLIP)
+@torch.no_grad()
+def clip_image_probs(pil_image, prompts=PROMPTS):
+    _lazy_load_models()
+    # text features
+    text_inputs = _clip_proc(text=prompts, return_tensors="pt", padding=True).to(DEVICE)
+    text_feats = _clip_model.get_text_features(**text_inputs)  # [K, d]
+    text_feats = torch.nn.functional.normalize(text_feats, dim=-1)
+    # image features
+    img_inputs = _clip_proc(images=pil_image, return_tensors="pt").to(DEVICE)
+    img_feats = _clip_model.get_image_features(**img_inputs)   # [1, d]
+    img_feats = torch.nn.functional.normalize(img_feats, dim=-1)
+    # similarity to softmax
+    sims = (img_feats @ text_feats.T).squeeze(0)               # [K]
+    probs = torch.softmax(sims, dim=-1)                        # [K]
+    return probs.detach().cpu().numpy()                        # np.float32[K]
+# audio branch (Wav2Vec2 + energy prior)
+@torch.no_grad()
+def wav2vec2_embed_energy(wave_16k: np.ndarray):
+    _lazy_load_models()
+    # wave_16k must be float32 mono in [-1, 1]
+    inp = _wav_proc(wave_16k, sampling_rate=16000, return_tensors="pt").to(DEVICE)
+    out = _wav_model(**inp).last_hidden_state    # [1, T, 768]
+    emb = out.mean(dim=1).squeeze(0)            # [768]
+    emb = torch.nn.functional.normalize(emb, dim=-1)
+    emb_np = emb.detach().cpu().numpy()
+    # simple loudness proxy (RMS)
+    rms = float(np.sqrt(np.mean(np.square(wave_16k))))  # 0..~1
+    return emb_np, rms
+def audio_prior_from_rms(rms: float) -> np.ndarray:
+    # clamp
+    r = max(0.0, min(1.0, rms))
+    # weights via curves
+    calm = max(0.0, 1.0 - 2.0*r)          # high when quiet
+    sad  = max(0.0, 1.2 - 2.2*r)
+    energetic = r**0.8                     # grows with loudness
+    joyful = (r**0.9) * 0.9 + 0.1*(1-r)   # energetic but with a small bias
+    suspense = 0.6*(1.0 - abs(r - 0.5)*2) # middle loudness means suspense
+    vec = np.array([calm, energetic, suspense, joyful, sad], dtype=np.float32)
+    vec = np.clip(vec, 1e-4, None)
+    vec = vec / vec.sum()
+    return vec
+@torch.no_grad()
+def wav2vec2_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
+    _ensure_audio_prototypes()
+    emb, _ = wav2vec2_embed_energy(wave_16k)               # normalized already
+    emb = emb / (np.linalg.norm(emb) + 1e-8)
+    sims = np.array([float(np.dot(emb, _proto_embs[lbl])) for lbl in LABELS], dtype=np.float32)  # [K]
+    # temperature softmax for tunable sharpness
+    z = sims / max(1e-6, float(temperature))
+    z = z - z.max()                                        # numerical stability
+    p = np.exp(z); p /= (p.sum() + 1e-8)
+    return p.astype(np.float32)
+# fusion
+def fuse_probs(image_probs: np.ndarray, audio_prior: np.ndarray, alpha: float = 0.7) -> np.ndarray:
+    p_img = image_probs / (image_probs.sum() + 1e-8)   # alpha closer to 1 favors image, 0 favors audio.
+    p_aud = audio_prior / (audio_prior.sum() + 1e-8)
+    p = alpha * p_img + (1.0 - alpha) * p_aud
+    p = p / (p.sum() + 1e-8)
+    return p
+def top1_label_from_probs(p: np.ndarray) -> str:
+    return LABELS[int(p.argmax())]

fusion-app/labels.json ADDED Viewed

	@@ -0,0 +1,11 @@

+{
+  "labels": [
+    {"name": "calm",      "prompt": "a tranquil, peaceful scene",         "def": "low motion, soft colors, quiet audio"},
+    {"name": "energetic", "prompt": "a high-energy lively scene",         "def": "fast motion, bright colors, loud/fast audio"},
+    {"name": "suspense",  "prompt": "a tense, foreboding scene",          "def": "dim colors, slow build, ominous drones"},
+    {"name": "joyful",    "prompt": "a happy, upbeat, celebratory scene", "def": "warm colors, smiles, upbeat music"},
+    {"name": "sad",       "prompt": "a somber, gloomy scene",             "def": "cool/dark tones, slow pace, quiet audio"}
+  ]
+}

fusion-app/tests/test_shapes.py ADDED Viewed

	@@ -0,0 +1,4 @@

+import numpy as np
+def test_concat_dim():
+    img, aud = np.random.randn(512), np.random.randn(768)
+    assert (img.size + aud.size) == 1280

fusion-app/tests/test_smoke.py ADDED Viewed

	@@ -0,0 +1,3 @@

+def test_imports():
+    import gradio, numpy  # noqa
+    assert True

fusion-app/utils_media.py ADDED Viewed

	@@ -0,0 +1,176 @@

+import csv
+import json
+from pathlib import Path
+import time
+from typing import Any, Dict, Tuple, Union
+import io
+import numpy as np
+from PIL import Image
+import ffmpeg
+import tempfile
+from pydub import AudioSegment
+#  helpers
+def probe_duration_sec(video_path: str) -> float:
+    try:
+        meta = ffmpeg.probe(video_path)
+        return float(meta.get("format", {}).get("duration", 0.0)) or 0.0
+    except Exception:
+        return 0.0
+def _to_path(p: Union[str, dict, Path]) -> str:
+    if isinstance(p, dict):
+        return p.get("name") or p.get("path") or p.get("data") or ""
+    return str(p)
+def _audiosegment_float32(seg: AudioSegment) -> np.ndarray:
+    seg = seg.set_frame_rate(16000).set_channels(1).set_sample_width(2)  # 16-bit
+    samples = np.array(seg.get_array_of_samples(), dtype=np.int16)
+    return (samples.astype(np.float32) / 32768.0)
+#  public API
+def video_to_frame_audio(
+    video_in,
+    target_frames: int = 64,   # aim for this many frames total
+    fps_cap: float = 3.0       # never sample faster than this
+    ) -> Tuple[list, np.ndarray, dict]:
+    video_path = _to_path(video_in)
+    if not video_path:
+        raise ValueError("Empty video path")
+    dur = probe_duration_sec(video_path)
+    if dur <= 0:
+        fps = 1.0
+    else:
+        fps = min(fps_cap, max(1.0 / dur, target_frames / dur))
+    frames = []
+    with tempfile.TemporaryDirectory() as td:
+        td = Path(td)
+        out_pattern = str(td / "frame_%06d.jpg")
+        (
+            ffmpeg
+            .input(video_path)
+            .output(out_pattern, vf=f"fps={fps}", vsync="vfr", qscale=2)
+            .overwrite_output()
+            .run(capture_stdout=True, capture_stderr=True)
+        )
+        for p in sorted(td.glob("frame_*.jpg")):
+            frames.append(Image.open(p).convert("RGB"))
+    seg = AudioSegment.from_file(video_path)
+    audio16k = _audiosegment_float32(seg)
+    meta = {"duration_s": float(dur), "fps_used": float(fps), "n_frames": int(len(frames))}
+    return frames, audio16k, meta
+def load_audio_16k(audio_path_like) -> np.ndarray:
+    path = _to_path(audio_path_like)
+    seg = AudioSegment.from_file(path)
+    return _audiosegment_float32(seg)
+# Logging
+DEFAULT_CSV = Path(__file__).parent / "runs_local.csv"
+def now_iso() -> str:
+    # UTC-ish wall time string (sufficient for ordering/eyeballing).
+    return time.strftime("%Y-%m-%dT%H:%M:%S")
+def append_csv(csv_path: Union[str, Path] = DEFAULT_CSV, row: Dict[str, Any] = None) -> None:
+    if row is None:
+        return
+    p = Path(csv_path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    is_new = not p.exists()
+    safe_row = {k: (json.dumps(v) if isinstance(v, (list, dict)) else v) for k, v in row.items()}
+    with p.open("a", newline="", encoding="utf-8") as f:
+        w = csv.DictWriter(f, fieldnames=list(safe_row.keys()))
+        if is_new:
+            w.writeheader()
+        w.writerow(safe_row)
+def log_inference(
+    *,
+    engine: str,          # "local" or "api"
+    mode: str,            # "video" or "image_audio"
+    alpha: float,
+    lat: Dict[str, Any],  # expects keys like t_image_ms, t_audio_ms, t_fuse_ms, t_total_ms, rms
+    pred: str,
+    probs: Dict[str, float],
+    csv_path: Union[str, Path] = DEFAULT_CSV
+) -> None:
+    payload = {
+        "ts": now_iso(),
+        "engine": engine,
+        "mode": mode,
+        "alpha": float(alpha),
+        "rms": lat.get("rms"),
+        "t_image_ms": lat.get("t_image_ms"),
+        "t_audio_ms": lat.get("t_audio_ms"),
+        "t_fuse_ms":  lat.get("t_fuse_ms"),
+        "t_total_ms": lat.get("t_total_ms"),
+        "pred": pred,
+        "probs": probs,
+    }
+    append_csv(csv_path, payload)
+# Summarizer
+def summarize_csv(
+    csv_path: Union[str, Path] = DEFAULT_CSV,
+    cols = ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms")
+) -> Dict[str, Dict[str, float]]:
+    """
+    Compute p50/p95 for latency columns. Returns a dict so you can print or consume it.
+    """
+    p = Path(csv_path)
+    if not p.exists():
+        return {}
+    with p.open("r", encoding="utf-8") as f:
+        rows = list(csv.DictReader(f))
+    def _col_vals(c):
+        out = []
+        for r in rows:
+            v = r.get(c)
+            if v is None or v == "":
+                continue
+            try:
+                out.append(float(v))
+            except Exception:
+                pass
+        return np.array(out, dtype=float)
+    stats: Dict[str, Dict[str, float]] = {}
+    for c in cols:
+        arr = _col_vals(c)
+        if arr.size == 0:
+            stats[c] = {"p50": float("nan"), "p95": float("nan"), "n": 0}
+        else:
+            stats[c] = {
+                "p50": float(np.percentile(arr, 50)),
+                "p95": float(np.percentile(arr, 95)),
+                "n":   int(arr.size),
+            }
+    return stats
+if __name__ == "__main__":
+    # CLI usage: python fusion-app/utils_media.py [csv_path]
+    import sys
+    path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CSV
+    s = summarize_csv(path)
+    print(f"File: {path}")
+    if not s:
+        print("No rows found.")
+    else:
+        for k in ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms"):
+            if k in s:
+                print(f"{k:>11}:  p50={s[k]['p50']:.1f} ms   p95={s[k]['p95']:.1f} ms   n={s[k]['n']}")

packages.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ ffmpeg

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+gradio
+transformers
+torch
+torchaudio
+torchvision
+pydub
+ffmpeg-python
+numpy
+pytest
+huggingface_hub
+datasets