Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- .gitignore +208 -0
- LICENSE +21 -0
- README.md +6 -10
- fusion-app/app_api.py +228 -0
- fusion-app/app_local.py +123 -0
- fusion-app/fusion.py +147 -0
- fusion-app/labels.json +11 -0
- fusion-app/tests/test_shapes.py +4 -0
- fusion-app/tests/test_smoke.py +3 -0
- fusion-app/utils_media.py +176 -0
- packages.txt +1 -0
- requirements.txt +11 -0
.gitignore
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Byte-compiled / optimized / DLL files
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[codz]
|
| 4 |
+
*$py.class
|
| 5 |
+
|
| 6 |
+
# C extensions
|
| 7 |
+
*.so
|
| 8 |
+
|
| 9 |
+
# Distribution / packaging
|
| 10 |
+
.Python
|
| 11 |
+
build/
|
| 12 |
+
develop-eggs/
|
| 13 |
+
dist/
|
| 14 |
+
downloads/
|
| 15 |
+
eggs/
|
| 16 |
+
.eggs/
|
| 17 |
+
lib/
|
| 18 |
+
lib64/
|
| 19 |
+
parts/
|
| 20 |
+
sdist/
|
| 21 |
+
var/
|
| 22 |
+
wheels/
|
| 23 |
+
share/python-wheels/
|
| 24 |
+
*.egg-info/
|
| 25 |
+
.installed.cfg
|
| 26 |
+
*.egg
|
| 27 |
+
MANIFEST
|
| 28 |
+
|
| 29 |
+
# PyInstaller
|
| 30 |
+
# Usually these files are written by a python script from a template
|
| 31 |
+
# before PyInstaller builds the exe, so as to inject date/other infos into it.
|
| 32 |
+
*.manifest
|
| 33 |
+
*.spec
|
| 34 |
+
|
| 35 |
+
# Installer logs
|
| 36 |
+
pip-log.txt
|
| 37 |
+
pip-delete-this-directory.txt
|
| 38 |
+
|
| 39 |
+
# Unit test / coverage reports
|
| 40 |
+
htmlcov/
|
| 41 |
+
.tox/
|
| 42 |
+
.nox/
|
| 43 |
+
.coverage
|
| 44 |
+
.coverage.*
|
| 45 |
+
.cache
|
| 46 |
+
nosetests.xml
|
| 47 |
+
coverage.xml
|
| 48 |
+
*.cover
|
| 49 |
+
*.py.cover
|
| 50 |
+
.hypothesis/
|
| 51 |
+
.pytest_cache/
|
| 52 |
+
cover/
|
| 53 |
+
|
| 54 |
+
# Translations
|
| 55 |
+
*.mo
|
| 56 |
+
*.pot
|
| 57 |
+
|
| 58 |
+
# Django stuff:
|
| 59 |
+
*.log
|
| 60 |
+
local_settings.py
|
| 61 |
+
db.sqlite3
|
| 62 |
+
db.sqlite3-journal
|
| 63 |
+
|
| 64 |
+
# Flask stuff:
|
| 65 |
+
instance/
|
| 66 |
+
.webassets-cache
|
| 67 |
+
|
| 68 |
+
# Scrapy stuff:
|
| 69 |
+
.scrapy
|
| 70 |
+
|
| 71 |
+
# Sphinx documentation
|
| 72 |
+
docs/_build/
|
| 73 |
+
|
| 74 |
+
# PyBuilder
|
| 75 |
+
.pybuilder/
|
| 76 |
+
target/
|
| 77 |
+
|
| 78 |
+
# Jupyter Notebook
|
| 79 |
+
.ipynb_checkpoints
|
| 80 |
+
|
| 81 |
+
# IPython
|
| 82 |
+
profile_default/
|
| 83 |
+
ipython_config.py
|
| 84 |
+
|
| 85 |
+
# pyenv
|
| 86 |
+
# For a library or package, you might want to ignore these files since the code is
|
| 87 |
+
# intended to run in multiple environments; otherwise, check them in:
|
| 88 |
+
# .python-version
|
| 89 |
+
|
| 90 |
+
# pipenv
|
| 91 |
+
# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
|
| 92 |
+
# However, in case of collaboration, if having platform-specific dependencies or dependencies
|
| 93 |
+
# having no cross-platform support, pipenv may install dependencies that don't work, or not
|
| 94 |
+
# install all needed dependencies.
|
| 95 |
+
#Pipfile.lock
|
| 96 |
+
|
| 97 |
+
# UV
|
| 98 |
+
# Similar to Pipfile.lock, it is generally recommended to include uv.lock in version control.
|
| 99 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 100 |
+
# commonly ignored for libraries.
|
| 101 |
+
#uv.lock
|
| 102 |
+
|
| 103 |
+
# poetry
|
| 104 |
+
# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
|
| 105 |
+
# This is especially recommended for binary packages to ensure reproducibility, and is more
|
| 106 |
+
# commonly ignored for libraries.
|
| 107 |
+
# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
|
| 108 |
+
#poetry.lock
|
| 109 |
+
#poetry.toml
|
| 110 |
+
|
| 111 |
+
# pdm
|
| 112 |
+
# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
|
| 113 |
+
# pdm recommends including project-wide configuration in pdm.toml, but excluding .pdm-python.
|
| 114 |
+
# https://pdm-project.org/en/latest/usage/project/#working-with-version-control
|
| 115 |
+
#pdm.lock
|
| 116 |
+
#pdm.toml
|
| 117 |
+
.pdm-python
|
| 118 |
+
.pdm-build/
|
| 119 |
+
|
| 120 |
+
# pixi
|
| 121 |
+
# Similar to Pipfile.lock, it is generally recommended to include pixi.lock in version control.
|
| 122 |
+
#pixi.lock
|
| 123 |
+
# Pixi creates a virtual environment in the .pixi directory, just like venv module creates one
|
| 124 |
+
# in the .venv directory. It is recommended not to include this directory in version control.
|
| 125 |
+
.pixi
|
| 126 |
+
|
| 127 |
+
# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
|
| 128 |
+
__pypackages__/
|
| 129 |
+
|
| 130 |
+
# Celery stuff
|
| 131 |
+
celerybeat-schedule
|
| 132 |
+
celerybeat.pid
|
| 133 |
+
|
| 134 |
+
# SageMath parsed files
|
| 135 |
+
*.sage.py
|
| 136 |
+
|
| 137 |
+
# Environments
|
| 138 |
+
.env
|
| 139 |
+
.envrc
|
| 140 |
+
.venv
|
| 141 |
+
env/
|
| 142 |
+
venv/
|
| 143 |
+
ENV/
|
| 144 |
+
env.bak/
|
| 145 |
+
venv.bak/
|
| 146 |
+
|
| 147 |
+
# Spyder project settings
|
| 148 |
+
.spyderproject
|
| 149 |
+
.spyproject
|
| 150 |
+
|
| 151 |
+
# Rope project settings
|
| 152 |
+
.ropeproject
|
| 153 |
+
|
| 154 |
+
# mkdocs documentation
|
| 155 |
+
/site
|
| 156 |
+
|
| 157 |
+
# mypy
|
| 158 |
+
.mypy_cache/
|
| 159 |
+
.dmypy.json
|
| 160 |
+
dmypy.json
|
| 161 |
+
|
| 162 |
+
# Pyre type checker
|
| 163 |
+
.pyre/
|
| 164 |
+
|
| 165 |
+
# pytype static type analyzer
|
| 166 |
+
.pytype/
|
| 167 |
+
|
| 168 |
+
# Cython debug symbols
|
| 169 |
+
cython_debug/
|
| 170 |
+
|
| 171 |
+
# PyCharm
|
| 172 |
+
# JetBrains specific template is maintained in a separate JetBrains.gitignore that can
|
| 173 |
+
# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
|
| 174 |
+
# and can be added to the global gitignore or merged into this file. For a more nuclear
|
| 175 |
+
# option (not recommended) you can uncomment the following to ignore the entire idea folder.
|
| 176 |
+
#.idea/
|
| 177 |
+
|
| 178 |
+
# Abstra
|
| 179 |
+
# Abstra is an AI-powered process automation framework.
|
| 180 |
+
# Ignore directories containing user credentials, local state, and settings.
|
| 181 |
+
# Learn more at https://abstra.io/docs
|
| 182 |
+
.abstra/
|
| 183 |
+
|
| 184 |
+
# Visual Studio Code
|
| 185 |
+
# Visual Studio Code specific template is maintained in a separate VisualStudioCode.gitignore
|
| 186 |
+
# that can be found at https://github.com/github/gitignore/blob/main/Global/VisualStudioCode.gitignore
|
| 187 |
+
# and can be added to the global gitignore or merged into this file. However, if you prefer,
|
| 188 |
+
# you could uncomment the following to ignore the entire vscode folder
|
| 189 |
+
# .vscode/
|
| 190 |
+
|
| 191 |
+
# Ruff stuff:
|
| 192 |
+
.ruff_cache/
|
| 193 |
+
|
| 194 |
+
# PyPI configuration file
|
| 195 |
+
.pypirc
|
| 196 |
+
|
| 197 |
+
# Cursor
|
| 198 |
+
# Cursor is an AI-powered code editor. `.cursorignore` specifies files/directories to
|
| 199 |
+
# exclude from AI features like autocomplete and code analysis. Recommended for sensitive data
|
| 200 |
+
# refer to https://docs.cursor.com/context/ignore-files
|
| 201 |
+
.cursorignore
|
| 202 |
+
.cursorindexingignore
|
| 203 |
+
|
| 204 |
+
# Marimo
|
| 205 |
+
marimo/_static/
|
| 206 |
+
marimo/_lsp/
|
| 207 |
+
__marimo__/
|
| 208 |
+
DSCS553_CS1_Assignment.pdf
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Shreya Boyane
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
CHANGED
|
@@ -1,13 +1,9 @@
|
|
| 1 |
---
|
| 2 |
-
title: Scene Mood Classifier
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk: gradio
|
| 7 |
-
|
| 8 |
-
app_file: app.py
|
| 9 |
pinned: false
|
| 10 |
-
license: mit
|
| 11 |
---
|
| 12 |
-
|
| 13 |
-
Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
|
|
|
|
| 1 |
---
|
| 2 |
+
title: Scene Mood Classifier
|
| 3 |
+
emoji: 🎬
|
| 4 |
+
colorFrom: indigo
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: gradio
|
| 7 |
+
app_file: fusion-app/app_api.py
|
|
|
|
| 8 |
pinned: false
|
|
|
|
| 9 |
---
|
|
|
|
|
|
fusion-app/app_api.py
ADDED
|
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from __future__ import annotations
|
| 2 |
+
import io, os, time, json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from typing import List, Dict
|
| 5 |
+
import numpy as np
|
| 6 |
+
from PIL import Image
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import requests
|
| 9 |
+
from huggingface_hub import InferenceClient
|
| 10 |
+
from pydub import AudioSegment
|
| 11 |
+
from utils_media import video_to_frame_audio, load_audio_16k, log_inference
|
| 12 |
+
|
| 13 |
+
HERE = Path(__file__).parent
|
| 14 |
+
LABEL_ITEMS = json.loads((HERE / "labels.json").read_text())["labels"]
|
| 15 |
+
LABELS = [x["name"] for x in LABEL_ITEMS]
|
| 16 |
+
PROMPTS = [x["prompt"] for x in LABEL_ITEMS]
|
| 17 |
+
|
| 18 |
+
CLIP_MODEL = "openai/clip-vit-base-patch32"
|
| 19 |
+
W2V2_MODEL = "facebook/wav2vec2-base"
|
| 20 |
+
|
| 21 |
+
HF_TOKEN = os.getenv("HF_TOKEN")
|
| 22 |
+
if not HF_TOKEN:
|
| 23 |
+
raise RuntimeError("Missing HF_TOKEN in environment.")
|
| 24 |
+
|
| 25 |
+
client = InferenceClient(token=HF_TOKEN)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
def _img_to_jpeg_bytes(pil: Image.Image) -> bytes:
|
| 30 |
+
buf = io.BytesIO()
|
| 31 |
+
pil.convert("RGB").save(buf, format="JPEG", quality=90)
|
| 32 |
+
return buf.getvalue()
|
| 33 |
+
|
| 34 |
+
def clip_api_probs(pil: Image.Image, prompts: List[str] = PROMPTS) -> np.ndarray:
|
| 35 |
+
|
| 36 |
+
result = client.zero_shot_image_classification(
|
| 37 |
+
image=pil, candidate_labels=prompts,
|
| 38 |
+
hypothesis_template="{}",
|
| 39 |
+
model=CLIP_MODEL,
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
scores = {d["label"]: float(d["score"]) for d in result}
|
| 43 |
+
arr = np.array([scores.get(p, 0.0) for p in prompts], dtype=np.float32)
|
| 44 |
+
|
| 45 |
+
s = arr.sum(); arr = arr / s if s > 0 else np.ones_like(arr)/len(arr)
|
| 46 |
+
return arr
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def _wave_float32_to_wav_bytes(wave_16k: np.ndarray, sr=16000) -> bytes:
|
| 51 |
+
|
| 52 |
+
samples = (np.clip(wave_16k, -1, 1) * 32767.0).astype(np.int16)
|
| 53 |
+
seg = AudioSegment(
|
| 54 |
+
samples.tobytes(), frame_rate=sr, sample_width=2, channels=1
|
| 55 |
+
)
|
| 56 |
+
out = io.BytesIO()
|
| 57 |
+
seg.export(out, format="wav")
|
| 58 |
+
return out.getvalue()
|
| 59 |
+
|
| 60 |
+
def w2v2_api_embed(wave_16k: np.ndarray) -> np.ndarray:
|
| 61 |
+
wav_bytes = _wave_float32_to_wav_bytes(wave_16k)
|
| 62 |
+
|
| 63 |
+
url = f"https://api-inference.huggingface.co/models/{W2V2_MODEL}"
|
| 64 |
+
hdrs = {"Authorization": f"Bearer {HF_TOKEN}"}
|
| 65 |
+
r = requests.post(url, headers=hdrs, data=wav_bytes, timeout=60)
|
| 66 |
+
r.raise_for_status()
|
| 67 |
+
arr = np.asarray(r.json(), dtype=np.float32) # shape [T, 768]
|
| 68 |
+
if arr.ndim == 3: # [batch, T, D]
|
| 69 |
+
arr = arr[0]
|
| 70 |
+
vec = arr.mean(axis=0) # [768]
|
| 71 |
+
# L2 normalize
|
| 72 |
+
n = np.linalg.norm(vec) + 1e-8
|
| 73 |
+
return (vec / n).astype(np.float32)
|
| 74 |
+
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
_PROTO_EMBS: Dict[str, np.ndarray] | None = None
|
| 78 |
+
|
| 79 |
+
def _sine(sr, freq, dur, amp=0.2):
|
| 80 |
+
t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
|
| 81 |
+
return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
|
| 82 |
+
|
| 83 |
+
def _burst_noise(sr, dur, amp=0.2):
|
| 84 |
+
x = np.random.randn(int(sr*dur)).astype(np.float32)
|
| 85 |
+
n = x.size
|
| 86 |
+
env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
|
| 87 |
+
env = np.pad(env, (0, n-env.size), constant_values=1.0)
|
| 88 |
+
env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
|
| 89 |
+
return (amp * x * env).astype(np.float32)
|
| 90 |
+
|
| 91 |
+
def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
|
| 92 |
+
third = 3/2 if minor else 4/3
|
| 93 |
+
w = (_sine(sr, base, dur, amp)
|
| 94 |
+
+ _sine(sr, base*third, dur, amp*0.7)
|
| 95 |
+
+ _sine(sr, base*2, dur, amp*0.5))
|
| 96 |
+
return (w / (np.max(np.abs(w)) + 1e-6)).astype(np.float32)
|
| 97 |
+
|
| 98 |
+
def _synthesize_audio_prototypes(sr=16000, dur=2.0):
|
| 99 |
+
return {
|
| 100 |
+
"calm": _sine(sr, 220, dur, amp=0.08),
|
| 101 |
+
"energetic": _burst_noise(sr, dur, amp=0.35),
|
| 102 |
+
"suspense": _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12),
|
| 103 |
+
"joyful": _triad(sr, 262, minor=False, dur=dur, amp=0.22),
|
| 104 |
+
"sad": _triad(sr, 262, minor=True, dur=dur, amp=0.20),
|
| 105 |
+
}
|
| 106 |
+
|
| 107 |
+
def _ensure_proto_embs():
|
| 108 |
+
global _PROTO_EMBS
|
| 109 |
+
if _PROTO_EMBS is not None:
|
| 110 |
+
return
|
| 111 |
+
waves = _synthesize_audio_prototypes()
|
| 112 |
+
embs = {}
|
| 113 |
+
for lbl, wav in waves.items():
|
| 114 |
+
e = w2v2_api_embed(wav) # API embed L2-normalized
|
| 115 |
+
embs[lbl] = e
|
| 116 |
+
_PROTO_EMBS = embs
|
| 117 |
+
|
| 118 |
+
def w2v2_api_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
|
| 119 |
+
_ensure_proto_embs()
|
| 120 |
+
emb = w2v2_api_embed(wave_16k) # [768], normalized
|
| 121 |
+
sims = np.array([float(np.dot(emb, _PROTO_EMBS[lbl])) for lbl in LABELS], dtype=np.float32)
|
| 122 |
+
z = sims / max(1e-6, float(temperature))
|
| 123 |
+
z = z - z.max()
|
| 124 |
+
p = np.exp(z); p /= (p.sum() + 1e-8)
|
| 125 |
+
return p.astype(np.float32)
|
| 126 |
+
|
| 127 |
+
|
| 128 |
+
def fuse_probs(p_img: np.ndarray, p_aud: np.ndarray, alpha: float) -> np.ndarray:
|
| 129 |
+
p_img = p_img / (p_img.sum() + 1e-8)
|
| 130 |
+
p_aud = p_aud / (p_aud.sum() + 1e-8)
|
| 131 |
+
p = alpha * p_img + (1 - alpha) * p_aud
|
| 132 |
+
return p / (p.sum() + 1e-8)
|
| 133 |
+
|
| 134 |
+
def top1_label(p: np.ndarray) -> str:
|
| 135 |
+
return LABELS[int(np.argmax(p))]
|
| 136 |
+
|
| 137 |
+
def predict_video(video, alpha=0.7):
|
| 138 |
+
t0 = time.time()
|
| 139 |
+
|
| 140 |
+
# FULL video analysis
|
| 141 |
+
frames, wave, meta = video_to_frame_audio(video, target_frames=24, fps_cap=2.0)
|
| 142 |
+
|
| 143 |
+
# IMAGE
|
| 144 |
+
t_img0 = time.time()
|
| 145 |
+
per_frame = [clip_api_probs(pil) for pil in frames]
|
| 146 |
+
p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
|
| 147 |
+
t_img = time.time() - t_img0
|
| 148 |
+
|
| 149 |
+
# AUDIO
|
| 150 |
+
t_aud0 = time.time()
|
| 151 |
+
p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
|
| 152 |
+
t_aud = time.time() - t_aud0
|
| 153 |
+
|
| 154 |
+
# FUSION
|
| 155 |
+
t_fus0 = time.time()
|
| 156 |
+
p = fuse_probs(p_img, p_aud, alpha=float(alpha))
|
| 157 |
+
t_fus = time.time() - t_fus0
|
| 158 |
+
|
| 159 |
+
pred = top1_label(p)
|
| 160 |
+
probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
|
| 161 |
+
lat = {
|
| 162 |
+
"t_image_ms": int(t_img*1000),
|
| 163 |
+
"t_audio_ms": int(t_aud*1000),
|
| 164 |
+
"t_fuse_ms": int(t_fus*1000),
|
| 165 |
+
"t_total_ms": int((time.time()-t0)*1000),
|
| 166 |
+
"n_frames": meta.get("n_frames"),
|
| 167 |
+
"fps_used": meta.get("fps_used"),
|
| 168 |
+
"duration_s": meta.get("duration_s"),
|
| 169 |
+
}
|
| 170 |
+
log_inference(engine="api", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
|
| 171 |
+
return pred, probs, lat
|
| 172 |
+
|
| 173 |
+
def predict_image_audio(image: Image.Image, audio_path: str, alpha=0.7):
|
| 174 |
+
t0 = time.time()
|
| 175 |
+
wave = load_audio_16k(audio_path)
|
| 176 |
+
|
| 177 |
+
# IMAGE
|
| 178 |
+
t_img0 = time.time()
|
| 179 |
+
p_img = clip_api_probs(image)
|
| 180 |
+
t_img = time.time() - t_img0
|
| 181 |
+
|
| 182 |
+
# AUDIO
|
| 183 |
+
t_aud0 = time.time()
|
| 184 |
+
p_aud = w2v2_api_zero_shot_probs(wave, temperature=1.0)
|
| 185 |
+
t_aud = time.time() - t_aud0
|
| 186 |
+
|
| 187 |
+
# FUSION
|
| 188 |
+
t_fus0 = time.time()
|
| 189 |
+
p = fuse_probs(p_img, p_aud, alpha=float(alpha))
|
| 190 |
+
t_fus = time.time() - t_fus0
|
| 191 |
+
|
| 192 |
+
pred = top1_label(p)
|
| 193 |
+
probs = {k: round(float(v), 4) for k, v in zip(LABELS, p)}
|
| 194 |
+
lat = {
|
| 195 |
+
"t_image_ms": int(t_img*1000),
|
| 196 |
+
"t_audio_ms": int(t_aud*1000),
|
| 197 |
+
"t_fuse_ms": int(t_fus*1000),
|
| 198 |
+
"t_total_ms": int((time.time()-t0)*1000),
|
| 199 |
+
}
|
| 200 |
+
log_inference(engine="api", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
|
| 201 |
+
return pred, probs, lat
|
| 202 |
+
|
| 203 |
+
'''
|
| 204 |
+
Chat GPT : Create Gradio interface for the above API functions same as local app.
|
| 205 |
+
'''
|
| 206 |
+
with gr.Blocks(title="Scene Mood (API)") as demo:
|
| 207 |
+
gr.Markdown("# Scene Mood Classifier - API Version. Upload a short **video** or an **image + audio** pair.")
|
| 208 |
+
with gr.Tab("Video"):
|
| 209 |
+
v = gr.Video(sources=["upload"], height=240)
|
| 210 |
+
alpha_v = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
|
| 211 |
+
label="Fusion weight α (image ↔ audio)",
|
| 212 |
+
info="α=1 trusts image only; α=0 trusts audio only.")
|
| 213 |
+
btn_v = gr.Button("Analyze")
|
| 214 |
+
out_v1, out_v2, out_v3 = gr.Label(), gr.JSON(), gr.JSON()
|
| 215 |
+
btn_v.click(predict_video, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
|
| 216 |
+
|
| 217 |
+
with gr.Tab("Image + Audio"):
|
| 218 |
+
img = gr.Image(type="pil", height=240, label="Image")
|
| 219 |
+
aud = gr.Audio(sources=["upload"], type="filepath", label="Audio")
|
| 220 |
+
alpha_ia = gr.Slider(0.0, 1.0, value=0.7, step=0.05,
|
| 221 |
+
label="Fusion weight α (image ↔ audio)",
|
| 222 |
+
info="α=1 trusts image only; α=0 trusts audio only.")
|
| 223 |
+
btn_ia = gr.Button("Analyze")
|
| 224 |
+
out_i1, out_i2, out_i3 = gr.Label(), gr.JSON(), gr.JSON()
|
| 225 |
+
btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
|
| 226 |
+
|
| 227 |
+
if __name__ == "__main__":
|
| 228 |
+
demo.launch()
|
fusion-app/app_local.py
ADDED
|
@@ -0,0 +1,123 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
from utils_media import video_to_frame_audio, load_audio_16k, log_inference
|
| 5 |
+
from fusion import clip_image_probs, wav2vec2_embed_energy, wav2vec2_zero_shot_probs, audio_prior_from_rms, fuse_probs, top1_label_from_probs
|
| 6 |
+
|
| 7 |
+
HERE = Path(__file__).parent
|
| 8 |
+
lables_PATH = HERE / "labels.json"
|
| 9 |
+
|
| 10 |
+
lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
|
| 11 |
+
|
| 12 |
+
# lables = [x ["name"] for x in json.load(Path("fusion-app/labels.json").read_text())["labels"]]
|
| 13 |
+
|
| 14 |
+
def predict_vid(video, alpha=0.7):
|
| 15 |
+
import time, numpy as np
|
| 16 |
+
t0 = time.time()
|
| 17 |
+
frames, wave, meta = video_to_frame_audio(video, target_frames=64, fps_cap=3.0)
|
| 18 |
+
|
| 19 |
+
t_img0 = time.time()
|
| 20 |
+
per_frame = []
|
| 21 |
+
for pil in frames:
|
| 22 |
+
per_frame.append(clip_image_probs(pil)) # np[K]
|
| 23 |
+
p_img = np.mean(np.stack(per_frame, axis=0), axis=0)
|
| 24 |
+
t_img = time.time() - t_img0
|
| 25 |
+
|
| 26 |
+
t_aud0 = time.time()
|
| 27 |
+
_, rms = wav2vec2_embed_energy(wave) # embedding computed; report rms
|
| 28 |
+
p_aud = audio_prior_from_rms(rms) # np[K]
|
| 29 |
+
t_aud = time.time() - t_aud0
|
| 30 |
+
|
| 31 |
+
t_fus0 = time.time()
|
| 32 |
+
p = fuse_probs(p_img, p_aud, alpha=float(alpha))
|
| 33 |
+
t_fus = time.time() - t_fus0
|
| 34 |
+
|
| 35 |
+
pred = top1_label_from_probs(p)
|
| 36 |
+
probs = {k: round(float(v), 4) for k, v in zip(lables, p)}
|
| 37 |
+
lat = {
|
| 38 |
+
"t_image_ms": int(t_img * 1000),
|
| 39 |
+
"t_audio_ms": int(t_aud * 1000),
|
| 40 |
+
"t_fuse_ms": int(t_fus * 1000),
|
| 41 |
+
"t_total_ms": int((time.time() - t0) * 1000),
|
| 42 |
+
"rms": round(float(rms), 4),
|
| 43 |
+
"n_frames": meta.get("n_frames"),
|
| 44 |
+
"fps_used": round(float(meta.get("fps_used") or 0.0), 3),
|
| 45 |
+
"duration_s": round(float(meta.get("duration_s") or 0.0), 2),
|
| 46 |
+
}
|
| 47 |
+
print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
|
| 48 |
+
log_inference(engine="local", mode="video", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
|
| 49 |
+
return pred, probs, lat
|
| 50 |
+
|
| 51 |
+
def predict_image_audio(image, audio_path, alpha=0.7):
|
| 52 |
+
import time, numpy as np
|
| 53 |
+
t0 = time.time()
|
| 54 |
+
wave = load_audio_16k(audio_path)
|
| 55 |
+
|
| 56 |
+
t_img0 = time.time()
|
| 57 |
+
p_img = clip_image_probs(image)
|
| 58 |
+
t_img = time.time() - t_img0
|
| 59 |
+
|
| 60 |
+
t_aud0 = time.time()
|
| 61 |
+
p_aud = wav2vec2_zero_shot_probs(wave, temperature=1.0)
|
| 62 |
+
_, rms = wav2vec2_embed_energy(wave)
|
| 63 |
+
p_rms = audio_prior_from_rms(rms)
|
| 64 |
+
p_aud = 0.8 * p_aud + 0.2 * p_rms
|
| 65 |
+
t_aud = time.time() - t_aud0
|
| 66 |
+
|
| 67 |
+
t_fus0 = time.time()
|
| 68 |
+
p = fuse_probs(p_img, p_aud, alpha=float(alpha))
|
| 69 |
+
t_fus = time.time() - t_fus0
|
| 70 |
+
|
| 71 |
+
pred = top1_label_from_probs(p)
|
| 72 |
+
probs = {k: float(v) for k, v in zip(lables, p)}
|
| 73 |
+
lat = {
|
| 74 |
+
"t_image_ms": int(t_img*1000),
|
| 75 |
+
"t_audio_ms": int(t_aud*1000),
|
| 76 |
+
"t_fuse_ms": int(t_fus*1000),
|
| 77 |
+
"t_total_ms": int((time.time()-t0)*1000),
|
| 78 |
+
"rms": round(float(rms), 4),
|
| 79 |
+
}
|
| 80 |
+
print("[DEBUG] p_img:", p_img, "p_aud:", p_aud, "fused:", p, "rms:", rms, flush=True)
|
| 81 |
+
log_inference(engine="local", mode="image_audio", alpha=float(alpha), lat=lat, pred=pred, probs=probs)
|
| 82 |
+
return pred, probs, lat
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
with gr.Blocks(title="Scene Mood Detection") as demo:
|
| 86 |
+
gr.Markdown("# Scene Mood Classifier - Local \nUpload a short **video** or an **image + audio** pair.")
|
| 87 |
+
with gr.Tab("Video"):
|
| 88 |
+
v = gr.Video(sources=["upload"], height=240)
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
# Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
|
| 92 |
+
alpha_v = gr.Slider(
|
| 93 |
+
minimum=0.0, maximum=1.0, value=0.7, step=0.05,
|
| 94 |
+
label="Fusion weight α (image ↔ audio)",
|
| 95 |
+
info="α=1 trusts image only; α=0 trusts audio only."
|
| 96 |
+
)
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
btn_v = gr.Button("Analyze")
|
| 100 |
+
out_v1 = gr.Label(label="Prediction")
|
| 101 |
+
out_v2 = gr.JSON(label="Probabilities")
|
| 102 |
+
out_v3 = gr.JSON(label="Latency (ms)")
|
| 103 |
+
btn_v.click(predict_vid, inputs=[v, alpha_v], outputs=[out_v1, out_v2, out_v3])
|
| 104 |
+
|
| 105 |
+
with gr.Tab("Image + Audio"):
|
| 106 |
+
img = gr.Image(type="pil", height=240)
|
| 107 |
+
aud = gr.Audio(sources=["upload"], type="filepath")
|
| 108 |
+
|
| 109 |
+
# Chat GPT : Create Gradio slider for alpha value with label "Fusion weight α (image ↔ audio)" and info "α=1 trusts image only; α=0 trusts audio only."
|
| 110 |
+
alpha_ia = gr.Slider(
|
| 111 |
+
minimum=0.0, maximum=1.0, value=0.7, step=0.05,
|
| 112 |
+
label="Fusion weight α (image ↔ audio)",
|
| 113 |
+
info="α=1 trusts image only; α=0 trusts audio only."
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
btn_ia = gr.Button("Analyze")
|
| 117 |
+
out_i1 = gr.Label(label="Prediction")
|
| 118 |
+
out_i2 = gr.JSON(label="Probabilities")
|
| 119 |
+
out_i3 = gr.JSON(label="Latency (ms)")
|
| 120 |
+
btn_ia.click(predict_image_audio, inputs=[img, aud, alpha_ia], outputs=[out_i1, out_i2, out_i3])
|
| 121 |
+
|
| 122 |
+
if __name__ == "__main__":
|
| 123 |
+
demo.launch()
|
fusion-app/fusion.py
ADDED
|
@@ -0,0 +1,147 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
import json
|
| 3 |
+
import numpy as np
|
| 4 |
+
import torch
|
| 5 |
+
import math
|
| 6 |
+
from transformers import CLIPProcessor, CLIPModel, Wav2Vec2Processor, Wav2Vec2Model
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 10 |
+
|
| 11 |
+
_here = Path(__file__).parent
|
| 12 |
+
_labels = json.loads((_here / "labels.json").read_text())["labels"]
|
| 13 |
+
LABELS = [x["name"] for x in _labels]
|
| 14 |
+
PROMPTS = [x["prompt"] for x in _labels]
|
| 15 |
+
|
| 16 |
+
_clip_model = None
|
| 17 |
+
_clip_proc = None
|
| 18 |
+
_wav_model = None
|
| 19 |
+
_wav_proc = None
|
| 20 |
+
_proto_embs = None
|
| 21 |
+
|
| 22 |
+
def _lazy_load_models():
|
| 23 |
+
global _clip_model, _clip_proc, _wav_model, _wav_proc
|
| 24 |
+
if _clip_model is None:
|
| 25 |
+
_clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(DEVICE)
|
| 26 |
+
_clip_model.eval()
|
| 27 |
+
_clip_proc = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
|
| 28 |
+
if _wav_model is None:
|
| 29 |
+
_wav_model = Wav2Vec2Model.from_pretrained("facebook/wav2vec2-base").to(DEVICE)
|
| 30 |
+
_wav_model.eval()
|
| 31 |
+
_wav_proc = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base")
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def _sine(sr, freq, dur, amp=0.2):
|
| 35 |
+
t = np.linspace(0, dur, int(sr*dur), endpoint=False, dtype=np.float32)
|
| 36 |
+
return (amp * np.sin(2*np.pi*freq*t)).astype(np.float32)
|
| 37 |
+
|
| 38 |
+
def _burst_noise(sr, dur, amp=0.2):
|
| 39 |
+
x = np.random.randn(int(sr*dur)).astype(np.float32)
|
| 40 |
+
# fast attack / fast decay envelope
|
| 41 |
+
n = x.size
|
| 42 |
+
env = np.linspace(0, 1, int(0.05*n), dtype=np.float32)
|
| 43 |
+
env = np.pad(env, (0, n-env.size), constant_values=1.0)
|
| 44 |
+
env[-int(0.15*n):] = np.linspace(1, 0, int(0.15*n), dtype=np.float32)
|
| 45 |
+
return (amp * x * env).astype(np.float32)
|
| 46 |
+
|
| 47 |
+
def _triad(sr, base, minor=False, dur=2.0, amp=0.18):
|
| 48 |
+
third = 3/2 if minor else 4/3 # (approx)
|
| 49 |
+
f1, f2, f3 = base, base*third, base*2
|
| 50 |
+
w = (_sine(sr,f1,dur,amp) + _sine(sr,f2,dur,amp*0.7) + _sine(sr,f3,dur,amp*0.5))
|
| 51 |
+
return (w / (np.max(np.abs(w))+1e-6)).astype(np.float32)
|
| 52 |
+
|
| 53 |
+
def _synthesize_audio_prototypes(sr=16000, dur=2.0):
|
| 54 |
+
|
| 55 |
+
return {
|
| 56 |
+
"calm": _sine(sr, 220, dur, amp=0.08), # quiet low sine
|
| 57 |
+
"energetic": _burst_noise(sr, dur, amp=0.35), # noisy, punchy
|
| 58 |
+
"suspense": _sine(sr, 70, dur, amp=0.18) + _sine(sr, 80, dur, amp=0.12), # low drones
|
| 59 |
+
"joyful": _triad(sr, 262, minor=False, dur=dur, amp=0.22), # C major-ish
|
| 60 |
+
"sad": _triad(sr, 262, minor=True, dur=dur, amp=0.20), # C minor-ish
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
def _ensure_audio_prototypes():
|
| 64 |
+
global _proto_embs
|
| 65 |
+
if _proto_embs is not None:
|
| 66 |
+
return
|
| 67 |
+
_lazy_load_models()
|
| 68 |
+
waves = _synthesize_audio_prototypes()
|
| 69 |
+
embs = {}
|
| 70 |
+
for lbl, wav in waves.items():
|
| 71 |
+
emb, _ = wav2vec2_embed_energy(wav) # normalized 768-d embedding
|
| 72 |
+
embs[lbl] = emb / (np.linalg.norm(emb) + 1e-8)
|
| 73 |
+
_proto_embs = embs # cache
|
| 74 |
+
|
| 75 |
+
# image branch (CLIP)
|
| 76 |
+
@torch.no_grad()
|
| 77 |
+
def clip_image_probs(pil_image, prompts=PROMPTS):
|
| 78 |
+
|
| 79 |
+
_lazy_load_models()
|
| 80 |
+
# text features
|
| 81 |
+
text_inputs = _clip_proc(text=prompts, return_tensors="pt", padding=True).to(DEVICE)
|
| 82 |
+
text_feats = _clip_model.get_text_features(**text_inputs) # [K, d]
|
| 83 |
+
text_feats = torch.nn.functional.normalize(text_feats, dim=-1)
|
| 84 |
+
|
| 85 |
+
# image features
|
| 86 |
+
img_inputs = _clip_proc(images=pil_image, return_tensors="pt").to(DEVICE)
|
| 87 |
+
img_feats = _clip_model.get_image_features(**img_inputs) # [1, d]
|
| 88 |
+
img_feats = torch.nn.functional.normalize(img_feats, dim=-1)
|
| 89 |
+
|
| 90 |
+
# similarity to softmax
|
| 91 |
+
sims = (img_feats @ text_feats.T).squeeze(0) # [K]
|
| 92 |
+
probs = torch.softmax(sims, dim=-1) # [K]
|
| 93 |
+
return probs.detach().cpu().numpy() # np.float32[K]
|
| 94 |
+
|
| 95 |
+
# audio branch (Wav2Vec2 + energy prior)
|
| 96 |
+
@torch.no_grad()
|
| 97 |
+
def wav2vec2_embed_energy(wave_16k: np.ndarray):
|
| 98 |
+
_lazy_load_models()
|
| 99 |
+
# wave_16k must be float32 mono in [-1, 1]
|
| 100 |
+
inp = _wav_proc(wave_16k, sampling_rate=16000, return_tensors="pt").to(DEVICE)
|
| 101 |
+
out = _wav_model(**inp).last_hidden_state # [1, T, 768]
|
| 102 |
+
emb = out.mean(dim=1).squeeze(0) # [768]
|
| 103 |
+
emb = torch.nn.functional.normalize(emb, dim=-1)
|
| 104 |
+
emb_np = emb.detach().cpu().numpy()
|
| 105 |
+
|
| 106 |
+
# simple loudness proxy (RMS)
|
| 107 |
+
rms = float(np.sqrt(np.mean(np.square(wave_16k)))) # 0..~1
|
| 108 |
+
return emb_np, rms
|
| 109 |
+
|
| 110 |
+
def audio_prior_from_rms(rms: float) -> np.ndarray:
|
| 111 |
+
# clamp
|
| 112 |
+
r = max(0.0, min(1.0, rms))
|
| 113 |
+
# weights via curves
|
| 114 |
+
calm = max(0.0, 1.0 - 2.0*r) # high when quiet
|
| 115 |
+
sad = max(0.0, 1.2 - 2.2*r)
|
| 116 |
+
energetic = r**0.8 # grows with loudness
|
| 117 |
+
joyful = (r**0.9) * 0.9 + 0.1*(1-r) # energetic but with a small bias
|
| 118 |
+
suspense = 0.6*(1.0 - abs(r - 0.5)*2) # middle loudness means suspense
|
| 119 |
+
|
| 120 |
+
vec = np.array([calm, energetic, suspense, joyful, sad], dtype=np.float32)
|
| 121 |
+
vec = np.clip(vec, 1e-4, None)
|
| 122 |
+
vec = vec / vec.sum()
|
| 123 |
+
return vec
|
| 124 |
+
|
| 125 |
+
@torch.no_grad()
|
| 126 |
+
def wav2vec2_zero_shot_probs(wave_16k: np.ndarray, temperature: float = 1.0) -> np.ndarray:
|
| 127 |
+
_ensure_audio_prototypes()
|
| 128 |
+
emb, _ = wav2vec2_embed_energy(wave_16k) # normalized already
|
| 129 |
+
emb = emb / (np.linalg.norm(emb) + 1e-8)
|
| 130 |
+
sims = np.array([float(np.dot(emb, _proto_embs[lbl])) for lbl in LABELS], dtype=np.float32) # [K]
|
| 131 |
+
# temperature softmax for tunable sharpness
|
| 132 |
+
z = sims / max(1e-6, float(temperature))
|
| 133 |
+
z = z - z.max() # numerical stability
|
| 134 |
+
p = np.exp(z); p /= (p.sum() + 1e-8)
|
| 135 |
+
return p.astype(np.float32)
|
| 136 |
+
|
| 137 |
+
# fusion
|
| 138 |
+
def fuse_probs(image_probs: np.ndarray, audio_prior: np.ndarray, alpha: float = 0.7) -> np.ndarray:
|
| 139 |
+
|
| 140 |
+
p_img = image_probs / (image_probs.sum() + 1e-8) # alpha closer to 1 favors image, 0 favors audio.
|
| 141 |
+
p_aud = audio_prior / (audio_prior.sum() + 1e-8)
|
| 142 |
+
p = alpha * p_img + (1.0 - alpha) * p_aud
|
| 143 |
+
p = p / (p.sum() + 1e-8)
|
| 144 |
+
return p
|
| 145 |
+
|
| 146 |
+
def top1_label_from_probs(p: np.ndarray) -> str:
|
| 147 |
+
return LABELS[int(p.argmax())]
|
fusion-app/labels.json
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"labels": [
|
| 3 |
+
{"name": "calm", "prompt": "a tranquil, peaceful scene", "def": "low motion, soft colors, quiet audio"},
|
| 4 |
+
{"name": "energetic", "prompt": "a high-energy lively scene", "def": "fast motion, bright colors, loud/fast audio"},
|
| 5 |
+
{"name": "suspense", "prompt": "a tense, foreboding scene", "def": "dim colors, slow build, ominous drones"},
|
| 6 |
+
{"name": "joyful", "prompt": "a happy, upbeat, celebratory scene", "def": "warm colors, smiles, upbeat music"},
|
| 7 |
+
{"name": "sad", "prompt": "a somber, gloomy scene", "def": "cool/dark tones, slow pace, quiet audio"}
|
| 8 |
+
]
|
| 9 |
+
}
|
| 10 |
+
|
| 11 |
+
|
fusion-app/tests/test_shapes.py
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
def test_concat_dim():
|
| 3 |
+
img, aud = np.random.randn(512), np.random.randn(768)
|
| 4 |
+
assert (img.size + aud.size) == 1280
|
fusion-app/tests/test_smoke.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
def test_imports():
|
| 2 |
+
import gradio, numpy # noqa
|
| 3 |
+
assert True
|
fusion-app/utils_media.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import csv
|
| 2 |
+
import json
|
| 3 |
+
from pathlib import Path
|
| 4 |
+
import time
|
| 5 |
+
from typing import Any, Dict, Tuple, Union
|
| 6 |
+
import io
|
| 7 |
+
import numpy as np
|
| 8 |
+
from PIL import Image
|
| 9 |
+
import ffmpeg
|
| 10 |
+
import tempfile
|
| 11 |
+
from pydub import AudioSegment
|
| 12 |
+
|
| 13 |
+
# helpers
|
| 14 |
+
def probe_duration_sec(video_path: str) -> float:
|
| 15 |
+
try:
|
| 16 |
+
meta = ffmpeg.probe(video_path)
|
| 17 |
+
return float(meta.get("format", {}).get("duration", 0.0)) or 0.0
|
| 18 |
+
except Exception:
|
| 19 |
+
return 0.0
|
| 20 |
+
|
| 21 |
+
def _to_path(p: Union[str, dict, Path]) -> str:
|
| 22 |
+
if isinstance(p, dict):
|
| 23 |
+
return p.get("name") or p.get("path") or p.get("data") or ""
|
| 24 |
+
return str(p)
|
| 25 |
+
|
| 26 |
+
def _audiosegment_float32(seg: AudioSegment) -> np.ndarray:
|
| 27 |
+
seg = seg.set_frame_rate(16000).set_channels(1).set_sample_width(2) # 16-bit
|
| 28 |
+
samples = np.array(seg.get_array_of_samples(), dtype=np.int16)
|
| 29 |
+
return (samples.astype(np.float32) / 32768.0)
|
| 30 |
+
|
| 31 |
+
# public API
|
| 32 |
+
def video_to_frame_audio(
|
| 33 |
+
video_in,
|
| 34 |
+
target_frames: int = 64, # aim for this many frames total
|
| 35 |
+
fps_cap: float = 3.0 # never sample faster than this
|
| 36 |
+
) -> Tuple[list, np.ndarray, dict]:
|
| 37 |
+
|
| 38 |
+
video_path = _to_path(video_in)
|
| 39 |
+
if not video_path:
|
| 40 |
+
raise ValueError("Empty video path")
|
| 41 |
+
|
| 42 |
+
dur = probe_duration_sec(video_path)
|
| 43 |
+
|
| 44 |
+
if dur <= 0:
|
| 45 |
+
fps = 1.0
|
| 46 |
+
else:
|
| 47 |
+
fps = min(fps_cap, max(1.0 / dur, target_frames / dur))
|
| 48 |
+
|
| 49 |
+
frames = []
|
| 50 |
+
with tempfile.TemporaryDirectory() as td:
|
| 51 |
+
td = Path(td)
|
| 52 |
+
out_pattern = str(td / "frame_%06d.jpg")
|
| 53 |
+
|
| 54 |
+
(
|
| 55 |
+
ffmpeg
|
| 56 |
+
.input(video_path)
|
| 57 |
+
.output(out_pattern, vf=f"fps={fps}", vsync="vfr", qscale=2)
|
| 58 |
+
.overwrite_output()
|
| 59 |
+
.run(capture_stdout=True, capture_stderr=True)
|
| 60 |
+
)
|
| 61 |
+
for p in sorted(td.glob("frame_*.jpg")):
|
| 62 |
+
frames.append(Image.open(p).convert("RGB"))
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
seg = AudioSegment.from_file(video_path)
|
| 66 |
+
audio16k = _audiosegment_float32(seg)
|
| 67 |
+
|
| 68 |
+
meta = {"duration_s": float(dur), "fps_used": float(fps), "n_frames": int(len(frames))}
|
| 69 |
+
return frames, audio16k, meta
|
| 70 |
+
|
| 71 |
+
def load_audio_16k(audio_path_like) -> np.ndarray:
|
| 72 |
+
path = _to_path(audio_path_like)
|
| 73 |
+
seg = AudioSegment.from_file(path)
|
| 74 |
+
return _audiosegment_float32(seg)
|
| 75 |
+
|
| 76 |
+
|
| 77 |
+
# Logging
|
| 78 |
+
DEFAULT_CSV = Path(__file__).parent / "runs_local.csv"
|
| 79 |
+
|
| 80 |
+
def now_iso() -> str:
|
| 81 |
+
# UTC-ish wall time string (sufficient for ordering/eyeballing).
|
| 82 |
+
return time.strftime("%Y-%m-%dT%H:%M:%S")
|
| 83 |
+
|
| 84 |
+
def append_csv(csv_path: Union[str, Path] = DEFAULT_CSV, row: Dict[str, Any] = None) -> None:
|
| 85 |
+
if row is None:
|
| 86 |
+
return
|
| 87 |
+
p = Path(csv_path)
|
| 88 |
+
p.parent.mkdir(parents=True, exist_ok=True)
|
| 89 |
+
is_new = not p.exists()
|
| 90 |
+
safe_row = {k: (json.dumps(v) if isinstance(v, (list, dict)) else v) for k, v in row.items()}
|
| 91 |
+
with p.open("a", newline="", encoding="utf-8") as f:
|
| 92 |
+
w = csv.DictWriter(f, fieldnames=list(safe_row.keys()))
|
| 93 |
+
if is_new:
|
| 94 |
+
w.writeheader()
|
| 95 |
+
w.writerow(safe_row)
|
| 96 |
+
|
| 97 |
+
def log_inference(
|
| 98 |
+
*,
|
| 99 |
+
engine: str, # "local" or "api"
|
| 100 |
+
mode: str, # "video" or "image_audio"
|
| 101 |
+
alpha: float,
|
| 102 |
+
lat: Dict[str, Any], # expects keys like t_image_ms, t_audio_ms, t_fuse_ms, t_total_ms, rms
|
| 103 |
+
pred: str,
|
| 104 |
+
probs: Dict[str, float],
|
| 105 |
+
csv_path: Union[str, Path] = DEFAULT_CSV
|
| 106 |
+
) -> None:
|
| 107 |
+
|
| 108 |
+
payload = {
|
| 109 |
+
"ts": now_iso(),
|
| 110 |
+
"engine": engine,
|
| 111 |
+
"mode": mode,
|
| 112 |
+
"alpha": float(alpha),
|
| 113 |
+
"rms": lat.get("rms"),
|
| 114 |
+
"t_image_ms": lat.get("t_image_ms"),
|
| 115 |
+
"t_audio_ms": lat.get("t_audio_ms"),
|
| 116 |
+
"t_fuse_ms": lat.get("t_fuse_ms"),
|
| 117 |
+
"t_total_ms": lat.get("t_total_ms"),
|
| 118 |
+
"pred": pred,
|
| 119 |
+
"probs": probs,
|
| 120 |
+
}
|
| 121 |
+
append_csv(csv_path, payload)
|
| 122 |
+
|
| 123 |
+
|
| 124 |
+
# Summarizer
|
| 125 |
+
|
| 126 |
+
def summarize_csv(
|
| 127 |
+
csv_path: Union[str, Path] = DEFAULT_CSV,
|
| 128 |
+
cols = ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms")
|
| 129 |
+
) -> Dict[str, Dict[str, float]]:
|
| 130 |
+
"""
|
| 131 |
+
Compute p50/p95 for latency columns. Returns a dict so you can print or consume it.
|
| 132 |
+
"""
|
| 133 |
+
p = Path(csv_path)
|
| 134 |
+
if not p.exists():
|
| 135 |
+
return {}
|
| 136 |
+
|
| 137 |
+
with p.open("r", encoding="utf-8") as f:
|
| 138 |
+
rows = list(csv.DictReader(f))
|
| 139 |
+
|
| 140 |
+
def _col_vals(c):
|
| 141 |
+
out = []
|
| 142 |
+
for r in rows:
|
| 143 |
+
v = r.get(c)
|
| 144 |
+
if v is None or v == "":
|
| 145 |
+
continue
|
| 146 |
+
try:
|
| 147 |
+
out.append(float(v))
|
| 148 |
+
except Exception:
|
| 149 |
+
pass
|
| 150 |
+
return np.array(out, dtype=float)
|
| 151 |
+
|
| 152 |
+
stats: Dict[str, Dict[str, float]] = {}
|
| 153 |
+
for c in cols:
|
| 154 |
+
arr = _col_vals(c)
|
| 155 |
+
if arr.size == 0:
|
| 156 |
+
stats[c] = {"p50": float("nan"), "p95": float("nan"), "n": 0}
|
| 157 |
+
else:
|
| 158 |
+
stats[c] = {
|
| 159 |
+
"p50": float(np.percentile(arr, 50)),
|
| 160 |
+
"p95": float(np.percentile(arr, 95)),
|
| 161 |
+
"n": int(arr.size),
|
| 162 |
+
}
|
| 163 |
+
return stats
|
| 164 |
+
|
| 165 |
+
if __name__ == "__main__":
|
| 166 |
+
# CLI usage: python fusion-app/utils_media.py [csv_path]
|
| 167 |
+
import sys
|
| 168 |
+
path = sys.argv[1] if len(sys.argv) > 1 else DEFAULT_CSV
|
| 169 |
+
s = summarize_csv(path)
|
| 170 |
+
print(f"File: {path}")
|
| 171 |
+
if not s:
|
| 172 |
+
print("No rows found.")
|
| 173 |
+
else:
|
| 174 |
+
for k in ("t_image_ms", "t_audio_ms", "t_fuse_ms", "t_total_ms"):
|
| 175 |
+
if k in s:
|
| 176 |
+
print(f"{k:>11}: p50={s[k]['p50']:.1f} ms p95={s[k]['p95']:.1f} ms n={s[k]['n']}")
|
packages.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
ffmpeg
|
requirements.txt
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio
|
| 2 |
+
transformers
|
| 3 |
+
torch
|
| 4 |
+
torchaudio
|
| 5 |
+
torchvision
|
| 6 |
+
pydub
|
| 7 |
+
ffmpeg-python
|
| 8 |
+
numpy
|
| 9 |
+
pytest
|
| 10 |
+
huggingface_hub
|
| 11 |
+
datasets
|