SonicaB commited on
Commit
5ce022b
·
verified ·
1 Parent(s): 08926b6

Upload folder using huggingface_hub

Browse files
fusion-app/app_local.py CHANGED
@@ -3,6 +3,7 @@ import time
3
  import json
4
  import numpy as np
5
  from pathlib import Path
 
6
 
7
  HERE = Path(__file__).parent
8
  lables_PATH = HERE / "labels.json"
@@ -13,16 +14,19 @@ lables = [x["name"] for x in json.loads(lables_PATH.read_text())["labels"]]
13
 
14
  def predict_vid(video):
15
  t0= time.time()
 
16
  probs = np.ones(len(lables))/len(lables)
17
  pred = lables[int(np.argmax(probs))]
18
- lat = {"t_total_ms": int((time.time()-t0)*1000), "note": "dummy"}
19
  return pred, {k: float(v) for k,v in zip(lables, probs)}, lat
20
 
21
  def predict_aud_img(audio, image):
22
  t0 = time.time()
 
 
23
  probs = np.ones(len(lables)) / len(lables)
24
  pred = lables[int(np.argmax(probs))]
25
- lat = {"t_total_ms": int((time.time()-t0)*1000), "note": "dummy"}
26
  return pred, {k: float(v) for k,v in zip(lables, probs)}, lat
27
 
28
 
 
3
  import json
4
  import numpy as np
5
  from pathlib import Path
6
+ from utils_media import video_to_frame_audio, load_audio_16k
7
 
8
  HERE = Path(__file__).parent
9
  lables_PATH = HERE / "labels.json"
 
14
 
15
  def predict_vid(video):
16
  t0= time.time()
17
+ frame, audio16k = video_to_frame_audio(video)
18
  probs = np.ones(len(lables))/len(lables)
19
  pred = lables[int(np.argmax(probs))]
20
+ lat = {"t_total_ms": int((time.time()-t0)*1000), "note": "decoded media"}
21
  return pred, {k: float(v) for k,v in zip(lables, probs)}, lat
22
 
23
  def predict_aud_img(audio, image):
24
  t0 = time.time()
25
+ wave = load_audio_16k(audio)
26
+ frame = image
27
  probs = np.ones(len(lables)) / len(lables)
28
  pred = lables[int(np.argmax(probs))]
29
+ lat = {"t_total_ms": int((time.time()-t0)*1000), "note": "loaded media"}
30
  return pred, {k: float(v) for k,v in zip(lables, probs)}, lat
31
 
32
 
fusion-app/labels.json CHANGED
@@ -7,3 +7,5 @@
7
  {"name": "sad", "prompt": "a somber, gloomy scene", "def": "cool/dark tones, slow pace, quiet audio"}
8
  ]
9
  }
 
 
 
7
  {"name": "sad", "prompt": "a somber, gloomy scene", "def": "cool/dark tones, slow pace, quiet audio"}
8
  ]
9
  }
10
+
11
+
fusion-app/utils_media.py CHANGED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from typing import Tuple, Union
3
+ import io
4
+ import numpy as np
5
+ from PIL import Image
6
+ import ffmpeg
7
+ from pydub import AudioSegment
8
+
9
+ # helpers
10
+ def _to_path(p: Union[str, dict, Path]) -> str:
11
+ if isinstance(p, dict):
12
+ return p.get("name") or p.get("path") or p.get("data") or ""
13
+ return str(p)
14
+
15
+ def _audiosegment_float32(seg: AudioSegment) -> np.ndarray:
16
+ seg = seg.set_frame_rate(16000).set_channels(1).set_sample_width(2) # 16-bit
17
+ samples = np.array(seg.get_array_of_samples(), dtype=np.int16)
18
+ return (samples.astype(np.float32) / 32768.0)
19
+
20
+ # public API
21
+ def video_to_frame_audio(video_in) -> Tuple[Image.Image, np.ndarray]:
22
+ video_path = _to_path(video_in)
23
+ if not video_path:
24
+ raise ValueError("Empty video path")
25
+
26
+ try:
27
+ out, _ = (
28
+ ffmpeg
29
+ .input(video_path)
30
+ .output('pipe:', vframes=1, format='image2', vcodec='mjpeg')
31
+ .run(capture_stdout=True, capture_stderr=True)
32
+ )
33
+ frame = Image.open(io.BytesIO(out)).convert("RGB")
34
+ except ffmpeg.Error as e:
35
+ raise RuntimeError(f"ffmpeg frame extract failed: {e.stderr.decode()[:2000]}")
36
+
37
+
38
+ seg = AudioSegment.from_file(video_path)
39
+ audio16k = _audiosegment_float32(seg)
40
+ return frame, audio16k
41
+
42
+ def load_audio_16k(audio_path_like) -> np.ndarray:
43
+ path = _to_path(audio_path_like)
44
+ seg = AudioSegment.from_file(path)
45
+ return _audiosegment_float32(seg)
packages.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ ffmpeg