Audio2Image / handler.py
HariLogicgo's picture
added model and weights
40cfce6
# handler.py
import subprocess, os, tempfile, shutil, uuid, base64
from typing import Dict
class Handler:
def __init__(self, model_dir: str):
self.model_dir = model_dir
self.code_dir = os.path.join(model_dir, "Wan2.2")
self.ckpt_dir = os.path.join(model_dir, "Wan2.2-S2V-14B")
def __call__(self, inputs: Dict):
prompt = inputs.get("prompt", "a person is talking")
image_b64 = inputs.get("image_b64")
audio_b64 = inputs.get("audio_b64")
tmpd = tempfile.mkdtemp()
try:
image_path = os.path.join(tmpd, "input.jpg")
audio_path = os.path.join(tmpd, "input.wav")
if image_b64:
with open(image_path, "wb") as f:
f.write(base64.b64decode(image_b64))
if audio_b64:
with open(audio_path, "wb") as f:
f.write(base64.b64decode(audio_b64))
out_path = os.path.join(tmpd, f"out_{uuid.uuid4().hex}.mp4")
cmd = [
"python", "generate.py",
"--task", "s2v-14B",
"--size", "1024*704",
"--ckpt_dir", self.ckpt_dir,
"--offload_model", "True",
"--convert_model_dtype",
"--prompt", prompt,
"--image", image_path,
"--audio", audio_path,
"--num_clip", "1"
]
subprocess.check_call(cmd, cwd=self.code_dir)
# Wan2.2 usually writes to outputs/, so adapt if needed
if os.path.exists("outputs"):
video_file = sorted(os.listdir("outputs"))[-1]
with open(os.path.join("outputs", video_file), "rb") as f:
return {"video_b64": base64.b64encode(f.read()).decode("utf-8")}
else:
with open(out_path, "rb") as f:
return {"video_b64": base64.b64encode(f.read()).decode("utf-8")}
finally:
shutil.rmtree(tmpd, ignore_errors=True)