anime-gen-api / test_single_cut.py
AswinMathew's picture
Upload folder using huggingface_hub
7190fd0 verified
"""Test a single cut end-to-end: Image -> TTS (Pocket TTS) -> Video -> Mux.
Generates a frontal pose manhwa character with cloned Genshin voice.
"""
import asyncio
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).resolve().parent))
OUTPUT_DIR = Path("test_single_cut_output")
async def main():
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
from app.services.pollinations import generate_image, generate_video, upload_media
from app.services.ffmpeg import mux_audio
# ── 1. Generate Image (frontal pose, manhwa style) ──
print("=" * 60)
print("[1/4] Generating image...")
print("=" * 60)
image_prompt = (
"manhwa webtoon style, clean cel-shading, sharp lineart, vivid colors. "
"medium close-up shot, a young man with sharp jaw, messy black hair, "
"piercing golden eyes, wearing a dark high-collared coat with silver trim, "
"standing in a grand marble hall, dramatic side lighting casting sharp shadows, "
"confident smirk, arms crossed, facing the viewer directly, "
"cinematic composition, 4k detail"
)
image_path = str(OUTPUT_DIR / "frame.png")
t0 = time.time()
await generate_image(
prompt=image_prompt,
output_path=image_path,
model="klein-large",
width=1024,
height=768,
seed=42,
)
print(f" Image saved: {image_path} ({time.time() - t0:.1f}s)")
# ── 2. Generate TTS (Pocket TTS with Childe's voice) ──
print(f"\n{'=' * 60}")
print("[2/4] Generating voice (Pocket TTS - Childe voice)...")
print("=" * 60)
from app.services.pocket_tts_service import PocketTTSService
dialogue_text = "You think you can challenge me? How amusing. Let me show you what true power looks like."
voice_state_path = "data/genshin_voices/childe/voice_state.safetensors"
audio_path = str(OUTPUT_DIR / "voice.wav")
t0 = time.time()
tts_result = await PocketTTSService.generate(
text=dialogue_text,
voice_ref=voice_state_path,
output_path=audio_path,
)
print(f" Audio saved: {audio_path}")
print(f" Duration: {tts_result['duration_sec']:.1f}s ({time.time() - t0:.1f}s gen)")
# ── 3. Generate Video (img2vid from the image) ──
print(f"\n{'=' * 60}")
print("[3/4] Generating video (grok-video img2vid)...")
print("=" * 60)
# Upload image first for img2vid
print(" Uploading image to media.pollinations.ai...")
t0 = time.time()
image_url = await upload_media(image_path)
print(f" Image URL: {image_url} ({time.time() - t0:.1f}s)")
video_prompt = "character smirks confidently, slight head tilt, coat fabric shifts, dramatic lighting flickers"
silent_video_path = str(OUTPUT_DIR / "video_silent.mp4")
# Use TTS duration (rounded up) for video length
import math
video_duration = min(max(math.ceil(tts_result["duration_sec"]), 3), 5)
t0 = time.time()
print(f" Generating {video_duration}s video...")
await generate_video(
prompt=video_prompt,
output_path=silent_video_path,
duration=video_duration,
image_url=image_url,
)
print(f" Silent video saved: {silent_video_path} ({time.time() - t0:.1f}s)")
# ── 4. Mux audio + video ──
print(f"\n{'=' * 60}")
print("[4/4] Muxing audio + video...")
print("=" * 60)
final_path = str(OUTPUT_DIR / "final_cut.mp4")
t0 = time.time()
await mux_audio(
video_path=silent_video_path,
audio_path=audio_path,
output_path=final_path,
duration_sec=tts_result["duration_sec"],
)
print(f" Final cut: {final_path} ({time.time() - t0:.1f}s)")
# Summary
print(f"\n{'=' * 60}")
print("DONE!")
print(f"{'=' * 60}")
print(f" Image: {image_path}")
print(f" Audio: {audio_path} ({tts_result['duration_sec']:.1f}s)")
print(f" Video: {silent_video_path}")
print(f" Final: {final_path}")
print(f"\nOpen {final_path} to watch!")
if __name__ == "__main__":
asyncio.run(main())