Spaces:

AswinMathew
/

anime-gen-api

Sleeping

App Files Files Community

anime-gen-api / test_single_cut.py

AswinMathew

Upload folder using huggingface_hub

7190fd0 verified about 1 month ago

raw

history blame contribute delete

4.12 kB

	"""Test a single cut end-to-end: Image -> TTS (Pocket TTS) -> Video -> Mux.

	Generates a frontal pose manhwa character with cloned Genshin voice.
	"""
	import asyncio
	import sys
	import time
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent))

	OUTPUT_DIR = Path("test_single_cut_output")


	async def main():
	OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

	from app.services.pollinations import generate_image, generate_video, upload_media
	from app.services.ffmpeg import mux_audio

	# ── 1. Generate Image (frontal pose, manhwa style) ──
	print("=" * 60)
	print("[1/4] Generating image...")
	print("=" * 60)

	image_prompt = (
	"manhwa webtoon style, clean cel-shading, sharp lineart, vivid colors. "
	"medium close-up shot, a young man with sharp jaw, messy black hair, "
	"piercing golden eyes, wearing a dark high-collared coat with silver trim, "
	"standing in a grand marble hall, dramatic side lighting casting sharp shadows, "
	"confident smirk, arms crossed, facing the viewer directly, "
	"cinematic composition, 4k detail"
	)
	image_path = str(OUTPUT_DIR / "frame.png")

	t0 = time.time()
	await generate_image(
	prompt=image_prompt,
	output_path=image_path,
	model="klein-large",
	width=1024,
	height=768,
	seed=42,
	)
	print(f" Image saved: {image_path} ({time.time() - t0:.1f}s)")

	# ── 2. Generate TTS (Pocket TTS with Childe's voice) ──
	print(f"\n{'=' * 60}")
	print("[2/4] Generating voice (Pocket TTS - Childe voice)...")
	print("=" * 60)

	from app.services.pocket_tts_service import PocketTTSService

	dialogue_text = "You think you can challenge me? How amusing. Let me show you what true power looks like."
	voice_state_path = "data/genshin_voices/childe/voice_state.safetensors"
	audio_path = str(OUTPUT_DIR / "voice.wav")

	t0 = time.time()
	tts_result = await PocketTTSService.generate(
	text=dialogue_text,
	voice_ref=voice_state_path,
	output_path=audio_path,
	)
	print(f" Audio saved: {audio_path}")
	print(f" Duration: {tts_result['duration_sec']:.1f}s ({time.time() - t0:.1f}s gen)")

	# ── 3. Generate Video (img2vid from the image) ──
	print(f"\n{'=' * 60}")
	print("[3/4] Generating video (grok-video img2vid)...")
	print("=" * 60)

	# Upload image first for img2vid
	print(" Uploading image to media.pollinations.ai...")
	t0 = time.time()
	image_url = await upload_media(image_path)
	print(f" Image URL: {image_url} ({time.time() - t0:.1f}s)")

	video_prompt = "character smirks confidently, slight head tilt, coat fabric shifts, dramatic lighting flickers"
	silent_video_path = str(OUTPUT_DIR / "video_silent.mp4")

	# Use TTS duration (rounded up) for video length
	import math
	video_duration = min(max(math.ceil(tts_result["duration_sec"]), 3), 5)

	t0 = time.time()
	print(f" Generating {video_duration}s video...")
	await generate_video(
	prompt=video_prompt,
	output_path=silent_video_path,
	duration=video_duration,
	image_url=image_url,
	)
	print(f" Silent video saved: {silent_video_path} ({time.time() - t0:.1f}s)")

	# ── 4. Mux audio + video ──
	print(f"\n{'=' * 60}")
	print("[4/4] Muxing audio + video...")
	print("=" * 60)

	final_path = str(OUTPUT_DIR / "final_cut.mp4")
	t0 = time.time()
	await mux_audio(
	video_path=silent_video_path,
	audio_path=audio_path,
	output_path=final_path,
	duration_sec=tts_result["duration_sec"],
	)
	print(f" Final cut: {final_path} ({time.time() - t0:.1f}s)")

	# Summary
	print(f"\n{'=' * 60}")
	print("DONE!")
	print(f"{'=' * 60}")
	print(f" Image: {image_path}")
	print(f" Audio: {audio_path} ({tts_result['duration_sec']:.1f}s)")
	print(f" Video: {silent_video_path}")
	print(f" Final: {final_path}")
	print(f"\nOpen {final_path} to watch!")


	if __name__ == "__main__":
	asyncio.run(main())