Spaces:

adi-123
/

CineStoryAI

Running

App Files Files Community

CineStoryAI / app.py

adi-123

Update app.py

751d179 verified about 2 months ago

raw

history blame contribute delete

24.7 kB

	"""
	CineStory AI — Image → Interactive Branching Story → Cinematic Narrated Video

	Architecture (all $0.00):
	Vision: Groq free API (Llama 4 Scout) — rich scene understanding
	Story: Together AI free tier (Llama 3.1 8B) — branching narratives
	Images: Together AI Flux Schnell-Free — stylised chapter keyframes
	TTS: Kokoro 82M on CPU — #1 ranked TTS, zero cost
	Composer: ffmpeg Ken Burns — audio-synced storyboard video, CPU only

	No video generation APIs. No GPU. Total cost per story: $0.00.
	"""
	import os
	import json
	import time
	import tempfile
	import logging
	import socket
	import gradio as gr

	from vision import analyze_scene, scene_to_story_prompt
	from story import (
	generate_opening, continue_story, generate_linear_story, StoryState,
	)
	from tts import generate_speech, VOICE_MAP
	from composer import (
	create_cinematic_story_video, get_style_names, STYLE_PRESETS,
	get_image_gen_errors,
	)

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger("cinestory")


	def _load_local_env(env_file: str = ".env") -> None:
	if not os.path.exists(env_file):
	return
	try:
	with open(env_file, "r", encoding="utf-8") as f:
	for raw_line in f:
	line = raw_line.strip()
	if not line or line.startswith("#") or "=" not in line:
	continue
	key, value = line.split("=", 1)
	key = key.strip()
	value = value.strip().strip('"').strip("'")
	if key and key not in os.environ:
	os.environ[key] = value
	except Exception as e:
	logger.warning(f"Failed to load {env_file}: {e}")

	_load_local_env()

	WORK_DIR = tempfile.mkdtemp(prefix="cinestory_")


	def _find_free_port(default_port: int = 7860) -> int:
	try:
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
	sock.bind(("0.0.0.0", default_port))
	return default_port
	except OSError:
	pass
	try:
	with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock:
	sock.bind(("0.0.0.0", 0))
	return int(sock.getsockname()[1])
	except OSError:
	return default_port


	# ── Pipeline Functions ────────────────────────────────────────────────────────

	def process_image(image_path):
	"""Analyze uploaded image with Groq vision → rich structured JSON."""
	if image_path is None:
	return "Please upload an image first.", "{}"
	try:
	scene = analyze_scene(image_path)
	summary = (
	f"Scene: {scene.get('scene_description', 'N/A')}\n\n"
	f"Mood: {scene.get('mood', 'N/A')} \| "
	f"Atmosphere: {scene.get('atmosphere', 'N/A')}\n\n"
	f"Setting: {scene.get('setting', 'N/A')} "
	f"({scene.get('time_of_day', '')})\n\n"
	f"Narrative Potential: {scene.get('narrative_potential', 'N/A')}\n\n"
	f"Sensory Details: {scene.get('sensory_details', 'N/A')}"
	)
	return summary, json.dumps(scene)
	except Exception as e:
	logger.error(f"Scene analysis failed: {e}")
	return f"Error analyzing image: {str(e)}", "{}"


	def generate_story_opening(scene_json, genre, tone, theme, conflict, ending):
	"""Generate branching story opening with 3 choices."""
	try:
	scene = json.loads(scene_json) if scene_json else {}
	except json.JSONDecodeError:
	scene = {"scene_description": scene_json}

	preferences = {
	"genre": genre, "tone": tone, "theme": theme,
	"conflict": conflict, "ending": ending,
	}
	prompt = scene_to_story_prompt(scene, preferences)

	try:
	state = generate_opening(prompt)
	choices_text = ""
	if state.choices:
	choices_text = "\n\n---\nWhat happens next?\n"
	for i, c in enumerate(state.choices):
	choices_text += f"\nOption {i+1}: {c}"

	state_dict = {
	"scene_context": state.scene_context,
	"chapters": state.chapters,
	"current_text": state.current_text,
	"choices": state.choices,
	"branch_depth": state.branch_depth,
	"max_branches": state.max_branches,
	}
	return (
	state.current_text + choices_text,
	json.dumps(state_dict),
	gr.update(visible=bool(state.choices)),
	)
	except Exception as e:
	logger.error(f"Story generation failed: {e}")
	return f"Error: {str(e)}", "{}", gr.update(visible=False)


	def make_choice(choice_num, state_json):
	"""Continue story based on user's branch choice."""
	try:
	sd = json.loads(state_json)
	state = StoryState(
	scene_context=sd["scene_context"],
	chapters=sd["chapters"],
	current_text=sd["current_text"],
	choices=sd["choices"],
	branch_depth=sd["branch_depth"],
	max_branches=sd.get("max_branches", 2),
	)
	new_state = continue_story(state, choice_num)

	full_story = "\n\n---\n\n".join(new_state.chapters)
	choices_text = ""
	if new_state.choices:
	choices_text = "\n\n---\nWhat happens next?\n"
	for i, c in enumerate(new_state.choices):
	choices_text += f"\nOption {i+1}: {c}"
	elif new_state.branch_depth >= new_state.max_branches:
	choices_text = "\n\n---\n🎬 Story Complete! Generate your cinematic video below."

	new_dict = {
	"scene_context": new_state.scene_context,
	"chapters": new_state.chapters,
	"current_text": new_state.current_text,
	"choices": new_state.choices,
	"branch_depth": new_state.branch_depth,
	"max_branches": new_state.max_branches,
	}
	return (
	full_story + choices_text,
	json.dumps(new_dict),
	gr.update(visible=bool(new_state.choices)),
	)
	except Exception as e:
	logger.error(f"Story continuation failed: {e}")
	return f"Error: {str(e)}", state_json, gr.update(visible=False)


	def generate_audio_only(state_json, voice_name, speed):
	"""Generate narration audio without video (quick preview)."""
	try:
	sd = json.loads(state_json)
	full_text = "\n\n".join(sd.get("chapters", []))
	if not full_text.strip():
	return None, "No story text to narrate."

	voice_id = VOICE_MAP.get(voice_name, "af_heart")
	output_path = os.path.join(WORK_DIR, "narration_preview.wav")

	start = time.time()
	generate_speech(full_text, voice=voice_id, speed=speed, output_path=output_path)
	elapsed = time.time() - start

	return output_path, f"Audio generated in {elapsed:.1f}s using Kokoro ({voice_name})"
	except Exception as e:
	logger.error(f"TTS failed: {e}")
	return None, f"Error: {str(e)}"


	def generate_cinematic_video(
	image_path, state_json, scene_json, voice_name, speed, style_name,
	progress=gr.Progress(track_tqdm=False),
	):
	"""
	End-to-end: story chapters → stylised images → per-chapter audio →
	Ken Burns storyboard video synced to narration.

	Each image displays for exactly as long as its chapter is narrated.
	Total cost: $0.00.
	"""
	if image_path is None:
	return None, None, "Upload an image first."
	try:
	sd = json.loads(state_json)
	chapters = sd.get("chapters", [])
	if not chapters:
	return None, None, "Generate a story first."

	scene = json.loads(scene_json) if scene_json else {}
	except json.JSONDecodeError:
	return None, None, "Invalid story state."

	voice_id = VOICE_MAP.get(voice_name, "af_heart")
	output_path = os.path.join(WORK_DIR, "cinestory_final.mp4")

	try:
	progress(0.1, desc="Generating chapter images...")
	result = create_cinematic_story_video(
	chapters=chapters,
	scene_json=scene,
	original_image_path=image_path,
	style_name=style_name,
	voice=voice_id,
	speed=speed,
	output_path=output_path,
	)

	durations = ", ".join(
	f"Ch{d['chapter']}: {d['duration_s']}s" for d in result.chapter_durations
	)

	# Check if any image gen errors occurred (partial fallbacks)
	img_errors = get_image_gen_errors()
	error_note = ""
	if img_errors:
	error_note = (
	f"\n\n⚠️ Image generation warnings ({len(img_errors)}):\n"
	+ "\n".join(f"- {e[:120]}" for e in img_errors[-5:])
	)

	status = (
	f"Video created in {result.generation_time}s \| "
	f"Duration: {result.total_duration}s \| "
	f"Chapters: {result.num_chapters}\n\n"
	f"Timing: {durations}\n\n"
	f"Cost: ${result.cost_usd:.2f}"
	f"{error_note}"
	)

	# Also extract audio for the audio player
	audio_path = os.path.join(WORK_DIR, "narration_combined.wav")
	# Combine chapter audios if they exist in temp dir
	import glob
	ch_audios = sorted(glob.glob(os.path.join(
	os.path.dirname(output_path), "..", "cinestory_vid_", "ch_audio_.wav"
	)))
	if not ch_audios:
	# Generate a single combined audio as fallback
	full_text = "\n\n".join(chapters)
	generate_speech(full_text, voice=voice_id, speed=speed, output_path=audio_path)
	else:
	from tts import concatenate_audio
	concatenate_audio(ch_audios, audio_path, pause_seconds=0.5)

	return result.video_path, audio_path, status
	except Exception as e:
	logger.error(f"Cinematic video failed: {e}", exc_info=True)
	img_errors = get_image_gen_errors()
	detail = ""
	if img_errors:
	detail = "\n\nImage generation errors:\n" + "\n".join(
	f"- {err[:150]}" for err in img_errors[-5:]
	)
	return None, None, f"Error: {str(e)}{detail}"


	# ── Gradio UI ─────────────────────────────────────────────────────────────────

	CUSTOM_CSS = """
	@import url('https://fonts.googleapis.com/css2?family=Space+Grotesk:wght@400;500;600;700&family=Fraunces:opsz,wght,SOFT@9..144,500,50&display=swap');

	/* ── Force light mode via CSS variables ─────────────────── */
	:root, .dark {
	--block-background-fill: white !important;
	--panel-background-fill: white !important;
	--body-background-fill: #f8faf6 !important;
	--background-fill-primary: white !important;
	--background-fill-secondary: #f8faf6 !important;
	--border-color-primary: #d1d5db !important;
	--block-border-color: #e5e7eb !important;
	--input-background-fill: white !important;
	--body-text-color: #1f2937 !important;
	--block-label-text-color: #1e293b !important;
	--block-title-text-color: #1e293b !important;
	}

	.gradio-container {
	background:
	radial-gradient(1200px 600px at 0% -10%, #d9efe9 0%, transparent 60%),
	radial-gradient(1000px 500px at 100% 0%, #ffe7cc 0%, transparent 55%),
	linear-gradient(140deg, #f5f7f2, #fefcf8) !important;
	font-family: "Space Grotesk", ui-sans-serif, system-ui, sans-serif;
	color: #1f2937;
	}

	/* ── All text dark ──────────────────────────────────────── */
	.gradio-container .prose, .gradio-container .prose *,
	.gradio-container .markdown-text, .gradio-container .markdown-text *,
	.gradio-container label, .gradio-container label span,
	.gradio-container p, .gradio-container h1,
	.gradio-container h2, .gradio-container h3 {
	color: #1f2937 !important;
	}

	/* ── Media player controls: leave untouched ─────────────── */
	.gradio-container audio, .gradio-container audio *,
	.gradio-container video, .gradio-container video *,
	.gradio-container button svg, .gradio-container button path {
	color: unset !important;
	fill: unset !important;
	}

	/* ── Accordion header ───────────────────────────────────── */
	.gradio-container .label-wrap {
	background: linear-gradient(135deg, #eef7f4, #fdf5ec) !important;
	color: #1e293b !important;
	}
	.gradio-container .label-wrap * { color: #1e293b !important; }

	/* ── Dropdown labels: no colored background ─────────────── */
	.gradio-container label > span { background: transparent !important; }

	/* ── Inline code: light teal instead of dark block ──────── */
	.gradio-container code,
	.gradio-container .prose code,
	.gradio-container .markdown-text code {
	background: rgba(15, 118, 110, 0.08) !important;
	color: #0f766e !important;
	padding: 0.15em 0.4em;
	border-radius: 4px;
	}

	/* ── Label badges (Genre, Tone etc): no colored bg ──────── */
	.gradio-container .block label span,
	.gradio-container span[data-testid],
	.gradio-container .gr-input-label {
	background: transparent !important;
	background-color: transparent !important;
	color: #1e293b !important;
	}

	/* ── Hero ────────────────────────────────────────────────── */
	.hero {
	border: 1px solid rgba(20,30,24,0.10);
	background: linear-gradient(120deg, #ffffff, #f8fffc) !important;
	border-radius: 18px; padding: 1.1rem 1.2rem;
	box-shadow: 0 10px 28px rgba(16,24,40,0.06);
	margin-bottom: 0.8rem;
	}
	.hero h1 {
	margin: 0; color: #1e293b !important;
	font-family: "Fraunces", Georgia, serif;
	font-size: clamp(1.8rem, 3.2vw, 2.45rem);
	}
	.hero p { margin: 0.55rem 0 0; color: #5b6472 !important; font-size: 0.99rem; }

	/* ── Step chips ──────────────────────────────────────────── */
	.flow-guide {
	display: grid;
	grid-template-columns: repeat(4, minmax(130px, 1fr));
	gap: 0.55rem; margin: 0.5rem 0 1rem;
	}
	.guide-chip {
	border: 1px solid rgba(15,118,110,0.18);
	background: linear-gradient(140deg, #ffffff, #f2fbf8) !important;
	border-radius: 12px; padding: 0.6rem 0.72rem;
	font-size: 0.88rem; color: #1e293b !important;
	box-shadow: 0 4px 14px rgba(15,118,110,0.06);
	}
	.guide-chip * { color: #1e293b !important; }
	.guide-chip b { color: #0f766e !important; font-weight: 700; }

	/* ── Helpers ─────────────────────────────────────────────── */
	.panel-title { color: #1e293b !important; font-weight: 700; font-size: 1rem; }
	.helper-note { color: #5b6472 !important; font-size: 0.88rem; margin-bottom: 0.4rem; }
	.cost-tag {
	font-family: ui-monospace, monospace; font-size: 0.86em;
	color: #047857 !important;
	border: 1px solid rgba(4,120,87,0.2);
	background: rgba(236,253,245,0.7) !important;
	border-radius: 10px; padding: 0.6rem 0.7rem;
	}

	button.primary, button.primary * { color: white !important; }

	@media (max-width: 960px) {
	.flow-guide { grid-template-columns: repeat(2, minmax(120px, 1fr)); }
	}
	"""

	CUSTOM_THEME = gr.themes.Soft(primary_hue="emerald", secondary_hue="orange")


	def build_app():
	# Gradio 6 moved theme/css from Blocks() to launch().
	# We try Blocks() first (works in Gradio 5), fall back to bare Blocks.
	try:
	app_context = gr.Blocks(
	title="CineStory AI",
	theme=CUSTOM_THEME,
	css=CUSTOM_CSS,
	)
	except TypeError:
	# Gradio 6: theme/css not accepted in constructor
	app_context = gr.Blocks(title="CineStory AI")

	with app_context as app:

	gr.HTML(
	"<div class='hero'>"
	"<h1>CineStory AI</h1>"
	"<p>Turn one image into a short interactive story, then export a narrated cinematic video.</p>"
	"</div>"
	"<div class='flow-guide'>"
	"<div class='guide-chip'><b>Step 1</b><br>Upload and analyze your image</div>"
	"<div class='guide-chip'><b>Step 2</b><br>Generate story and choose branches</div>"
	"<div class='guide-chip'><b>Step 3</b><br>Preview narration voice</div>"
	"<div class='guide-chip'><b>Step 4</b><br>Create the final cinematic video</div>"
	"</div>"
	)

	scene_json = gr.State("{}")
	story_state = gr.State("{}")

	with gr.Row():
	# ── Left: inputs ──────────────────────────────────────────────
	with gr.Column(scale=2):
	gr.HTML(
	"<div class='panel-title'>1) Image and story controls</div>"
	"<div class='helper-note'>Start by uploading one image, then tune story direction and voice.</div>"
	)
	image_input = gr.Image(type="filepath", label="Choose an image")

	gr.Markdown("### Story Preferences")
	with gr.Row():
	genre = gr.Dropdown(
	["Fantasy", "Science Fiction", "Mystery",
	"Romance", "Horror", "Adventure"],
	value="Fantasy", label="Genre",
	)
	tone = gr.Dropdown(
	["Serious", "Light-hearted", "Humorous",
	"Dark", "Whimsical"],
	value="Serious", label="Tone",
	)
	with gr.Row():
	theme = gr.Dropdown(
	["Self-discovery", "Redemption", "Love",
	"Justice", "Survival", "Freedom"],
	value="Self-discovery", label="Theme",
	)
	conflict = gr.Dropdown(
	["Internal struggle", "Person vs. Society",
	"Person vs. Nature", "Person vs. Person"],
	value="Internal struggle", label="Conflict",
	)
	ending = gr.Dropdown(
	["Happy", "Bittersweet", "Open-ended", "Tragic", "Twist"],
	value="Open-ended", label="Ending",
	)

	analyze_btn = gr.Button(
	"Step 1: Analyze Image", variant="primary", size="lg",
	)

	gr.Markdown("### Visual Style and Voice")
	with gr.Row():
	style_select = gr.Dropdown(
	get_style_names(),
	value="Watercolor Storybook",
	label="Art Style",
	)
	voice_select = gr.Dropdown(
	list(VOICE_MAP.keys()),
	value="Narrator (Female, Warm)",
	label="Narrator Voice",
	)
	speed_slider = gr.Slider(
	0.5, 1.5, value=1.0, step=0.1, label="Narration Speed",
	)

	# ── Right: outputs ────────────────────────────────────────────
	with gr.Column(scale=3):

	with gr.Accordion("Step 1 Output: Scene Analysis", open=True):
	scene_display = gr.Markdown(
	"Click Step 1: Analyze Image* after uploading your image.*"
	)

	with gr.Accordion("Step 2: Story", open=True):
	story_display = gr.Markdown(
	"After analysis, click Step 2: Generate Story* and pick your branch options.*"
	)
	generate_story_btn = gr.Button(
	"Step 2: Generate Story", variant="primary", size="lg",
	)
	with gr.Group(visible=False) as choice_group:
	gr.Markdown("Choose what happens next (up to 2 rounds):")
	with gr.Row():
	choice_1_btn = gr.Button("Option 1", variant="secondary")
	choice_2_btn = gr.Button("Option 2", variant="secondary")
	choice_3_btn = gr.Button("Option 3", variant="secondary")

	with gr.Accordion("Optional Step 3: Audio Preview", open=False):
	audio_btn = gr.Button(
	"Step 3: Preview Narration Audio", variant="secondary",
	)
	audio_output = gr.Audio(
	label="Narration Preview", type="filepath",
	)
	audio_status = gr.Markdown("")

	with gr.Accordion("Step 4: Cinematic Video", open=True):
	gr.Markdown(
	"*Creates stylized chapter images, narration, and a stitched video. "
	"Run this after you are happy with the story choices.*"
	)
	video_btn = gr.Button(
	"Step 4: Create Cinematic Story Video",
	variant="primary", size="lg",
	)
	video_output = gr.Video(label="Story Video")
	video_audio = gr.Audio(
	label="Full Narration", type="filepath", visible=True,
	)
	video_status = gr.Markdown("", elem_classes="cost-tag")

	# ── Wiring ────────────────────────────────────────────────────────

	analyze_btn.click(
	fn=process_image,
	inputs=[image_input],
	outputs=[scene_display, scene_json],
	)

	generate_story_btn.click(
	fn=generate_story_opening,
	inputs=[scene_json, genre, tone, theme, conflict, ending],
	outputs=[story_display, story_state, choice_group],
	)

	choice_1_btn.click(
	fn=lambda s: make_choice(0, s),
	inputs=[story_state],
	outputs=[story_display, story_state, choice_group],
	)
	choice_2_btn.click(
	fn=lambda s: make_choice(1, s),
	inputs=[story_state],
	outputs=[story_display, story_state, choice_group],
	)
	choice_3_btn.click(
	fn=lambda s: make_choice(2, s),
	inputs=[story_state],
	outputs=[story_display, story_state, choice_group],
	)

	audio_btn.click(
	fn=generate_audio_only,
	inputs=[story_state, voice_select, speed_slider],
	outputs=[audio_output, audio_status],
	)

	video_btn.click(
	fn=generate_cinematic_video,
	inputs=[
	image_input, story_state, scene_json,
	voice_select, speed_slider, style_select,
	],
	outputs=[video_output, video_audio, video_status],
	)

	return app


	# ── Entry Point ───────────────────────────────────────────────────────────────

	if __name__ == "__main__":
	app = build_app()
	port = int(os.environ.get("GRADIO_SERVER_PORT", "7860"))
	server_port = _find_free_port(default_port=port)
	logger.info(f"Launching on port {server_port}")

	launch_kwargs = dict(
	share=False,
	server_port=server_port,
	)
	# Gradio 6 accepts theme/css in launch()
	try:
	app.launch(
	pwa=True,
	favicon_path=(
	"./assets/favicon.png"
	if os.path.exists("./assets/favicon.png") else None
	),
	theme=CUSTOM_THEME,
	css=CUSTOM_CSS,
	**launch_kwargs,
	)
	except TypeError:
	# Gradio 5: theme/css already set in Blocks(), launch doesn't accept them
	app.launch(
	pwa=True,
	favicon_path=(
	"./assets/favicon.png"
	if os.path.exists("./assets/favicon.png") else None
	),
	**launch_kwargs,
	)