garyuzair commited on
Commit
fe33295
Β·
verified Β·
1 Parent(s): 946032c

Upload app_hf_space_optimized.py

Browse files
Files changed (1) hide show
  1. src/app_hf_space_optimized.py +582 -0
src/app_hf_space_optimized.py ADDED
@@ -0,0 +1,582 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import torch
3
+ import gc
4
+ import os
5
+ import json
6
+ import time
7
+ import soundfile as sf
8
+ from PIL import Image
9
+ import numpy as np
10
+ import ffmpeg # Use ffmpeg-python
11
+ from transformers import AutoModelForCausalLM, AutoTokenizer
12
+ from diffusers import StableDiffusionXLPipeline, CogVideoXPipeline
13
+ from diffusers.utils import export_to_video
14
+ from parler_tts import ParlerTTSForConditionalGeneration
15
+ import tempfile
16
+ import shutil
17
+ import traceback
18
+ import psutil # For memory stats
19
+
20
+ st.set_page_config(layout="wide", page_title="POV Video Gen (HF Space)")
21
+
22
+ # --- Configuration ---
23
+ LLM_MODEL_ID = "Qwen/Qwen3-0.6B"
24
+ IMAGE_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
25
+ VIDEO_MODEL_ID = "THUDM/CogVideoX-2b"
26
+ TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
27
+
28
+ IMAGE_WIDTH = 768
29
+ IMAGE_HEIGHT = 1344
30
+ SCENE_DURATION_SECONDS = 4 # Reduced duration for faster processing
31
+ VIDEO_FPS = 10
32
+ NUM_SCENES_DEFAULT = 3 # Lowered default
33
+ MAX_SCENES = 4 # Stricter limit for free tier
34
+ TEMP_SUBDIR = "pov_video_temp_hf" # Unique name
35
+
36
+ # --- Device Setup & Memory Monitor ---
37
+ mem_info_placeholder = st.sidebar.empty()
38
+
39
+ def display_memory_usage():
40
+ """Displays CPU and GPU memory usage in the sidebar."""
41
+ try:
42
+ process = psutil.Process(os.getpid())
43
+ cpu_mem = process.memory_info().rss / (1024 * 1024) # MB
44
+ gpu_mem_info = "N/A"
45
+ if torch.cuda.is_available():
46
+ allocated = torch.cuda.memory_allocated(0) / (1024 * 1024) # MB
47
+ reserved = torch.cuda.memory_reserved(0) / (1024 * 1024) # MB
48
+ total = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024) # MB
49
+ gpu_mem_info = f"Alloc: {allocated:.0f}MB | Reserv: {reserved:.0f}MB | Total: {total:.0f}MB"
50
+ mem_info_placeholder.info(f"🧠 CPU Mem: {cpu_mem:.0f} MB\n⚑ GPU Mem: {gpu_mem_info}")
51
+ except Exception as e:
52
+ mem_info_placeholder.warning(f"Could not get memory info: {e}")
53
+
54
+ if torch.cuda.is_available():
55
+ device = "cuda"
56
+ try:
57
+ vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
58
+ st.sidebar.success(f"βœ… GPU Detected! VRAM: {vram_gb:.2f} GB")
59
+ if vram_gb < 15:
60
+ st.sidebar.warning("⚠️ Low VRAM (< 15GB). May struggle.")
61
+ except Exception:
62
+ st.sidebar.warning("Could not read GPU VRAM.") # Continue assuming GPU exists
63
+ else:
64
+ device = "cpu"
65
+ st.sidebar.error("⚠️ No GPU! App will be extremely slow & likely fail.")
66
+
67
+ # --- Helper Functions ---
68
+ def cleanup_gpu_memory(*args):
69
+ """Attempts to free GPU memory."""
70
+ print(f"Attempting GPU mem cleanup. Vars to del: {len(args)}")
71
+ display_memory_usage() # Before cleanup
72
+ del args # Remove reference to the tuple itself
73
+ gc.collect()
74
+ if torch.cuda.is_available():
75
+ torch.cuda.empty_cache()
76
+ display_memory_usage() # After cleanup
77
+ print("GPU mem cleanup done.")
78
+
79
+ def get_temp_dir():
80
+ """Creates or returns the path to the temporary directory."""
81
+ # Use a consistent path within the app's execution context for simplicity on Spaces
82
+ # This might lead to leftover files if cleanup fails, but avoids potential permission issues with system temp
83
+ app_temp_dir = os.path.abspath(TEMP_SUBDIR) # Use relative path from script
84
+ os.makedirs(app_temp_dir, exist_ok=True)
85
+ if 'temp_dir_path' not in st.session_state or st.session_state.temp_dir_path != app_temp_dir:
86
+ print(f"Setting temp dir: {app_temp_dir}")
87
+ st.session_state.temp_dir_path = app_temp_dir
88
+ return app_temp_dir
89
+
90
+ def cleanup_temp_dir():
91
+ """Removes the application's temporary directory."""
92
+ dir_path = st.session_state.get('temp_dir_path', None)
93
+ if dir_path and os.path.exists(dir_path) and TEMP_SUBDIR in dir_path: # Safety check
94
+ try:
95
+ shutil.rmtree(dir_path)
96
+ st.sidebar.success(f"Cleaned up: {dir_path}")
97
+ st.session_state.temp_dir_path = None
98
+ except Exception as e:
99
+ st.sidebar.error(f"Error cleaning temp dir {dir_path}: {e}")
100
+ else:
101
+ st.sidebar.info("Temp dir not found or already cleaned.")
102
+
103
+
104
+ # --- Model Interaction Functions (Load -> Use -> Unload) ---
105
+
106
+ def run_llm_step(user_prompt, num_scenes):
107
+ """Loads LLM, generates story, unloads LLM."""
108
+ st.info(f"πŸ”„ Loading LLM: {LLM_MODEL_ID}...")
109
+ display_memory_usage()
110
+ llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None
111
+ story_data = None
112
+ try:
113
+ dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_bf16_supported() else torch.float16 if device=="cuda" else torch.float32
114
+ llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
115
+ llm_model = AutoModelForCausalLM.from_pretrained(
116
+ LLM_MODEL_ID, torch_dtype=dtype, low_cpu_mem_usage=True, device_map="auto" # Try low_cpu_mem_usage
117
+ )
118
+ display_memory_usage()
119
+ st.info("🧠 Generating story structure...")
120
+
121
+ # --- System Prompt --- (Shortened descriptions max length)
122
+ system_prompt = f"""
123
+ You are an expert director creating POV TikTok video scripts.
124
+ Break down the user's scenario into exactly {num_scenes} scenes ({SCENE_DURATION_SECONDS}s each).
125
+ For EACH scene, generate:
126
+ 1. `scene_description`: Max 1-2 concise sentences describing action/setting for TTS. Max 350 characters.
127
+ 2. `image_prompt`: Detailed SDXL POV prompt (Start with "First-person perspective - pov shot of..."). Include setting, mood, style, time period, elements. Add "pov hands from the bottom corner..." if needed.
128
+ 3. `video_direction_prompt`: Simple camera action/motion for CogVideoX (e.g., "Camera pans right", "Subtle zoom in", "Static shot", "Hand reaches out").
129
+ 4. `audio_description`: Voice & ambience description for Parler-TTS (e.g., "Nervous male voice, faint market chatter.", "Calm female narrator, quiet library ambience.").
130
+
131
+ Respond ONLY with a valid JSON object:
132
+ {{
133
+ "story_details": {{
134
+ "title": "POV Title (Year)",
135
+ "full_story": "Brief summary...",
136
+ "scenes": [
137
+ {{ // Scene 1
138
+ "scene_description": "...", // Max 350 chars
139
+ "image_prompt": "...",
140
+ "video_direction_prompt": "...",
141
+ "audio_description": "..."
142
+ }},
143
+ // ... {num_scenes} scenes total ...
144
+ ]
145
+ }}
146
+ }}
147
+ Strictly adhere to JSON format. No extra text.
148
+ """.strip()
149
+
150
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Create script: {user_prompt}"}]
151
+ text_input = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
152
+ model_inputs = llm_tokenizer([text_input], return_tensors="pt").to(llm_model.device if hasattr(llm_model, 'device') else device)
153
+
154
+ # Use recommended parameters for non-thinking Qwen3
155
+ generated_ids = llm_model.generate(
156
+ **model_inputs, max_new_tokens=4096, # Still allow space for generation
157
+ temperature=0.7, top_p=0.8, top_k=20, do_sample=True,
158
+ pad_token_id=llm_tokenizer.eos_token_id # Important for stopping
159
+ )
160
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
161
+ response_text = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
162
+
163
+ st.write("LLM Raw Output:"); st.code(response_text, language='text')
164
+ json_string = response_text.strip().removeprefix("```json").removesuffix("```").strip()
165
+ parsed_data = json.loads(json_string)
166
+
167
+ if not ("story_details" in parsed_data and "scenes" in parsed_data["story_details"]): raise ValueError("Invalid JSON structure.")
168
+ actual_num_scenes = len(parsed_data["story_details"]["scenes"])
169
+ if actual_num_scenes != num_scenes: st.warning(f"LLM gave {actual_num_scenes} scenes, requested {num_scenes}.")
170
+
171
+ story_data = parsed_data["story_details"]
172
+ st.success("βœ… Story generation complete.")
173
+ except Exception as e:
174
+ st.error(f"❌ LLM Step Failed: {e}"); st.error(traceback.format_exc()); story_data = None
175
+ finally:
176
+ st.info("πŸ”„ Unloading LLM..."); cleanup_gpu_memory(llm_model, llm_tokenizer, model_inputs, generated_ids); st.info("βœ… LLM Unloaded.")
177
+ return story_data
178
+
179
+ def run_image_step(scenes, temp_dir):
180
+ st.info(f"πŸ”„ Loading Image Generator: {IMAGE_MODEL_ID}...")
181
+ display_memory_usage()
182
+ image_pipe = None; image_results = []
183
+ try:
184
+ dtype = torch.float16 if device == "cuda" else torch.float32
185
+ image_pipe = StableDiffusionXLPipeline.from_pretrained(
186
+ IMAGE_MODEL_ID, torch_dtype=dtype, use_safetensors=True, variant="fp16" if device == "cuda" else None,
187
+ low_cpu_mem_usage=True # Crucial for loading on low RAM systems
188
+ )
189
+ # Use CPU offloading even if it's slower, necessary for T4 VRAM
190
+ if device == "cuda": image_pipe.enable_model_cpu_offload()
191
+ else: image_pipe.to(device) # Move to CPU if needed
192
+ display_memory_usage()
193
+ st.info("🎨 Generating images sequentially...")
194
+
195
+ for i, scene in enumerate(scenes):
196
+ img_path = os.path.join(temp_dir, f"scene_{i+1}_img.png")
197
+ st.write(f"Generating Image {i+1}/{len(scenes)}...")
198
+ image = None # Define before try block
199
+ try:
200
+ with torch.no_grad():
201
+ image = image_pipe(
202
+ prompt=scene.get("image_prompt", "blank image"),
203
+ width=IMAGE_WIDTH, height=IMAGE_HEIGHT, num_inference_steps=25 # Fewer steps for speed
204
+ ).images[0]
205
+ image.save(img_path)
206
+ image_results.append({"scene": i, "path": img_path, "status": "succeeded"})
207
+ st.image(image, caption=f"Scene {i+1} OK", width=150)
208
+ except Exception as e:
209
+ st.error(f"❌ Image {i+1} Failed: {e}"); st.error(traceback.format_exc())
210
+ image_results.append({"scene": i, "path": None, "status": "failed"})
211
+ finally: cleanup_gpu_memory(image) # Clean intermediate var
212
+
213
+ st.success("βœ… Image generation step complete.")
214
+ except Exception as e:
215
+ st.error(f"❌ Image Gen Step Failed: {e}"); st.error(traceback.format_exc())
216
+ image_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
217
+ finally:
218
+ st.info("πŸ”„ Unloading Image Generator..."); cleanup_gpu_memory(image_pipe); st.info("βœ… Image Generator Unloaded.")
219
+ return image_results
220
+
221
+ def run_video_step(image_results, scenes, temp_dir):
222
+ successful_images = [item for item in image_results if item["status"] == "succeeded"]
223
+ if not successful_images: return []
224
+ st.info(f"πŸ”„ Loading Video Generator: {VIDEO_MODEL_ID}...")
225
+ display_memory_usage()
226
+ video_pipe = None; video_results = []
227
+ try:
228
+ dtype = torch.float16 if device == "cuda" else torch.float32
229
+ # Instantiate VAE and Transformer separately for potential offloading/quantization later if needed
230
+ # For now, load pipeline directly, enabling optimizations
231
+ video_pipe = CogVideoXPipeline.from_pretrained(VIDEO_MODEL_ID, torch_dtype=dtype)
232
+ if device == "cuda":
233
+ video_pipe.enable_model_cpu_offload()
234
+ video_pipe.enable_sequential_cpu_offload() # Needed for low VRAM
235
+ else: video_pipe.to(device)
236
+ video_pipe.vae.enable_slicing(); video_pipe.vae.enable_tiling()
237
+ display_memory_usage()
238
+ st.info("🎬 Generating videos sequentially...")
239
+ generator = torch.Generator(device=device)
240
+
241
+ for item in successful_images:
242
+ scene_index = item["scene"]; vid_path = os.path.join(temp_dir, f"scene_{scene_index + 1}_vid.mp4")
243
+ st.write(f"Generating Video for Scene {scene_index + 1}...")
244
+ img, video_frames = None, None # Define before try
245
+ try:
246
+ img = Image.open(item["path"])
247
+ video_direction = scenes[scene_index].get("video_direction_prompt", "subtle motion")
248
+ seed = int(time.time() * 1000 + scene_index) % 100000
249
+ if device == "cuda": generator.manual_seed(seed)
250
+ else: generator = torch.Generator(device='cpu').manual_seed(seed)
251
+
252
+ with torch.no_grad():
253
+ video_frames = video_pipe(
254
+ prompt=video_direction, image=img, num_inference_steps=40, # Slightly fewer steps
255
+ num_frames=int(SCENE_DURATION_SECONDS * VIDEO_FPS) + 1,
256
+ guidance_scale=6.0, generator=generator
257
+ ).frames[0]
258
+ export_to_video(video_frames, vid_path, fps=VIDEO_FPS)
259
+ video_results.append({"scene": scene_index, "path": vid_path, "status": "succeeded"})
260
+ # Comment out preview to save resources on Spaces
261
+ # st.video(vid_path)
262
+ st.success(f"Video Scene {scene_index + 1} OK.")
263
+ except Exception as e:
264
+ st.error(f"❌ Video {scene_index + 1} Failed: {e}"); st.error(traceback.format_exc())
265
+ video_results.append({"scene": scene_index, "path": None, "status": "failed"})
266
+ finally: cleanup_gpu_memory(img, video_frames)
267
+
268
+ st.success("βœ… Video generation step complete.")
269
+ except Exception as e:
270
+ st.error(f"❌ Video Gen Step Failed: {e}"); st.error(traceback.format_exc())
271
+ video_results = [{"scene": item["scene"], "path": None, "status": "failed"} for item in successful_images]
272
+ finally:
273
+ st.info("πŸ”„ Unloading Video Generator..."); cleanup_gpu_memory(video_pipe); st.info("βœ… Video Generator Unloaded.")
274
+ return video_results
275
+
276
+ def run_audio_step(scenes, temp_dir):
277
+ st.info(f"πŸ”„ Loading TTS Model: {TTS_MODEL_ID}...")
278
+ display_memory_usage()
279
+ tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
280
+ audio_results = []
281
+ try:
282
+ # Load TTS model (Parler requires specific class)
283
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID).to(device)
284
+ tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) # For text prompt
285
+ tts_desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) # For description
286
+ display_memory_usage()
287
+ st.info("πŸ”Š Generating audio sequentially...")
288
+
289
+ for i, scene in enumerate(scenes):
290
+ audio_path = os.path.join(temp_dir, f"scene_{i+1}_audio.wav")
291
+ st.write(f"Generating Audio {i+1}/{len(scenes)}...")
292
+ desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None # Define before try
293
+ try:
294
+ text_to_speak = scene.get("scene_description", "")[:350] # Enforce limit
295
+ voice_description = scene.get("audio_description", "A neutral speaker.")
296
+ if not text_to_speak:
297
+ audio_results.append({"scene": i, "path": None, "status": "skipped"})
298
+ continue
299
+
300
+ desc_input_ids = tts_desc_tokenizer(voice_description, return_tensors="pt").input_ids.to(device)
301
+ prompt_input_ids = tts_tokenizer(text_to_speak, return_tensors="pt").input_ids.to(device)
302
+
303
+ with torch.no_grad():
304
+ generation = tts_model.generate(
305
+ input_ids=desc_input_ids, prompt_input_ids=prompt_input_ids,
306
+ do_sample=True, temperature=0.7 # Slightly higher temp for variety
307
+ ).to(torch.float32)
308
+
309
+ audio_arr = generation.cpu().numpy().squeeze()
310
+ sampling_rate = tts_model.config.sampling_rate
311
+ sf.write(audio_path, audio_arr, sampling_rate)
312
+ audio_results.append({"scene": i, "path": audio_path, "status": "succeeded"})
313
+ st.audio(audio_path, format='audio/wav') # Preview audio
314
+ except Exception as e:
315
+ st.error(f"❌ Audio {i+1} Failed: {e}"); st.error(traceback.format_exc())
316
+ audio_results.append({"scene": i, "path": None, "status": "failed"})
317
+ finally: cleanup_gpu_memory(desc_input_ids, prompt_input_ids, generation, audio_arr)
318
+
319
+ st.success("βœ… Audio generation step complete.")
320
+ except Exception as e:
321
+ st.error(f"❌ Audio Gen Step Failed: {e}"); st.error(traceback.format_exc())
322
+ audio_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
323
+ finally:
324
+ st.info("πŸ”„ Unloading TTS Model..."); cleanup_gpu_memory(tts_model, tts_tokenizer, tts_desc_tokenizer); st.info("βœ… TTS Model Unloaded.")
325
+ return audio_results
326
+
327
+ def run_compose_step_ffmpeg(video_results, audio_results, temp_dir, title="final_pov_video"):
328
+ """Combines videos and audio using ffmpeg-python."""
329
+ st.info("🎞️ Composing final video using ffmpeg-python (CPU)...")
330
+ display_memory_usage()
331
+ final_video_path = None
332
+ long_video_path = os.path.join(temp_dir, "long_video_temp.mp4")
333
+ long_audio_path = os.path.join(temp_dir, "long_audio_temp.wav")
334
+ final_output_path = os.path.join(temp_dir, f"{title}.mp4")
335
+ concat_video_list_path = os.path.join(temp_dir, "ffmpeg_video_list.txt")
336
+ concat_audio_list_path = os.path.join(temp_dir, "ffmpeg_audio_list.txt")
337
+
338
+ try:
339
+ successful_videos = sorted([item for item in video_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
340
+ successful_audio = sorted([item for item in audio_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
341
+
342
+ # Align based on scene index for safety
343
+ paths_to_compose = []
344
+ audio_map = {item['scene']: item['path'] for item in successful_audio}
345
+ for video_item in successful_videos:
346
+ scene_idx = video_item['scene']
347
+ if scene_idx in audio_map:
348
+ paths_to_compose.append({'scene': scene_idx, 'video': video_item['path'], 'audio': audio_map[scene_idx]})
349
+
350
+ if not paths_to_compose:
351
+ st.error("❌ No matching video/audio pairs found.")
352
+ return None
353
+
354
+ st.write(f"Found {len(paths_to_compose)} matching scene(s) to compose.")
355
+
356
+ # 1. Create file lists for ffmpeg concat demuxer
357
+ with open(concat_video_list_path, "w") as f_vid, open(concat_audio_list_path, "w") as f_aud:
358
+ for item in paths_to_compose:
359
+ f_vid.write(f"file '{os.path.relpath(item['video'], temp_dir)}'\n") # Use relative paths within temp dir
360
+ f_aud.write(f"file '{os.path.relpath(item['audio'], temp_dir)}'\n")
361
+
362
+ # 2. Concatenate Audio Files
363
+ st.write("Concatenating audio...")
364
+ try:
365
+ (
366
+ ffmpeg
367
+ .input(concat_audio_list_path, format='concat', safe=0, fflags='+igndts') # Add flags
368
+ .output(long_audio_path, acodec='pcm_s16le') # Output intermediate WAV
369
+ .global_args('-hide_banner', '-loglevel', 'error') # Suppress verbose output
370
+ .run(overwrite_output=True, cmd='ffmpeg') # Specify cmd='ffmpeg' if needed
371
+ )
372
+ st.write("Audio concatenated.")
373
+ except ffmpeg.Error as e:
374
+ st.error("FFmpeg Audio Concat Error:")
375
+ st.code(e.stderr.decode() if e.stderr else str(e))
376
+ raise # Re-raise to stop the process
377
+
378
+ # 3. Concatenate Video Files
379
+ st.write("Concatenating videos...")
380
+ try:
381
+ (
382
+ ffmpeg
383
+ .input(concat_video_list_path, format='concat', safe=0, fflags='+igndts')
384
+ .output(long_video_path, c='copy') # Use stream copy for speed
385
+ .global_args('-hide_banner', '-loglevel', 'error')
386
+ .run(overwrite_output=True, cmd='ffmpeg')
387
+ )
388
+ st.write("Videos concatenated.")
389
+ except ffmpeg.Error as e:
390
+ st.error("FFmpeg Video Concat Error:")
391
+ st.code(e.stderr.decode() if e.stderr else str(e))
392
+ raise
393
+
394
+ # 4. Mux (Combine) Video and Audio
395
+ st.write("Muxing final video...")
396
+ try:
397
+ in_video = ffmpeg.input(long_video_path)
398
+ in_audio = ffmpeg.input(long_audio_path)
399
+ (
400
+ ffmpeg
401
+ .output(in_video, in_audio, final_output_path, vcodec='copy', acodec='aac', shortest=None, strict='experimental') # Use aac audio codec
402
+ .global_args('-hide_banner', '-loglevel', 'error')
403
+ .run(overwrite_output=True, cmd='ffmpeg')
404
+ )
405
+ final_video_path = final_output_path # Set the final path on success
406
+ st.success("βœ… Final video composed!")
407
+
408
+ except ffmpeg.Error as e:
409
+ st.error("FFmpeg Muxing Error:")
410
+ st.code(e.stderr.decode() if e.stderr else str(e))
411
+ final_video_path = None # Ensure it's None on failure
412
+ raise
413
+
414
+
415
+ except Exception as e:
416
+ st.error(f"❌ Video Composition Step Failed: {e}")
417
+ st.error(traceback.format_exc())
418
+ final_video_path = None
419
+ finally:
420
+ # Clean up intermediate files and lists
421
+ st.write("Cleaning up intermediate composition files...")
422
+ for f_path in [long_video_path, long_audio_path, concat_video_list_path, concat_audio_list_path]:
423
+ if os.path.exists(f_path):
424
+ try: os.remove(f_path)
425
+ except Exception as e_clean: print(f"Error cleaning {f_path}: {e_clean}")
426
+ display_memory_usage() # Final memory check for this step
427
+ return final_video_path
428
+
429
+
430
+ # --- Streamlit UI ---
431
+
432
+ st.title("🎬 POV Video Gen (HF Space Optimized)")
433
+ st.caption("Local Generation: Scenario -> Story -> Images -> Videos -> Audio -> Compose -> Download")
434
+
435
+ # Initialize Session State
436
+ def init_state():
437
+ keys_to_init = {
438
+ 'generation_in_progress': False, 'current_step': "idle", 'story_data': None,
439
+ 'image_results': [], 'video_results': [], 'audio_results': [],
440
+ 'final_video_path': None, 'temp_dir_path': None,
441
+ 'num_scenes': NUM_SCENES_DEFAULT
442
+ }
443
+ for key, default_value in keys_to_init.items():
444
+ if key not in st.session_state:
445
+ st.session_state[key] = default_value
446
+ init_state()
447
+
448
+ # --- Sidebar ---
449
+ with st.sidebar:
450
+ st.header("βš™οΈ Config & Control")
451
+ user_prompt = st.text_area("1. Enter POV Scenario:", height=100, value="POV: You're Marco Polo negotiating trade routes in the Silk Road bazaar (1270)", key="user_prompt_input")
452
+ num_scenes_req = st.number_input(f"2. Target Scenes (Max {MAX_SCENES}):", min_value=1, max_value=MAX_SCENES, value=st.session_state.num_scenes, key="num_scenes_req_input")
453
+
454
+ start_disable = st.session_state.generation_in_progress or device == "cpu"
455
+ start_button = st.button("πŸš€ Start Generation", type="primary", disabled=start_disable)
456
+
457
+ if start_button:
458
+ init_state() # Reset state variables first
459
+ st.session_state.generation_in_progress = True
460
+ st.session_state.current_step = "story"
461
+ st.session_state.num_scenes = num_scenes_req # Use the requested number
462
+ cleanup_temp_dir() # Clean old files
463
+ get_temp_dir() # Ensure new temp dir exists for this run
464
+ st.experimental_rerun()
465
+
466
+ st.header("⚠️ Actions")
467
+ if st.button("πŸ” Reset Workflow", disabled=st.session_state.generation_in_progress):
468
+ init_state()
469
+ cleanup_temp_dir() # Also clean files on reset
470
+ st.experimental_rerun()
471
+
472
+ if st.button("🧹 Clean Temp Files Only", help=f"Removes files in {st.session_state.get('temp_dir_path', 'N/A')}", disabled=st.session_state.generation_in_progress):
473
+ cleanup_temp_dir()
474
+ st.experimental_rerun() # Rerun to update button help text etc.
475
+
476
+ # --- Main Area Logic & Progress ---
477
+ st.divider()
478
+ if device == "cpu":
479
+ st.error("πŸ”΄ GPU (CUDA) is required. Cannot run on CPU.")
480
+ elif st.session_state.generation_in_progress:
481
+ st.subheader(f"πŸš€ Running Step: {st.session_state.current_step.upper()}")
482
+ progress_bar = st.progress(0)
483
+ steps = ["story", "image", "video", "audio", "compose", "done"]
484
+ try:
485
+ current_index = steps.index(st.session_state.current_step)
486
+ progress_bar.progress((current_index / (len(steps) - 1)) * 100)
487
+ except ValueError:
488
+ progress_bar.progress(0) # Should not happen
489
+
490
+ # Use placeholders for status updates within each step function
491
+ status_placeholder = st.empty()
492
+
493
+ # Wrap the step execution in a try block to catch errors and stop
494
+ try:
495
+ temp_dir = get_temp_dir() # Ensure temp_dir is set
496
+ current_step = st.session_state.current_step # Local copy
497
+
498
+ if current_step == "story":
499
+ with status_placeholder.container(): st.session_state.story_data = run_llm_step(user_prompt, st.session_state.num_scenes)
500
+ next_step = "image" if st.session_state.story_data else "error"
501
+
502
+ elif current_step == "image":
503
+ scenes = st.session_state.story_data.get('scenes', [])
504
+ with status_placeholder.container(): st.session_state.image_results = run_image_step(scenes, temp_dir)
505
+ next_step = "video" if any(r['status'] == 'succeeded' for r in st.session_state.image_results) else "error"
506
+
507
+ elif current_step == "video":
508
+ scenes = st.session_state.story_data.get('scenes', [])
509
+ with status_placeholder.container(): st.session_state.video_results = run_video_step(st.session_state.image_results, scenes, temp_dir)
510
+ next_step = "audio" if any(r['status'] == 'succeeded' for r in st.session_state.video_results) else "error"
511
+
512
+ elif current_step == "audio":
513
+ scenes = st.session_state.story_data.get('scenes', [])
514
+ with status_placeholder.container(): st.session_state.audio_results = run_audio_step(scenes, temp_dir)
515
+ next_step = "compose" if any(r['status'] == 'succeeded' for r in st.session_state.audio_results) else "error"
516
+
517
+ elif current_step == "compose":
518
+ title_base = "".join(filter(str.isalnum, st.session_state.story_data.get('title', 'pov'))).replace(" ", "_") if st.session_state.story_data else "pov_video"
519
+ with status_placeholder.container(): st.session_state.final_video_path = run_compose_step_ffmpeg(
520
+ st.session_state.video_results, st.session_state.audio_results, temp_dir, title=title_base)
521
+ next_step = "done" if st.session_state.final_video_path else "error"
522
+
523
+ else: # Should not be reached if logic is right
524
+ next_step = "error"
525
+
526
+ # Update state and rerun ONLY if the step succeeded
527
+ if next_step != "error":
528
+ st.session_state.current_step = next_step
529
+ if next_step == "done":
530
+ st.session_state.generation_in_progress = False # Workflow finished successfully
531
+ progress_bar.progress(100)
532
+ st.experimental_rerun()
533
+ else:
534
+ st.error(f"πŸ›‘ Workflow failed at step: {current_step}")
535
+ st.session_state.current_step = "error"
536
+ st.session_state.generation_in_progress = False
537
+
538
+ except Exception as e:
539
+ st.error(f"An unexpected error occurred during step {st.session_state.current_step}: {e}")
540
+ st.error(traceback.format_exc())
541
+ st.session_state.current_step = "error"
542
+ st.session_state.generation_in_progress = False
543
+
544
+
545
+ # --- Display Final Output ---
546
+ st.divider()
547
+ st.header("βœ… Final Video")
548
+ if st.session_state.current_step == "done" and st.session_state.final_video_path:
549
+ final_video_path = st.session_state.final_video_path
550
+ if os.path.exists(final_video_path):
551
+ st.video(final_video_path)
552
+ try:
553
+ with open(final_video_path, "rb") as fp:
554
+ st.download_button(
555
+ label="⬇️ Download Final Video (.mp4)",
556
+ data=fp,
557
+ file_name=os.path.basename(final_video_path),
558
+ mime="video/mp4",
559
+ key="final_video_download_btn"
560
+ )
561
+ except Exception as e:
562
+ st.error(f"Error reading final video for download: {e}")
563
+ else:
564
+ st.error(f"Final video file not found: {final_video_path}. It might have been cleaned up.")
565
+ elif st.session_state.current_step == "error":
566
+ st.error("πŸ›‘ Workflow failed. Check logs above. Please Reset and try again.")
567
+ elif st.session_state.generation_in_progress:
568
+ st.info(f"⏳ Workflow running... Current step: **{st.session_state.current_step.upper()}**")
569
+ else:
570
+ st.info("πŸ‘‹ Ready to generate. Use the sidebar to start.")
571
+
572
+ # Optional: Display intermediate results in an expander
573
+ with st.expander("Show Intermediate File Details", expanded=False):
574
+ st.write("**Story Data:**"); st.json(st.session_state.story_data or {})
575
+ st.write("**Image Results:**"); st.json(st.session_state.image_results or [])
576
+ st.write("**Video Results:**"); st.json(st.session_state.video_results or [])
577
+ st.write("**Audio Results:**"); st.json(st.session_state.audio_results or [])
578
+ st.write("**Final Path:**", st.session_state.final_video_path or "Not generated")
579
+ st.write("**Temp Dir:**", st.session_state.get('temp_dir_path', "N/A"))
580
+
581
+ # Final memory display
582
+ display_memory_usage()