garyuzair commited on
Commit
3f918d1
·
verified ·
1 Parent(s): 1b4bb8c

Update src/app_hf_space_optimized.py

Browse files
Files changed (1) hide show
  1. src/app_hf_space_optimized.py +720 -370
src/app_hf_space_optimized.py CHANGED
@@ -10,6 +10,8 @@ import numpy as np
10
  import ffmpeg # Use ffmpeg-python
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
12
  from diffusers import StableDiffusionXLPipeline
 
 
13
  from diffusers.pipelines.cogvideo import CogVideoXPipeline
14
  from diffusers.utils import export_to_video
15
  from parler_tts import ParlerTTSForConditionalGeneration
@@ -18,7 +20,8 @@ import shutil
18
  import traceback
19
  import psutil # For memory stats
20
 
21
- st.set_page_config(layout="wide", page_title="POV Video Gen (HF Space)")
 
22
 
23
  # --- Configuration ---
24
  LLM_MODEL_ID = "Qwen/Qwen3-0.6B"
@@ -26,13 +29,15 @@ IMAGE_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
26
  VIDEO_MODEL_ID = "THUDM/CogVideoX-2b"
27
  TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
28
 
 
29
  IMAGE_WIDTH = 768
30
  IMAGE_HEIGHT = 1344
 
31
  SCENE_DURATION_SECONDS = 4 # Reduced duration for faster processing
32
  VIDEO_FPS = 10
33
  NUM_SCENES_DEFAULT = 3 # Lowered default
34
- MAX_SCENES = 4 # Stricter limit for free tier
35
- TEMP_SUBDIR = "pov_video_temp_hf" # Unique name
36
 
37
  # --- Device Setup & Memory Monitor ---
38
  mem_info_placeholder = st.sidebar.empty()
@@ -44,45 +49,58 @@ def display_memory_usage():
44
  cpu_mem = process.memory_info().rss / (1024 * 1024) # MB
45
  gpu_mem_info = "N/A"
46
  if torch.cuda.is_available():
 
47
  allocated = torch.cuda.memory_allocated(0) / (1024 * 1024) # MB
48
- reserved = torch.cuda.memory_reserved(0) / (1024 * 1024) # MB
 
49
  total = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024) # MB
50
- gpu_mem_info = f"Alloc: {allocated:.0f}MB | Reserv: {reserved:.0f}MB | Total: {total:.0f}MB"
51
  mem_info_placeholder.info(f"🧠 CPU Mem: {cpu_mem:.0f} MB\n⚡ GPU Mem: {gpu_mem_info}")
52
  except Exception as e:
53
  mem_info_placeholder.warning(f"Could not get memory info: {e}")
54
 
 
55
  if torch.cuda.is_available():
56
  device = "cuda"
57
  try:
58
  vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
59
  st.sidebar.success(f"✅ GPU Detected! VRAM: {vram_gb:.2f} GB")
60
- if vram_gb < 15:
61
- st.sidebar.warning("⚠️ Low VRAM (< 15GB). May struggle.")
 
62
  except Exception:
63
- st.sidebar.warning("Could not read GPU VRAM.") # Continue assuming GPU exists
64
  else:
65
  device = "cpu"
66
- st.sidebar.error("⚠️ No GPU! App will be extremely slow & likely fail.")
67
 
68
  # --- Helper Functions ---
69
  def cleanup_gpu_memory(*args):
70
- """Attempts to free GPU memory."""
71
- print(f"Attempting GPU mem cleanup. Vars to del: {len(args)}")
72
- display_memory_usage() # Before cleanup
73
- del args # Remove reference to the tuple itself
 
 
 
74
  gc.collect()
75
  if torch.cuda.is_available():
76
  torch.cuda.empty_cache()
77
- display_memory_usage() # After cleanup
78
- print("GPU mem cleanup done.")
 
 
 
 
79
 
80
  def get_temp_dir():
81
  """Creates or returns the path to the temporary directory."""
82
  # Use a consistent path within the app's execution context for simplicity on Spaces
83
  # This might lead to leftover files if cleanup fails, but avoids potential permission issues with system temp
84
- app_temp_dir = os.path.abspath(TEMP_SUBDIR) # Use relative path from script
 
85
  os.makedirs(app_temp_dir, exist_ok=True)
 
86
  if 'temp_dir_path' not in st.session_state or st.session_state.temp_dir_path != app_temp_dir:
87
  print(f"Setting temp dir: {app_temp_dir}")
88
  st.session_state.temp_dir_path = app_temp_dir
@@ -91,11 +109,14 @@ def get_temp_dir():
91
  def cleanup_temp_dir():
92
  """Removes the application's temporary directory."""
93
  dir_path = st.session_state.get('temp_dir_path', None)
94
- if dir_path and os.path.exists(dir_path) and TEMP_SUBDIR in dir_path: # Safety check
 
 
95
  try:
 
96
  shutil.rmtree(dir_path)
97
  st.sidebar.success(f"Cleaned up: {dir_path}")
98
- st.session_state.temp_dir_path = None
99
  except Exception as e:
100
  st.sidebar.error(f"Error cleaning temp dir {dir_path}: {e}")
101
  else:
@@ -104,36 +125,44 @@ def cleanup_temp_dir():
104
 
105
  # --- Model Interaction Functions (Load -> Use -> Unload) ---
106
 
107
- def run_llm_step(user_prompt, num_scenes):
108
  """Loads LLM, generates story, unloads LLM."""
109
- st.info(f"🔄 Loading LLM: {LLM_MODEL_ID}...")
110
- display_memory_usage()
111
- llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None
112
- story_data = None
113
- try:
114
- dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_bf16_supported() else torch.float16 if device=="cuda" else torch.float32
115
- llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
116
- llm_model = AutoModelForCausalLM.from_pretrained(
117
- LLM_MODEL_ID, torch_dtype=dtype, low_cpu_mem_usage=True, device_map="auto" # Try low_cpu_mem_usage
118
- )
119
  display_memory_usage()
120
- st.info("🧠 Generating story structure...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
121
 
122
- # --- System Prompt --- (Shortened descriptions max length)
123
- system_prompt = f"""
124
- You are an expert director creating POV TikTok video scripts.
125
- Break down the user's scenario into exactly {num_scenes} scenes ({SCENE_DURATION_SECONDS}s each).
126
  For EACH scene, generate:
127
- 1. `scene_description`: Max 1-2 concise sentences describing action/setting for TTS. Max 350 characters.
128
- 2. `image_prompt`: Detailed SDXL POV prompt (Start with "First-person perspective - pov shot of..."). Include setting, mood, style, time period, elements. Add "pov hands from the bottom corner..." if needed.
129
- 3. `video_direction_prompt`: Simple camera action/motion for CogVideoX (e.g., "Camera pans right", "Subtle zoom in", "Static shot", "Hand reaches out").
130
- 4. `audio_description`: Voice & ambience description for Parler-TTS (e.g., "Nervous male voice, faint market chatter.", "Calm female narrator, quiet library ambience.").
131
 
132
- Respond ONLY with a valid JSON object:
133
  {{
134
  "story_details": {{
135
- "title": "POV Title (Year)",
136
- "full_story": "Brief summary...",
137
  "scenes": [
138
  {{ // Scene 1
139
  "scene_description": "...", // Max 350 chars
@@ -145,439 +174,760 @@ Respond ONLY with a valid JSON object:
145
  ]
146
  }}
147
  }}
148
- Strictly adhere to JSON format. No extra text.
149
- """.strip()
150
-
151
- messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Create script: {user_prompt}"}]
152
- text_input = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
153
- model_inputs = llm_tokenizer([text_input], return_tensors="pt").to(llm_model.device if hasattr(llm_model, 'device') else device)
154
-
155
- # Use recommended parameters for non-thinking Qwen3
156
- generated_ids = llm_model.generate(
157
- **model_inputs, max_new_tokens=4096, # Still allow space for generation
158
- temperature=0.7, top_p=0.8, top_k=20, do_sample=True,
159
- pad_token_id=llm_tokenizer.eos_token_id # Important for stopping
160
- )
161
- output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
162
- response_text = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
163
-
164
- st.write("LLM Raw Output:"); st.code(response_text, language='text')
165
- json_string = response_text.strip().removeprefix("```json").removesuffix("```").strip()
166
- parsed_data = json.loads(json_string)
167
-
168
- if not ("story_details" in parsed_data and "scenes" in parsed_data["story_details"]): raise ValueError("Invalid JSON structure.")
169
- actual_num_scenes = len(parsed_data["story_details"]["scenes"])
170
- if actual_num_scenes != num_scenes: st.warning(f"LLM gave {actual_num_scenes} scenes, requested {num_scenes}.")
171
-
172
- story_data = parsed_data["story_details"]
173
- st.success("✅ Story generation complete.")
174
- except Exception as e:
175
- st.error(f"❌ LLM Step Failed: {e}"); st.error(traceback.format_exc()); story_data = None
176
- finally:
177
- st.info("🔄 Unloading LLM..."); cleanup_gpu_memory(llm_model, llm_tokenizer, model_inputs, generated_ids); st.info("✅ LLM Unloaded.")
178
- return story_data
179
 
180
- def run_image_step(scenes, temp_dir):
181
- st.info(f"🔄 Loading Image Generator: {IMAGE_MODEL_ID}...")
182
- display_memory_usage()
183
- image_pipe = None; image_results = []
184
- try:
185
- dtype = torch.float16 if device == "cuda" else torch.float32
186
- image_pipe = StableDiffusionXLPipeline.from_pretrained(
187
- IMAGE_MODEL_ID, torch_dtype=dtype, use_safetensors=True, variant="fp16" if device == "cuda" else None,
188
- low_cpu_mem_usage=True # Crucial for loading on low RAM systems
189
- )
190
- # Use CPU offloading even if it's slower, necessary for T4 VRAM
191
- if device == "cuda": image_pipe.enable_model_cpu_offload()
192
- else: image_pipe.to(device) # Move to CPU if needed
193
- display_memory_usage()
194
- st.info("🎨 Generating images sequentially...")
195
 
196
- for i, scene in enumerate(scenes):
197
- img_path = os.path.join(temp_dir, f"scene_{i+1}_img.png")
198
- st.write(f"Generating Image {i+1}/{len(scenes)}...")
199
- image = None # Define before try block
200
  try:
201
- with torch.no_grad():
202
- image = image_pipe(
203
- prompt=scene.get("image_prompt", "blank image"),
204
- width=IMAGE_WIDTH, height=IMAGE_HEIGHT, num_inference_steps=25 # Fewer steps for speed
205
- ).images[0]
206
- image.save(img_path)
207
- image_results.append({"scene": i, "path": img_path, "status": "succeeded"})
208
- st.image(image, caption=f"Scene {i+1} OK", width=150)
209
- except Exception as e:
210
- st.error(f"❌ Image {i+1} Failed: {e}"); st.error(traceback.format_exc())
211
- image_results.append({"scene": i, "path": None, "status": "failed"})
212
- finally: cleanup_gpu_memory(image) # Clean intermediate var
213
-
214
- st.success(" Image generation step complete.")
215
- except Exception as e:
216
- st.error(f"❌ Image Gen Step Failed: {e}"); st.error(traceback.format_exc())
217
- image_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
218
- finally:
219
- st.info("🔄 Unloading Image Generator..."); cleanup_gpu_memory(image_pipe); st.info("✅ Image Generator Unloaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
220
  return image_results
221
 
222
- def run_video_step(image_results, scenes, temp_dir):
 
223
  successful_images = [item for item in image_results if item["status"] == "succeeded"]
224
- if not successful_images: return []
225
- st.info(f"🔄 Loading Video Generator: {VIDEO_MODEL_ID}...")
226
- display_memory_usage()
227
- video_pipe = None; video_results = []
228
- try:
229
- dtype = torch.float16 if device == "cuda" else torch.float32
230
- # Instantiate VAE and Transformer separately for potential offloading/quantization later if needed
231
- # For now, load pipeline directly, enabling optimizations
232
- video_pipe = CogVideoXPipeline.from_pretrained(VIDEO_MODEL_ID, torch_dtype=dtype)
233
- if device == "cuda":
234
- video_pipe.enable_model_cpu_offload()
235
- video_pipe.enable_sequential_cpu_offload() # Needed for low VRAM
236
- else: video_pipe.to(device)
237
- video_pipe.vae.enable_slicing(); video_pipe.vae.enable_tiling()
238
- display_memory_usage()
239
- st.info("🎬 Generating videos sequentially...")
240
- generator = torch.Generator(device=device)
241
 
242
- for item in successful_images:
243
- scene_index = item["scene"]; vid_path = os.path.join(temp_dir, f"scene_{scene_index + 1}_vid.mp4")
244
- st.write(f"Generating Video for Scene {scene_index + 1}...")
245
- img, video_frames = None, None # Define before try
246
- try:
247
- img = Image.open(item["path"])
248
- video_direction = scenes[scene_index].get("video_direction_prompt", "subtle motion")
249
- seed = int(time.time() * 1000 + scene_index) % 100000
250
- if device == "cuda": generator.manual_seed(seed)
251
- else: generator = torch.Generator(device='cpu').manual_seed(seed)
252
-
253
- with torch.no_grad():
254
- video_frames = video_pipe(
255
- prompt=video_direction, image=img, num_inference_steps=40, # Slightly fewer steps
256
- num_frames=int(SCENE_DURATION_SECONDS * VIDEO_FPS) + 1,
257
- guidance_scale=6.0, generator=generator
258
- ).frames[0]
259
- export_to_video(video_frames, vid_path, fps=VIDEO_FPS)
260
- video_results.append({"scene": scene_index, "path": vid_path, "status": "succeeded"})
261
- # Comment out preview to save resources on Spaces
262
- # st.video(vid_path)
263
- st.success(f"Video Scene {scene_index + 1} OK.")
264
- except Exception as e:
265
- st.error(f" Video {scene_index + 1} Failed: {e}"); st.error(traceback.format_exc())
266
- video_results.append({"scene": scene_index, "path": None, "status": "failed"})
267
- finally: cleanup_gpu_memory(img, video_frames)
268
-
269
- st.success("✅ Video generation step complete.")
270
- except Exception as e:
271
- st.error(f" Video Gen Step Failed: {e}"); st.error(traceback.format_exc())
272
- video_results = [{"scene": item["scene"], "path": None, "status": "failed"} for item in successful_images]
273
- finally:
274
- st.info("🔄 Unloading Video Generator..."); cleanup_gpu_memory(video_pipe); st.info("✅ Video Generator Unloaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
275
  return video_results
276
 
277
- def run_audio_step(scenes, temp_dir):
278
- st.info(f"🔄 Loading TTS Model: {TTS_MODEL_ID}...")
279
- display_memory_usage()
280
- tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
281
- audio_results = []
282
- try:
283
- # Load TTS model (Parler requires specific class)
284
- tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID).to(device)
285
- tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) # For text prompt
286
- tts_desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) # For description
287
- display_memory_usage()
288
- st.info("🔊 Generating audio sequentially...")
289
 
290
- for i, scene in enumerate(scenes):
291
- audio_path = os.path.join(temp_dir, f"scene_{i+1}_audio.wav")
292
- st.write(f"Generating Audio {i+1}/{len(scenes)}...")
293
- desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None # Define before try
294
- try:
295
- text_to_speak = scene.get("scene_description", "")[:350] # Enforce limit
296
- voice_description = scene.get("audio_description", "A neutral speaker.")
297
- if not text_to_speak:
298
- audio_results.append({"scene": i, "path": None, "status": "skipped"})
299
- continue
300
-
301
- desc_input_ids = tts_desc_tokenizer(voice_description, return_tensors="pt").input_ids.to(device)
302
- prompt_input_ids = tts_tokenizer(text_to_speak, return_tensors="pt").input_ids.to(device)
303
-
304
- with torch.no_grad():
305
- generation = tts_model.generate(
306
- input_ids=desc_input_ids, prompt_input_ids=prompt_input_ids,
307
- do_sample=True, temperature=0.7 # Slightly higher temp for variety
308
- ).to(torch.float32)
309
-
310
- audio_arr = generation.cpu().numpy().squeeze()
311
- sampling_rate = tts_model.config.sampling_rate
312
- sf.write(audio_path, audio_arr, sampling_rate)
313
- audio_results.append({"scene": i, "path": audio_path, "status": "succeeded"})
314
- st.audio(audio_path, format='audio/wav') # Preview audio
315
- except Exception as e:
316
- st.error(f"❌ Audio {i+1} Failed: {e}"); st.error(traceback.format_exc())
317
- audio_results.append({"scene": i, "path": None, "status": "failed"})
318
- finally: cleanup_gpu_memory(desc_input_ids, prompt_input_ids, generation, audio_arr)
319
-
320
- st.success("✅ Audio generation step complete.")
321
- except Exception as e:
322
- st.error(f"❌ Audio Gen Step Failed: {e}"); st.error(traceback.format_exc())
323
- audio_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
324
- finally:
325
- st.info("🔄 Unloading TTS Model..."); cleanup_gpu_memory(tts_model, tts_tokenizer, tts_desc_tokenizer); st.info("✅ TTS Model Unloaded.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
326
  return audio_results
327
 
328
- def run_compose_step_ffmpeg(video_results, audio_results, temp_dir, title="final_pov_video"):
329
  """Combines videos and audio using ffmpeg-python."""
330
- st.info("🎞️ Composing final video using ffmpeg-python (CPU)...")
331
- display_memory_usage()
332
- final_video_path = None
333
- long_video_path = os.path.join(temp_dir, "long_video_temp.mp4")
334
- long_audio_path = os.path.join(temp_dir, "long_audio_temp.wav")
335
- final_output_path = os.path.join(temp_dir, f"{title}.mp4")
336
- concat_video_list_path = os.path.join(temp_dir, "ffmpeg_video_list.txt")
337
- concat_audio_list_path = os.path.join(temp_dir, "ffmpeg_audio_list.txt")
338
 
339
- try:
340
- successful_videos = sorted([item for item in video_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
341
- successful_audio = sorted([item for item in audio_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
342
-
343
- # Align based on scene index for safety
344
- paths_to_compose = []
345
- audio_map = {item['scene']: item['path'] for item in successful_audio}
346
- for video_item in successful_videos:
347
- scene_idx = video_item['scene']
348
- if scene_idx in audio_map:
349
- paths_to_compose.append({'scene': scene_idx, 'video': video_item['path'], 'audio': audio_map[scene_idx]})
350
-
351
- if not paths_to_compose:
352
- st.error("❌ No matching video/audio pairs found.")
353
- return None
354
-
355
- st.write(f"Found {len(paths_to_compose)} matching scene(s) to compose.")
356
-
357
- # 1. Create file lists for ffmpeg concat demuxer
358
- with open(concat_video_list_path, "w") as f_vid, open(concat_audio_list_path, "w") as f_aud:
359
- for item in paths_to_compose:
360
- f_vid.write(f"file '{os.path.relpath(item['video'], temp_dir)}'\n") # Use relative paths within temp dir
361
- f_aud.write(f"file '{os.path.relpath(item['audio'], temp_dir)}'\n")
362
-
363
- # 2. Concatenate Audio Files
364
- st.write("Concatenating audio...")
365
- try:
366
- (
367
- ffmpeg
368
- .input(concat_audio_list_path, format='concat', safe=0, fflags='+igndts') # Add flags
369
- .output(long_audio_path, acodec='pcm_s16le') # Output intermediate WAV
370
- .global_args('-hide_banner', '-loglevel', 'error') # Suppress verbose output
371
- .run(overwrite_output=True, cmd='ffmpeg') # Specify cmd='ffmpeg' if needed
372
- )
373
- st.write("Audio concatenated.")
374
- except ffmpeg.Error as e:
375
- st.error("FFmpeg Audio Concat Error:")
376
- st.code(e.stderr.decode() if e.stderr else str(e))
377
- raise # Re-raise to stop the process
378
-
379
- # 3. Concatenate Video Files
380
- st.write("Concatenating videos...")
381
- try:
382
- (
383
- ffmpeg
384
- .input(concat_video_list_path, format='concat', safe=0, fflags='+igndts')
385
- .output(long_video_path, c='copy') # Use stream copy for speed
386
- .global_args('-hide_banner', '-loglevel', 'error')
387
- .run(overwrite_output=True, cmd='ffmpeg')
388
- )
389
- st.write("Videos concatenated.")
390
- except ffmpeg.Error as e:
391
- st.error("FFmpeg Video Concat Error:")
392
- st.code(e.stderr.decode() if e.stderr else str(e))
393
- raise
394
-
395
- # 4. Mux (Combine) Video and Audio
396
- st.write("Muxing final video...")
397
- try:
398
- in_video = ffmpeg.input(long_video_path)
399
- in_audio = ffmpeg.input(long_audio_path)
400
- (
401
- ffmpeg
402
- .output(in_video, in_audio, final_output_path, vcodec='copy', acodec='aac', shortest=None, strict='experimental') # Use aac audio codec
403
- .global_args('-hide_banner', '-loglevel', 'error')
404
- .run(overwrite_output=True, cmd='ffmpeg')
405
- )
406
- final_video_path = final_output_path # Set the final path on success
407
- st.success("✅ Final video composed!")
408
 
409
- except ffmpeg.Error as e:
410
- st.error("FFmpeg Muxing Error:")
411
- st.code(e.stderr.decode() if e.stderr else str(e))
412
- final_video_path = None # Ensure it's None on failure
413
- raise
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414
 
415
 
416
- except Exception as e:
417
- st.error(f"❌ Video Composition Step Failed: {e}")
418
- st.error(traceback.format_exc())
419
- final_video_path = None
420
- finally:
421
- # Clean up intermediate files and lists
422
- st.write("Cleaning up intermediate composition files...")
423
- for f_path in [long_video_path, long_audio_path, concat_video_list_path, concat_audio_list_path]:
424
- if os.path.exists(f_path):
425
- try: os.remove(f_path)
426
- except Exception as e_clean: print(f"Error cleaning {f_path}: {e_clean}")
427
- display_memory_usage() # Final memory check for this step
 
 
 
 
428
  return final_video_path
429
 
430
 
431
  # --- Streamlit UI ---
432
 
433
- st.title("🎬 POV Video Gen (HF Space Optimized)")
434
- st.caption("Local Generation: Scenario -> Story -> Images -> Videos -> Audio -> Compose -> Download")
 
435
 
436
  # Initialize Session State
 
437
  def init_state():
438
  keys_to_init = {
439
- 'generation_in_progress': False, 'current_step': "idle", 'story_data': None,
440
- 'image_results': [], 'video_results': [], 'audio_results': [],
441
- 'final_video_path': None, 'temp_dir_path': None,
442
- 'num_scenes': NUM_SCENES_DEFAULT
 
 
 
 
 
443
  }
444
  for key, default_value in keys_to_init.items():
445
  if key not in st.session_state:
446
  st.session_state[key] = default_value
447
- init_state()
448
 
449
  # --- Sidebar ---
450
  with st.sidebar:
451
  st.header("⚙️ Config & Control")
 
452
  user_prompt = st.text_area("1. Enter POV Scenario:", height=100, value="POV: You're Marco Polo negotiating trade routes in the Silk Road bazaar (1270)", key="user_prompt_input")
453
- num_scenes_req = st.number_input(f"2. Target Scenes (Max {MAX_SCENES}):", min_value=1, max_value=MAX_SCENES, value=st.session_state.num_scenes, key="num_scenes_req_input")
454
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
  start_disable = st.session_state.generation_in_progress or device == "cpu"
456
  start_button = st.button("🚀 Start Generation", type="primary", disabled=start_disable)
457
 
458
  if start_button:
459
- init_state() # Reset state variables first
 
460
  st.session_state.generation_in_progress = True
461
- st.session_state.current_step = "story"
462
- st.session_state.num_scenes = num_scenes_req # Use the requested number
463
- cleanup_temp_dir() # Clean old files
464
- get_temp_dir() # Ensure new temp dir exists for this run
465
- st.experimental_rerun()
466
 
467
  st.header("⚠️ Actions")
 
468
  if st.button("🔁 Reset Workflow", disabled=st.session_state.generation_in_progress):
469
- init_state()
470
  cleanup_temp_dir() # Also clean files on reset
471
- st.experimental_rerun()
472
 
473
- if st.button("🧹 Clean Temp Files Only", help=f"Removes files in {st.session_state.get('temp_dir_path', 'N/A')}", disabled=st.session_state.generation_in_progress):
 
 
474
  cleanup_temp_dir()
475
- st.experimental_rerun() # Rerun to update button help text etc.
476
 
477
- # --- Main Area Logic & Progress ---
 
478
  st.divider()
 
 
479
  if device == "cpu":
480
- st.error("🔴 GPU (CUDA) is required. Cannot run on CPU.")
 
481
  elif st.session_state.generation_in_progress:
482
- st.subheader(f"🚀 Running Step: {st.session_state.current_step.upper()}")
483
- progress_bar = st.progress(0)
 
 
484
  steps = ["story", "image", "video", "audio", "compose", "done"]
485
  try:
486
  current_index = steps.index(st.session_state.current_step)
487
- progress_bar.progress((current_index / (len(steps) - 1)) * 100)
 
 
 
 
 
 
 
488
  except ValueError:
489
- progress_bar.progress(0) # Should not happen
 
490
 
491
- # Use placeholders for status updates within each step function
492
  status_placeholder = st.empty()
493
 
494
- # Wrap the step execution in a try block to catch errors and stop
 
495
  try:
496
- temp_dir = get_temp_dir() # Ensure temp_dir is set
497
- current_step = st.session_state.current_step # Local copy
 
 
 
 
498
 
499
  if current_step == "story":
500
- with status_placeholder.container(): st.session_state.story_data = run_llm_step(user_prompt, st.session_state.num_scenes)
501
- next_step = "image" if st.session_state.story_data else "error"
 
 
 
502
 
503
  elif current_step == "image":
504
- scenes = st.session_state.story_data.get('scenes', [])
505
- with status_placeholder.container(): st.session_state.image_results = run_image_step(scenes, temp_dir)
506
- next_step = "video" if any(r['status'] == 'succeeded' for r in st.session_state.image_results) else "error"
 
 
 
 
 
 
507
 
508
  elif current_step == "video":
509
- scenes = st.session_state.story_data.get('scenes', [])
510
- with status_placeholder.container(): st.session_state.video_results = run_video_step(st.session_state.image_results, scenes, temp_dir)
511
- next_step = "audio" if any(r['status'] == 'succeeded' for r in st.session_state.video_results) else "error"
 
 
512
 
513
  elif current_step == "audio":
514
- scenes = st.session_state.story_data.get('scenes', [])
515
- with status_placeholder.container(): st.session_state.audio_results = run_audio_step(scenes, temp_dir)
516
- next_step = "compose" if any(r['status'] == 'succeeded' for r in st.session_state.audio_results) else "error"
 
 
517
 
518
  elif current_step == "compose":
519
- title_base = "".join(filter(str.isalnum, st.session_state.story_data.get('title', 'pov'))).replace(" ", "_") if st.session_state.story_data else "pov_video"
520
- with status_placeholder.container(): st.session_state.final_video_path = run_compose_step_ffmpeg(
521
- st.session_state.video_results, st.session_state.audio_results, temp_dir, title=title_base)
 
 
 
522
  next_step = "done" if st.session_state.final_video_path else "error"
 
 
 
523
 
524
- else: # Should not be reached if logic is right
525
- next_step = "error"
526
 
527
- # Update state and rerun ONLY if the step succeeded
528
- if next_step != "error":
529
- st.session_state.current_step = next_step
530
- if next_step == "done":
531
- st.session_state.generation_in_progress = False # Workflow finished successfully
532
- progress_bar.progress(100)
 
 
 
 
 
 
 
 
533
  st.experimental_rerun()
534
- else:
535
- st.error(f"🛑 Workflow failed at step: {current_step}")
536
- st.session_state.current_step = "error"
 
 
 
537
  st.session_state.generation_in_progress = False
538
 
 
539
  except Exception as e:
 
540
  st.error(f"An unexpected error occurred during step {st.session_state.current_step}: {e}")
541
- st.error(traceback.format_exc())
542
- st.session_state.current_step = "error"
543
- st.session_state.generation_in_progress = False
 
 
 
544
 
545
 
546
  # --- Display Final Output ---
547
  st.divider()
548
  st.header("✅ Final Video")
 
 
549
  if st.session_state.current_step == "done" and st.session_state.final_video_path:
550
  final_video_path = st.session_state.final_video_path
551
  if os.path.exists(final_video_path):
552
- st.video(final_video_path)
 
 
 
553
  try:
554
  with open(final_video_path, "rb") as fp:
555
  st.download_button(
556
  label="⬇️ Download Final Video (.mp4)",
557
  data=fp,
558
- file_name=os.path.basename(final_video_path),
559
  mime="video/mp4",
560
- key="final_video_download_btn"
561
  )
562
  except Exception as e:
563
  st.error(f"Error reading final video for download: {e}")
564
  else:
565
- st.error(f"Final video file not found: {final_video_path}. It might have been cleaned up.")
 
 
566
  elif st.session_state.current_step == "error":
567
- st.error("🛑 Workflow failed. Check logs above. Please Reset and try again.")
 
 
568
  elif st.session_state.generation_in_progress:
 
569
  st.info(f"⏳ Workflow running... Current step: **{st.session_state.current_step.upper()}**")
 
 
570
  else:
571
- st.info("👋 Ready to generate. Use the sidebar to start.")
572
 
573
- # Optional: Display intermediate results in an expander
574
- with st.expander("Show Intermediate File Details", expanded=False):
 
 
575
  st.write("**Story Data:**"); st.json(st.session_state.story_data or {})
576
  st.write("**Image Results:**"); st.json(st.session_state.image_results or [])
577
  st.write("**Video Results:**"); st.json(st.session_state.video_results or [])
578
  st.write("**Audio Results:**"); st.json(st.session_state.audio_results or [])
579
  st.write("**Final Path:**", st.session_state.final_video_path or "Not generated")
580
  st.write("**Temp Dir:**", st.session_state.get('temp_dir_path', "N/A"))
581
-
582
- # Final memory display
 
 
 
 
 
 
 
 
 
 
 
583
  display_memory_usage()
 
10
  import ffmpeg # Use ffmpeg-python
11
  from transformers import AutoModelForCausalLM, AutoTokenizer
12
  from diffusers import StableDiffusionXLPipeline
13
+ # Corrected import path for CogVideoXPipeline
14
+ # This import path is typically found in the main branch of diffusers
15
  from diffusers.pipelines.cogvideo import CogVideoXPipeline
16
  from diffusers.utils import export_to_video
17
  from parler_tts import ParlerTTSForConditionalGeneration
 
20
  import traceback
21
  import psutil # For memory stats
22
 
23
+ # Use a more explicit title indicating vertical format
24
+ st.set_page_config(layout="wide", page_title="POV Vertical Video Gen (HF Space)")
25
 
26
  # --- Configuration ---
27
  LLM_MODEL_ID = "Qwen/Qwen3-0.6B"
 
29
  VIDEO_MODEL_ID = "THUDM/CogVideoX-2b"
30
  TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
31
 
32
+ # Target Portrait Resolution for TikTok/YouTube Shorts (9:16 aspect ratio)
33
  IMAGE_WIDTH = 768
34
  IMAGE_HEIGHT = 1344
35
+
36
  SCENE_DURATION_SECONDS = 4 # Reduced duration for faster processing
37
  VIDEO_FPS = 10
38
  NUM_SCENES_DEFAULT = 3 # Lowered default
39
+ MAX_SCENES = 4 # Stricter limit for free tier (T4 GPU)
40
+ TEMP_SUBDIR = "pov_video_temp_hf" # Unique name for temp directory
41
 
42
  # --- Device Setup & Memory Monitor ---
43
  mem_info_placeholder = st.sidebar.empty()
 
49
  cpu_mem = process.memory_info().rss / (1024 * 1024) # MB
50
  gpu_mem_info = "N/A"
51
  if torch.cuda.is_available():
52
+ # Get current and peak allocated memory
53
  allocated = torch.cuda.memory_allocated(0) / (1024 * 1024) # MB
54
+ # reserved = torch.cuda.memory_reserved(0) / (1024 * 1024) # MB # Reserved is less critical than allocated/peak
55
+ peak_allocated = torch.cuda.max_memory_allocated(0) / (1024 * 1024) # MB
56
  total = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024) # MB
57
+ gpu_mem_info = f"Alloc: {allocated:.0f}MB | Peak Alloc: {peak_allocated:.0f}MB | Total: {total:.0f}MB"
58
  mem_info_placeholder.info(f"🧠 CPU Mem: {cpu_mem:.0f} MB\n⚡ GPU Mem: {gpu_mem_info}")
59
  except Exception as e:
60
  mem_info_placeholder.warning(f"Could not get memory info: {e}")
61
 
62
+ # Determine device (GPU or CPU)
63
  if torch.cuda.is_available():
64
  device = "cuda"
65
  try:
66
  vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
67
  st.sidebar.success(f"✅ GPU Detected! VRAM: {vram_gb:.2f} GB")
68
+ # T4 has ~15GB, K80 has ~11GB (dual). Warn if significantly less than T4.
69
+ if vram_gb < 14: # Adjusted warning threshold slightly for typical free tiers
70
+ st.sidebar.warning("⚠️ Low VRAM detected (< 14GB). Generation may fail due to memory constraints.")
71
  except Exception:
72
+ st.sidebar.warning("Could not read GPU VRAM.") # Continue assuming GPU exists but warn
73
  else:
74
  device = "cpu"
75
+ st.sidebar.error("⚠️ No GPU! Model inference is not supported on CPU. Generation is disabled.")
76
 
77
  # --- Helper Functions ---
78
  def cleanup_gpu_memory(*args):
79
+ """Attempts to free GPU memory and runs GC."""
80
+ # print(f"Attempting GPU mem cleanup. Vars to del: {len(args)}") # Optional: uncomment for verbose logs
81
+ # Display memory before cleanup
82
+ display_memory_usage()
83
+ # Dereference objects explicitly
84
+ for arg in args:
85
+ del arg
86
  gc.collect()
87
  if torch.cuda.is_available():
88
  torch.cuda.empty_cache()
89
+ # Optionally reset peak stats after major cleanup to monitor next stage
90
+ # torch.cuda.reset_peak_memory_stats(0)
91
+ # Display memory after cleanup
92
+ display_memory_usage()
93
+ # print("GPU mem cleanup done.") # Optional: uncomment for verbose logs
94
+
95
 
96
  def get_temp_dir():
97
  """Creates or returns the path to the temporary directory."""
98
  # Use a consistent path within the app's execution context for simplicity on Spaces
99
  # This might lead to leftover files if cleanup fails, but avoids potential permission issues with system temp
100
+ # Using a subdirectory of the current working directory is safer on platforms like Spaces
101
+ app_temp_dir = os.path.join(os.getcwd(), TEMP_SUBDIR)
102
  os.makedirs(app_temp_dir, exist_ok=True)
103
+ # Store the path in session state to be able to clean it later
104
  if 'temp_dir_path' not in st.session_state or st.session_state.temp_dir_path != app_temp_dir:
105
  print(f"Setting temp dir: {app_temp_dir}")
106
  st.session_state.temp_dir_path = app_temp_dir
 
109
  def cleanup_temp_dir():
110
  """Removes the application's temporary directory."""
111
  dir_path = st.session_state.get('temp_dir_path', None)
112
+ # Safety check: Ensure the path exists and contains the unique subdir name before deleting
113
+ # This prevents accidentally deleting critical system directories.
114
+ if dir_path and os.path.exists(dir_path) and TEMP_SUBDIR in dir_path and os.path.basename(dir_path) == TEMP_SUBDIR:
115
  try:
116
+ st.sidebar.info(f"Attempting to clean up: {dir_path}")
117
  shutil.rmtree(dir_path)
118
  st.sidebar.success(f"Cleaned up: {dir_path}")
119
+ st.session_state.temp_dir_path = None # Clear the path from state after cleaning
120
  except Exception as e:
121
  st.sidebar.error(f"Error cleaning temp dir {dir_path}: {e}")
122
  else:
 
125
 
126
  # --- Model Interaction Functions (Load -> Use -> Unload) ---
127
 
128
+ def run_llm_step(user_prompt, num_scenes, status_placeholder):
129
  """Loads LLM, generates story, unloads LLM."""
130
+ with status_placeholder.container():
131
+ st.info(f"🔄 Loading LLM: {LLM_MODEL_ID}...")
132
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
 
 
 
 
 
 
 
133
  display_memory_usage()
134
+ llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None
135
+ story_data = None
136
+ try:
137
+ # Use bfloat16 if available and CUDA is used, otherwise float16 for CUDA, float32 for CPU
138
+ dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_bf16_supported() else torch.float16 if device=="cuda" else torch.float32
139
+ llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
140
+ # Use device_map="auto" for automatic model distribution across devices (including CPU offload)
141
+ llm_model = AutoModelForCausalLM.from_pretrained(
142
+ LLM_MODEL_ID, torch_dtype=dtype, low_cpu_mem_usage=True, device_map="auto"
143
+ )
144
+ if torch.cuda.is_available():
145
+ display_memory_usage() # Display after loading
146
+ st.info(f"📊 Peak GPU Memory (after LLM load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
147
+
148
+ st.info("🧠 Generating story structure...")
149
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
150
 
151
+ # --- System Prompt --- (Updated to mention vertical format and specific dimensions)
152
+ system_prompt = f"""
153
+ You are an expert director creating POV vertical video scripts for platforms like TikTok and YouTube Shorts.
154
+ Break down the user's scenario into exactly {num_scenes} scenes, each intended for a clip approximately {SCENE_DURATION_SECONDS} seconds long with an aspect ratio of {IMAGE_WIDTH}x{IMAGE_HEIGHT} pixels (portrait).
155
  For EACH scene, generate:
156
+ 1. `scene_description`: Max 1-2 concise sentences describing action/setting for TTS. Max 350 characters. Keep in mind this will be spoken over a short video clip.
157
+ 2. `image_prompt`: Detailed SDXL POV prompt (Start with "First-person perspective - pov shot of..."). Include setting, mood, style, time period, elements. Emphasize visual elements suitable for a portrait {IMAGE_WIDTH}x{IMAGE_HEIGHT} frame. Add "pov hands from the bottom corner, phone in hand," etc., if relevant to the scenario.
158
+ 3. `video_direction_prompt`: Simple camera action/motion for CogVideoX (e.g., "Camera pans right", "Subtle zoom in", "Static shot", "Hand reaches out"). Focus on short, subtle motions suitable for a fixed POV and vertical format. Avoid complex actions that require significant scene changes.
159
+ 4. `audio_description`: Voice & ambience description for Parler-TTS (e.g., "Nervous male voice, faint market chatter.", "Calm female narrator, quiet library ambience."). This sets the tone for the narration/voiceover.
160
 
161
+ Respond ONLY with a valid JSON object. Ensure the JSON structure is exactly as follows, with a top-level "story_details" object containing a "scenes" array:
162
  {{
163
  "story_details": {{
164
+ "title": "POV Title (e.g.,POV First Date)",
165
+ "full_story": "Brief summary of the complete POV story.",
166
  "scenes": [
167
  {{ // Scene 1
168
  "scene_description": "...", // Max 350 chars
 
174
  ]
175
  }}
176
  }}
177
+ Strictly adhere to JSON format. No conversational text, markdown code blocks (\`\`\`json), or any other text before or after the JSON block.
178
+ """.strip()
179
+
180
+ messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Create script: {user_prompt}"}]
181
+ # Use add_generation_prompt=True for Qwen models to follow their chat format
182
+ text_input = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
183
+ # Move input tensors to the model's device
184
+ model_inputs = llm_tokenizer([text_input], return_tensors="pt").to(llm_model.device if hasattr(llm_model, 'device') else device)
185
+
186
+ # Generate the response
187
+ generated_ids = llm_model.generate(
188
+ **model_inputs,
189
+ max_new_tokens=4096, # Set a reasonable upper limit for the response length
190
+ temperature=0.7, # Control randomness
191
+ top_p=0.8, # Nucleus sampling
192
+ top_k=20, # Top-k sampling
193
+ do_sample=True, # Enable sampling
194
+ pad_token_id=llm_tokenizer.eos_token_id, # Ensure generation stops correctly
195
+ num_beams=1 # Use greedy or sampling search, not beam search for chat
196
+ )
197
+ # Decode the generated part of the output
198
+ output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
199
+ response_text = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
 
 
 
 
 
 
 
 
200
 
201
+ if torch.cuda.is_available():
202
+ display_memory_usage() # Display after inference
203
+ st.info(f"📊 Peak GPU Memory (during LLM inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
204
+
205
+ st.write("LLM Raw Output:"); st.code(response_text, language='text')
 
 
 
 
 
 
 
 
 
 
206
 
207
+ # Robust JSON parsing - try to find and parse the JSON object
208
+ json_string = response_text.strip()
 
 
209
  try:
210
+ # Attempt direct parse first
211
+ parsed_data = json.loads(json_string)
212
+ except json.JSONDecodeError:
213
+ # If direct parse fails, try to find the JSON within the text (handles ```json, etc.)
214
+ json_start = json_string.find('{')
215
+ json_end = json_string.rfind('}')
216
+ if json_start == -1 or json_end == -1:
217
+ raise ValueError("JSON object not found in LLM output.")
218
+ json_string = json_string[json_start : json_end + 1]
219
+ parsed_data = json.loads(json_string)
220
+
221
+
222
+ if not ("story_details" in parsed_data and "scenes" in parsed_data["story_details"]):
223
+ raise ValueError("Invalid JSON structure from LLM: missing 'story_details' or 'scenes'.")
224
+
225
+ # Check if the LLM generated the requested number of scenes (warning only, proceed with what was generated)
226
+ actual_num_scenes = len(parsed_data["story_details"]["scenes"])
227
+ if actual_num_scenes != num_scenes:
228
+ st.warning(f"LLM generated {actual_num_scenes} scenes, but {num_scenes} were requested. Using the generated scenes.")
229
+
230
+ story_data = parsed_data["story_details"]
231
+ st.success("✅ Story generation complete.")
232
+ except Exception as e:
233
+ st.error(f"❌ LLM Step Failed: {e}"); st.error(traceback.format_exc()); story_data = None
234
+ finally:
235
+ # Explicitly set references to None before cleanup
236
+ cleanup_gpu_memory(llm_model, llm_tokenizer, model_inputs, generated_ids)
237
+ llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None # Ensure they are truly dereferenced
238
+ st.info("✅ LLM Unloaded.")
239
+ return story_data
240
+
241
+ def run_image_step(scenes, temp_dir, status_placeholder):
242
+ if not scenes:
243
+ with status_placeholder.container(): st.warning("Skipping image step: No scenes available from story data.")
244
+ return []
245
+ with status_placeholder.container():
246
+ st.info(f"🔄 Loading Image Generator: {IMAGE_MODEL_ID}...")
247
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
248
+ display_memory_usage()
249
+ image_pipe = None; image_results = []
250
+ try:
251
+ dtype = torch.float16 if device == "cuda" else torch.float32
252
+ # Load SDXL pipe with necessary optimizations
253
+ image_pipe = StableDiffusionXLPipeline.from_pretrained(
254
+ IMAGE_MODEL_ID,
255
+ torch_dtype=dtype,
256
+ use_safetensors=True,
257
+ variant="fp16" if device == "cuda" and dtype == torch.float16 else None,
258
+ low_cpu_mem_usage=True # Helps load models on systems with limited RAM
259
+ )
260
+ # Enable model offloading - moves parts of the model to CPU/disk to save VRAM
261
+ if device == "cuda": image_pipe.enable_model_cpu_offload()
262
+ else: image_pipe.to(device) # Ensure pipe is on the correct device if not using offload
263
+
264
+ if torch.cuda.is_available():
265
+ display_memory_usage() # Display after loading
266
+ st.info(f"📊 Peak GPU Memory (after Image load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
267
+
268
+ st.info(f"🎨 Generating images ({IMAGE_WIDTH}x{IMAGE_HEIGHT}) sequentially...")
269
+
270
+ for i, scene in enumerate(scenes):
271
+ # Ensure image path is within the temp directory
272
+ img_path = os.path.join(temp_dir, f"scene_{i+1}_img.png")
273
+ st.write(f"Generating Image {i+1}/{len(scenes)}...")
274
+ image = None # Define before try block
275
+ try:
276
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
277
+ with torch.no_grad():
278
+ # Generate image with desired portrait dimensions
279
+ image = image_pipe(
280
+ prompt=scene.get("image_prompt", "blank image, abstract art"), # Use a default prompt if missing
281
+ width=IMAGE_WIDTH,
282
+ height=IMAGE_HEIGHT,
283
+ num_inference_steps=25 # Balance speed and quality
284
+ ).images[0]
285
+
286
+ if torch.cuda.is_available():
287
+ display_memory_usage() # Display after inference
288
+ st.info(f"📊 Peak GPU Memory (during Image inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
289
+
290
+ image.save(img_path)
291
+ image_results.append({"scene": i, "path": img_path, "status": "succeeded"})
292
+ # Display smaller image preview to save browser resources
293
+ st.image(image, caption=f"Scene {i+1} Image OK", width=150)
294
+ except Exception as e:
295
+ st.error(f"❌ Image {i+1} Failed: {e}"); st.error(traceback.format_exc())
296
+ image_results.append({"scene": i, "path": None, "status": "failed"})
297
+ finally:
298
+ # Explicitly set references to None before cleanup
299
+ cleanup_gpu_memory(image); image = None # Clean intermediate variable
300
+
301
+ st.success("✅ Image generation step complete.")
302
+ except Exception as e:
303
+ st.error(f"❌ Image Gen Step Failed: {e}"); st.error(traceback.format_exc())
304
+ # If the pipe loading failed, mark all as failed
305
+ if not image_results:
306
+ image_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
307
+ finally:
308
+ # Explicitly set references to None before cleanup
309
+ cleanup_gpu_memory(image_pipe); image_pipe = None
310
+ st.info("✅ Image Generator Unloaded.")
311
  return image_results
312
 
313
+ def run_video_step(image_results, scenes, temp_dir, status_placeholder):
314
+ # Only attempt video generation for images that succeeded
315
  successful_images = [item for item in image_results if item["status"] == "succeeded"]
316
+ if not successful_images:
317
+ with status_placeholder.container(): st.warning("Skipping video step: No successful images were generated.")
318
+ return []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
319
 
320
+ with status_placeholder.container():
321
+ st.info(f"🔄 Loading Video Generator: {VIDEO_MODEL_ID}...")
322
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
323
+ display_memory_usage()
324
+ video_pipe = None; video_results = []
325
+ try:
326
+ dtype = torch.float16 if device == "cuda" else torch.float32
327
+ # Load CogVideoX pipeline
328
+ video_pipe = CogVideoXPipeline.from_pretrained(VIDEO_MODEL_ID, torch_dtype=dtype)
329
+
330
+ # Enable memory offloading for CogVideoX
331
+ if device == "cuda":
332
+ video_pipe.enable_model_cpu_offload() # Moves parts of the model to CPU/disk
333
+ video_pipe.enable_sequential_cpu_offload() # Further optimization for sequential parts
334
+ else: video_pipe.to(device) # Ensure pipe is on CPU if no GPU
335
+
336
+ # Enable VAE slicing/tiling if helpful (can reduce VRAM for VAE part)
337
+ video_pipe.vae.enable_slicing(); video_pipe.vae.enable_tiling()
338
+
339
+ if torch.cuda.is_available():
340
+ display_memory_usage() # Display after loading
341
+ st.info(f"📊 Peak GPU Memory (after Video load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
342
+
343
+ st.info("🎬 Generating videos sequentially from images...")
344
+ # Use a generator for deterministic (or reproducible based on seed) results if needed
345
+ # However, time-based seed is fine for unique videos
346
+ generator = torch.Generator(device=device) # Generator needs to be on the correct device
347
+
348
+ for item in successful_images:
349
+ scene_index = item["scene"] # Use the original scene index
350
+ # Ensure video path is within the temp directory
351
+ vid_path = os.path.join(temp_dir, f"scene_{scene_index + 1}_vid.mp4")
352
+ st.write(f"Generating Video for Scene {scene_index + 1} (Image {scene_index + 1})...")
353
+ img, video_frames = None, None # Define before try
354
+ try:
355
+ img = Image.open(item["path"]) # Load the generated image
356
+
357
+ # CogVideoX uses a 'video_direction_prompt' for motion
358
+ video_direction = scenes[scene_index].get("video_direction_prompt", "subtle motion")
359
+ # Generate a seed based on current time + scene index
360
+ seed = int(time.time() * 1000 + scene_index) % 100000
361
+ # Set the seed for the generator on the correct device
362
+ generator.manual_seed(seed)
363
+
364
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
365
+
366
+ with torch.no_grad():
367
+ # Generate video frames from the image and motion prompt
368
+ # num_frames should ideally match SCENE_DURATION_SECONDS * VIDEO_FPS
369
+ video_frames = video_pipe(
370
+ prompt=video_direction,
371
+ image=img,
372
+ num_inference_steps=40, # Balance speed/quality
373
+ num_frames=int(SCENE_DURATION_SECONDS * VIDEO_FPS), # Generate specific number of frames
374
+ guidance_scale=6.0,
375
+ generator=generator
376
+ ).frames[0] # Get the first (and only) video sequence
377
+
378
+ if torch.cuda.is_available():
379
+ display_memory_usage() # Display after inference
380
+ st.info(f"📊 Peak GPU Memory (during Video inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
381
+
382
+
383
+ # Export the frames to a video file
384
+ # export_to_video handles saving frames to MP4 using imageio-ffmpeg
385
+ export_to_video(video_frames, vid_path, fps=VIDEO_FPS)
386
+ video_results.append({"scene": scene_index, "path": vid_path, "status": "succeeded"})
387
+ # Comment out preview to save resources on Spaces
388
+ # st.video(vid_path, format='video/mp4', start_time=0)
389
+ st.success(f"Video Scene {scene_index + 1} OK.")
390
+ except Exception as e:
391
+ st.error(f"❌ Video {scene_index + 1} Failed: {e}"); st.error(traceback.format_exc())
392
+ video_results.append({"scene": scene_index, "path": None, "status": "failed"})
393
+ finally:
394
+ # Explicitly set references to None before cleanup
395
+ cleanup_gpu_memory(img, video_frames); img, video_frames = None, None
396
+
397
+ st.success("✅ Video generation step complete.")
398
+ except Exception as e:
399
+ st.error(f"❌ Video Gen Step Failed: {e}"); st.error(traceback.format_exc())
400
+ # If the pipe loading failed, mark all potential videos as failed
401
+ if not video_results:
402
+ video_results = [{"scene": item["scene"], "path": None, "status": "failed"} for item in successful_images]
403
+ finally:
404
+ # Explicitly set references to None before cleanup
405
+ cleanup_gpu_memory(video_pipe); video_pipe = None
406
+ st.info("✅ Video Generator Unloaded.")
407
  return video_results
408
 
409
+ def run_audio_step(scenes, temp_dir, status_placeholder):
410
+ # Generate audio for all scenes from the story data, regardless of image/video success,
411
+ # so we have potential audio even for muted video segments if needed for composition.
412
+ if not scenes:
413
+ with status_placeholder.container(): st.warning("Skipping audio step: No scenes available from story data.")
414
+ return []
 
 
 
 
 
 
415
 
416
+ with status_placeholder.container():
417
+ st.info(f"🔄 Loading TTS Model: {TTS_MODEL_ID}...")
418
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
419
+ display_memory_usage()
420
+ tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
421
+ audio_results = []
422
+ try:
423
+ # Load TTS model (ParlerTTSForConditionalGeneration requires specific class)
424
+ # Use device_map="auto" for automatic handling of model placement
425
+ tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID, device_map="auto")
426
+
427
+ # Tokenizers are usually CPU-based, load them normally
428
+ tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) # Tokenizer for the text prompt
429
+ # Tokenizer for the voice description might be different, get its path from the model config
430
+ tts_desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
431
+
432
+ if torch.cuda.is_available():
433
+ display_memory_usage() # Display after loading
434
+ st.info(f"📊 Peak GPU Memory (after TTS load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
435
+
436
+ st.info("🔊 Generating audio sequentially...")
437
+
438
+ for i, scene in enumerate(scenes):
439
+ # Ensure audio path is within the temp directory
440
+ audio_path = os.path.join(temp_dir, f"scene_{i+1}_audio.wav")
441
+ st.write(f"Generating Audio {i+1}/{len(scenes)}...")
442
+ desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None # Define before try
443
+ try:
444
+ text_to_speak = scene.get("scene_description", "").strip() # Get description
445
+ voice_description = scene.get("audio_description", "A neutral speaker.") # Get voice desc
446
+
447
+ # Enforce limit and check if there's actually text to speak
448
+ if not text_to_speak or len(text_to_speak) > 350:
449
+ if len(text_to_speak) > 350:
450
+ st.warning(f"Audio {i+1} description too long ({len(text_to_speak)} chars). Skipping audio generation for this scene: {text_to_speak[:100]}...")
451
+ else:
452
+ st.info(f"Audio {i+1}: No text description provided. Skipping audio generation for this scene.")
453
+ # Still record a result, but status is skipped/failed
454
+ audio_results.append({"scene": i, "path": None, "status": "skipped"})
455
+ continue # Move to the next scene's audio
456
+
457
+ if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
458
+
459
+ # Tokenize inputs - ensure they are on the correct device where the model parts are
460
+ # Since using device_map="auto", the model handles moving inputs. Tokenizers are usually CPU.
461
+ # For ParlerTTS with device_map="auto", inputs should likely be on the device where the
462
+ # input layers land, which might still be GPU. Best to send explicitly if needed, or rely on auto.
463
+ # Let's explicitly move to the model's primary device if possible.
464
+ model_device = tts_model.device if hasattr(tts_model, 'device') and tts_model.device.type != 'cpu' else device # Handle case if auto map puts main layer on CPU
465
+ desc_input_ids = tts_desc_tokenizer(voice_description, return_tensors="pt").input_ids.to(model_device)
466
+ prompt_input_ids = tts_tokenizer(text_to_speak, return_tensors="pt").input_ids.to(model_device)
467
+
468
+
469
+ with torch.no_grad():
470
+ # Generate audio - ParlerTTS outputs waveform directly
471
+ # generation is [batch_size, num_samples]
472
+ generation = tts_model.generate(
473
+ input_ids=desc_input_ids,
474
+ prompt_input_ids=prompt_input_ids,
475
+ do_sample=True,
476
+ temperature=0.7 # Control voice variation
477
+ ).to(torch.float32) # Ensure output is float32 for soundfile
478
+
479
+ if torch.cuda.is_available():
480
+ display_memory_usage() # Display after inference
481
+ st.info(f"📊 Peak GPU Memory (during Audio inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
482
+
483
+ # Convert tensor output to numpy array and get sampling rate
484
+ audio_arr = generation.cpu().numpy().squeeze()
485
+ sampling_rate = tts_model.config.sampling_rate
486
+
487
+ # Save the audio using soundfile
488
+ sf.write(audio_path, audio_arr, sampling_rate)
489
+ audio_results.append({"scene": i, "path": audio_path, "status": "succeeded"})
490
+ st.audio(audio_path, format='audio/wav') # Preview audio
491
+ except Exception as e:
492
+ st.error(f"❌ Audio {i+1} Failed: {e}"); st.error(traceback.format_exc())
493
+ audio_results.append({"scene": i, "path": None, "status": "failed"})
494
+ finally:
495
+ # Explicitly set references to None before cleanup
496
+ cleanup_gpu_memory(desc_input_ids, prompt_input_ids, generation, audio_arr)
497
+ desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None
498
+
499
+ st.success("✅ Audio generation step complete.")
500
+ except Exception as e:
501
+ st.error(f"❌ Audio Gen Step Failed: {e}"); st.error(traceback.format_exc())
502
+ # If the TTS *model loading* failed, mark all potential audio as failed
503
+ if not audio_results:
504
+ audio_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
505
+ finally:
506
+ # Explicitly set references to None before cleanup
507
+ cleanup_gpu_memory(tts_model, tts_tokenizer, tts_desc_tokenizer)
508
+ tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
509
+ st.info("✅ TTS Model Unloaded.")
510
  return audio_results
511
 
512
+ def run_compose_step_ffmpeg(video_results, audio_results, temp_dir, title="final_pov_video", status_placeholder=None):
513
  """Combines videos and audio using ffmpeg-python."""
514
+ if status_placeholder is None:
515
+ # Use a default container if no placeholder is passed (shouldn't happen in the app flow)
516
+ status_placeholder = st.empty()
 
 
 
 
 
517
 
518
+ with status_placeholder.container():
519
+ st.info("🎞️ Composing final video using ffmpeg-python (CPU)...")
520
+ # Composition is CPU-bound, display CPU memory
521
+ display_memory_usage()
522
+ final_video_path = None
523
+ long_video_path = os.path.join(temp_dir, "long_video_temp.mp4")
524
+ long_audio_path = os.path.join(temp_dir, "long_audio_temp.wav")
525
+ # Sanitize title for filename - keep alphanumeric, spaces, underscores
526
+ safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '_')).rstrip().replace(' ', '_')
527
+ # Add a default if title is empty after sanitization
528
+ final_output_filename = f"{safe_title or 'pov_video'}.mp4"
529
+ final_output_path = os.path.join(temp_dir, final_output_filename)
530
+
531
+ # Create lists for ffmpeg concat demuxer files
532
+ concat_video_list_path = os.path.join(temp_dir, "ffmpeg_video_list.txt")
533
+ concat_audio_list_path = os.path.join(temp_dir, "ffmpeg_audio_list.txt")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534
 
535
+ try:
536
+ # Filter and sort results by scene index
537
+ successful_videos = sorted([item for item in video_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
538
+ successful_audio = sorted([item for item in audio_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
539
+
540
+ # Align video and audio streams by scene index.
541
+ # Prioritize video; include audio only if a video exists for that scene index.
542
+ # This creates a list of dictionaries for scenes that will be included in the final video.
543
+ composed_scenes_data = []
544
+ audio_map = {item['scene']: item['path'] for item in successful_audio}
545
+
546
+ for video_item in successful_videos:
547
+ scene_idx = video_item['scene']
548
+ composed_scenes_data.append({
549
+ 'scene': scene_idx,
550
+ 'video_path': video_item['path'],
551
+ 'audio_path': audio_map.get(scene_idx) # Get audio path if it exists, otherwise None
552
+ })
553
+
554
+ if not composed_scenes_data:
555
+ st.error("❌ No successful video clips generated to compose.")
556
+ return None
557
+
558
+ st.write(f"Found {len(composed_scenes_data)} scene(s) with successful video clips to compose.")
559
+
560
+ # --- FFmpeg Concatenation ---
561
+ # 1. Create file lists for ffmpeg concat demuxer (only for *successful* components)
562
+ # List all video paths that will be concatenated
563
+ videos_for_concat = [item['video_path'] for item in composed_scenes_data]
564
+ # List audio paths ONLY for scenes that had successful video *AND* successful audio
565
+ audio_for_concat = [item['audio_path'] for item in composed_scenes_data if item['audio_path']]
566
+
567
+
568
+ if not videos_for_concat:
569
+ st.error("❌ No video files found for composition after filtering.")
570
+ return None
571
+
572
+ # Write video list file
573
+ with open(concat_video_list_path, "w") as f_vid:
574
+ for v_path in videos_for_concat:
575
+ # Use relative paths within temp dir - safer for ffmpeg execution
576
+ f_vid.write(f"file '{os.path.relpath(v_path, temp_dir)}'\n")
577
+
578
+ # Write audio list file only if there's audio to concatenate
579
+ if audio_for_concat:
580
+ with open(concat_audio_list_path, "w") as f_aud:
581
+ for a_path in audio_for_concat:
582
+ f_aud.write(f"file '{os.path.relpath(a_path, temp_dir)}'\n")
583
+
584
+
585
+ # 2. Concatenate Video Files using concat demuxer
586
+ st.write("Concatenating videos...")
587
+ try:
588
+ # Using -f concat -safe 0 is standard for combining lists of files
589
+ # fflags=+igndts helps with variable frame rate or timestamp issues common in generated videos
590
+ (
591
+ ffmpeg
592
+ .input(concat_video_list_path, format='concat', safe=0, fflags='+igndts')
593
+ .output(long_video_path, c='copy') # Use stream copy for speed - assumes inputs are compatible codecs/formats
594
+ .global_args('-hide_banner', '-loglevel', 'error') # Suppress verbose output, show only errors
595
+ .run(overwrite_output=True, cmd='ffmpeg') # Explicitly call 'ffmpeg' command
596
+ )
597
+ st.write("Videos concatenated successfully.")
598
+ except ffmpeg.Error as e:
599
+ st.error("FFmpeg Video Concat Error:")
600
+ st.code(e.stderr.decode() if e.stderr else str(e))
601
+ raise # Re-raise to stop the process if video concat fails
602
+
603
+ # 3. Concatenate Audio Files using concat demuxer (Only if any audio was successful for corresponding videos)
604
+ long_audio_concatenated = False # Flag to track if audio concat succeeded
605
+ if audio_for_concat:
606
+ st.write("Concatenating audio...")
607
+ try:
608
+ # Use pcm_s16le for intermediate WAV format - it's safe and widely compatible
609
+ (
610
+ ffmpeg
611
+ .input(concat_audio_list_path, format='concat', safe=0, fflags='+igndts')
612
+ .output(long_audio_path, acodec='pcm_s16le') # Output intermediate WAV
613
+ .global_args('-hide_banner', '-loglevel', 'error')
614
+ .run(overwrite_output=True, cmd='ffmpeg')
615
+ )
616
+ st.write("Audio concatenated successfully.")
617
+ long_audio_concatenated = True # Set flag on success
618
+ except ffmpeg.Error as e:
619
+ st.warning("FFmpeg Audio Concat Error - proceeding without audio:")
620
+ st.code(e.stderr.decode() if e.stderr else str(e))
621
+ # Don't re-raise, just continue without audio if it fails
622
+
623
+
624
+ # 4. Mux (Combine) Video and Audio
625
+ st.write("Muxing final video and audio...")
626
+ try:
627
+ in_video = ffmpeg.input(long_video_path)
628
+
629
+ # Add audio input only if audio was successfully concatenated
630
+ if long_audio_concatenated and os.path.exists(long_audio_path):
631
+ in_audio = ffmpeg.input(long_audio_path)
632
+ # Output command with both video and audio inputs
633
+ stream = ffmpeg.output(in_video, in_audio, final_output_path,
634
+ vcodec='copy', # Copy video stream directly (preserves portrait format)
635
+ acodec='aac', # Re-encode audio to AAC (standard for MP4)
636
+ shortest=None, # Extend shorter stream (video or audio) to match the longer one's duration
637
+ strict='experimental') # Needed for some older aac encoders, good practice
638
+
639
+ else: # No audio to mux, output video only
640
+ st.warning("Muxing video without audio.")
641
+ stream = ffmpeg.output(in_video, final_output_path,
642
+ vcodec='copy',
643
+ an=None) # -an flag removes audio stream from the output
644
+
645
+ # Run the muxing command
646
+ stream.global_args('-hide_banner', '-loglevel', 'error').run(overwrite_output=True, cmd='ffmpeg')
647
+
648
+ # Set the final path on success
649
+ final_video_path = final_output_path
650
+ st.success("✅ Final video composed!")
651
+
652
+ except ffmpeg.Error as e:
653
+ st.error("FFmpeg Muxing Error:")
654
+ st.code(e.stderr.decode() if e.stderr else str(e))
655
+ final_video_path = None # Ensure it's None on failure
656
+ # Re-raise the muxing error as composition failed
657
+ raise
658
 
659
 
660
+ except Exception as e:
661
+ # Catch any other errors during the composition logic (file handling, etc.)
662
+ st.error(f"❌ Video Composition Step Failed: {e}")
663
+ st.error(traceback.format_exc())
664
+ final_video_path = None
665
+ finally:
666
+ # Clean up intermediate files and lists regardless of success/failure
667
+ st.write("Cleaning up intermediate composition files...")
668
+ intermediate_files = [long_video_path, long_audio_path, concat_video_list_path, concat_audio_list_path]
669
+ for f_path in intermediate_files:
670
+ if os.path.exists(f_path):
671
+ try:
672
+ os.remove(f_path)
673
+ # print(f"Cleaned: {f_path}") # Optional: uncomment for verbose logs
674
+ except Exception as e_clean: print(f"Error cleaning {f_path}: {e_clean}")
675
+ display_memory_usage() # Final memory check for this step
676
  return final_video_path
677
 
678
 
679
  # --- Streamlit UI ---
680
 
681
+ # Updated title and caption for clarity
682
+ st.title("🎬 POV Vertical Video Gen (HF Space Optimized)")
683
+ st.caption(f"Workflow: Scenario → Story → Images ({IMAGE_WIDTH}x{IMAGE_HEIGHT}) → Videos → Audio → Compose → Download. Optimized for vertical formats (e.g., TikTok/YouTube Shorts) on the Free Tier.")
684
 
685
  # Initialize Session State
686
+ # This function ensures required keys exist in st.session_state on first load
687
  def init_state():
688
  keys_to_init = {
689
+ 'generation_in_progress': False, # Flag to indicate if a generation process is running
690
+ 'current_step': "idle", # Current step in the workflow ("idle", "story", "image", ...)
691
+ 'story_data': None, # Stores the output from the LLM step
692
+ 'image_results': [], # List of results from the image generation step
693
+ 'video_results': [], # List of results from the video generation step
694
+ 'audio_results': [], # List of results from the audio generation step
695
+ 'final_video_path': None, # Path to the final composed video file
696
+ 'temp_dir_path': None, # Path to the temporary directory for this run
697
+ 'num_scenes': NUM_SCENES_DEFAULT # Number of scenes requested
698
  }
699
  for key, default_value in keys_to_init.items():
700
  if key not in st.session_state:
701
  st.session_state[key] = default_value
702
+ init_state() # Call init_state on each app load to set defaults if not already present
703
 
704
  # --- Sidebar ---
705
  with st.sidebar:
706
  st.header("⚙️ Config & Control")
707
+ # Text area for user input scenario
708
  user_prompt = st.text_area("1. Enter POV Scenario:", height=100, value="POV: You're Marco Polo negotiating trade routes in the Silk Road bazaar (1270)", key="user_prompt_input")
 
709
 
710
+ # Number input for the desired number of scenes
711
+ # Ensure min/max values are enforced
712
+ num_scenes_req = st.number_input(f"2. Target Scenes (Max {MAX_SCENES}):", min_value=1, max_value=MAX_SCENES,
713
+ value=min(st.session_state.num_scenes, MAX_SCENES), # Ensure initial value respects max
714
+ step=1, # Increment by 1
715
+ key="num_scenes_req_input")
716
+
717
+ # Display target dimensions and duration clearly
718
+ st.info(f"Target video resolution: {IMAGE_WIDTH}x{IMAGE_HEIGHT} (Portrait)")
719
+ st.info(f"Approx. scene duration: {SCENE_DURATION_SECONDS}s, FPS: {VIDEO_FPS}")
720
+
721
+
722
+ # Start generation button
723
+ # Disabled if generation is already in progress or if running on CPU
724
  start_disable = st.session_state.generation_in_progress or device == "cpu"
725
  start_button = st.button("🚀 Start Generation", type="primary", disabled=start_disable)
726
 
727
  if start_button:
728
+ # Reset state and trigger the start of the workflow
729
+ init_state() # Reset all state variables for a fresh run
730
  st.session_state.generation_in_progress = True
731
+ st.session_state.current_step = "story" # Start with the story generation step
732
+ st.session_state.num_scenes = num_scenes_req # Store the user-requested number of scenes
733
+ cleanup_temp_dir() # Clean old files before starting a new run
734
+ get_temp_dir() # Ensure a new temp dir path is set for this run
735
+ st.experimental_rerun() # Trigger a rerun to enter the generation loop
736
 
737
  st.header("⚠️ Actions")
738
+ # Reset workflow button - disabled if generation is in progress
739
  if st.button("🔁 Reset Workflow", disabled=st.session_state.generation_in_progress):
740
+ init_state() # Reset all session state
741
  cleanup_temp_dir() # Also clean files on reset
742
+ st.experimental_rerun() # Rerun to update UI state and exit generation loop
743
 
744
+ # Clean temp files button - disabled if generation is in progress
745
+ cleanup_button_help = f"Removes files in: {st.session_state.get('temp_dir_path', 'N/A')}"
746
+ if st.button("🧹 Clean Temp Files Only", help=cleanup_button_help, disabled=st.session_state.generation_in_progress):
747
  cleanup_temp_dir()
748
+ # No rerun needed here unless you want to force UI update based on temp_dir_path existence
749
 
750
+
751
+ # --- Main Area Logic & Progress Display ---
752
  st.divider()
753
+
754
+ # Display error if running on CPU
755
  if device == "cpu":
756
+ st.error("🔴 GPU (CUDA) is required for model inference. This application will not run on CPU.")
757
+ # If generation is in progress, display current step and progress bar
758
  elif st.session_state.generation_in_progress:
759
+ st.subheader(f"🚀 Running Step: **{st.session_state.current_step.upper()}**")
760
+ progress_bar = st.progress(0) # Initialize or update progress bar
761
+
762
+ # Define the sequence of steps
763
  steps = ["story", "image", "video", "audio", "compose", "done"]
764
  try:
765
  current_index = steps.index(st.session_state.current_step)
766
+ # Calculate progress percentage
767
+ # Prevent progress bar from reaching 100% before the 'done' step
768
+ progress_value = (current_index / (len(steps) - 1)) * 100
769
+ if st.session_state.current_step != "done":
770
+ progress_bar.progress(int(min(progress_value, 99))) # Cap at 99%
771
+ else:
772
+ progress_bar.progress(100)
773
+
774
  except ValueError:
775
+ # Fallback if current_step is somehow not in the steps list
776
+ progress_bar.progress(0)
777
 
778
+ # Use a single placeholder for step-specific status updates (loading, generating, etc.)
779
  status_placeholder = st.empty()
780
 
781
+ # --- Workflow Execution Logic ---
782
+ # This block executes one step at a time based on st.session_state.current_step
783
  try:
784
+ # Ensure temp_dir is set before starting any step that uses files
785
+ temp_dir = get_temp_dir()
786
+ current_step = st.session_state.current_step # Get current step from state
787
+
788
+ # --- Execute the current step ---
789
+ next_step = current_step # Default next step is the current one (stays if error)
790
 
791
  if current_step == "story":
792
+ st.session_state.story_data = run_llm_step(user_prompt, st.session_state.num_scenes, status_placeholder)
793
+ # Determine next step based on success of the current step
794
+ next_step = "image" if st.session_state.story_data and st.session_state.story_data.get('scenes') else "error"
795
+ if next_step == "error": status_placeholder.error("Story generation failed or returned no scenes.")
796
+
797
 
798
  elif current_step == "image":
799
+ # Pass the scenes data from the story step
800
+ scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
801
+ if not scenes:
802
+ status_placeholder.warning("Skipping image step: No scenes available from story data.")
803
+ st.session_state.image_results = [] # Ensure it's an empty list if skipped
804
+ else:
805
+ st.session_state.image_results = run_image_step(scenes, temp_dir, status_placeholder)
806
+ next_step = "video" # Always proceed to video step, it handles empty results
807
+
808
 
809
  elif current_step == "video":
810
+ # Pass scenes and image results
811
+ scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
812
+ st.session_state.video_results = run_video_step(st.session_state.image_results, scenes, temp_dir, status_placeholder)
813
+ next_step = "audio" # Always proceed to audio step
814
+
815
 
816
  elif current_step == "audio":
817
+ # Pass scenes data
818
+ scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
819
+ st.session_state.audio_results = run_audio_step(scenes, temp_dir, status_placeholder)
820
+ next_step = "compose" # Always proceed to compose step
821
+
822
 
823
  elif current_step == "compose":
824
+ # Get the title for the final video filename
825
+ title_base = st.session_state.story_data.get('title', 'pov_video') if st.session_state.story_data else 'pov_video'
826
+ # Run the composition step, passing video and audio results and the temp dir
827
+ st.session_state.final_video_path = run_compose_step_ffmpeg(
828
+ st.session_state.video_results, st.session_state.audio_results, temp_dir, title=title_base, status_placeholder=status_placeholder)
829
+ # Determine next step: 'done' if video path exists, 'error' otherwise
830
  next_step = "done" if st.session_state.final_video_path else "error"
831
+ if next_step == "error" and not status_placeholder.container._provided_by_user:
832
+ # Add a generic error message if the compose function didn't provide a specific one
833
+ status_placeholder.error("Composition step failed.")
834
 
 
 
835
 
836
+ else: # Should not be reached if the state machine is correct
837
+ next_step = "error"
838
+ status_placeholder.error(f"Internal error: Unknown state '{current_step}'")
839
+
840
+
841
+ # --- State Transition ---
842
+ # Update state and trigger a rerun ONLY if the workflow should transition to the next step
843
+ if next_step != current_step: # Check if the state needs to change
844
+ st.session_state.current_step = next_step # Set the new step
845
+ if next_step == "done" or next_step == "error":
846
+ st.session_state.generation_in_progress = False # Workflow finished (success or failure)
847
+ # The progress bar update to 100% for 'done' is handled above based on state
848
+ # Trigger a rerun. Streamlit will reload the script, and the logic will continue
849
+ # from the new st.session_state.current_step.
850
  st.experimental_rerun()
851
+
852
+ # If next_step is the same as current_step, it implies an error occurred *within* the step's
853
+ # execution that set next_step to "error", and the logic above decided not to rerun.
854
+ # In this specific case (error state reached), ensure the generation_in_progress flag is false
855
+ # if it wasn't already set by the logic inside the step function itself.
856
+ elif st.session_state.current_step == "error" and st.session_state.generation_in_progress:
857
  st.session_state.generation_in_progress = False
858
 
859
+
860
  except Exception as e:
861
+ # Catch any unexpected errors that weren't handled within the step functions
862
  st.error(f"An unexpected error occurred during step {st.session_state.current_step}: {e}")
863
+ st.error(traceback.format_exc()) # Display full traceback for debugging
864
+ status_placeholder.error(f"An unexpected error stopped the workflow at step: **{st.session_state.current_step.upper()}**")
865
+ st.session_state.current_step = "error" # Set state to error
866
+ st.session_state.generation_in_progress = False # Stop generation
867
+ progress_bar.progress(0) # Reset progress bar on error
868
+ st.experimental_rerun() # Rerun to show the error state UI and stop execution flow
869
 
870
 
871
  # --- Display Final Output ---
872
  st.divider()
873
  st.header("✅ Final Video")
874
+
875
+ # Display the final video if the workflow is done and a path exists
876
  if st.session_state.current_step == "done" and st.session_state.final_video_path:
877
  final_video_path = st.session_state.final_video_path
878
  if os.path.exists(final_video_path):
879
+ # Use st.video to display the video player
880
+ st.video(final_video_path, format='video/mp4') # Explicitly set format
881
+
882
+ # Provide a download button for the video file
883
  try:
884
  with open(final_video_path, "rb") as fp:
885
  st.download_button(
886
  label="⬇️ Download Final Video (.mp4)",
887
  data=fp,
888
+ file_name=os.path.basename(final_video_path), # Use the filename from the path
889
  mime="video/mp4",
890
+ key="final_video_download_btn" # Unique key for the widget
891
  )
892
  except Exception as e:
893
  st.error(f"Error reading final video for download: {e}")
894
  else:
895
+ st.error(f"Final video file not found: {final_video_path}. It might have been cleaned up prematurely or composition failed unexpectedly.")
896
+
897
+ # Display error message if the workflow ended in an error state
898
  elif st.session_state.current_step == "error":
899
+ st.error("🛑 Workflow failed. Check logs above and in the app output/Spaces logs tab. Please use 'Reset Workflow' and try again.")
900
+
901
+ # Inform the user if generation is ongoing
902
  elif st.session_state.generation_in_progress:
903
+ # The current step and progress are displayed in the section above
904
  st.info(f"⏳ Workflow running... Current step: **{st.session_state.current_step.upper()}**")
905
+
906
+ # Initial message when the app is ready
907
  else:
908
+ st.info("👋 Ready to generate. Use the sidebar to enter your scenario and configuration, then click 'Start Generation'.")
909
 
910
+
911
+ # Optional: Expander to show detailed intermediate results for debugging
912
+ with st.expander("Show Intermediate File Details and State", expanded=False):
913
+ st.write("**Session State:**", st.session_state) # Display all session state (useful for debugging)
914
  st.write("**Story Data:**"); st.json(st.session_state.story_data or {})
915
  st.write("**Image Results:**"); st.json(st.session_state.image_results or [])
916
  st.write("**Video Results:**"); st.json(st.session_state.video_results or [])
917
  st.write("**Audio Results:**"); st.json(st.session_state.audio_results or [])
918
  st.write("**Final Path:**", st.session_state.final_video_path or "Not generated")
919
  st.write("**Temp Dir:**", st.session_state.get('temp_dir_path', "N/A"))
920
+ # Attempt to list contents of the temp directory
921
+ temp_dir_content = []
922
+ temp_dir_path_state = st.session_state.get('temp_dir_path')
923
+ if temp_dir_path_state and os.path.exists(temp_dir_path_state):
924
+ try:
925
+ temp_dir_content = os.listdir(temp_dir_path_state)
926
+ # Sort for easier reading
927
+ temp_dir_content.sort()
928
+ except Exception as e:
929
+ temp_dir_content = [f"Error listing directory contents: {e}"]
930
+ st.write("**Temp Dir Contents:**", temp_dir_content if temp_dir_content else "Directory empty or not created/found.")
931
+
932
+ # Final memory display (always visible in sidebar due to placement in the script)
933
  display_memory_usage()