Spaces:

garyuzair
/

POV

Runtime error

App Files Files Community

garyuzair commited on May 4, 2025

Commit

3f918d1

verified ·

1 Parent(s): 1b4bb8c

Update src/app_hf_space_optimized.py

Browse files

Files changed (1) hide show

src/app_hf_space_optimized.py +720 -370

src/app_hf_space_optimized.py CHANGED Viewed

@@ -10,6 +10,8 @@ import numpy as np
 import ffmpeg # Use ffmpeg-python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from diffusers import StableDiffusionXLPipeline
 from diffusers.pipelines.cogvideo import CogVideoXPipeline
 from diffusers.utils import export_to_video
 from parler_tts import ParlerTTSForConditionalGeneration
@@ -18,7 +20,8 @@ import shutil
 import traceback
 import psutil # For memory stats
-st.set_page_config(layout="wide", page_title="POV Video Gen (HF Space)")
 # --- Configuration ---
 LLM_MODEL_ID = "Qwen/Qwen3-0.6B"
@@ -26,13 +29,15 @@ IMAGE_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
 VIDEO_MODEL_ID = "THUDM/CogVideoX-2b"
 TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
 IMAGE_WIDTH = 768
 IMAGE_HEIGHT = 1344
 SCENE_DURATION_SECONDS = 4 # Reduced duration for faster processing
 VIDEO_FPS = 10
 NUM_SCENES_DEFAULT = 3 # Lowered default
-MAX_SCENES = 4 # Stricter limit for free tier
-TEMP_SUBDIR = "pov_video_temp_hf" # Unique name
 # --- Device Setup & Memory Monitor ---
 mem_info_placeholder = st.sidebar.empty()
@@ -44,45 +49,58 @@ def display_memory_usage():
         cpu_mem = process.memory_info().rss / (1024 * 1024) # MB
         gpu_mem_info = "N/A"
         if torch.cuda.is_available():
             allocated = torch.cuda.memory_allocated(0) / (1024 * 1024) # MB
-            reserved = torch.cuda.memory_reserved(0) / (1024 * 1024) # MB
             total = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024) # MB
-            gpu_mem_info = f"Alloc: {allocated:.0f}MB | Reserv: {reserved:.0f}MB | Total: {total:.0f}MB"
         mem_info_placeholder.info(f"🧠 CPU Mem: {cpu_mem:.0f} MB\n⚡ GPU Mem: {gpu_mem_info}")
     except Exception as e:
         mem_info_placeholder.warning(f"Could not get memory info: {e}")
 if torch.cuda.is_available():
     device = "cuda"
     try:
         vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
         st.sidebar.success(f"✅ GPU Detected! VRAM: {vram_gb:.2f} GB")
-        if vram_gb < 15:
-            st.sidebar.warning("⚠️ Low VRAM (< 15GB). May struggle.")
     except Exception:
-        st.sidebar.warning("Could not read GPU VRAM.") # Continue assuming GPU exists
 else:
     device = "cpu"
-    st.sidebar.error("⚠️ No GPU! App will be extremely slow & likely fail.")
 # --- Helper Functions ---
 def cleanup_gpu_memory(*args):
-    """Attempts to free GPU memory."""
-    print(f"Attempting GPU mem cleanup. Vars to del: {len(args)}")
-    display_memory_usage() # Before cleanup
-    del args # Remove reference to the tuple itself
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
-    display_memory_usage() # After cleanup
-    print("GPU mem cleanup done.")
 def get_temp_dir():
     """Creates or returns the path to the temporary directory."""
     # Use a consistent path within the app's execution context for simplicity on Spaces
     # This might lead to leftover files if cleanup fails, but avoids potential permission issues with system temp
-    app_temp_dir = os.path.abspath(TEMP_SUBDIR) # Use relative path from script
     os.makedirs(app_temp_dir, exist_ok=True)
     if 'temp_dir_path' not in st.session_state or st.session_state.temp_dir_path != app_temp_dir:
         print(f"Setting temp dir: {app_temp_dir}")
         st.session_state.temp_dir_path = app_temp_dir
@@ -91,11 +109,14 @@ def get_temp_dir():
 def cleanup_temp_dir():
     """Removes the application's temporary directory."""
     dir_path = st.session_state.get('temp_dir_path', None)
-    if dir_path and os.path.exists(dir_path) and TEMP_SUBDIR in dir_path: # Safety check
         try:
             shutil.rmtree(dir_path)
             st.sidebar.success(f"Cleaned up: {dir_path}")
-            st.session_state.temp_dir_path = None
         except Exception as e:
             st.sidebar.error(f"Error cleaning temp dir {dir_path}: {e}")
     else:
@@ -104,36 +125,44 @@ def cleanup_temp_dir():
 # --- Model Interaction Functions (Load -> Use -> Unload) ---
-def run_llm_step(user_prompt, num_scenes):
     """Loads LLM, generates story, unloads LLM."""
-    st.info(f"🔄 Loading LLM: {LLM_MODEL_ID}...")
-    display_memory_usage()
-    llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None
-    story_data = None
-    try:
-        dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_bf16_supported() else torch.float16 if device=="cuda" else torch.float32
-        llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
-        llm_model = AutoModelForCausalLM.from_pretrained(
-            LLM_MODEL_ID, torch_dtype=dtype, low_cpu_mem_usage=True, device_map="auto" # Try low_cpu_mem_usage
-        )
         display_memory_usage()
-        st.info("🧠 Generating story structure...")
-        # --- System Prompt --- (Shortened descriptions max length)
-        system_prompt = f"""
-You are an expert director creating POV TikTok video scripts.
-Break down the user's scenario into exactly {num_scenes} scenes ({SCENE_DURATION_SECONDS}s each).
 For EACH scene, generate:
-1. `scene_description`: Max 1-2 concise sentences describing action/setting for TTS. Max 350 characters.
-2. `image_prompt`: Detailed SDXL POV prompt (Start with "First-person perspective - pov shot of..."). Include setting, mood, style, time period, elements. Add "pov hands from the bottom corner..." if needed.
-3. `video_direction_prompt`: Simple camera action/motion for CogVideoX (e.g., "Camera pans right", "Subtle zoom in", "Static shot", "Hand reaches out").
-4. `audio_description`: Voice & ambience description for Parler-TTS (e.g., "Nervous male voice, faint market chatter.", "Calm female narrator, quiet library ambience.").
-Respond ONLY with a valid JSON object:
 {{
 "story_details": {{
-"title": "POV Title (Year)",
-"full_story": "Brief summary...",
 "scenes": [
   {{ // Scene 1
     "scene_description": "...", // Max 350 chars
@@ -145,439 +174,760 @@ Respond ONLY with a valid JSON object:
 ]
 }}
 }}
-Strictly adhere to JSON format. No extra text.
-        """.strip()
-        messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Create script: {user_prompt}"}]
-        text_input = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
-        model_inputs = llm_tokenizer([text_input], return_tensors="pt").to(llm_model.device if hasattr(llm_model, 'device') else device)
-        # Use recommended parameters for non-thinking Qwen3
-        generated_ids = llm_model.generate(
-            **model_inputs, max_new_tokens=4096, # Still allow space for generation
-            temperature=0.7, top_p=0.8, top_k=20, do_sample=True,
-            pad_token_id=llm_tokenizer.eos_token_id # Important for stopping
-        )
-        output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
-        response_text = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
-        st.write("LLM Raw Output:"); st.code(response_text, language='text')
-        json_string = response_text.strip().removeprefix("```json").removesuffix("```").strip()
-        parsed_data = json.loads(json_string)
-        if not ("story_details" in parsed_data and "scenes" in parsed_data["story_details"]): raise ValueError("Invalid JSON structure.")
-        actual_num_scenes = len(parsed_data["story_details"]["scenes"])
-        if actual_num_scenes != num_scenes: st.warning(f"LLM gave {actual_num_scenes} scenes, requested {num_scenes}.")
-        story_data = parsed_data["story_details"]
-        st.success("✅ Story generation complete.")
-    except Exception as e:
-        st.error(f"❌ LLM Step Failed: {e}"); st.error(traceback.format_exc()); story_data = None
-    finally:
-        st.info("🔄 Unloading LLM..."); cleanup_gpu_memory(llm_model, llm_tokenizer, model_inputs, generated_ids); st.info("✅ LLM Unloaded.")
-    return story_data
-def run_image_step(scenes, temp_dir):
-    st.info(f"🔄 Loading Image Generator: {IMAGE_MODEL_ID}...")
-    display_memory_usage()
-    image_pipe = None; image_results = []
-    try:
-        dtype = torch.float16 if device == "cuda" else torch.float32
-        image_pipe = StableDiffusionXLPipeline.from_pretrained(
-            IMAGE_MODEL_ID, torch_dtype=dtype, use_safetensors=True, variant="fp16" if device == "cuda" else None,
-            low_cpu_mem_usage=True # Crucial for loading on low RAM systems
-        )
-        # Use CPU offloading even if it's slower, necessary for T4 VRAM
-        if device == "cuda": image_pipe.enable_model_cpu_offload()
-        else: image_pipe.to(device) # Move to CPU if needed
-        display_memory_usage()
-        st.info("🎨 Generating images sequentially...")
-        for i, scene in enumerate(scenes):
-            img_path = os.path.join(temp_dir, f"scene_{i+1}_img.png")
-            st.write(f"Generating Image {i+1}/{len(scenes)}...")
-            image = None # Define before try block
             try:
-                with torch.no_grad():
-                    image = image_pipe(
-                        prompt=scene.get("image_prompt", "blank image"),
-                        width=IMAGE_WIDTH, height=IMAGE_HEIGHT, num_inference_steps=25 # Fewer steps for speed
-                    ).images[0]
-                image.save(img_path)
-                image_results.append({"scene": i, "path": img_path, "status": "succeeded"})
-                st.image(image, caption=f"Scene {i+1} OK", width=150)
-            except Exception as e:
-                st.error(f"❌ Image {i+1} Failed: {e}"); st.error(traceback.format_exc())
-                image_results.append({"scene": i, "path": None, "status": "failed"})
-            finally: cleanup_gpu_memory(image) # Clean intermediate var
-        st.success("✅ Image generation step complete.")
-    except Exception as e:
-        st.error(f"❌ Image Gen Step Failed: {e}"); st.error(traceback.format_exc())
-        image_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
-    finally:
-        st.info("🔄 Unloading Image Generator..."); cleanup_gpu_memory(image_pipe); st.info("✅ Image Generator Unloaded.")
     return image_results
-def run_video_step(image_results, scenes, temp_dir):
     successful_images = [item for item in image_results if item["status"] == "succeeded"]
-    if not successful_images: return []
-    st.info(f"🔄 Loading Video Generator: {VIDEO_MODEL_ID}...")
-    display_memory_usage()
-    video_pipe = None; video_results = []
-    try:
-        dtype = torch.float16 if device == "cuda" else torch.float32
-        # Instantiate VAE and Transformer separately for potential offloading/quantization later if needed
-        # For now, load pipeline directly, enabling optimizations
-        video_pipe = CogVideoXPipeline.from_pretrained(VIDEO_MODEL_ID, torch_dtype=dtype)
-        if device == "cuda":
-             video_pipe.enable_model_cpu_offload()
-             video_pipe.enable_sequential_cpu_offload() # Needed for low VRAM
-        else: video_pipe.to(device)
-        video_pipe.vae.enable_slicing(); video_pipe.vae.enable_tiling()
-        display_memory_usage()
-        st.info("🎬 Generating videos sequentially...")
-        generator = torch.Generator(device=device)
-        for item in successful_images:
-            scene_index = item["scene"]; vid_path = os.path.join(temp_dir, f"scene_{scene_index + 1}_vid.mp4")
-            st.write(f"Generating Video for Scene {scene_index + 1}...")
-            img, video_frames = None, None # Define before try
-            try:
-                img = Image.open(item["path"])
-                video_direction = scenes[scene_index].get("video_direction_prompt", "subtle motion")
-                seed = int(time.time() * 1000 + scene_index) % 100000
-                if device == "cuda": generator.manual_seed(seed)
-                else: generator = torch.Generator(device='cpu').manual_seed(seed)
-                with torch.no_grad():
-                    video_frames = video_pipe(
-                        prompt=video_direction, image=img, num_inference_steps=40, # Slightly fewer steps
-                        num_frames=int(SCENE_DURATION_SECONDS * VIDEO_FPS) + 1,
-                        guidance_scale=6.0, generator=generator
-                    ).frames[0]
-                export_to_video(video_frames, vid_path, fps=VIDEO_FPS)
-                video_results.append({"scene": scene_index, "path": vid_path, "status": "succeeded"})
-                # Comment out preview to save resources on Spaces
-                # st.video(vid_path)
-                st.success(f"Video Scene {scene_index + 1} OK.")
-            except Exception as e:
-                st.error(f"❌ Video {scene_index + 1} Failed: {e}"); st.error(traceback.format_exc())
-                video_results.append({"scene": scene_index, "path": None, "status": "failed"})
-            finally: cleanup_gpu_memory(img, video_frames)
-        st.success("✅ Video generation step complete.")
-    except Exception as e:
-        st.error(f"❌ Video Gen Step Failed: {e}"); st.error(traceback.format_exc())
-        video_results = [{"scene": item["scene"], "path": None, "status": "failed"} for item in successful_images]
-    finally:
-        st.info("🔄 Unloading Video Generator..."); cleanup_gpu_memory(video_pipe); st.info("✅ Video Generator Unloaded.")
     return video_results
-def run_audio_step(scenes, temp_dir):
-    st.info(f"🔄 Loading TTS Model: {TTS_MODEL_ID}...")
-    display_memory_usage()
-    tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
-    audio_results = []
-    try:
-        # Load TTS model (Parler requires specific class)
-        tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID).to(device)
-        tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) # For text prompt
-        tts_desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path) # For description
-        display_memory_usage()
-        st.info("🔊 Generating audio sequentially...")
-        for i, scene in enumerate(scenes):
-            audio_path = os.path.join(temp_dir, f"scene_{i+1}_audio.wav")
-            st.write(f"Generating Audio {i+1}/{len(scenes)}...")
-            desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None # Define before try
-            try:
-                text_to_speak = scene.get("scene_description", "")[:350] # Enforce limit
-                voice_description = scene.get("audio_description", "A neutral speaker.")
-                if not text_to_speak:
-                    audio_results.append({"scene": i, "path": None, "status": "skipped"})
-                    continue
-                desc_input_ids = tts_desc_tokenizer(voice_description, return_tensors="pt").input_ids.to(device)
-                prompt_input_ids = tts_tokenizer(text_to_speak, return_tensors="pt").input_ids.to(device)
-                with torch.no_grad():
-                    generation = tts_model.generate(
-                        input_ids=desc_input_ids, prompt_input_ids=prompt_input_ids,
-                        do_sample=True, temperature=0.7 # Slightly higher temp for variety
-                    ).to(torch.float32)
-                audio_arr = generation.cpu().numpy().squeeze()
-                sampling_rate = tts_model.config.sampling_rate
-                sf.write(audio_path, audio_arr, sampling_rate)
-                audio_results.append({"scene": i, "path": audio_path, "status": "succeeded"})
-                st.audio(audio_path, format='audio/wav') # Preview audio
-            except Exception as e:
-                st.error(f"❌ Audio {i+1} Failed: {e}"); st.error(traceback.format_exc())
-                audio_results.append({"scene": i, "path": None, "status": "failed"})
-            finally: cleanup_gpu_memory(desc_input_ids, prompt_input_ids, generation, audio_arr)
-        st.success("✅ Audio generation step complete.")
-    except Exception as e:
-        st.error(f"❌ Audio Gen Step Failed: {e}"); st.error(traceback.format_exc())
-        audio_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
-    finally:
-        st.info("🔄 Unloading TTS Model..."); cleanup_gpu_memory(tts_model, tts_tokenizer, tts_desc_tokenizer); st.info("✅ TTS Model Unloaded.")
     return audio_results
-def run_compose_step_ffmpeg(video_results, audio_results, temp_dir, title="final_pov_video"):
     """Combines videos and audio using ffmpeg-python."""
-    st.info("🎞️ Composing final video using ffmpeg-python (CPU)...")
-    display_memory_usage()
-    final_video_path = None
-    long_video_path = os.path.join(temp_dir, "long_video_temp.mp4")
-    long_audio_path = os.path.join(temp_dir, "long_audio_temp.wav")
-    final_output_path = os.path.join(temp_dir, f"{title}.mp4")
-    concat_video_list_path = os.path.join(temp_dir, "ffmpeg_video_list.txt")
-    concat_audio_list_path = os.path.join(temp_dir, "ffmpeg_audio_list.txt")
-    try:
-        successful_videos = sorted([item for item in video_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
-        successful_audio = sorted([item for item in audio_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
-        # Align based on scene index for safety
-        paths_to_compose = []
-        audio_map = {item['scene']: item['path'] for item in successful_audio}
-        for video_item in successful_videos:
-            scene_idx = video_item['scene']
-            if scene_idx in audio_map:
-                paths_to_compose.append({'scene': scene_idx, 'video': video_item['path'], 'audio': audio_map[scene_idx]})
-        if not paths_to_compose:
-            st.error("❌ No matching video/audio pairs found.")
-            return None
-        st.write(f"Found {len(paths_to_compose)} matching scene(s) to compose.")
-        # 1. Create file lists for ffmpeg concat demuxer
-        with open(concat_video_list_path, "w") as f_vid, open(concat_audio_list_path, "w") as f_aud:
-            for item in paths_to_compose:
-                f_vid.write(f"file '{os.path.relpath(item['video'], temp_dir)}'\n") # Use relative paths within temp dir
-                f_aud.write(f"file '{os.path.relpath(item['audio'], temp_dir)}'\n")
-        # 2. Concatenate Audio Files
-        st.write("Concatenating audio...")
-        try:
-            (
-                ffmpeg
-                .input(concat_audio_list_path, format='concat', safe=0, fflags='+igndts') # Add flags
-                .output(long_audio_path, acodec='pcm_s16le') # Output intermediate WAV
-                .global_args('-hide_banner', '-loglevel', 'error') # Suppress verbose output
-                .run(overwrite_output=True, cmd='ffmpeg') # Specify cmd='ffmpeg' if needed
-            )
-            st.write("Audio concatenated.")
-        except ffmpeg.Error as e:
-            st.error("FFmpeg Audio Concat Error:")
-            st.code(e.stderr.decode() if e.stderr else str(e))
-            raise # Re-raise to stop the process
-        # 3. Concatenate Video Files
-        st.write("Concatenating videos...")
-        try:
-            (
-                ffmpeg
-                .input(concat_video_list_path, format='concat', safe=0, fflags='+igndts')
-                .output(long_video_path, c='copy') # Use stream copy for speed
-                .global_args('-hide_banner', '-loglevel', 'error')
-                .run(overwrite_output=True, cmd='ffmpeg')
-            )
-            st.write("Videos concatenated.")
-        except ffmpeg.Error as e:
-            st.error("FFmpeg Video Concat Error:")
-            st.code(e.stderr.decode() if e.stderr else str(e))
-            raise
-        # 4. Mux (Combine) Video and Audio
-        st.write("Muxing final video...")
-        try:
-            in_video = ffmpeg.input(long_video_path)
-            in_audio = ffmpeg.input(long_audio_path)
-            (
-                ffmpeg
-                .output(in_video, in_audio, final_output_path, vcodec='copy', acodec='aac', shortest=None, strict='experimental') # Use aac audio codec
-                .global_args('-hide_banner', '-loglevel', 'error')
-                .run(overwrite_output=True, cmd='ffmpeg')
-            )
-            final_video_path = final_output_path # Set the final path on success
-            st.success("✅ Final video composed!")
-        except ffmpeg.Error as e:
-            st.error("FFmpeg Muxing Error:")
-            st.code(e.stderr.decode() if e.stderr else str(e))
-            final_video_path = None # Ensure it's None on failure
-            raise
-    except Exception as e:
-        st.error(f"❌ Video Composition Step Failed: {e}")
-        st.error(traceback.format_exc())
-        final_video_path = None
-    finally:
-        # Clean up intermediate files and lists
-        st.write("Cleaning up intermediate composition files...")
-        for f_path in [long_video_path, long_audio_path, concat_video_list_path, concat_audio_list_path]:
-             if os.path.exists(f_path):
-                 try: os.remove(f_path)
-                 except Exception as e_clean: print(f"Error cleaning {f_path}: {e_clean}")
-        display_memory_usage() # Final memory check for this step
     return final_video_path
 # --- Streamlit UI ---
-st.title("🎬 POV Video Gen (HF Space Optimized)")
-st.caption("Local Generation: Scenario -> Story -> Images -> Videos -> Audio -> Compose -> Download")
 # Initialize Session State
 def init_state():
     keys_to_init = {
-        'generation_in_progress': False, 'current_step': "idle", 'story_data': None,
-        'image_results': [], 'video_results': [], 'audio_results': [],
-        'final_video_path': None, 'temp_dir_path': None,
-        'num_scenes': NUM_SCENES_DEFAULT
     }
     for key, default_value in keys_to_init.items():
         if key not in st.session_state:
             st.session_state[key] = default_value
-init_state()
 # --- Sidebar ---
 with st.sidebar:
     st.header("⚙️ Config & Control")
     user_prompt = st.text_area("1. Enter POV Scenario:", height=100, value="POV: You're Marco Polo negotiating trade routes in the Silk Road bazaar (1270)", key="user_prompt_input")
-    num_scenes_req = st.number_input(f"2. Target Scenes (Max {MAX_SCENES}):", min_value=1, max_value=MAX_SCENES, value=st.session_state.num_scenes, key="num_scenes_req_input")
     start_disable = st.session_state.generation_in_progress or device == "cpu"
     start_button = st.button("🚀 Start Generation", type="primary", disabled=start_disable)
     if start_button:
-        init_state() # Reset state variables first
         st.session_state.generation_in_progress = True
-        st.session_state.current_step = "story"
-        st.session_state.num_scenes = num_scenes_req # Use the requested number
-        cleanup_temp_dir() # Clean old files
-        get_temp_dir() # Ensure new temp dir exists for this run
-        st.experimental_rerun()
     st.header("⚠️ Actions")
     if st.button("🔁 Reset Workflow", disabled=st.session_state.generation_in_progress):
-        init_state()
         cleanup_temp_dir() # Also clean files on reset
-        st.experimental_rerun()
-    if st.button("🧹 Clean Temp Files Only", help=f"Removes files in {st.session_state.get('temp_dir_path', 'N/A')}", disabled=st.session_state.generation_in_progress):
         cleanup_temp_dir()
-        st.experimental_rerun() # Rerun to update button help text etc.
-# --- Main Area Logic & Progress ---
 st.divider()
 if device == "cpu":
-    st.error("🔴 GPU (CUDA) is required. Cannot run on CPU.")
 elif st.session_state.generation_in_progress:
-    st.subheader(f"🚀 Running Step: {st.session_state.current_step.upper()}")
-    progress_bar = st.progress(0)
     steps = ["story", "image", "video", "audio", "compose", "done"]
     try:
         current_index = steps.index(st.session_state.current_step)
-        progress_bar.progress((current_index / (len(steps) - 1)) * 100)
     except ValueError:
-        progress_bar.progress(0) # Should not happen
-    # Use placeholders for status updates within each step function
     status_placeholder = st.empty()
-    # Wrap the step execution in a try block to catch errors and stop
     try:
-        temp_dir = get_temp_dir() # Ensure temp_dir is set
-        current_step = st.session_state.current_step # Local copy
         if current_step == "story":
-            with status_placeholder.container(): st.session_state.story_data = run_llm_step(user_prompt, st.session_state.num_scenes)
-            next_step = "image" if st.session_state.story_data else "error"
         elif current_step == "image":
-            scenes = st.session_state.story_data.get('scenes', [])
-            with status_placeholder.container(): st.session_state.image_results = run_image_step(scenes, temp_dir)
-            next_step = "video" if any(r['status'] == 'succeeded' for r in st.session_state.image_results) else "error"
         elif current_step == "video":
-            scenes = st.session_state.story_data.get('scenes', [])
-            with status_placeholder.container(): st.session_state.video_results = run_video_step(st.session_state.image_results, scenes, temp_dir)
-            next_step = "audio" if any(r['status'] == 'succeeded' for r in st.session_state.video_results) else "error"
         elif current_step == "audio":
-            scenes = st.session_state.story_data.get('scenes', [])
-            with status_placeholder.container(): st.session_state.audio_results = run_audio_step(scenes, temp_dir)
-            next_step = "compose" if any(r['status'] == 'succeeded' for r in st.session_state.audio_results) else "error"
         elif current_step == "compose":
-            title_base = "".join(filter(str.isalnum, st.session_state.story_data.get('title', 'pov'))).replace(" ", "_") if st.session_state.story_data else "pov_video"
-            with status_placeholder.container(): st.session_state.final_video_path = run_compose_step_ffmpeg(
-                st.session_state.video_results, st.session_state.audio_results, temp_dir, title=title_base)
             next_step = "done" if st.session_state.final_video_path else "error"
-        else: # Should not be reached if logic is right
-            next_step = "error"
-        # Update state and rerun ONLY if the step succeeded
-        if next_step != "error":
-             st.session_state.current_step = next_step
-             if next_step == "done":
-                 st.session_state.generation_in_progress = False # Workflow finished successfully
-                 progress_bar.progress(100)
              st.experimental_rerun()
-        else:
-             st.error(f"🛑 Workflow failed at step: {current_step}")
-             st.session_state.current_step = "error"
              st.session_state.generation_in_progress = False
     except Exception as e:
          st.error(f"An unexpected error occurred during step {st.session_state.current_step}: {e}")
-         st.error(traceback.format_exc())
-         st.session_state.current_step = "error"
-         st.session_state.generation_in_progress = False
 # --- Display Final Output ---
 st.divider()
 st.header("✅ Final Video")
 if st.session_state.current_step == "done" and st.session_state.final_video_path:
     final_video_path = st.session_state.final_video_path
     if os.path.exists(final_video_path):
-        st.video(final_video_path)
         try:
             with open(final_video_path, "rb") as fp:
                 st.download_button(
                     label="⬇️ Download Final Video (.mp4)",
                     data=fp,
-                    file_name=os.path.basename(final_video_path),
                     mime="video/mp4",
-                    key="final_video_download_btn"
                 )
         except Exception as e:
             st.error(f"Error reading final video for download: {e}")
     else:
-        st.error(f"Final video file not found: {final_video_path}. It might have been cleaned up.")
 elif st.session_state.current_step == "error":
-    st.error("🛑 Workflow failed. Check logs above. Please Reset and try again.")
 elif st.session_state.generation_in_progress:
     st.info(f"⏳ Workflow running... Current step: **{st.session_state.current_step.upper()}**")
 else:
-    st.info("👋 Ready to generate. Use the sidebar to start.")
-# Optional: Display intermediate results in an expander
-with st.expander("Show Intermediate File Details", expanded=False):
      st.write("**Story Data:**"); st.json(st.session_state.story_data or {})
      st.write("**Image Results:**"); st.json(st.session_state.image_results or [])
      st.write("**Video Results:**"); st.json(st.session_state.video_results or [])
      st.write("**Audio Results:**"); st.json(st.session_state.audio_results or [])
      st.write("**Final Path:**", st.session_state.final_video_path or "Not generated")
      st.write("**Temp Dir:**", st.session_state.get('temp_dir_path', "N/A"))
-# Final memory display
 display_memory_usage()

 import ffmpeg # Use ffmpeg-python
 from transformers import AutoModelForCausalLM, AutoTokenizer
 from diffusers import StableDiffusionXLPipeline
+# Corrected import path for CogVideoXPipeline
+# This import path is typically found in the main branch of diffusers
 from diffusers.pipelines.cogvideo import CogVideoXPipeline
 from diffusers.utils import export_to_video
 from parler_tts import ParlerTTSForConditionalGeneration
 import traceback
 import psutil # For memory stats
+# Use a more explicit title indicating vertical format
+st.set_page_config(layout="wide", page_title="POV Vertical Video Gen (HF Space)")
 # --- Configuration ---
 LLM_MODEL_ID = "Qwen/Qwen3-0.6B"
 VIDEO_MODEL_ID = "THUDM/CogVideoX-2b"
 TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
+# Target Portrait Resolution for TikTok/YouTube Shorts (9:16 aspect ratio)
 IMAGE_WIDTH = 768
 IMAGE_HEIGHT = 1344
 SCENE_DURATION_SECONDS = 4 # Reduced duration for faster processing
 VIDEO_FPS = 10
 NUM_SCENES_DEFAULT = 3 # Lowered default
+MAX_SCENES = 4 # Stricter limit for free tier (T4 GPU)
+TEMP_SUBDIR = "pov_video_temp_hf" # Unique name for temp directory
 # --- Device Setup & Memory Monitor ---
 mem_info_placeholder = st.sidebar.empty()
         cpu_mem = process.memory_info().rss / (1024 * 1024) # MB
         gpu_mem_info = "N/A"
         if torch.cuda.is_available():
+            # Get current and peak allocated memory
             allocated = torch.cuda.memory_allocated(0) / (1024 * 1024) # MB
+            # reserved = torch.cuda.memory_reserved(0) / (1024 * 1024) # MB # Reserved is less critical than allocated/peak
+            peak_allocated = torch.cuda.max_memory_allocated(0) / (1024 * 1024) # MB
             total = torch.cuda.get_device_properties(0).total_memory / (1024 * 1024) # MB
+            gpu_mem_info = f"Alloc: {allocated:.0f}MB | Peak Alloc: {peak_allocated:.0f}MB | Total: {total:.0f}MB"
         mem_info_placeholder.info(f"🧠 CPU Mem: {cpu_mem:.0f} MB\n⚡ GPU Mem: {gpu_mem_info}")
     except Exception as e:
         mem_info_placeholder.warning(f"Could not get memory info: {e}")
+# Determine device (GPU or CPU)
 if torch.cuda.is_available():
     device = "cuda"
     try:
         vram_gb = torch.cuda.get_device_properties(0).total_memory / (1024**3)
         st.sidebar.success(f"✅ GPU Detected! VRAM: {vram_gb:.2f} GB")
+        # T4 has ~15GB, K80 has ~11GB (dual). Warn if significantly less than T4.
+        if vram_gb < 14: # Adjusted warning threshold slightly for typical free tiers
+            st.sidebar.warning("⚠️ Low VRAM detected (< 14GB). Generation may fail due to memory constraints.")
     except Exception:
+        st.sidebar.warning("Could not read GPU VRAM.") # Continue assuming GPU exists but warn
 else:
     device = "cpu"
+    st.sidebar.error("⚠️ No GPU! Model inference is not supported on CPU. Generation is disabled.")
 # --- Helper Functions ---
 def cleanup_gpu_memory(*args):
+    """Attempts to free GPU memory and runs GC."""
+    # print(f"Attempting GPU mem cleanup. Vars to del: {len(args)}") # Optional: uncomment for verbose logs
+    # Display memory before cleanup
+    display_memory_usage()
+    # Dereference objects explicitly
+    for arg in args:
+        del arg
     gc.collect()
     if torch.cuda.is_available():
         torch.cuda.empty_cache()
+        # Optionally reset peak stats after major cleanup to monitor next stage
+        # torch.cuda.reset_peak_memory_stats(0)
+    # Display memory after cleanup
+    display_memory_usage()
+    # print("GPU mem cleanup done.") # Optional: uncomment for verbose logs
 def get_temp_dir():
     """Creates or returns the path to the temporary directory."""
     # Use a consistent path within the app's execution context for simplicity on Spaces
     # This might lead to leftover files if cleanup fails, but avoids potential permission issues with system temp
+    # Using a subdirectory of the current working directory is safer on platforms like Spaces
+    app_temp_dir = os.path.join(os.getcwd(), TEMP_SUBDIR)
     os.makedirs(app_temp_dir, exist_ok=True)
+    # Store the path in session state to be able to clean it later
     if 'temp_dir_path' not in st.session_state or st.session_state.temp_dir_path != app_temp_dir:
         print(f"Setting temp dir: {app_temp_dir}")
         st.session_state.temp_dir_path = app_temp_dir
 def cleanup_temp_dir():
     """Removes the application's temporary directory."""
     dir_path = st.session_state.get('temp_dir_path', None)
+    # Safety check: Ensure the path exists and contains the unique subdir name before deleting
+    # This prevents accidentally deleting critical system directories.
+    if dir_path and os.path.exists(dir_path) and TEMP_SUBDIR in dir_path and os.path.basename(dir_path) == TEMP_SUBDIR:
         try:
+            st.sidebar.info(f"Attempting to clean up: {dir_path}")
             shutil.rmtree(dir_path)
             st.sidebar.success(f"Cleaned up: {dir_path}")
+            st.session_state.temp_dir_path = None # Clear the path from state after cleaning
         except Exception as e:
             st.sidebar.error(f"Error cleaning temp dir {dir_path}: {e}")
     else:
 # --- Model Interaction Functions (Load -> Use -> Unload) ---
+def run_llm_step(user_prompt, num_scenes, status_placeholder):
     """Loads LLM, generates story, unloads LLM."""
+    with status_placeholder.container():
+        st.info(f"🔄 Loading LLM: {LLM_MODEL_ID}...")
+        if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
         display_memory_usage()
+        llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None
+        story_data = None
+        try:
+            # Use bfloat16 if available and CUDA is used, otherwise float16 for CUDA, float32 for CPU
+            dtype = torch.bfloat16 if device=="cuda" and torch.cuda.is_bf16_supported() else torch.float16 if device=="cuda" else torch.float32
+            llm_tokenizer = AutoTokenizer.from_pretrained(LLM_MODEL_ID)
+            # Use device_map="auto" for automatic model distribution across devices (including CPU offload)
+            llm_model = AutoModelForCausalLM.from_pretrained(
+                LLM_MODEL_ID, torch_dtype=dtype, low_cpu_mem_usage=True, device_map="auto"
+            )
+            if torch.cuda.is_available():
+                 display_memory_usage() # Display after loading
+                 st.info(f"📊 Peak GPU Memory (after LLM load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+            st.info("🧠 Generating story structure...")
+            if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
+            # --- System Prompt --- (Updated to mention vertical format and specific dimensions)
+            system_prompt = f"""
+You are an expert director creating POV vertical video scripts for platforms like TikTok and YouTube Shorts.
+Break down the user's scenario into exactly {num_scenes} scenes, each intended for a clip approximately {SCENE_DURATION_SECONDS} seconds long with an aspect ratio of {IMAGE_WIDTH}x{IMAGE_HEIGHT} pixels (portrait).
 For EACH scene, generate:
+1. `scene_description`: Max 1-2 concise sentences describing action/setting for TTS. Max 350 characters. Keep in mind this will be spoken over a short video clip.
+2. `image_prompt`: Detailed SDXL POV prompt (Start with "First-person perspective - pov shot of..."). Include setting, mood, style, time period, elements. Emphasize visual elements suitable for a portrait {IMAGE_WIDTH}x{IMAGE_HEIGHT} frame. Add "pov hands from the bottom corner, phone in hand," etc., if relevant to the scenario.
+3. `video_direction_prompt`: Simple camera action/motion for CogVideoX (e.g., "Camera pans right", "Subtle zoom in", "Static shot", "Hand reaches out"). Focus on short, subtle motions suitable for a fixed POV and vertical format. Avoid complex actions that require significant scene changes.
+4. `audio_description`: Voice & ambience description for Parler-TTS (e.g., "Nervous male voice, faint market chatter.", "Calm female narrator, quiet library ambience."). This sets the tone for the narration/voiceover.
+Respond ONLY with a valid JSON object. Ensure the JSON structure is exactly as follows, with a top-level "story_details" object containing a "scenes" array:
 {{
 "story_details": {{
+"title": "POV Title (e.g.,POV First Date)",
+"full_story": "Brief summary of the complete POV story.",
 "scenes": [
   {{ // Scene 1
     "scene_description": "...", // Max 350 chars
 ]
 }}
 }}
+Strictly adhere to JSON format. No conversational text, markdown code blocks (\`\`\`json), or any other text before or after the JSON block.
+            """.strip()
+            messages = [{"role": "system", "content": system_prompt}, {"role": "user", "content": f"Create script: {user_prompt}"}]
+            # Use add_generation_prompt=True for Qwen models to follow their chat format
+            text_input = llm_tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True, enable_thinking=False)
+            # Move input tensors to the model's device
+            model_inputs = llm_tokenizer([text_input], return_tensors="pt").to(llm_model.device if hasattr(llm_model, 'device') else device)
+            # Generate the response
+            generated_ids = llm_model.generate(
+                **model_inputs,
+                max_new_tokens=4096, # Set a reasonable upper limit for the response length
+                temperature=0.7, # Control randomness
+                top_p=0.8,       # Nucleus sampling
+                top_k=20,        # Top-k sampling
+                do_sample=True,  # Enable sampling
+                pad_token_id=llm_tokenizer.eos_token_id, # Ensure generation stops correctly
+                num_beams=1 # Use greedy or sampling search, not beam search for chat
+            )
+            # Decode the generated part of the output
+            output_ids = generated_ids[0][len(model_inputs.input_ids[0]):]
+            response_text = llm_tokenizer.decode(output_ids, skip_special_tokens=True).strip()
+            if torch.cuda.is_available():
+                 display_memory_usage() # Display after inference
+                 st.info(f"📊 Peak GPU Memory (during LLM inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+            st.write("LLM Raw Output:"); st.code(response_text, language='text')
+            # Robust JSON parsing - try to find and parse the JSON object
+            json_string = response_text.strip()
             try:
+                 # Attempt direct parse first
+                 parsed_data = json.loads(json_string)
+            except json.JSONDecodeError:
+                 # If direct parse fails, try to find the JSON within the text (handles ```json, etc.)
+                 json_start = json_string.find('{')
+                 json_end = json_string.rfind('}')
+                 if json_start == -1 or json_end == -1:
+                      raise ValueError("JSON object not found in LLM output.")
+                 json_string = json_string[json_start : json_end + 1]
+                 parsed_data = json.loads(json_string)
+            if not ("story_details" in parsed_data and "scenes" in parsed_data["story_details"]):
+                raise ValueError("Invalid JSON structure from LLM: missing 'story_details' or 'scenes'.")
+            # Check if the LLM generated the requested number of scenes (warning only, proceed with what was generated)
+            actual_num_scenes = len(parsed_data["story_details"]["scenes"])
+            if actual_num_scenes != num_scenes:
+                 st.warning(f"LLM generated {actual_num_scenes} scenes, but {num_scenes} were requested. Using the generated scenes.")
+            story_data = parsed_data["story_details"]
+            st.success("✅ Story generation complete.")
+        except Exception as e:
+            st.error(f"❌ LLM Step Failed: {e}"); st.error(traceback.format_exc()); story_data = None
+        finally:
+            # Explicitly set references to None before cleanup
+            cleanup_gpu_memory(llm_model, llm_tokenizer, model_inputs, generated_ids)
+            llm_model, llm_tokenizer, model_inputs, generated_ids = None, None, None, None # Ensure they are truly dereferenced
+            st.info("✅ LLM Unloaded.")
+    return story_data
+def run_image_step(scenes, temp_dir, status_placeholder):
+    if not scenes:
+        with status_placeholder.container(): st.warning("Skipping image step: No scenes available from story data.")
+        return []
+    with status_placeholder.container():
+        st.info(f"🔄 Loading Image Generator: {IMAGE_MODEL_ID}...")
+        if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
+        display_memory_usage()
+        image_pipe = None; image_results = []
+        try:
+            dtype = torch.float16 if device == "cuda" else torch.float32
+            # Load SDXL pipe with necessary optimizations
+            image_pipe = StableDiffusionXLPipeline.from_pretrained(
+                IMAGE_MODEL_ID,
+                torch_dtype=dtype,
+                use_safetensors=True,
+                variant="fp16" if device == "cuda" and dtype == torch.float16 else None,
+                low_cpu_mem_usage=True # Helps load models on systems with limited RAM
+            )
+            # Enable model offloading - moves parts of the model to CPU/disk to save VRAM
+            if device == "cuda": image_pipe.enable_model_cpu_offload()
+            else: image_pipe.to(device) # Ensure pipe is on the correct device if not using offload
+            if torch.cuda.is_available():
+                 display_memory_usage() # Display after loading
+                 st.info(f"📊 Peak GPU Memory (after Image load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+            st.info(f"🎨 Generating images ({IMAGE_WIDTH}x{IMAGE_HEIGHT}) sequentially...")
+            for i, scene in enumerate(scenes):
+                # Ensure image path is within the temp directory
+                img_path = os.path.join(temp_dir, f"scene_{i+1}_img.png")
+                st.write(f"Generating Image {i+1}/{len(scenes)}...")
+                image = None # Define before try block
+                try:
+                    if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
+                    with torch.no_grad():
+                        # Generate image with desired portrait dimensions
+                        image = image_pipe(
+                            prompt=scene.get("image_prompt", "blank image, abstract art"), # Use a default prompt if missing
+                            width=IMAGE_WIDTH,
+                            height=IMAGE_HEIGHT,
+                            num_inference_steps=25 # Balance speed and quality
+                        ).images[0]
+                    if torch.cuda.is_available():
+                         display_memory_usage() # Display after inference
+                         st.info(f"📊 Peak GPU Memory (during Image inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+                    image.save(img_path)
+                    image_results.append({"scene": i, "path": img_path, "status": "succeeded"})
+                    # Display smaller image preview to save browser resources
+                    st.image(image, caption=f"Scene {i+1} Image OK", width=150)
+                except Exception as e:
+                    st.error(f"❌ Image {i+1} Failed: {e}"); st.error(traceback.format_exc())
+                    image_results.append({"scene": i, "path": None, "status": "failed"})
+                finally:
+                    # Explicitly set references to None before cleanup
+                    cleanup_gpu_memory(image); image = None # Clean intermediate variable
+            st.success("✅ Image generation step complete.")
+        except Exception as e:
+            st.error(f"❌ Image Gen Step Failed: {e}"); st.error(traceback.format_exc())
+            # If the pipe loading failed, mark all as failed
+            if not image_results:
+                 image_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
+        finally:
+            # Explicitly set references to None before cleanup
+            cleanup_gpu_memory(image_pipe); image_pipe = None
+            st.info("✅ Image Generator Unloaded.")
     return image_results
+def run_video_step(image_results, scenes, temp_dir, status_placeholder):
+    # Only attempt video generation for images that succeeded
     successful_images = [item for item in image_results if item["status"] == "succeeded"]
+    if not successful_images:
+        with status_placeholder.container(): st.warning("Skipping video step: No successful images were generated.")
+        return []
+    with status_placeholder.container():
+        st.info(f"🔄 Loading Video Generator: {VIDEO_MODEL_ID}...")
+        if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
+        display_memory_usage()
+        video_pipe = None; video_results = []
+        try:
+            dtype = torch.float16 if device == "cuda" else torch.float32
+            # Load CogVideoX pipeline
+            video_pipe = CogVideoXPipeline.from_pretrained(VIDEO_MODEL_ID, torch_dtype=dtype)
+            # Enable memory offloading for CogVideoX
+            if device == "cuda":
+                 video_pipe.enable_model_cpu_offload() # Moves parts of the model to CPU/disk
+                 video_pipe.enable_sequential_cpu_offload() # Further optimization for sequential parts
+            else: video_pipe.to(device) # Ensure pipe is on CPU if no GPU
+            # Enable VAE slicing/tiling if helpful (can reduce VRAM for VAE part)
+            video_pipe.vae.enable_slicing(); video_pipe.vae.enable_tiling()
+            if torch.cuda.is_available():
+                 display_memory_usage() # Display after loading
+                 st.info(f"📊 Peak GPU Memory (after Video load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+            st.info("🎬 Generating videos sequentially from images...")
+            # Use a generator for deterministic (or reproducible based on seed) results if needed
+            # However, time-based seed is fine for unique videos
+            generator = torch.Generator(device=device) # Generator needs to be on the correct device
+            for item in successful_images:
+                scene_index = item["scene"] # Use the original scene index
+                # Ensure video path is within the temp directory
+                vid_path = os.path.join(temp_dir, f"scene_{scene_index + 1}_vid.mp4")
+                st.write(f"Generating Video for Scene {scene_index + 1} (Image {scene_index + 1})...")
+                img, video_frames = None, None # Define before try
+                try:
+                    img = Image.open(item["path"]) # Load the generated image
+                    # CogVideoX uses a 'video_direction_prompt' for motion
+                    video_direction = scenes[scene_index].get("video_direction_prompt", "subtle motion")
+                    # Generate a seed based on current time + scene index
+                    seed = int(time.time() * 1000 + scene_index) % 100000
+                    # Set the seed for the generator on the correct device
+                    generator.manual_seed(seed)
+                    if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
+                    with torch.no_grad():
+                        # Generate video frames from the image and motion prompt
+                        # num_frames should ideally match SCENE_DURATION_SECONDS * VIDEO_FPS
+                        video_frames = video_pipe(
+                            prompt=video_direction,
+                            image=img,
+                            num_inference_steps=40, # Balance speed/quality
+                            num_frames=int(SCENE_DURATION_SECONDS * VIDEO_FPS), # Generate specific number of frames
+                            guidance_scale=6.0,
+                            generator=generator
+                        ).frames[0] # Get the first (and only) video sequence
+                    if torch.cuda.is_available():
+                         display_memory_usage() # Display after inference
+                         st.info(f"📊 Peak GPU Memory (during Video inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+                    # Export the frames to a video file
+                    # export_to_video handles saving frames to MP4 using imageio-ffmpeg
+                    export_to_video(video_frames, vid_path, fps=VIDEO_FPS)
+                    video_results.append({"scene": scene_index, "path": vid_path, "status": "succeeded"})
+                    # Comment out preview to save resources on Spaces
+                    # st.video(vid_path, format='video/mp4', start_time=0)
+                    st.success(f"Video Scene {scene_index + 1} OK.")
+                except Exception as e:
+                    st.error(f"❌ Video {scene_index + 1} Failed: {e}"); st.error(traceback.format_exc())
+                    video_results.append({"scene": scene_index, "path": None, "status": "failed"})
+                finally:
+                    # Explicitly set references to None before cleanup
+                    cleanup_gpu_memory(img, video_frames); img, video_frames = None, None
+            st.success("✅ Video generation step complete.")
+        except Exception as e:
+            st.error(f"❌ Video Gen Step Failed: {e}"); st.error(traceback.format_exc())
+            # If the pipe loading failed, mark all potential videos as failed
+            if not video_results:
+                 video_results = [{"scene": item["scene"], "path": None, "status": "failed"} for item in successful_images]
+        finally:
+            # Explicitly set references to None before cleanup
+            cleanup_gpu_memory(video_pipe); video_pipe = None
+            st.info("✅ Video Generator Unloaded.")
     return video_results
+def run_audio_step(scenes, temp_dir, status_placeholder):
+    # Generate audio for all scenes from the story data, regardless of image/video success,
+    # so we have potential audio even for muted video segments if needed for composition.
+    if not scenes:
+        with status_placeholder.container(): st.warning("Skipping audio step: No scenes available from story data.")
+        return []
+    with status_placeholder.container():
+        st.info(f"🔄 Loading TTS Model: {TTS_MODEL_ID}...")
+        if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before load
+        display_memory_usage()
+        tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
+        audio_results = []
+        try:
+            # Load TTS model (ParlerTTSForConditionalGeneration requires specific class)
+            # Use device_map="auto" for automatic handling of model placement
+            tts_model = ParlerTTSForConditionalGeneration.from_pretrained(TTS_MODEL_ID, device_map="auto")
+            # Tokenizers are usually CPU-based, load them normally
+            tts_tokenizer = AutoTokenizer.from_pretrained(TTS_MODEL_ID) # Tokenizer for the text prompt
+            # Tokenizer for the voice description might be different, get its path from the model config
+            tts_desc_tokenizer = AutoTokenizer.from_pretrained(tts_model.config.text_encoder._name_or_path)
+            if torch.cuda.is_available():
+                display_memory_usage() # Display after loading
+                st.info(f"📊 Peak GPU Memory (after TTS load): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+            st.info("🔊 Generating audio sequentially...")
+            for i, scene in enumerate(scenes):
+                # Ensure audio path is within the temp directory
+                audio_path = os.path.join(temp_dir, f"scene_{i+1}_audio.wav")
+                st.write(f"Generating Audio {i+1}/{len(scenes)}...")
+                desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None # Define before try
+                try:
+                    text_to_speak = scene.get("scene_description", "").strip() # Get description
+                    voice_description = scene.get("audio_description", "A neutral speaker.") # Get voice desc
+                    # Enforce limit and check if there's actually text to speak
+                    if not text_to_speak or len(text_to_speak) > 350:
+                        if len(text_to_speak) > 350:
+                            st.warning(f"Audio {i+1} description too long ({len(text_to_speak)} chars). Skipping audio generation for this scene: {text_to_speak[:100]}...")
+                        else:
+                             st.info(f"Audio {i+1}: No text description provided. Skipping audio generation for this scene.")
+                        # Still record a result, but status is skipped/failed
+                        audio_results.append({"scene": i, "path": None, "status": "skipped"})
+                        continue # Move to the next scene's audio
+                    if torch.cuda.is_available(): torch.cuda.reset_peak_memory_stats(0) # Reset peak stats before inference
+                    # Tokenize inputs - ensure they are on the correct device where the model parts are
+                    # Since using device_map="auto", the model handles moving inputs. Tokenizers are usually CPU.
+                    # For ParlerTTS with device_map="auto", inputs should likely be on the device where the
+                    # input layers land, which might still be GPU. Best to send explicitly if needed, or rely on auto.
+                    # Let's explicitly move to the model's primary device if possible.
+                    model_device = tts_model.device if hasattr(tts_model, 'device') and tts_model.device.type != 'cpu' else device # Handle case if auto map puts main layer on CPU
+                    desc_input_ids = tts_desc_tokenizer(voice_description, return_tensors="pt").input_ids.to(model_device)
+                    prompt_input_ids = tts_tokenizer(text_to_speak, return_tensors="pt").input_ids.to(model_device)
+                    with torch.no_grad():
+                        # Generate audio - ParlerTTS outputs waveform directly
+                        # generation is [batch_size, num_samples]
+                        generation = tts_model.generate(
+                            input_ids=desc_input_ids,
+                            prompt_input_ids=prompt_input_ids,
+                            do_sample=True,
+                            temperature=0.7 # Control voice variation
+                        ).to(torch.float32) # Ensure output is float32 for soundfile
+                    if torch.cuda.is_available():
+                        display_memory_usage() # Display after inference
+                        st.info(f"📊 Peak GPU Memory (during Audio inference): {torch.cuda.max_memory_allocated(0)/(1024*1024):.0f} MB")
+                    # Convert tensor output to numpy array and get sampling rate
+                    audio_arr = generation.cpu().numpy().squeeze()
+                    sampling_rate = tts_model.config.sampling_rate
+                    # Save the audio using soundfile
+                    sf.write(audio_path, audio_arr, sampling_rate)
+                    audio_results.append({"scene": i, "path": audio_path, "status": "succeeded"})
+                    st.audio(audio_path, format='audio/wav') # Preview audio
+                except Exception as e:
+                    st.error(f"❌ Audio {i+1} Failed: {e}"); st.error(traceback.format_exc())
+                    audio_results.append({"scene": i, "path": None, "status": "failed"})
+                finally:
+                    # Explicitly set references to None before cleanup
+                    cleanup_gpu_memory(desc_input_ids, prompt_input_ids, generation, audio_arr)
+                    desc_input_ids, prompt_input_ids, generation, audio_arr = None, None, None, None
+            st.success("✅ Audio generation step complete.")
+        except Exception as e:
+            st.error(f"❌ Audio Gen Step Failed: {e}"); st.error(traceback.format_exc())
+            # If the TTS *model loading* failed, mark all potential audio as failed
+            if not audio_results:
+                 audio_results = [{"scene": i, "path": None, "status": "failed"} for i in range(len(scenes))]
+        finally:
+            # Explicitly set references to None before cleanup
+            cleanup_gpu_memory(tts_model, tts_tokenizer, tts_desc_tokenizer)
+            tts_model, tts_tokenizer, tts_desc_tokenizer = None, None, None
+            st.info("✅ TTS Model Unloaded.")
     return audio_results
+def run_compose_step_ffmpeg(video_results, audio_results, temp_dir, title="final_pov_video", status_placeholder=None):
     """Combines videos and audio using ffmpeg-python."""
+    if status_placeholder is None:
+         # Use a default container if no placeholder is passed (shouldn't happen in the app flow)
+         status_placeholder = st.empty()
+    with status_placeholder.container():
+        st.info("🎞️ Composing final video using ffmpeg-python (CPU)...")
+        # Composition is CPU-bound, display CPU memory
+        display_memory_usage()
+        final_video_path = None
+        long_video_path = os.path.join(temp_dir, "long_video_temp.mp4")
+        long_audio_path = os.path.join(temp_dir, "long_audio_temp.wav")
+        # Sanitize title for filename - keep alphanumeric, spaces, underscores
+        safe_title = "".join(c for c in title if c.isalnum() or c in (' ', '_')).rstrip().replace(' ', '_')
+        # Add a default if title is empty after sanitization
+        final_output_filename = f"{safe_title or 'pov_video'}.mp4"
+        final_output_path = os.path.join(temp_dir, final_output_filename)
+        # Create lists for ffmpeg concat demuxer files
+        concat_video_list_path = os.path.join(temp_dir, "ffmpeg_video_list.txt")
+        concat_audio_list_path = os.path.join(temp_dir, "ffmpeg_audio_list.txt")
+        try:
+            # Filter and sort results by scene index
+            successful_videos = sorted([item for item in video_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
+            successful_audio = sorted([item for item in audio_results if item["status"] == "succeeded"], key=lambda x: x["scene"])
+            # Align video and audio streams by scene index.
+            # Prioritize video; include audio only if a video exists for that scene index.
+            # This creates a list of dictionaries for scenes that will be included in the final video.
+            composed_scenes_data = []
+            audio_map = {item['scene']: item['path'] for item in successful_audio}
+            for video_item in successful_videos:
+                scene_idx = video_item['scene']
+                composed_scenes_data.append({
+                    'scene': scene_idx,
+                    'video_path': video_item['path'],
+                    'audio_path': audio_map.get(scene_idx) # Get audio path if it exists, otherwise None
+                })
+            if not composed_scenes_data:
+                st.error("❌ No successful video clips generated to compose.")
+                return None
+            st.write(f"Found {len(composed_scenes_data)} scene(s) with successful video clips to compose.")
+            # --- FFmpeg Concatenation ---
+            # 1. Create file lists for ffmpeg concat demuxer (only for *successful* components)
+            # List all video paths that will be concatenated
+            videos_for_concat = [item['video_path'] for item in composed_scenes_data]
+            # List audio paths ONLY for scenes that had successful video *AND* successful audio
+            audio_for_concat = [item['audio_path'] for item in composed_scenes_data if item['audio_path']]
+            if not videos_for_concat:
+                st.error("❌ No video files found for composition after filtering.")
+                return None
+            # Write video list file
+            with open(concat_video_list_path, "w") as f_vid:
+                for v_path in videos_for_concat:
+                    # Use relative paths within temp dir - safer for ffmpeg execution
+                    f_vid.write(f"file '{os.path.relpath(v_path, temp_dir)}'\n")
+            # Write audio list file only if there's audio to concatenate
+            if audio_for_concat:
+                 with open(concat_audio_list_path, "w") as f_aud:
+                      for a_path in audio_for_concat:
+                           f_aud.write(f"file '{os.path.relpath(a_path, temp_dir)}'\n")
+            # 2. Concatenate Video Files using concat demuxer
+            st.write("Concatenating videos...")
+            try:
+                # Using -f concat -safe 0 is standard for combining lists of files
+                # fflags=+igndts helps with variable frame rate or timestamp issues common in generated videos
+                (
+                    ffmpeg
+                    .input(concat_video_list_path, format='concat', safe=0, fflags='+igndts')
+                    .output(long_video_path, c='copy') # Use stream copy for speed - assumes inputs are compatible codecs/formats
+                    .global_args('-hide_banner', '-loglevel', 'error') # Suppress verbose output, show only errors
+                    .run(overwrite_output=True, cmd='ffmpeg') # Explicitly call 'ffmpeg' command
+                )
+                st.write("Videos concatenated successfully.")
+            except ffmpeg.Error as e:
+                st.error("FFmpeg Video Concat Error:")
+                st.code(e.stderr.decode() if e.stderr else str(e))
+                raise # Re-raise to stop the process if video concat fails
+            # 3. Concatenate Audio Files using concat demuxer (Only if any audio was successful for corresponding videos)
+            long_audio_concatenated = False # Flag to track if audio concat succeeded
+            if audio_for_concat:
+                 st.write("Concatenating audio...")
+                 try:
+                      # Use pcm_s16le for intermediate WAV format - it's safe and widely compatible
+                      (
+                           ffmpeg
+                           .input(concat_audio_list_path, format='concat', safe=0, fflags='+igndts')
+                           .output(long_audio_path, acodec='pcm_s16le') # Output intermediate WAV
+                           .global_args('-hide_banner', '-loglevel', 'error')
+                           .run(overwrite_output=True, cmd='ffmpeg')
+                      )
+                      st.write("Audio concatenated successfully.")
+                      long_audio_concatenated = True # Set flag on success
+                 except ffmpeg.Error as e:
+                      st.warning("FFmpeg Audio Concat Error - proceeding without audio:")
+                      st.code(e.stderr.decode() if e.stderr else str(e))
+                      # Don't re-raise, just continue without audio if it fails
+            # 4. Mux (Combine) Video and Audio
+            st.write("Muxing final video and audio...")
+            try:
+                in_video = ffmpeg.input(long_video_path)
+                # Add audio input only if audio was successfully concatenated
+                if long_audio_concatenated and os.path.exists(long_audio_path):
+                    in_audio = ffmpeg.input(long_audio_path)
+                    # Output command with both video and audio inputs
+                    stream = ffmpeg.output(in_video, in_audio, final_output_path,
+                                           vcodec='copy', # Copy video stream directly (preserves portrait format)
+                                           acodec='aac', # Re-encode audio to AAC (standard for MP4)
+                                           shortest=None, # Extend shorter stream (video or audio) to match the longer one's duration
+                                           strict='experimental') # Needed for some older aac encoders, good practice
+                else: # No audio to mux, output video only
+                     st.warning("Muxing video without audio.")
+                     stream = ffmpeg.output(in_video, final_output_path,
+                                           vcodec='copy',
+                                           an=None) # -an flag removes audio stream from the output
+                # Run the muxing command
+                stream.global_args('-hide_banner', '-loglevel', 'error').run(overwrite_output=True, cmd='ffmpeg')
+                # Set the final path on success
+                final_video_path = final_output_path
+                st.success("✅ Final video composed!")
+            except ffmpeg.Error as e:
+                st.error("FFmpeg Muxing Error:")
+                st.code(e.stderr.decode() if e.stderr else str(e))
+                final_video_path = None # Ensure it's None on failure
+                # Re-raise the muxing error as composition failed
+                raise
+        except Exception as e:
+            # Catch any other errors during the composition logic (file handling, etc.)
+            st.error(f"❌ Video Composition Step Failed: {e}")
+            st.error(traceback.format_exc())
+            final_video_path = None
+        finally:
+            # Clean up intermediate files and lists regardless of success/failure
+            st.write("Cleaning up intermediate composition files...")
+            intermediate_files = [long_video_path, long_audio_path, concat_video_list_path, concat_audio_list_path]
+            for f_path in intermediate_files:
+                 if os.path.exists(f_path):
+                     try:
+                         os.remove(f_path)
+                         # print(f"Cleaned: {f_path}") # Optional: uncomment for verbose logs
+                     except Exception as e_clean: print(f"Error cleaning {f_path}: {e_clean}")
+            display_memory_usage() # Final memory check for this step
     return final_video_path
 # --- Streamlit UI ---
+# Updated title and caption for clarity
+st.title("🎬 POV Vertical Video Gen (HF Space Optimized)")
+st.caption(f"Workflow: Scenario → Story → Images ({IMAGE_WIDTH}x{IMAGE_HEIGHT}) → Videos → Audio → Compose → Download. Optimized for vertical formats (e.g., TikTok/YouTube Shorts) on the Free Tier.")
 # Initialize Session State
+# This function ensures required keys exist in st.session_state on first load
 def init_state():
     keys_to_init = {
+        'generation_in_progress': False, # Flag to indicate if a generation process is running
+        'current_step': "idle",         # Current step in the workflow ("idle", "story", "image", ...)
+        'story_data': None,             # Stores the output from the LLM step
+        'image_results': [],            # List of results from the image generation step
+        'video_results': [],            # List of results from the video generation step
+        'audio_results': [],            # List of results from the audio generation step
+        'final_video_path': None,       # Path to the final composed video file
+        'temp_dir_path': None,          # Path to the temporary directory for this run
+        'num_scenes': NUM_SCENES_DEFAULT # Number of scenes requested
     }
     for key, default_value in keys_to_init.items():
         if key not in st.session_state:
             st.session_state[key] = default_value
+init_state() # Call init_state on each app load to set defaults if not already present
 # --- Sidebar ---
 with st.sidebar:
     st.header("⚙️ Config & Control")
+    # Text area for user input scenario
     user_prompt = st.text_area("1. Enter POV Scenario:", height=100, value="POV: You're Marco Polo negotiating trade routes in the Silk Road bazaar (1270)", key="user_prompt_input")
+    # Number input for the desired number of scenes
+    # Ensure min/max values are enforced
+    num_scenes_req = st.number_input(f"2. Target Scenes (Max {MAX_SCENES}):", min_value=1, max_value=MAX_SCENES,
+                                     value=min(st.session_state.num_scenes, MAX_SCENES), # Ensure initial value respects max
+                                     step=1, # Increment by 1
+                                     key="num_scenes_req_input")
+    # Display target dimensions and duration clearly
+    st.info(f"Target video resolution: {IMAGE_WIDTH}x{IMAGE_HEIGHT} (Portrait)")
+    st.info(f"Approx. scene duration: {SCENE_DURATION_SECONDS}s, FPS: {VIDEO_FPS}")
+    # Start generation button
+    # Disabled if generation is already in progress or if running on CPU
     start_disable = st.session_state.generation_in_progress or device == "cpu"
     start_button = st.button("🚀 Start Generation", type="primary", disabled=start_disable)
     if start_button:
+        # Reset state and trigger the start of the workflow
+        init_state() # Reset all state variables for a fresh run
         st.session_state.generation_in_progress = True
+        st.session_state.current_step = "story" # Start with the story generation step
+        st.session_state.num_scenes = num_scenes_req # Store the user-requested number of scenes
+        cleanup_temp_dir() # Clean old files before starting a new run
+        get_temp_dir() # Ensure a new temp dir path is set for this run
+        st.experimental_rerun() # Trigger a rerun to enter the generation loop
     st.header("⚠️ Actions")
+    # Reset workflow button - disabled if generation is in progress
     if st.button("🔁 Reset Workflow", disabled=st.session_state.generation_in_progress):
+        init_state() # Reset all session state
         cleanup_temp_dir() # Also clean files on reset
+        st.experimental_rerun() # Rerun to update UI state and exit generation loop
+    # Clean temp files button - disabled if generation is in progress
+    cleanup_button_help = f"Removes files in: {st.session_state.get('temp_dir_path', 'N/A')}"
+    if st.button("🧹 Clean Temp Files Only", help=cleanup_button_help, disabled=st.session_state.generation_in_progress):
         cleanup_temp_dir()
+        # No rerun needed here unless you want to force UI update based on temp_dir_path existence
+# --- Main Area Logic & Progress Display ---
 st.divider()
+# Display error if running on CPU
 if device == "cpu":
+    st.error("🔴 GPU (CUDA) is required for model inference. This application will not run on CPU.")
+# If generation is in progress, display current step and progress bar
 elif st.session_state.generation_in_progress:
+    st.subheader(f"🚀 Running Step: **{st.session_state.current_step.upper()}**")
+    progress_bar = st.progress(0) # Initialize or update progress bar
+    # Define the sequence of steps
     steps = ["story", "image", "video", "audio", "compose", "done"]
     try:
         current_index = steps.index(st.session_state.current_step)
+        # Calculate progress percentage
+        # Prevent progress bar from reaching 100% before the 'done' step
+        progress_value = (current_index / (len(steps) - 1)) * 100
+        if st.session_state.current_step != "done":
+             progress_bar.progress(int(min(progress_value, 99))) # Cap at 99%
+        else:
+             progress_bar.progress(100)
     except ValueError:
+        # Fallback if current_step is somehow not in the steps list
+        progress_bar.progress(0)
+    # Use a single placeholder for step-specific status updates (loading, generating, etc.)
     status_placeholder = st.empty()
+    # --- Workflow Execution Logic ---
+    # This block executes one step at a time based on st.session_state.current_step
     try:
+        # Ensure temp_dir is set before starting any step that uses files
+        temp_dir = get_temp_dir()
+        current_step = st.session_state.current_step # Get current step from state
+        # --- Execute the current step ---
+        next_step = current_step # Default next step is the current one (stays if error)
         if current_step == "story":
+            st.session_state.story_data = run_llm_step(user_prompt, st.session_state.num_scenes, status_placeholder)
+            # Determine next step based on success of the current step
+            next_step = "image" if st.session_state.story_data and st.session_state.story_data.get('scenes') else "error"
+            if next_step == "error": status_placeholder.error("Story generation failed or returned no scenes.")
         elif current_step == "image":
+            # Pass the scenes data from the story step
+            scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
+            if not scenes:
+                 status_placeholder.warning("Skipping image step: No scenes available from story data.")
+                 st.session_state.image_results = [] # Ensure it's an empty list if skipped
+            else:
+                 st.session_state.image_results = run_image_step(scenes, temp_dir, status_placeholder)
+            next_step = "video" # Always proceed to video step, it handles empty results
         elif current_step == "video":
+            # Pass scenes and image results
+            scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
+            st.session_state.video_results = run_video_step(st.session_state.image_results, scenes, temp_dir, status_placeholder)
+            next_step = "audio" # Always proceed to audio step
         elif current_step == "audio":
+            # Pass scenes data
+            scenes = st.session_state.story_data.get('scenes', []) if st.session_state.story_data else []
+            st.session_state.audio_results = run_audio_step(scenes, temp_dir, status_placeholder)
+            next_step = "compose" # Always proceed to compose step
         elif current_step == "compose":
+            # Get the title for the final video filename
+            title_base = st.session_state.story_data.get('title', 'pov_video') if st.session_state.story_data else 'pov_video'
+            # Run the composition step, passing video and audio results and the temp dir
+            st.session_state.final_video_path = run_compose_step_ffmpeg(
+                st.session_state.video_results, st.session_state.audio_results, temp_dir, title=title_base, status_placeholder=status_placeholder)
+            # Determine next step: 'done' if video path exists, 'error' otherwise
             next_step = "done" if st.session_state.final_video_path else "error"
+            if next_step == "error" and not status_placeholder.container._provided_by_user:
+                 # Add a generic error message if the compose function didn't provide a specific one
+                 status_placeholder.error("Composition step failed.")
+        else: # Should not be reached if the state machine is correct
+            next_step = "error"
+            status_placeholder.error(f"Internal error: Unknown state '{current_step}'")
+        # --- State Transition ---
+        # Update state and trigger a rerun ONLY if the workflow should transition to the next step
+        if next_step != current_step: # Check if the state needs to change
+             st.session_state.current_step = next_step # Set the new step
+             if next_step == "done" or next_step == "error":
+                 st.session_state.generation_in_progress = False # Workflow finished (success or failure)
+                 # The progress bar update to 100% for 'done' is handled above based on state
+             # Trigger a rerun. Streamlit will reload the script, and the logic will continue
+             # from the new st.session_state.current_step.
              st.experimental_rerun()
+        # If next_step is the same as current_step, it implies an error occurred *within* the step's
+        # execution that set next_step to "error", and the logic above decided not to rerun.
+        # In this specific case (error state reached), ensure the generation_in_progress flag is false
+        # if it wasn't already set by the logic inside the step function itself.
+        elif st.session_state.current_step == "error" and st.session_state.generation_in_progress:
              st.session_state.generation_in_progress = False
     except Exception as e:
+         # Catch any unexpected errors that weren't handled within the step functions
          st.error(f"An unexpected error occurred during step {st.session_state.current_step}: {e}")
+         st.error(traceback.format_exc()) # Display full traceback for debugging
+         status_placeholder.error(f"An unexpected error stopped the workflow at step: **{st.session_state.current_step.upper()}**")
+         st.session_state.current_step = "error" # Set state to error
+         st.session_state.generation_in_progress = False # Stop generation
+         progress_bar.progress(0) # Reset progress bar on error
+         st.experimental_rerun() # Rerun to show the error state UI and stop execution flow
 # --- Display Final Output ---
 st.divider()
 st.header("✅ Final Video")
+# Display the final video if the workflow is done and a path exists
 if st.session_state.current_step == "done" and st.session_state.final_video_path:
     final_video_path = st.session_state.final_video_path
     if os.path.exists(final_video_path):
+        # Use st.video to display the video player
+        st.video(final_video_path, format='video/mp4') # Explicitly set format
+        # Provide a download button for the video file
         try:
             with open(final_video_path, "rb") as fp:
                 st.download_button(
                     label="⬇️ Download Final Video (.mp4)",
                     data=fp,
+                    file_name=os.path.basename(final_video_path), # Use the filename from the path
                     mime="video/mp4",
+                    key="final_video_download_btn" # Unique key for the widget
                 )
         except Exception as e:
             st.error(f"Error reading final video for download: {e}")
     else:
+        st.error(f"Final video file not found: {final_video_path}. It might have been cleaned up prematurely or composition failed unexpectedly.")
+# Display error message if the workflow ended in an error state
 elif st.session_state.current_step == "error":
+    st.error("🛑 Workflow failed. Check logs above and in the app output/Spaces logs tab. Please use 'Reset Workflow' and try again.")
+# Inform the user if generation is ongoing
 elif st.session_state.generation_in_progress:
+    # The current step and progress are displayed in the section above
     st.info(f"⏳ Workflow running... Current step: **{st.session_state.current_step.upper()}**")
+# Initial message when the app is ready
 else:
+    st.info("👋 Ready to generate. Use the sidebar to enter your scenario and configuration, then click 'Start Generation'.")
+# Optional: Expander to show detailed intermediate results for debugging
+with st.expander("Show Intermediate File Details and State", expanded=False):
+     st.write("**Session State:**", st.session_state) # Display all session state (useful for debugging)
      st.write("**Story Data:**"); st.json(st.session_state.story_data or {})
      st.write("**Image Results:**"); st.json(st.session_state.image_results or [])
      st.write("**Video Results:**"); st.json(st.session_state.video_results or [])
      st.write("**Audio Results:**"); st.json(st.session_state.audio_results or [])
      st.write("**Final Path:**", st.session_state.final_video_path or "Not generated")
      st.write("**Temp Dir:**", st.session_state.get('temp_dir_path', "N/A"))
+     # Attempt to list contents of the temp directory
+     temp_dir_content = []
+     temp_dir_path_state = st.session_state.get('temp_dir_path')
+     if temp_dir_path_state and os.path.exists(temp_dir_path_state):
+          try:
+               temp_dir_content = os.listdir(temp_dir_path_state)
+               # Sort for easier reading
+               temp_dir_content.sort()
+          except Exception as e:
+               temp_dir_content = [f"Error listing directory contents: {e}"]
+     st.write("**Temp Dir Contents:**", temp_dir_content if temp_dir_content else "Directory empty or not created/found.")
+# Final memory display (always visible in sidebar due to placement in the script)
 display_memory_usage()