Spaces:

garyuzair
/

Personal

Runtime error

App Files Files Community

garyuzair commited on May 9, 2025

Commit

87d325d

verified ·

1 Parent(s): 682bf09

Create app.py

Browse files

Files changed (1) hide show

app.py +548 -0

app.py ADDED Viewed

	@@ -0,0 +1,548 @@

+import os
+import gc
+import torch
+import streamlit as st
+import tempfile
+import json
+import subprocess
+import shutil
+from datetime import datetime
+from io import BytesIO
+import random
+from PIL import Image
+# --- Hugging Face Model Libraries (Local Models) ---
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from parler_tts import ParlerTTSForConditionalGeneration
+# --- Google Generative AI (Gemini API) ---
+try:
+    from google import generativeai as genai
+    from google.generativeai import types as genai_types # For GenerateContentConfig
+    google_gemini_sdk_available = True
+except ImportError:
+    google_gemini_sdk_available = False
+    # Error will be handled in UI
+# --- Application Configuration ---
+st.set_page_config(layout="wide", page_title="🌟 AI POV Story Weaver v2")
+# --- Model IDs ---
+SCRIPT_LLM_MODEL_ID = "openai-community/gpt2-medium" # Stand-in for "Tinglama"
+TTS_MODEL_ID = "parler-tts/parler-tts-mini-v1.1"
+# --- Gemini API Configuration (from Streamlit Secrets) ---
+GEMINI_API_KEY = st.secrets.get("GEMINI_API_KEY")
+GEMINI_IMAGE_MODEL_ID = st.secrets.get("GEMINI_IMAGE_MODEL_ID") # User's specified model
+# --- Hugging Face Cache ---
+CACHE_DIR = os.path.join(tempfile.gettempdir(), "hf_cache_story_weaver_v2")
+os.makedirs(CACHE_DIR, exist_ok=True)
+# (Setting environment variables for HF cache)
+os.environ['HUGGINGFACE_HUB_CACHE'] = CACHE_DIR
+os.environ['HF_HOME'] = CACHE_DIR
+os.environ['TRANSFORMERS_CACHE'] = CACHE_DIR
+# --- Session State Initialization ---
+if 'run_id' not in st.session_state:
+    st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+if 'generated_data' not in st.session_state:
+    st.session_state.generated_data = None
+if 'temp_base_dir' not in st.session_state:
+    st.session_state.temp_base_dir = None
+# --- Utility Functions (largely same as before) ---
+def get_session_temp_dir():
+    if st.session_state.temp_base_dir and os.path.exists(st.session_state.temp_base_dir):
+        return st.session_state.temp_base_dir
+    base_dir = os.path.join(tempfile.gettempdir(), f"story_weaver_v2_run_{st.session_state.run_id}")
+    os.makedirs(base_dir, exist_ok=True)
+    st.session_state.temp_base_dir = base_dir
+    return base_dir
+def cleanup_temp_files():
+    path_to_clean = st.session_state.get("temp_base_dir")
+    if path_to_clean and os.path.exists(path_to_clean):
+        try:
+            shutil.rmtree(path_to_clean)
+            st.session_state.temp_base_dir = None
+        except Exception as e:
+            st.warning(f"Warning: Could not clean up temp dir {path_to_clean}: {e}")
+def clear_gpu_cache():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+# --- Model Loading Functions (Cached) ---
+@st.cache_resource
+def load_script_llm_resources(model_id):
+    st.write(f"Loading LLM for script generation: {model_id}...")
+    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
+    model = AutoModelForCausalLM.from_pretrained(
+        model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto", cache_dir=CACHE_DIR
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        model.config.pad_token_id = model.config.eos_token_id
+    st.write("LLM for script generation loaded.")
+    return model, tokenizer
+@st.cache_resource
+def load_tts_resources(model_id):
+    st.write(f"Loading TTS model: {model_id}...")
+    model = ParlerTTSForConditionalGeneration.from_pretrained(
+        model_id, torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+        device_map="auto", cache_dir=CACHE_DIR
+    )
+    tokenizer = AutoTokenizer.from_pretrained(model_id, cache_dir=CACHE_DIR)
+    st.write("TTS model loaded.")
+    return model, tokenizer
+@st.cache_resource
+def get_gemini_sdk_client(_api_key): # Argument to help Streamlit caching
+    """
+    Returns a Gemini SDK client instance if SDK is available and API key is provided.
+    This uses the client pattern from the user's snippet.
+    """
+    if not google_gemini_sdk_available:
+        st.error("Google Generative AI SDK (`google-generativeai`) is not installed.")
+        return None
+    if not _api_key:
+        st.error("GEMINI_API_KEY not found in secrets.toml.")
+        return None
+    try:
+        # According to user's snippet, they instantiate client directly with API key
+        client = genai.Client(api_key=_api_key)
+        st.write("Gemini SDK Client initialized.")
+        return client
+    except Exception as e:
+        st.error(f"Error initializing Gemini SDK Client: {e}")
+        return None
+# --- Core Generation Functions ---
+def generate_story_and_prompts(main_pov_prompt: str, num_scenes: int):
+    st.info(f"Generating story and image prompts for '{main_pov_prompt}'...")
+    model, tokenizer = load_script_llm_resources(SCRIPT_LLM_MODEL_ID)
+    # --- Enhanced Prompt for Script LLM ---
+    structured_prompt = f"""
+You are an expert visual storyteller and AI prompt engineer. Your task is to generate a multi-scene story based on a user's Point-of-View (POV) prompt.
+The story must be divided into exactly {num_scenes} distinct scenes.
+For each scene, you must provide:
+1.  "scene_number": An integer representing the scene order (e.g., 1, 2, ...).
+2.  "scene_narration": A short paragraph (2-4 sentences, ~30-60 words). This narration should be from the first-person POV, be engaging, and suitable for text-to-speech. Convey emotion or atmosphere where appropriate.
+3.  "image_generation_prompt": A highly descriptive and creative prompt (1-3 sentences, ~40-75 words) tailored for an advanced AI image generator like Gemini Flash. This prompt should generate a single, compelling image for the scene. Include:
+    *   **Subject & Action:** Clearly define the main subject(s) and what they are doing from the POV.
+    *   **Setting & Environment:** Describe the location, time of day, and key environmental details.
+    *   **Visual Style & Medium:** Suggest an artistic style (e.g., "photorealistic," "cinematic with dramatic lighting," "fantasy digital art," "impressionistic oil painting," "cyberpunk anime concept art," "vintage photograph").
+    *   **Camera View & Composition:** Specify camera angle if important (e.g., "first-person POV looking through a visor," "low-angle shot emphasizing scale," "close-up on a mysterious object," "wide establishing shot").
+    *   **Lighting & Color:** Describe the lighting conditions (e.g., "soft morning light," "neon glow," "moonlit night," "dramatic chiaroscuro") and dominant colors or color palette.
+    *   **Mood & Atmosphere:** Indicate the desired feeling (e.g., "mysterious and eerie," "hopeful and adventurous," "tense and suspenseful," "serene and peaceful").
+    *   **Key Details:** Mention any specific objects, textures, or elements crucial to the scene.
+User's main POV prompt: "{main_pov_prompt}"
+Output the result STRICTLY as a single JSON object. The JSON object should have a key "title" (a concise title derived from the main POV prompt) and a key "scenes" which is a list of scene objects. Each scene object must contain the keys "scene_number", "scene_narration", and "image_generation_prompt".
+Example of a single scene object within the "scenes" list:
+{{
+    "scene_number": 1,
+    "scene_narration": "My metallic fingers traced the glowing hieroglyphs on the alien console. A low hum resonated through the derelict starship, promising either discovery or doom.",
+    "image_generation_prompt": "First-person POV of a sleek, silver robotic hand touching intricate, glowing blue hieroglyphs on a dark, alien control panel. The background shows the dimly lit, derelict interior of a starship, with faint starlight filtering through a cracked viewport. Style: Cinematic sci-fi, photorealistic textures on the robot hand and console, mysterious and suspenseful atmosphere. Focus on the interaction between hand and console."
+}}
+Begin JSON output now:
+```json
+"""
+    input_ids = tokenizer.encode(structured_prompt, return_tensors="pt").to(model.device)
+    estimated_output_tokens = num_scenes * 180 + 150 # Increased estimate for richer prompts
+    max_new_tokens = min(estimated_output_tokens, 1200) # Slightly increased cap
+    try:
+        output = model.generate(
+            input_ids, max_new_tokens=max_new_tokens, do_sample=True,
+            temperature=0.7, top_k=60, pad_token_id=tokenizer.eos_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+        result_text = tokenizer.decode(output, skip_special_tokens=True)
+        json_str_content = ""
+        # Try to extract JSON block, more robustly
+        if "```json" in result_text:
+            json_start_index = result_text.find("```json") + len("```json")
+            json_end_index = result_text.rfind("```")
+            if json_start_index != -1 and json_end_index != -1 and json_end_index > json_start_index:
+                json_str_content = result_text[json_start_index:json_end_index].strip()
+            else: # Fallback if ending ``` is missing or malformed
+                json_str_content = result_text[json_start_index:].strip()
+        else: # If no ```json marker, assume the relevant part starts with {
+            json_start_index = result_text.find("{")
+            if json_start_index != -1:
+                # Try to find matching braces, simple approach
+                # This is not a perfect JSON parser but a heuristic
+                open_braces = 0
+                potential_json_end = -1
+                for i, char in enumerate(result_text[json_start_index:]):
+                    if char == '{':
+                        open_braces += 1
+                    elif char == '}':
+                        open_braces -= 1
+                        if open_braces == 0:
+                            potential_json_end = json_start_index + i + 1
+                            break
+                if potential_json_end != -1:
+                    json_str_content = result_text[json_start_index:potential_json_end]
+                else: # Could not find balanced braces, take a guess
+                    json_str_content = result_text[json_start_index:]
+        if not json_str_content:
+            st.error("LLM did not produce detectable JSON content.")
+            st.text_area("LLM Full Raw Output:", result_text, height=300)
+            return None
+        try:
+            parsed_json = json.loads(json_str_content)
+        except json.JSONDecodeError as e:
+            st.error(f"LLM output JSON parsing error: {e}")
+            st.text_area("Attempted JSON content:", json_str_content, height=200)
+            st.text_area("LLM Full Raw Output (for debugging):", result_text, height=300)
+            return None
+        if not isinstance(parsed_json, dict) or "scenes" not in parsed_json or not isinstance(parsed_json["scenes"], list):
+            st.error("LLM output JSON structure is not as expected (missing 'scenes' list or not a dict).")
+            st.json(parsed_json)
+            return None
+        if len(parsed_json["scenes"]) != num_scenes:
+            st.warning(f"LLM generated {len(parsed_json['scenes'])} scenes, but {num_scenes} were requested. Adjusting...")
+            parsed_json["scenes"] = parsed_json["scenes"][:num_scenes]
+            while len(parsed_json["scenes"]) < num_scenes: # Pad if too few (basic)
+                parsed_json["scenes"].append({
+                    "scene_number": len(parsed_json["scenes"]) + 1,
+                    "scene_narration": "Error: Scene data missing from LLM.",
+                    "image_generation_prompt": "Error: Image prompt missing from LLM."
+                })
+        st.success("Story and image prompts generated successfully!")
+        return parsed_json
+    except Exception as e:
+        st.error(f"Error during LLM story/prompt generation: {e}")
+        st.text_area("LLM Full Raw Output (on exception):", result_text if 'result_text' in locals() else "N/A", height=300)
+        return None
+    finally:
+        del model; del tokenizer; clear_gpu_cache()
+def generate_images_via_gemini(story_data):
+    st.info("Generating images with Gemini API...")
+    sdk_client = get_gemini_sdk_client(GEMINI_API_KEY) # Use the new client getter
+    if not sdk_client:
+        st.error("Gemini SDK Client not initialized. Cannot generate images.")
+        return None
+    if not GEMINI_IMAGE_MODEL_ID:
+        st.error("`GEMINI_IMAGE_MODEL_ID` is not set in secrets.toml. Cannot generate images.")
+        return None
+    st.markdown(f"**Using Gemini Model for Images:** `{GEMINI_IMAGE_MODEL_ID}`")
+    st.warning(f"""
+        **Note on Image Generation with `{GEMINI_IMAGE_MODEL_ID}`:**
+        - This uses your specified model and API call structure.
+        - Image characteristics (size, style nuances) are determined by this model.
+        - The 'Seed' input from the UI is not directly used in this specific Gemini API call structure.
+    """)
+    images_pil = []
+    for i, scene_obj in enumerate(story_data["scenes"]):
+        image_prompt_text = scene_obj.get("image_generation_prompt", "A beautiful, abstract scene.") # Fallback
+        scene_num = scene_obj.get("scene_number", i + 1)
+        st.write(f"Requesting image for Scene {scene_num} with prompt: \"{image_prompt_text[:150]}...\"")
+        try:
+            # --- Using user's specified Gemini calling convention ---
+            response = sdk_client.models.generate_content(
+                model=GEMINI_IMAGE_MODEL_ID, # Model name passed here
+                contents=[image_prompt_text], # The prompt for the image
+                # As per user's snippet, config might be needed by their specific model endpoint
+                generation_config=genai_types.GenerateContentConfig(
+                    # response_modalities=["TEXT", "IMAGE"] # This was in user's example for GenerateContentConfig
+                    # However, GenerateContentConfig does not have response_modalities.
+                    # If the user's model requires this, it might be an older/internal SDK version or custom handling.
+                    # For safety with public SDK, I will omit it unless specified it's for GenerateContentConfig.
+                    # If it's for the top-level call, it would be different.
+                    # The user's snippet has it under 'config=', implying it's for GenerateContentConfig.
+                    # Let's try to include it if the types allow, otherwise this might error with public SDK.
+                    # Upon checking google.generativeai.types.GenerationConfig, it does not have `response_modalities`.
+                    # The user's example had `config=types.GenerateContentConfig(response_modalities=["TEXT", "IMAGE"])`
+                    # This structure means `response_modalities` is an argument to `GenerateContentConfig`.
+                    # If their preview SDK `types.GenerateContentConfig` accepts it, this will work.
+                    # Otherwise, this line will be an error with the public SDK.
+                    # For now, I will try to pass it as they specified, assuming their SDK version is different.
+                    # **Update:** Based on their snippet, `response_modalities` seems to be part of `GenerateContentConfig`.
+                    # However, standard `google.generativeai.types.GenerateContentConfig` doesn't list it.
+                    # The `generate_content` method itself in `Model` class can take `request_options` which includes `response_mime_types`.
+                    # The most robust way if `response_modalities` is not a standard config param,
+                    # would be to rely on the model type to produce an image, or use `response_mime_type` if the model supports it.
+                    # Given the user's code snippet, I'll include it as they had it, assuming their types.py is different.
+                    # THIS IS A POTENTIAL POINT OF FAILURE IF USING STANDARD PUBLIC SDK.
+                    **({"response_modalities": ["TEXT", "IMAGE"]} if hasattr(genai_types.GenerateContentConfig(), 'response_modalities') else {})
+                    # The above line is a Poka-yoke to attempt to add it only if the attribute exists.
+                    # A simpler approach is to just try what they gave:
+                    # response_modalities = ["TEXT", "IMAGE"] # This would go into GenerateContentConfig
+                    # This is very specific to their stated "working code"
+                ),
+                # The `config` argument in `client.models.generate_content` maps to `generation_config` for the Model service.
+                # The API may also have `tool_config` and `safety_settings`.
+            )
+            generated_image = None
+            response_text_parts = []
+            if hasattr(response, 'parts') and response.parts:
+                for part in response.parts:
+                    if hasattr(part, 'text') and part.text:
+                        response_text_parts.append(part.text)
+                    if hasattr(part, 'mime_type') and part.mime_type and part.mime_type.startswith("image/"):
+                        if hasattr(part, 'inline_data') and hasattr(part.inline_data, 'data'):
+                            image_bytes = part.inline_data.data
+                            generated_image = Image.open(BytesIO(image_bytes))
+                            st.success(f"Image for Scene {scene_num} received from Gemini.")
+                            break # Found an image
+            elif hasattr(response, 'text') and not generated_image: # If no parts but has text (error or text-only response)
+                 response_text_parts.append(response.text)
+            if generated_image:
+                images_pil.append(generated_image)
+                if response_text_parts:
+                    st.caption(f"Accompanying text from Gemini for Scene {scene_num}: {' '.join(response_text_parts)}")
+            else:
+                st.warning(f"No image data explicitly found from Gemini for Scene {scene_num}.")
+                if response_text_parts:
+                    st.text_area(f"Gemini Text Response (Scene {scene_num}):", value=' '.join(response_text_parts), height=100)
+                else:
+                    st.text(f"Raw Gemini Response (Scene {scene_num}): {response}") # Log raw response if no image
+                if hasattr(response, 'prompt_feedback') and response.prompt_feedback:
+                     st.warning(f"Prompt Feedback for scene {scene_num}: {response.prompt_feedback}")
+                images_pil.append(None)
+        except Exception as e:
+            st.error(f"Error generating image for Scene {scene_num} with Gemini: {e}")
+            st.error(f"Model used: {GEMINI_IMAGE_MODEL_ID}. Prompt: '{image_prompt_text[:100]}...'")
+            if "API key not valid" in str(e) or "PERMISSION_DENIED" in str(e):
+                st.error("Gemini API Key error. Check your key and its permissions for this model.")
+            elif "Could not find model" in str(e) or "MODEL_NAME_INVALID" in str(e):
+                st.error(f"Gemini Model '{GEMINI_IMAGE_MODEL_ID}' not found or invalid. Verify the model name.")
+            elif "response_modalities" in str(e):
+                 st.error("The `response_modalities` config might not be supported by your version of `google.generativeai.types.GenerateContentConfig` or the model endpoint. This part of the code is based on your provided 'working snippet'.")
+            images_pil.append(None)
+    if not any(images_pil):
+        st.error("No images were successfully generated by Gemini.")
+        return None
+    st.success("Image generation step completed.")
+    return images_pil
+def generate_audio_narrations(story_data):
+    st.info("Generating audio narrations with ParlerTTS...")
+    tts_model, tts_tokenizer = load_tts_resources(TTS_MODEL_ID)
+    audio_dir = os.path.join(get_session_temp_dir(), "audio_files")
+    os.makedirs(audio_dir, exist_ok=True)
+    audio_file_paths = []
+    description = "A clear and engaging narrator tells a story with enthusiasm."
+    for i, scene_obj in enumerate(story_data["scenes"]):
+        narration_text = scene_obj.get("scene_narration", "Narration unavailable.")
+        scene_num = scene_obj.get("scene_number", i + 1)
+        st.write(f"Generating audio for Scene {scene_num}...")
+        try:
+            input_ids = tts_tokenizer(description, return_tensors="pt").input_ids.to(tts_model.device)
+            prompt_input_ids = tts_tokenizer(narration_text, return_tensors="pt").input_ids.to(tts_model.device)
+            generation = tts_model.generate(
+                input_ids=input_ids, prompt_input_ids=prompt_input_ids,
+                do_sample=True, temperature=0.7, # Slightly warmer for more expression
+                repetition_penalty=1.2, guidance_scale=3.0 # Experiment with ParlerTTS params
+            ).to(torch.float32)
+            audio_waveform = generation.cpu().numpy().squeeze()
+            file_path = os.path.join(audio_dir, f"s_{scene_num}_audio.wav")
+            sf.write(file_path, audio_waveform, tts_model.config.sampling_rate)
+            audio_file_paths.append(file_path)
+            st.success(f"Audio for Scene {scene_num} created.")
+        except Exception as e:
+            st.error(f"Audio error (Scene {scene_num}): {e}")
+            audio_file_paths.append(None)
+    del tts_model; del tts_tokenizer; clear_gpu_cache()
+    st.success("Audio narration step completed.")
+    return audio_file_paths
+def create_final_video(image_pil_objects, audio_paths):
+    # (This function remains largely the same as v1, ensure paths and checks are robust)
+    st.info("Creating final video...")
+    if not image_pil_objects or not audio_paths or len(image_pil_objects) != len(audio_paths):
+        st.error("Asset mismatch for video. Cannot create."); return None
+    try: subprocess.run(["ffmpeg", "-version"], capture_output=True, check=True)
+    except: st.error("FFMPEG not found."); return None
+    temp_image_dir = os.path.join(get_session_temp_dir(), "vid_frames")
+    os.makedirs(temp_image_dir, exist_ok=True)
+    img_paths_for_vid = []
+    for idx, img_pil in enumerate(image_pil_objects):
+        if img_pil:
+            p = os.path.join(temp_image_dir, f"f_{idx:03d}.png"); img_pil.save(p)
+            img_paths_for_vid.append(p)
+        else: img_paths_for_vid.append(None)
+    temp_clips_dir = os.path.join(get_session_temp_dir(), "temp_vid_clips")
+    os.makedirs(temp_clips_dir, exist_ok=True)
+    vid_clip_paths, valid_clips = [], 0
+    for i, (img_p, aud_p) in enumerate(zip(img_paths_for_vid, audio_paths)):
+        s_num = i + 1
+        if not (img_p and aud_p): st.warning(f"Skipping Scene {s_num} in video (missing asset)."); continue
+        try:
+            aud_info = sf.info(aud_p); aud_dur = aud_info.duration
+            if aud_dur < 0.5: aud_dur = 0.5 # Min clip duration
+            clip_out_p = os.path.join(temp_clips_dir, f"c_{s_num:03d}.mp4")
+            cmd = [
+                "ffmpeg", "-y", "-loop", "1", "-i", img_p, "-i", aud_p,
+                "-c:v", "libx264", "-preset", "fast", "-tune", "stillimage", # Faster preset
+                "-vf", "scale=1280:720:force_original_aspect_ratio=decrease,pad=1280:720:(ow-iw)/2:(oh-ih)/2,setsar=1", # Scale and pad to 720p
+                "-c:a", "aac", "-b:a", "192k", "-pix_fmt", "yuv420p",
+                "-t", str(aud_dur), "-shortest", clip_out_p
+            ]
+            res = subprocess.run(cmd, capture_output=True, text=True)
+            if res.returncode != 0: st.error(f"FFMPEG clip error (S{s_num}):\n{res.stderr}"); continue
+            vid_clip_paths.append(clip_out_p); valid_clips +=1
+            st.write(f"Video clip for Scene {s_num} processed.")
+        except Exception as e: st.error(f"Video processing error (S{s_num}): {e}")
+    if not vid_clip_paths or valid_clips == 0: st.error("No valid video clips. Cannot create final video."); return None
+    concat_list_f = os.path.join(temp_clips_dir, "concat_list.txt")
+    with open(concat_list_f, "w") as f:
+        for clip_p in vid_clip_paths: f.write(f"file '{os.path.basename(clip_p)}'\n")
+    final_vid_out_p = os.path.join(get_session_temp_dir(), "final_story_video_720p.mp4")
+    concat_cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", concat_list_f, "-c", "copy", final_vid_out_p]
+    st.write("Concatenating video clips...")
+    res = subprocess.run(concat_cmd, capture_output=True, text=True, cwd=temp_clips_dir)
+    if res.returncode != 0: st.error(f"FFMPEG concat error:\n{res.stderr}"); return None
+    st.success("Final video created successfully!"); return final_vid_out_p
+# --- Streamlit UI (largely same, check key names and logic) ---
+st.title("🌟 AI POV Story Weaver v2 ✨")
+st.markdown("Craft unique POV stories with refined AI-generated scripts, Gemini images, voiceovers, and a final video!")
+st.markdown("---")
+with st.sidebar:
+    st.header("🛠️ Story Configuration")
+    user_main_prompt = st.text_area(
+        "Enter your main POV story idea:",
+        st.session_state.get("user_main_prompt_val", "POV: I'm a lone astronaut discovering an ancient, bioluminescent forest on a new planet."),
+        height=120, key="main_prompt_input_v2"
+    )
+    num_scenes_input = st.slider(
+        "Number of Scenes:", 1, 4, # Max 4 for resource management
+        st.session_state.get("num_scenes_input_val", 2), key="num_scenes_slider_v2"
+    )
+    st.caption(f"Script LLM: `{SCRIPT_LLM_MODEL_ID}`")
+    st.caption(f"TTS Model: `{TTS_MODEL_ID}`")
+    st.caption(f"Image Model (Gemini): `{GEMINI_IMAGE_MODEL_ID}` (from secrets)")
+    if not google_gemini_sdk_available: st.error("Google SDK missing (`pip install google-generativeai`)")
+    if not GEMINI_API_KEY: st.error("`GEMINI_API_KEY` not set in secrets.")
+    if not GEMINI_IMAGE_MODEL_ID or "your-gemini" in GEMINI_IMAGE_MODEL_ID or "flash-preview" not in GEMINI_IMAGE_MODEL_ID :
+         st.warning(f"Model ID '{GEMINI_IMAGE_MODEL_ID}' might be placeholder or not your specific preview model. Ensure it's correct in secrets for image generation.")
+    st.markdown("---")
+    can_generate = google_gemini_sdk_available and GEMINI_API_KEY and GEMINI_IMAGE_MODEL_ID
+    if st.button("🎬 Weave My Story! (v2)", type="primary", use_container_width=True, disabled=not can_generate):
+        st.session_state.run_id = datetime.now().strftime("%Y%m%d_%H%M%S")
+        cleanup_temp_files()
+        st.session_state.generated_data = {}
+        st.session_state.user_main_prompt_val = user_main_prompt
+        st.session_state.num_scenes_input_val = num_scenes_input
+        st.session_state.trigger_generation_v2 = True
+    st.markdown("---")
+    if st.button("🧹 Clear All & Reset", use_container_width=True):
+        cleanup_temp_files()
+        keys_to_clear = ['generated_data', 'trigger_generation_v2', 'user_main_prompt_val', 'num_scenes_input_val']
+        for key in keys_to_clear:
+            if key in st.session_state: del st.session_state[key]
+        st.cache_resource.clear()
+        st.success("Cleared temp files, state, and model cache. Reload on next run."); st.rerun()
+# --- Main Area for Generation and Display ---
+if st.session_state.get("trigger_generation_v2"):
+    with st.spinner("📜 Phase 1: Generating enhanced story script and image prompts..."):
+        story_json_data = generate_story_and_prompts(
+            st.session_state.user_main_prompt_val, st.session_state.num_scenes_input_val
+        )
+        st.session_state.generated_data['story_json'] = story_json_data
+    if story_json_data:
+        # (Display logic for story, images, audio, video - similar to previous, ensuring keys match)
+        st.header("📜 Generated Story & Image Prompts")
+        st.json(story_json_data)
+        # ... (Download button for story_json) ...
+        st.markdown("---")
+        with st.spinner(f"🎨 Phase 2: Generating images with Gemini ({GEMINI_IMAGE_MODEL_ID})..."):
+            pil_images = generate_images_via_gemini(story_json_data)
+            st.session_state.generated_data['pil_images'] = pil_images
+        if pil_images and any(pil_images): # Check if list is not empty AND contains at least one image
+            st.header("🖼️ Generated Images")
+            # ... (Display and download buttons for pil_images) ...
+            cols = st.columns(min(len(pil_images), 3))
+            for i, img in enumerate(pil_images):
+                if img:
+                    with cols[i % len(cols)]:
+                        st.image(img, caption=f"Scene {story_json_data['scenes'][i].get('scene_number', i+1)}")
+                        # ... (download button for img) ...
+            st.markdown("---")
+            with st.spinner("🔊 Phase 3: Generating audio narrations..."):
+                audio_paths = generate_audio_narrations(story_json_data)
+                st.session_state.generated_data['audio_paths'] = audio_paths
+            if audio_paths and any(audio_paths):
+                st.header("🎤 Generated Audio Narrations")
+                # ... (Display and download buttons for audio_paths) ...
+                st.markdown("---")
+                if st.session_state.generated_data.get('pil_images') and st.session_state.generated_data.get('audio_paths'):
+                    with st.spinner("🎬 Final Phase: Weaving the video masterpiece..."):
+                        final_video_path = create_final_video(
+                            st.session_state.generated_data['pil_images'],
+                            st.session_state.generated_data['audio_paths']
+                        )
+                        st.session_state.generated_data['final_video_path'] = final_video_path
+                    if final_video_path:
+                        st.header("🎉 Your Story Video is Ready! 🎉")
+                        st.video(final_video_path)
+                        # ... (Download button for final_video_path) ...
+                        st.balloons()
+                    else: st.error("Video creation failed. Check FFMPEG logs if any were shown.")
+                else: st.warning("Skipping video: not all images or audio were generated.")
+            else: st.error("Audio generation failed. Cannot proceed to video.")
+        else: st.error("Image generation failed (no images returned). Cannot proceed.")
+    else: st.error("Story script generation failed. Cannot proceed.")
+    st.session_state.trigger_generation_v2 = False # Reset trigger
+elif not st.session_state.get("user_main_prompt_val"):
+    st.info("👋 Welcome to the AI Story Weaver v2! Configure your story in the sidebar and click 'Weave My Story!'")