Spaces:

rajux75
/

contentapi

Sleeping

App Files Files Community

rajux75 commited on Apr 17, 2025

Commit

45852d0

verified ·

1 Parent(s): 5f7ce0f

Update services/generation.py

Browse files

Files changed (1) hide show

services/generation.py +153 -87

services/generation.py CHANGED Viewed

@@ -2,17 +2,17 @@
 import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from PIL import Image
 import config
 from utils.helpers import decode_base64_image, encode_image_base64, encode_video_base64
 import logging
 import gc # Garbage collector
-from typing import List
-from diffusers import StableDiffusionPipeline, DiffusionPipeline, DPMSolverMultistepScheduler, LCMScheduler # Import LCMScheduler
-from peft import PeftConfig # Import PeftConfig (if needed, usually handled by load_lora_weights)
 logger = logging.getLogger(__name__)
 # --- Global Model Cache ---
 # Use a dictionary to hold loaded models and tokenizers
 # This allows loading them only once when the app starts.
@@ -21,53 +21,72 @@ model_cache = {}
 def load_models():
     """Loads all models into the cache. Called at application startup."""
     logger.info("Loading models...")
-    try:
-        # Text Generation Model
         logger.info(f"Loading text model: {config.TEXT_MODEL_NAME}")
         model_cache["text_tokenizer"] = AutoTokenizer.from_pretrained(config.TEXT_MODEL_NAME)
         model_cache["text_model"] = AutoModelForSeq2SeqLM.from_pretrained(config.TEXT_MODEL_NAME).to(config.DEVICE)
         logger.info("Text model loaded.")
-    # --- Image Generation Model ---
-    logger.info(f"Loading image model: {config.IMAGE_MODEL_NAME}")
-    image_pipeline = StableDiffusionPipeline.from_pretrained(
-        config.IMAGE_MODEL_NAME,
-        torch_dtype=config.DTYPE
-    )
-    # --- Load LCM LoRA ---
-    try:
-        logger.info(f"Loading LCM LoRA: {config.IMAGE_LCM_LORA_NAME}")
-        # Load LoRA weights directly into the pipeline
-        image_pipeline.load_lora_weights(config.IMAGE_LCM_LORA_NAME)
-        # Fuse LoRA for potential speedup (optional, test impact)
-        # image_pipeline.fuse_lora()
-        logger.info("LCM LoRA loaded successfully.")
-        # --- IMPORTANT: Set LCM Scheduler ---
-        image_pipeline.scheduler = LCMScheduler.from_config(image_pipeline.scheduler.config)
-        logger.info("Switched scheduler to LCMScheduler.")
-    except Exception as e:
-        logger.warning(f"Could not load or apply LCM LoRA '{config.IMAGE_LCM_LORA_NAME}'. Falling back to base model scheduler. Error: {e}", exc_info=True)
-        # Fallback to a standard fast scheduler if LCM fails
         image_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(image_pipeline.scheduler.config)
-    image_pipeline = image_pipeline.to(config.DEVICE)
-    if config.DEVICE == "cuda":
-         try:
-             # image_pipeline.enable_xformers_memory_efficient_attention()
-             pass
-         except ImportError:
-             logger.warning("xformers not installed...")
-         # image_pipeline.enable_attention_slicing()
-    model_cache["image_pipeline"] = image_pipeline
-    logger.info("Image model setup complete.")
-        # Video Generation Model
         logger.info(f"Loading video model: {config.VIDEO_MODEL_NAME}")
         video_pipeline = DiffusionPipeline.from_pretrained(
             config.VIDEO_MODEL_NAME,
@@ -75,19 +94,36 @@ def load_models():
             variant="fp16" if config.DTYPE == torch.float16 else None # Zeroscope often has fp16 variants
         )
         video_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(video_pipeline.scheduler.config)
-        video_pipeline.enable_model_cpu_offload() # Crucial for low VRAM environments like Spaces CPU/T4
-        # video_pipeline = video_pipeline.to(config.DEVICE) # CPU offload handles device placement
         model_cache["video_pipeline"] = video_pipeline
-        logger.info("Video model loaded.")
-    except Exception as e:
-        logger.error(f"Error loading models: {e}", exc_info=True)
-        # Depending on policy, you might want to raise the exception
-        # or allow the app to start with missing models (endpoints will fail)
-        raise  # Reraise to prevent app start if essential models fail
-    logger.info("All models loaded successfully.")
 def generate_ideas_sync(prompt: str, max_length: int, num_ideas: int) -> List[str]:
@@ -95,33 +131,41 @@ def generate_ideas_sync(prompt: str, max_length: int, num_ideas: int) -> List[st
     tokenizer = model_cache.get("text_tokenizer")
     model = model_cache.get("text_model")
     if not tokenizer or not model:
-        raise RuntimeError("Text model not loaded.")
-    # Adjust prompt slightly for better instruction following if needed (e.g., for Flan-T5)
-    # input_text = f"Generate {num_ideas} content ideas about: {prompt}"
     input_text = prompt # Keep original prompt based on request model
-    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(config.DEVICE) # Max input length for model
-    # Generation parameters
-    outputs = model.generate(
-        **inputs,
-        max_length=max_length,
-        num_return_sequences=num_ideas,
-        do_sample=True, # Use sampling for more diverse ideas
-        temperature=0.8,
-        top_k=50,
-        top_p=0.95,
-        no_repeat_ngram_size=2 # Avoid repetitive phrases
-    )
-    ideas = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
-    # Clean up GPU memory if applicable
-    del inputs
-    del outputs
-    if config.DEVICE == "cuda":
-        torch.cuda.empty_cache()
-    gc.collect()
     return ideas
@@ -129,7 +173,13 @@ def generate_image_sync(prompt: str, negative_prompt: str | None, height: int, w
     """Synchronous function for image generation (run in thread pool)."""
     pipeline = model_cache.get("image_pipeline")
     if not pipeline:
-        raise RuntimeError("Image pipeline not loaded.")
     try:
         with torch.no_grad(): # Conserve memory during inference
@@ -143,15 +193,19 @@ def generate_image_sync(prompt: str, negative_prompt: str | None, height: int, w
                 # generator=torch.Generator(device=config.DEVICE).manual_seed(seed) # Optional: for reproducibility
             )
         image: Image.Image = result.images[0]
         # Encode image to base64
         image_base64 = encode_image_base64(image, format="PNG")
     finally:
         # Clean up GPU memory if applicable
         if config.DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
     return image_base64
@@ -165,18 +219,26 @@ def generate_video_sync(
     fps: int,
     num_inference_steps: int,
     guidance_scale: float
-) -> tuple[str, str]:
     """Synchronous function for video generation (run in thread pool)."""
     pipeline = model_cache.get("video_pipeline")
     if not pipeline:
-        raise RuntimeError("Video pipeline not loaded.")
-    input_image = decode_base64_image(image_base64)
     try:
         with torch.no_grad():
-            # CPU offload handles device placement, no need for explicit .to(config.DEVICE)
-            video_frames = pipeline(
                 input_image,
                 prompt=prompt, # Zeroscope uses prompt less directly, more for style maybe
                 num_inference_steps=num_inference_steps,
@@ -187,21 +249,25 @@ def generate_video_sync(
                 motion_bucket_id=motion_bucket_id,
                 noise_aug_strength=noise_aug_strength
             ).frames[0] # Output is often nested [[frame1, frame2...]]
         # video_frames is usually List[PIL.Image], convert to numpy for encoding
-        video_frames_np = [np.array(frame) for frame in video_frames]
         # Encode video to base64
         video_base64, actual_format = encode_video_base64(video_frames_np, fps=fps, format="MP4") # Request MP4, helper handles fallback
     finally:
         # Clean up GPU/CPU memory
         # Offloading handles VRAM well, but ensure general RAM is freed
         del input_image
-        del video_frames
-        del video_frames_np
         if config.DEVICE == "cuda":
             torch.cuda.empty_cache() # Still good practice
         gc.collect()
     return video_base64, actual_format

 import torch
 from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
 from PIL import Image
+import numpy as np # Added import for numpy array conversion later
 import config
 from utils.helpers import decode_base64_image, encode_image_base64, encode_video_base64
 import logging
 import gc # Garbage collector
+from typing import List, Tuple # Added Tuple for generate_video_sync return type hint
+from diffusers import StableDiffusionPipeline, DiffusionPipeline, DPMSolverMultistepScheduler, LCMScheduler
+# from peft import PeftConfig # Usually not needed directly if using load_lora_weights
 logger = logging.getLogger(__name__)
 # --- Global Model Cache ---
 # Use a dictionary to hold loaded models and tokenizers
 # This allows loading them only once when the app starts.
 def load_models():
     """Loads all models into the cache. Called at application startup."""
     logger.info("Loading models...")
+    try: # <<<--- Start of the MAIN try block for all models ---<<<
+        # --- Text Generation Model ---
         logger.info(f"Loading text model: {config.TEXT_MODEL_NAME}")
         model_cache["text_tokenizer"] = AutoTokenizer.from_pretrained(config.TEXT_MODEL_NAME)
         model_cache["text_model"] = AutoModelForSeq2SeqLM.from_pretrained(config.TEXT_MODEL_NAME).to(config.DEVICE)
         logger.info("Text model loaded.")
+        # --- Image Generation Model (Base) ---
+        logger.info(f"Loading image model: {config.IMAGE_MODEL_NAME}")
+        image_pipeline = StableDiffusionPipeline.from_pretrained(
+            config.IMAGE_MODEL_NAME,
+            torch_dtype=config.DTYPE
+        )
+        # Default scheduler (will be potentially overridden by LCM)
         image_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(image_pipeline.scheduler.config)
+        logger.info("Image base pipeline loaded. Default scheduler: DPMSolverMultistepScheduler.")
+        # --- Attempt to Load LCM LoRA (Optional Speedup) ---
+        # Check if IMAGE_LCM_LORA_NAME is defined and not empty in config
+        lcm_lora_name = getattr(config, 'IMAGE_LCM_LORA_NAME', None) # Safely get LORA name
+        if lcm_lora_name:
+            try:
+                logger.info(f"Attempting to load LCM LoRA: {lcm_lora_name}")
+                # Load LoRA weights directly into the pipeline
+                image_pipeline.load_lora_weights(lcm_lora_name)
+                # Fuse LoRA for potential speedup (optional, test impact)
+                # image_pipeline.fuse_lora()
+                logger.info("LCM LoRA loaded successfully.")
+                # IMPORTANT: Set LCM Scheduler *only if* LoRA loaded successfully
+                image_pipeline.scheduler = LCMScheduler.from_config(image_pipeline.scheduler.config)
+                logger.info("Switched scheduler to LCMScheduler.")
+            except Exception as e:
+                logger.warning(f"Could not load or apply LCM LoRA '{lcm_lora_name}'. Using default scheduler. Error: {e}", exc_info=True)
+                # Scheduler already set to DPMSolverMultistepScheduler above, so no action needed here
+        else:
+            logger.info("No IMAGE_LCM_LORA_NAME configured in environment/config. Using default scheduler.")
+        # --- Image Pipeline Device Placement and Optimizations ---
+        image_pipeline = image_pipeline.to(config.DEVICE)
+        logger.info(f"Image pipeline moved to device: {config.DEVICE}")
+        if config.DEVICE == "cuda":
+            # Optional: Enable memory efficient attention mechanisms if GPU available and libs installed
+            try:
+                # Requires: pip install xformers
+                # image_pipeline.enable_xformers_memory_efficient_attention()
+                # logger.info("Enabled xformers memory efficient attention.")
+                pass # Keep commented out if xformers not installed/intended
+            except ImportError:
+                logger.warning("xformers not installed or enabled. Consider installing for potential memory savings on GPU.")
+                # Fallback option if xformers is not available
+                # try:
+                #     image_pipeline.enable_attention_slicing()
+                #     logger.info("Enabled attention slicing.")
+                # except Exception as attn_slice_e:
+                #      logger.warning(f"Could not enable attention slicing: {attn_slice_e}")
+        # --- Store Image Pipeline in Cache ---
+        model_cache["image_pipeline"] = image_pipeline
+        logger.info("Image model setup complete and cached.")
+        # --- Video Generation Model ---
         logger.info(f"Loading video model: {config.VIDEO_MODEL_NAME}")
         video_pipeline = DiffusionPipeline.from_pretrained(
             config.VIDEO_MODEL_NAME,
             variant="fp16" if config.DTYPE == torch.float16 else None # Zeroscope often has fp16 variants
         )
         video_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(video_pipeline.scheduler.config)
+        logger.info("Video pipeline loaded. Scheduler: DPMSolverMultistepScheduler.")
+        # Enable CPU offloading *before* potentially moving parts to GPU if not offloading everything
+        # This is crucial for fitting larger models in limited VRAM/RAM.
+        try:
+             video_pipeline.enable_model_cpu_offload()
+             logger.info("Enabled model CPU offload for video pipeline.")
+        except AttributeError:
+             logger.warning("Video pipeline class may not support enable_model_cpu_offload(). Attempting to move entire model to device.")
+             # Fallback if offload method isn't available on this specific pipeline class
+             try:
+                 video_pipeline = video_pipeline.to(config.DEVICE)
+                 logger.info(f"Video pipeline moved to device: {config.DEVICE}")
+             except Exception as move_err:
+                 logger.error(f"Failed to move video pipeline to device {config.DEVICE}: {move_err}", exc_info=True)
+                 # Decide if you want to raise here or let it fail later
+                 # raise
+        # Store video pipeline in cache
         model_cache["video_pipeline"] = video_pipeline
+        logger.info("Video model setup complete and cached.")
+        # --- Success Message ---
+        logger.info("All configured models loaded successfully.") # Runs only if all steps above succeed
+    except Exception as e: # <<<--- Catches errors from ANY model loading step ---<<<
+        logger.error(f"FATAL: Error loading one or more models during startup: {e}", exc_info=True)
+        # Re-raise the exception to prevent the application from starting
+        # in a state where essential models are missing.
+        raise
 def generate_ideas_sync(prompt: str, max_length: int, num_ideas: int) -> List[str]:
     tokenizer = model_cache.get("text_tokenizer")
     model = model_cache.get("text_model")
     if not tokenizer or not model:
+        # This should ideally not happen if load_models raises on failure
+        logger.error("Attempted to generate ideas but text model/tokenizer not found in cache.")
+        raise RuntimeError("Text model not loaded or available.")
+    logger.debug(f"Generating ideas for prompt: '{prompt}'")
     input_text = prompt # Keep original prompt based on request model
+    try:
+        inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(config.DEVICE) # Max input length for model
+        # Generation parameters
+        with torch.no_grad(): # Ensure no gradients are computed
+            outputs = model.generate(
+                **inputs,
+                max_length=max_length,
+                num_return_sequences=num_ideas,
+                do_sample=True, # Use sampling for more diverse ideas
+                temperature=0.8,
+                top_k=50,
+                top_p=0.95,
+                no_repeat_ngram_size=2 # Avoid repetitive phrases
+            )
+        ideas = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
+        logger.debug(f"Generated {len(ideas)} ideas.")
+    finally:
+        # Clean up GPU memory if applicable
+        del inputs
+        del outputs
+        if config.DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+        logger.debug("Cleaned up resources after idea generation.")
     return ideas
     """Synchronous function for image generation (run in thread pool)."""
     pipeline = model_cache.get("image_pipeline")
     if not pipeline:
+        logger.error("Attempted to generate image but image pipeline not found in cache.")
+        raise RuntimeError("Image pipeline not loaded or available.")
+    logger.debug(f"Generating image for prompt: '{prompt}'")
+    # Note: If using LCM, optimal steps are much lower (e.g., 4-8) and guidance might be 0 or 1.
+    # Consider adding logic here or in the API route to adjust params if LCM is active.
+    # For now, it uses the user-provided parameters.
     try:
         with torch.no_grad(): # Conserve memory during inference
                 # generator=torch.Generator(device=config.DEVICE).manual_seed(seed) # Optional: for reproducibility
             )
         image: Image.Image = result.images[0]
+        logger.debug("Image generation complete.")
         # Encode image to base64
         image_base64 = encode_image_base64(image, format="PNG")
+        logger.debug("Image encoded to base64.")
     finally:
         # Clean up GPU memory if applicable
+        # pipeline object itself is persistent in cache, don't delete it
         if config.DEVICE == "cuda":
             torch.cuda.empty_cache()
         gc.collect()
+        logger.debug("Cleaned up resources after image generation.")
     return image_base64
     fps: int,
     num_inference_steps: int,
     guidance_scale: float
+) -> Tuple[str, str]: # Corrected return type hint
     """Synchronous function for video generation (run in thread pool)."""
     pipeline = model_cache.get("video_pipeline")
     if not pipeline:
+        logger.error("Attempted to generate video but video pipeline not found in cache.")
+        raise RuntimeError("Video pipeline not loaded or available.")
+    logger.debug("Decoding base64 input image for video generation.")
+    try:
+        input_image = decode_base64_image(image_base64)
+    except Exception as decode_err:
+        logger.error(f"Failed to decode base64 image: {decode_err}", exc_info=True)
+        raise ValueError("Invalid base64 input image.") from decode_err
+    logger.debug(f"Generating video from image, frames={num_frames}, fps={fps}")
     try:
         with torch.no_grad():
+            # CPU offload handles device placement if enabled during load_models
+            video_frames_pil = pipeline(
                 input_image,
                 prompt=prompt, # Zeroscope uses prompt less directly, more for style maybe
                 num_inference_steps=num_inference_steps,
                 motion_bucket_id=motion_bucket_id,
                 noise_aug_strength=noise_aug_strength
             ).frames[0] # Output is often nested [[frame1, frame2...]]
+        logger.debug("Video frame generation complete.")
         # video_frames is usually List[PIL.Image], convert to numpy for encoding
+        video_frames_np = [np.array(frame) for frame in video_frames_pil]
+        logger.debug("Converted video frames to NumPy arrays.")
         # Encode video to base64
         video_base64, actual_format = encode_video_base64(video_frames_np, fps=fps, format="MP4") # Request MP4, helper handles fallback
+        logger.debug(f"Video encoded to base64 with format: {actual_format}")
     finally:
         # Clean up GPU/CPU memory
         # Offloading handles VRAM well, but ensure general RAM is freed
         del input_image
+        if 'video_frames_pil' in locals(): del video_frames_pil
+        if 'video_frames_np' in locals(): del video_frames_np
         if config.DEVICE == "cuda":
             torch.cuda.empty_cache() # Still good practice
         gc.collect()
+        logger.debug("Cleaned up resources after video generation.")
     return video_base64, actual_format