Spaces:

rajux75
/

contentapi

Sleeping

App Files Files Community

rajux75 commited on Apr 16, 2025

Commit

2a5411f

verified ·

1 Parent(s): bdbe000

Create services/generation.py

Browse files

Files changed (1) hide show

services/generation.py +188 -0

services/generation.py ADDED Viewed

	@@ -0,0 +1,188 @@

+# services/generation.py
+import torch
+from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
+from diffusers import StableDiffusionPipeline, DiffusionPipeline, DPMSolverMultistepScheduler
+from PIL import Image
+import config
+from utils.helpers import decode_base64_image, encode_image_base64, encode_video_base64
+import logging
+import gc # Garbage collector
+logger = logging.getLogger(__name__)
+# --- Global Model Cache ---
+# Use a dictionary to hold loaded models and tokenizers
+# This allows loading them only once when the app starts.
+model_cache = {}
+def load_models():
+    """Loads all models into the cache. Called at application startup."""
+    logger.info("Loading models...")
+    try:
+        # Text Generation Model
+        logger.info(f"Loading text model: {config.TEXT_MODEL_NAME}")
+        model_cache["text_tokenizer"] = AutoTokenizer.from_pretrained(config.TEXT_MODEL_NAME)
+        model_cache["text_model"] = AutoModelForSeq2SeqLM.from_pretrained(config.TEXT_MODEL_NAME).to(config.DEVICE)
+        logger.info("Text model loaded.")
+        # Image Generation Model
+        logger.info(f"Loading image model: {config.IMAGE_MODEL_NAME}")
+        image_pipeline = StableDiffusionPipeline.from_pretrained(
+            config.IMAGE_MODEL_NAME,
+            torch_dtype=config.DTYPE
+        )
+        # Optimization: Use a faster scheduler
+        image_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(image_pipeline.scheduler.config)
+        image_pipeline = image_pipeline.to(config.DEVICE)
+        # Optional: Enable attention slicing for lower VRAM usage on GPU
+        if config.DEVICE == "cuda":
+             try:
+                 # Requires pip install xformers - uncomment if installed
+                 # image_pipeline.enable_xformers_memory_efficient_attention()
+                 pass # Use default if xformers not installed/wanted
+             except ImportError:
+                 logger.warning("xformers not installed. Memory efficient attention not enabled.")
+             # image_pipeline.enable_attention_slicing() # Alternative if xformers not available
+        model_cache["image_pipeline"] = image_pipeline
+        logger.info("Image model loaded.")
+        # Video Generation Model
+        logger.info(f"Loading video model: {config.VIDEO_MODEL_NAME}")
+        video_pipeline = DiffusionPipeline.from_pretrained(
+            config.VIDEO_MODEL_NAME,
+            torch_dtype=config.DTYPE,
+            variant="fp16" if config.DTYPE == torch.float16 else None # Zeroscope often has fp16 variants
+        )
+        video_pipeline.scheduler = DPMSolverMultistepScheduler.from_config(video_pipeline.scheduler.config)
+        video_pipeline.enable_model_cpu_offload() # Crucial for low VRAM environments like Spaces CPU/T4
+        # video_pipeline = video_pipeline.to(config.DEVICE) # CPU offload handles device placement
+        model_cache["video_pipeline"] = video_pipeline
+        logger.info("Video model loaded.")
+    except Exception as e:
+        logger.error(f"Error loading models: {e}", exc_info=True)
+        # Depending on policy, you might want to raise the exception
+        # or allow the app to start with missing models (endpoints will fail)
+        raise  # Reraise to prevent app start if essential models fail
+    logger.info("All models loaded successfully.")
+def generate_ideas_sync(prompt: str, max_length: int, num_ideas: int) -> List[str]:
+    """Synchronous function for text generation (run in thread pool)."""
+    tokenizer = model_cache.get("text_tokenizer")
+    model = model_cache.get("text_model")
+    if not tokenizer or not model:
+        raise RuntimeError("Text model not loaded.")
+    # Adjust prompt slightly for better instruction following if needed (e.g., for Flan-T5)
+    # input_text = f"Generate {num_ideas} content ideas about: {prompt}"
+    input_text = prompt # Keep original prompt based on request model
+    inputs = tokenizer(input_text, return_tensors="pt", truncation=True, max_length=512).to(config.DEVICE) # Max input length for model
+    # Generation parameters
+    outputs = model.generate(
+        **inputs,
+        max_length=max_length,
+        num_return_sequences=num_ideas,
+        do_sample=True, # Use sampling for more diverse ideas
+        temperature=0.8,
+        top_k=50,
+        top_p=0.95,
+        no_repeat_ngram_size=2 # Avoid repetitive phrases
+    )
+    ideas = [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
+    # Clean up GPU memory if applicable
+    del inputs
+    del outputs
+    if config.DEVICE == "cuda":
+        torch.cuda.empty_cache()
+    gc.collect()
+    return ideas
+def generate_image_sync(prompt: str, negative_prompt: str | None, height: int, width: int, num_inference_steps: int, guidance_scale: float) -> str:
+    """Synchronous function for image generation (run in thread pool)."""
+    pipeline = model_cache.get("image_pipeline")
+    if not pipeline:
+        raise RuntimeError("Image pipeline not loaded.")
+    try:
+        with torch.no_grad(): # Conserve memory during inference
+            result = pipeline(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                height=height,
+                width=width,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                # generator=torch.Generator(device=config.DEVICE).manual_seed(seed) # Optional: for reproducibility
+            )
+        image: Image.Image = result.images[0]
+        # Encode image to base64
+        image_base64 = encode_image_base64(image, format="PNG")
+    finally:
+        # Clean up GPU memory if applicable
+        if config.DEVICE == "cuda":
+            torch.cuda.empty_cache()
+        gc.collect()
+    return image_base64
+def generate_video_sync(
+    image_base64: str,
+    prompt: str | None,
+    motion_bucket_id: int,
+    noise_aug_strength: float,
+    num_frames: int,
+    fps: int,
+    num_inference_steps: int,
+    guidance_scale: float
+) -> tuple[str, str]:
+    """Synchronous function for video generation (run in thread pool)."""
+    pipeline = model_cache.get("video_pipeline")
+    if not pipeline:
+        raise RuntimeError("Video pipeline not loaded.")
+    input_image = decode_base64_image(image_base64)
+    try:
+        with torch.no_grad():
+            # CPU offload handles device placement, no need for explicit .to(config.DEVICE)
+            video_frames = pipeline(
+                input_image,
+                prompt=prompt, # Zeroscope uses prompt less directly, more for style maybe
+                num_inference_steps=num_inference_steps,
+                num_frames=num_frames,
+                height=input_image.height, # Match input image size usually
+                width=input_image.width,
+                guidance_scale=guidance_scale,
+                motion_bucket_id=motion_bucket_id,
+                noise_aug_strength=noise_aug_strength
+            ).frames[0] # Output is often nested [[frame1, frame2...]]
+        # video_frames is usually List[PIL.Image], convert to numpy for encoding
+        video_frames_np = [np.array(frame) for frame in video_frames]
+        # Encode video to base64
+        video_base64, actual_format = encode_video_base64(video_frames_np, fps=fps, format="MP4") # Request MP4, helper handles fallback
+    finally:
+        # Clean up GPU/CPU memory
+        # Offloading handles VRAM well, but ensure general RAM is freed
+        del input_image
+        del video_frames
+        del video_frames_np
+        if config.DEVICE == "cuda":
+            torch.cuda.empty_cache() # Still good practice
+        gc.collect()
+    return video_base64, actual_format