Spaces:

rokmr
/

flux2.0

Running on Zero

File size: 13,946 Bytes

import spaces  # Import spaces FIRST, before any CUDA-related packages
import torch
from diffusers import Flux2Pipeline
from huggingface_hub import get_token
import requests
import io
import gradio as gr
from PIL import Image
import os

# Configuration
repo_id = "diffusers/FLUX.2-dev-bnb-4bit"
torch_dtype = torch.bfloat16

print("Starting Flux2 Image Generator...")

# Load the pipeline at startup
print("Loading Flux2 pipeline...")
pipe = None

def load_pipeline_startup():
    """Load pipeline at startup without CUDA."""
    global pipe
    try:
        print("Loading pipeline components...")
        pipe = Flux2Pipeline.from_pretrained(
            repo_id,
            text_encoder=None,
            torch_dtype=torch_dtype,
        )
        # Keep on CPU initially - will move to CUDA when needed
        print("Pipeline loaded successfully on CPU!")
    except Exception as e:
        print(f"Warning: Could not load pipeline at startup: {e}")
        print("Pipeline will be loaded on first use.")

# Try to load at startup
load_pipeline_startup()

def remote_text_encoder(prompts):
    """Encode prompts using remote text encoder API."""
    try:
        # Try multiple methods to get the token
        token = None
        
        # Method 1: From huggingface_hub
        try:
            from huggingface_hub import HfFolder
            token = HfFolder.get_token()
        except:
            pass
        
        # Method 2: get_token from huggingface_hub
        if not token:
            try:
                token = get_token()
            except:
                pass
        
        # Method 3: From environment variable (Spaces sets this automatically)
        if not token:
            token = os.environ.get("HF_TOKEN") or os.environ.get("HUGGING_FACE_HUB_TOKEN")
        
        # Method 4: From Spaces secrets
        if not token:
            token = os.environ.get("SPACE_TOKEN") or os.environ.get("SPACES_TOKEN")
        
        if not token:
            raise ValueError(
                "❌ HuggingFace token not found!\n\n"
                "📝 To fix this:\n"
                "1. Go to https://huggingface.co/settings/tokens\n"
                "2. Create a token with 'read' access\n"
                "3. In your Space settings, add a secret named 'HF_TOKEN' with your token value\n"
                "4. Restart your Space\n\n"
                "If running locally, use: huggingface-cli login"
            )
        
        print(f"Token found: {token[:10]}... (length: {len(token)})")
        
        response = requests.post(
            "https://remote-text-encoder-flux-2.huggingface.co/predict",
            json={"prompt": prompts},
            headers={
                "Authorization": f"Bearer {token}",
                "Content-Type": "application/json"
            },
            timeout=60
        )
        response.raise_for_status()
        prompt_embeds = torch.load(io.BytesIO(response.content))
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        return prompt_embeds.to(device)
    except requests.HTTPError as e:
        if e.response.status_code == 401:
            raise Exception(
                "❌ Authentication failed (401).\n\n"
                "Your HuggingFace token may not have access to this model.\n"
                "Please ensure your token has permission to access FLUX.2 models."
            )
        elif e.response.status_code == 403:
            raise Exception(
                "❌ Access forbidden (403).\n\n"
                "You may need to accept the model's license agreement on HuggingFace:\n"
                "Visit: https://huggingface.co/black-forest-labs/FLUX.1-dev"
            )
        else:
            raise Exception(f"HTTP error {e.response.status_code}: {str(e)}")
    except Exception as e:
        if "token" in str(e).lower():
            raise  # Re-raise token errors as-is
        raise Exception(f"Failed to encode prompt: {str(e)}")

def get_duration(prompt: str, input_image: Image.Image = None, num_inference_steps: int = 28, guidance_scale: float = 4.0, seed: int = 42, progress=None):
    """Calculate dynamic GPU duration based on inference steps and input image."""
    num_images = 0 if input_image is None else 1
    step_duration = 1.3 + 0.7 * num_images  # Increased from 1 to 1.3
    # Add extra time for model transfer to GPU + generation
    base_time = 30  # Time for moving model to GPU
    generation_time = num_inference_steps * step_duration
    return int(base_time + generation_time + 15)  # Extra 15s buffer

@spaces.GPU(duration=get_duration)  # Dynamic GPU allocation
def generate_image(
    prompt: str,
    input_image: Image.Image = None,
    num_inference_steps: int = 28,
    guidance_scale: float = 4.0,
    seed: int = 42,
    progress=gr.Progress()
):
    """
    Generate an image using Flux2 based on text prompt and optional input image.
    
    Args:
        prompt: Text description of the desired image
        input_image: Optional input image for image-to-image generation
        num_inference_steps: Number of denoising steps (higher = better quality but slower)
        guidance_scale: How closely to follow the prompt (higher = more strict)
        seed: Random seed for reproducibility (-1 for random)
    """
    global pipe
    
    print(f"=== Starting generation ===")
    print(f"Prompt: {prompt[:100]}...")
    print(f"CUDA available: {torch.cuda.is_available()}")
    
    if not prompt or prompt.strip() == "":
        raise gr.Error("Please enter a prompt!")
    
    progress(0, desc="Moving model to GPU...")
    
    try:
        # Load or get pipeline
        if pipe is None:
            print("Pipeline not loaded at startup, loading now...")
            load_pipeline_startup()
            if pipe is None:
                raise gr.Error("Failed to load pipeline. Please try again or contact support.")
        
        print("Moving pipeline to CUDA...")
        pipeline = pipe.to("cuda")
        torch.cuda.empty_cache()  # Clear cache before generation
        
        progress(0.1, desc="Encoding prompt...")
        print("Encoding prompt...")
        
        # Get prompt embeddings from remote encoder
        try:
            prompt_embeds = remote_text_encoder(prompt)
            print(f"Prompt embeds shape: {prompt_embeds.shape}")
        except Exception as e:
            print(f"Error encoding prompt: {str(e)}")
            raise gr.Error(f"Failed to encode prompt. Please check your HuggingFace token. Error: {str(e)}")
        
        progress(0.2, desc="Generating image...")
        
        # Set up generator
        generator_device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"Generator device: {generator_device}")
        
        if seed == -1:
            import random
            seed = random.randint(0, 2**32 - 1)
        
        print(f"Using seed: {seed}")
        generator = torch.Generator(device=generator_device).manual_seed(int(seed))
        
        # Prepare pipeline arguments
        pipe_kwargs = {
            "prompt_embeds": prompt_embeds,
            "generator": generator,
            "num_inference_steps": int(num_inference_steps),
            "guidance_scale": float(guidance_scale),
        }
        
        # Add input image if provided
        if input_image is not None:
            pipe_kwargs["image"] = input_image
            progress(0.25, desc="Processing input image...")
            print("Processing with input image")
        
        print(f"Starting generation with {num_inference_steps} steps...")
        
        # Custom callback for progress updates
        def progress_callback(pipe, step, timestep, callback_kwargs):
            progress((0.2 + (step / num_inference_steps) * 0.75), 
                    desc=f"Generating... Step {step}/{num_inference_steps}")
            return callback_kwargs
        
        # Generate image
        with torch.inference_mode():
            result = pipeline(
                **pipe_kwargs,
                callback_on_step_end=progress_callback,
            )
            image = result.images[0]
        
        print("Generation complete!")
        progress(1.0, desc="Done!")
        
        # Move pipeline back to CPU to free GPU memory
        print("Moving pipeline back to CPU...")
        pipe.to("cpu")
        torch.cuda.empty_cache()
        
        return image
    
    except gr.Error:
        # Re-raise Gradio errors as-is
        raise
    except Exception as e:
        import traceback
        error_msg = f"Error generating image: {str(e)}\n{traceback.format_exc()}"
        print(error_msg)
        
        # Provide more helpful error messages
        if "CUDA" in str(e) or "out of memory" in str(e).lower():
            raise gr.Error(f"GPU Error: {str(e)}. Try reducing inference steps.")
        elif "token" in str(e).lower() or "401" in str(e):
            raise gr.Error("Authentication failed. Please ensure your HuggingFace token is set correctly.")
        elif "timeout" in str(e).lower():
            raise gr.Error("Request timed out. Please try again.")
        else:
            raise gr.Error(f"Error: {str(e)}")


# Create Gradio interface
with gr.Blocks(
    title="Flux2 Image Generator",
) as demo:
    gr.Markdown(
        """
        # 🎨 Flux2 Image Generator
        Generate stunning images using **FLUX.2-dev** with 4-bit quantization for efficient inference.
        
        Supports both **text-to-image** and **image-to-image** generation.
        
        ⚡ **Powered by Hugging Face Zero GPU** - Automatic GPU allocation on demand!
        """
    )
    
    with gr.Row():
        with gr.Column(scale=1):
            gr.Markdown("### 📝 Input")
            
            prompt_input = gr.Textbox(
                label="Prompt",
                placeholder="Describe the image you want to generate...",
                lines=4,
                value="A cozy coffee shop scene on a rainy afternoon, warm lighting streaming through large windows with raindrops, a steaming cup of coffee on a wooden table with latte art, blurred background showing bookshelves and soft bokeh lights, photorealistic, cinematic composition, shallow depth of field"
            )
            
            image_input = gr.Image(
                label="Input Image (Optional)",
                type="pil",
                sources=["upload", "clipboard"],
                height=300
            )
            
            gr.Markdown("### ⚙️ Parameters")
            
            with gr.Row():
                num_steps = gr.Slider(
                    minimum=1,
                    maximum=100,
                    value=28,
                    step=1,
                    label="Inference Steps",
                    info="More steps = better quality but slower"
                )
                
                guidance = gr.Slider(
                    minimum=1.0,
                    maximum=15.0,
                    value=4.0,
                    step=0.5,
                    label="Guidance Scale",
                    info="How closely to follow the prompt"
                )
            
            seed_input = gr.Number(
                label="Seed",
                value=42,
                precision=0,
                info="Use -1 for random seed"
            )
            
            generate_btn = gr.Button(
                "🚀 Generate Image",
                variant="primary",
                size="lg",
            )
            
            gr.Markdown(
                """
                ### 💡 Tips
                - **Text-to-Image**: Just enter a prompt and click generate
                - **Image-to-Image**: Upload an image and describe the changes
                - Start with 28 steps for a good balance of quality and speed
                - Higher guidance scale follows your prompt more strictly
                - Use the same seed to reproduce results
                - First generation may take longer as the model loads
                """
            )
        
        with gr.Column(scale=1):
            gr.Markdown("### 🖼️ Output")
            
            output_image = gr.Image(
                label="Generated Image",
                type="pil",
                height=600
            )
            
            gr.Markdown(
                """
                ### 📊 Examples
                Try these prompts for inspiration!
                """
            )
    
    # Examples
    gr.Examples(
        examples=[
            [
                "A serene landscape with mountains at sunset, vibrant orange and pink sky, reflected in a calm lake, photorealistic",
                None,
                28,
                4.0,
                42
            ],
            [
                "A futuristic cityscape at night, neon lights, flying cars, cyberpunk style, highly detailed",
                None,
                28,
                4.0,
                123
            ],
            [
                "A cute robot reading a book in a cozy library, warm lighting, digital art style",
                None,
                28,
                4.0,
                456
            ],
            [
                "Macro photography of a dew drop on a leaf, morning light, sharp focus, bokeh background",
                None,
                28,
                4.0,
                789
            ],
        ],
        inputs=[prompt_input, image_input, num_steps, guidance, seed_input],
        outputs=output_image,
        cache_examples=False,
    )
    
    # Connect the generate button
    generate_btn.click(
        fn=generate_image,
        inputs=[prompt_input, image_input, num_steps, guidance, seed_input],
        outputs=output_image,
    )

if __name__ == "__main__":
    print("Launching Gradio interface...")
    demo.queue(max_size=20).launch()