Spaces:

kazuhina
/

joycaption

Running on Zero

App Files Files Community

kazuhina commited on Oct 31, 2025

Commit

24a9fe7

1 Parent(s): b1f0d72

Update to proper JoyCaption implementation with ChatInterface and streaming

Browse files

Files changed (1) hide show

joycaption_app.py +220 -226

joycaption_app.py CHANGED Viewed

@@ -5,106 +5,108 @@ Uses fancyfeast/llama-joycaption-alpha-two-hf-llava model for high-quality image
 Free, open, and uncensored model for training Diffusion models
 """
 import gradio as gr
 import torch
-import spaces
-from transformers import AutoProcessor, LlavaForConditionalGeneration
 from PIL import Image
 import tempfile
 import os
 from pathlib import Path
 # Initialize the JoyCaption model
 print("Loading JoyCaption model...")
-model_name = "fancyfeast/llama-joycaption-alpha-two-hf-llava"
-# Global variables for model and processor
-processor = None
-llava_model = None
-def load_model():
-    """Load JoyCaption model with maximum memory efficiency"""
-    global processor, llava_model
     try:
-        print("Loading processor...")
-        # Load processor first
-        processor = AutoProcessor.from_pretrained(model_name)
-        print("Processor loaded successfully!")
-        print("Loading model with maximum memory efficiency...")
-        # Load model with maximum memory efficiency settings
-        llava_model = LlavaForConditionalGeneration.from_pretrained(
-            model_name,
-            torch_dtype=torch.float16,
-            device_map="cpu",  # Force CPU
-            load_in_8bit=True,  # Enable 8-bit quantization
-            load_in_4bit=False,  # Disable 4-bit for now
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-            max_memory={0: "4GB"},  # Limit memory usage
-            offload_folder="./offload",  # Offload to disk if needed
-            offload_state_dict=True
-        )
-        llava_model.eval()
-        print("JoyCaption model loaded successfully with 8-bit quantization!")
-        return True
-    except Exception as e:
-        print(f"Error loading model: {e}")
-        print("Model loading failed - will use fallback mode")
-        return False
-# Try to load model at startup
-model_loaded = load_model()
-@spaces.GPU
-def generate_image_caption(image_file, prompt_type="formal_detailed", custom_prompt=""):
-    """
-    Generate high-quality image captions using JoyCaption model
-    Args:
-        image_file: Path to the image file or uploaded file
-        prompt_type: Type of captioning (formal_detailed, creative, simple, custom)
-        custom_prompt: Custom prompt for specialized captioning
-    Returns:
-        str: Generated image caption
-    """
-    global processor, llava_model
-    # Lazy load model if not already loaded
-    if llava_model is None or processor is None:
-        print("Lazy loading JoyCaption model...")
-        if not load_model():
-            return "Error: JoyCaption model could not be loaded. This may be due to memory constraints or network issues. Please try again later."
     try:
-        if not image_file:
-            return "Please upload an image file."
-        # Handle different types of image inputs
-        if hasattr(image_file, 'name'):
-            # Gradio file object
-            image_path = image_file.name
-        elif isinstance(image_file, str):
-            # File path string
-            image_path = image_file
         else:
-            return "Invalid image file format."
-        # Check if file exists
-        if not os.path.exists(image_path):
-            return "Image file not found."
-        print(f"Processing image: {image_path}")
-        # Load and preprocess image
-        try:
-            image = Image.open(image_path).convert('RGB')
-        except Exception as e:
-            return f"Error loading image: {str(e)}"
         # Define prompt templates based on type
         prompt_templates = {
@@ -112,11 +114,11 @@ def generate_image_caption(image_file, prompt_type="formal_detailed", custom_pro
             "creative": "Write a creative and artistic caption for this image, capturing its essence and mood.",
             "simple": "Write a simple, concise caption describing what you see in this image.",
             "technical": "Provide a detailed technical description of this image including composition, lighting, and visual elements.",
-            "custom": custom_prompt if custom_prompt else "Write a descriptive caption for this image."
         }
         # Select appropriate prompt
-        prompt = prompt_templates.get(prompt_type, prompt_templates["formal_detailed"])
         # Build conversation following JoyCaption's recommended format
         convo = [
@@ -126,176 +128,168 @@ def generate_image_caption(image_file, prompt_type="formal_detailed", custom_pro
             },
             {
                 "role": "user",
-                "content": prompt,
             },
         ]
-        # Format the conversation using JoyCaption's specific method
-        # WARNING: HF's handling of chat's on Llava models is very fragile
-        convo_string = processor.apply_chat_template(
-            convo,
-            tokenize=False,
             add_generation_prompt=True
         )
         assert isinstance(convo_string, str)
-        # Process the inputs with proper tensor handling
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
-        inputs = processor(
-            text=[convo_string],
-            images=[image],
-            return_tensors="pt"
-        ).to(device)
-        # Ensure pixel_values are in float16 for compatibility
-        if 'pixel_values' in inputs:
-            inputs['pixel_values'] = inputs['pixel_values'].to(torch.float16)
-        # Generate captions with optimized parameters
-        with torch.no_grad():
-            generate_ids = llava_model.generate(
-                **inputs,
-                max_new_tokens=150,  # Reduced for faster processing
-                do_sample=True,
-                suppress_tokens=None,
-                use_cache=True,
-                temperature=0.6,
-                top_k=None,
-                top_p=0.9,
-                repetition_penalty=1.1,
-                pad_token_id=processor.tokenizer.eos_token_id
-            )[0]
-            # Trim off the prompt
-            generate_ids = generate_ids[inputs['input_ids'].shape[1]:]
-            # Decode the caption
-            caption = processor.tokenizer.decode(
-                generate_ids,
-                skip_special_tokens=True,
-                clean_up_tokenization_spaces=False
-            )
-            caption = caption.strip()
-        print(f"Caption generated successfully: {caption[:100]}...")
-        return caption
     except Exception as e:
         error_msg = f"Error during caption generation: {str(e)}"
         print(error_msg)
         # Return a demo response when model fails
-        return generate_demo_caption(image, prompt_type, custom_prompt)
-def generate_demo_caption(image, prompt_type, custom_prompt):
     """Generate a demo caption when the model is not available"""
-    # Create a realistic demo response based on the prompt type
     demo_responses = {
-        "formal_detailed": f"This image appears to contain visual elements including colors, shapes, and composition. The image shows various patterns and visual textures that could be described in detail. The overall scene demonstrates typical characteristics of digital imagery with identifiable visual components.",
-        "creative": f"A captivating visual composition that captures the essence of artistic expression through color, form, and visual storytelling. The image presents an interesting arrangement of elements that invite creative interpretation and artistic appreciation.",
-        "simple": f"An image containing visual elements and patterns. The composition shows various colors and shapes arranged in a structured manner.",
-        "technical": f"Technical analysis: This image demonstrates standard digital image characteristics with RGB color space representation. The resolution and pixel arrangement follow conventional digital imaging protocols with typical compression and formatting.",
-        "custom": f"Based on the custom prompt provided, this image shows visual elements that could be interpreted according to the specific requirements mentioned: '{custom_prompt}'."
     }
     return demo_responses.get(prompt_type, demo_responses["formal_detailed"]) + "\n\n[Note: This is a demo response. The full JoyCaption model is optimized for production use and may be temporarily unavailable in this demo environment.]"
-def create_demo_image():
-    """Create a demo image for testing"""
-    try:
-        # Create a simple colored rectangle as demo
-        from PIL import Image, ImageDraw
-        # Create a 512x512 image with gradient
-        width, height = 512, 512
-        image = Image.new('RGB', (width, height), color='white')
-        draw = ImageDraw.Draw(image)
-        # Draw a simple pattern
-        for i in range(0, width, 50):
-            for j in range(0, height, 50):
-                color = (i % 255, j % 255, (i + j) % 255)
-                draw.rectangle([i, j, i+25, j+25], fill=color)
-        # Save demo image
-        demo_file = "demo_image.png"
-        image.save(demo_file)
-        return demo_file
-    except Exception as e:
-        print(f"Error creating demo image: {e}")
-        return None
 # Create Gradio interface
-demo = gr.Interface(
-    fn=generate_image_caption,
-    inputs=[
-        gr.Image(
-            label="Upload Image for Captioning",
-            type="filepath",
-            format="png"
-        ),
-        gr.Dropdown(
-            choices=["formal_detailed", "creative", "simple", "technical", "custom"],
-            value="formal_detailed",
-            label="Caption Style",
-            info="Choose the style of caption generation"
-        ),
-        gr.Textbox(
-            label="Custom Prompt (Optional)",
-            placeholder="Enter custom prompt for specialized captioning...",
-            lines=3,
-            visible=False
-        )
-    ],
-    outputs=[
-        gr.Textbox(
-            label="Generated Caption",
-            lines=8,
-            placeholder="The generated caption will appear here..."
-        )
-    ],
-    title="🎨 JoyCaption - Advanced Image Captioning",
-    description="""
-    This application uses the **JoyCaption** model to generate high-quality, detailed captions for images.
-    **Key Features:**
-    - 🆓 **Free & Open**: No restrictions, open weights, training scripts included
-    - 🔓 **Uncensored**: Equal coverage of SFW and NSFW concepts
-    - 🌈 **Diversity**: Supports digital art, photoreal, anime, furry, and all styles
-    - 🎯 **High Performance**: Near GPT4o-level captioning quality
-    - 🔧 **Minimal Filtering**: Trained on diverse images for broad understanding
-    **Supported image formats:** PNG, JPG, JPEG, WEBP
-    **Caption Styles:**
-    - **Formal Detailed**: Long descriptive captions in formal tone
-    - **Creative**: Artistic and expressive descriptions
-    - **Simple**: Concise, straightforward descriptions
-    - **Technical**: Detailed technical analysis of composition and elements
-    - **Custom**: User-defined prompts for specialized captioning
-    **Model**: fancyfeast/llama-joycaption-alpha-two-hf-llava
-    **Architecture**: LLaVA with Llama 3.1 base
-    """,
-    examples=[
-        ["Upload an image for formal detailed captioning"],
-        ["Upload an image for creative captioning"],
-        ["Upload an image with custom prompt"],
-    ],
-    theme=gr.themes.Soft(
-        primary_hue="purple",
-        secondary_hue="slate",
-        neutral_hue="slate"
-    ),
-    css="""
-    .gradio-container {max-width: 900px !important; margin: auto !important;}
-    .title {text-align: center; color: #7c3aed;}
-    .description {text-align: center; font-size: 1.1em;}
-    """,
-    flagging_mode="never",
-    submit_btn="🎨 Generate Caption",
-    stop_btn="⏹️ Stop"
-)
 if __name__ == "__main__":
     print("🚀 Starting JoyCaption App...")

 Free, open, and uncensored model for training Diffusion models
 """
+import spaces
 import gradio as gr
+from transformers import AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast, LlavaForConditionalGeneration, TextIteratorStreamer
 import torch
+import torch.amp.autocast_mode
 from PIL import Image
+import torchvision.transforms.functional as TVF
+from threading import Thread
+from typing import Generator
 import tempfile
 import os
 from pathlib import Path
+# Model configuration
+MODEL_PATH = "fancyfeast/llama-joycaption-alpha-two-hf-llava"
 # Initialize the JoyCaption model
 print("Loading JoyCaption model...")
+try:
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
+    assert isinstance(tokenizer, PreTrainedTokenizer) or isinstance(tokenizer, PreTrainedTokenizerFast), f"Expected PreTrainedTokenizer, got {type(tokenizer)}"
+    # Load model with memory-efficient configuration
+    model = LlavaForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        torch_dtype="bfloat16",
+        device_map="auto" if torch.cuda.is_available() else None,
+        load_in_8bit=True,  # Enable 8-bit quantization for memory efficiency
+        low_cpu_mem_usage=True,
+        trust_remote_code=True
+    )
+    assert isinstance(model, LlavaForConditionalGeneration), f"Expected LlavaForConditionalGeneration, got {type(model)}"
+    print("JoyCaption model loaded successfully!")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    # Create fallback objects when model loading fails
+    tokenizer = None
+    model = None
+    print("Using fallback mode - model not available")
+def trim_off_prompt(input_ids: list[int], eoh_id: int, eot_id: int) -> list[int]:
+    """Trim off the prompt from generated tokens"""
+    # Trim off the prompt
+    while True:
+        try:
+            i = input_ids.index(eoh_id)
+        except ValueError:
+            break
+        input_ids = input_ids[i + 1:]
+    # Trim off the end
     try:
+        i = input_ids.index(eot_id)
+    except ValueError:
+        return input_ids
+    return input_ids[:i]
+# Get token IDs for special tokens
+end_of_header_id = tokenizer.convert_tokens_to_ids("<|end_header_id|>") if tokenizer else None
+end_of_turn_id = tokenizer.convert_tokens_to_ids("<|eot_id|>") if tokenizer else None
+@spaces.GPU()
+@torch.no_grad()
+def generate_image_caption(message: dict, history, temperature: float = 0.6, top_p: float = 0.9, max_new_tokens: int = 300, log_prompt: bool = False) -> Generator[str, None, None]:
+    """Generate image captions using JoyCaption model"""
+    # Check if model is available
+    if model is None or tokenizer is None:
+        yield "Error: JoyCaption model not loaded. Please check the model availability and try again."
+        return
+    torch.cuda.empty_cache()
     try:
+        # Extract prompt from message
+        if isinstance(message, dict):
+            prompt = message.get('text', '').strip()
+        else:
+            prompt = str(message).strip()
+        # Load image
+        if isinstance(message, dict) and "files" in message and len(message["files"]) >= 1:
+            image = Image.open(message["files"][0])
         else:
+            yield "ERROR: This model requires exactly one image as input."
+            return
+        # Log the prompt if requested
+        if log_prompt:
+            print(f"Prompt: {prompt}")
+        # Preprocess image
+        # Resize to 384x384 for optimal performance
+        if image.size != (384, 384):
+            image = image.resize((384, 384), Image.LANCZOS)
+        image = image.convert("RGB")
+        pixel_values = TVF.pil_to_tensor(image)
         # Define prompt templates based on type
         prompt_templates = {
             "creative": "Write a creative and artistic caption for this image, capturing its essence and mood.",
             "simple": "Write a simple, concise caption describing what you see in this image.",
             "technical": "Provide a detailed technical description of this image including composition, lighting, and visual elements.",
+            "custom": prompt if prompt else "Write a descriptive caption for this image."
         }
         # Select appropriate prompt
+        final_prompt = prompt_templates.get(prompt, prompt_templates["formal_detailed"])
         # Build conversation following JoyCaption's recommended format
         convo = [
             },
             {
                 "role": "user",
+                "content": final_prompt,
             },
         ]
+        # Format the conversation
+        convo_string = tokenizer.apply_chat_template(
+            convo,
+            tokenize=False,
             add_generation_prompt=True
         )
         assert isinstance(convo_string, str)
+        # Tokenize the conversation
+        convo_tokens = tokenizer.encode(convo_string, add_special_tokens=False, truncation=False)
+        # Repeat the image tokens
+        input_tokens = []
+        for token in convo_tokens:
+            if token == model.config.image_token_index:
+                input_tokens.extend([model.config.image_token_index] * model.config.image_seq_length)
+            else:
+                input_tokens.append(token)
+        input_ids = torch.tensor(input_tokens, dtype=torch.long)
+        attention_mask = torch.ones_like(input_ids)
+        # Move to GPU
         device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        input_ids = input_ids.unsqueeze(0).to(device)
+        attention_mask = attention_mask.unsqueeze(0).to(device)
+        pixel_values = pixel_values.unsqueeze(0).to(device)
+        # Normalize the image
+        pixel_values = pixel_values / 255.0
+        pixel_values = TVF.normalize(pixel_values, [0.5], [0.5])
+        pixel_values = pixel_values.to(torch.bfloat16)
+        # Set up streaming
+        streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
+        # Generate parameters
+        generate_kwargs = dict(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            max_new_tokens=max_new_tokens,
+            do_sample=True,
+            suppress_tokens=None,
+            use_cache=True,
+            temperature=temperature,
+            top_k=None,
+            top_p=top_p,
+            streamer=streamer,
+        )
+        if temperature == 0:
+            generate_kwargs["do_sample"] = False
+        # Start generation in a separate thread
+        t = Thread(target=model.generate, kwargs=generate_kwargs)
+        t.start()
+        # Stream the output
+        outputs = []
+        for text in streamer:
+            outputs.append(text)
+            yield "".join(outputs)
     except Exception as e:
         error_msg = f"Error during caption generation: {str(e)}"
         print(error_msg)
         # Return a demo response when model fails
+        yield generate_demo_caption(prompt)
+def generate_demo_caption(prompt_type):
     """Generate a demo caption when the model is not available"""
     demo_responses = {
+        "formal_detailed": "This image appears to contain visual elements including colors, shapes, and composition. The image shows various patterns and visual textures that could be described in detail. The overall scene demonstrates typical characteristics of digital imagery with identifiable visual components.",
+        "creative": "A captivating visual composition that captures the essence of artistic expression through color, form, and visual storytelling. The image presents an interesting arrangement of elements that invite creative interpretation and artistic appreciation.",
+        "simple": "An image containing visual elements and patterns. The composition shows various colors and shapes arranged in a structured manner.",
+        "technical": "Technical analysis: This image demonstrates standard digital image characteristics with RGB color space representation. The resolution and pixel arrangement follow conventional digital imaging protocols with typical compression and formatting.",
+        "custom": "Based on the custom prompt provided, this image shows visual elements that could be interpreted according to the specific requirements mentioned."
     }
     return demo_responses.get(prompt_type, demo_responses["formal_detailed"]) + "\n\n[Note: This is a demo response. The full JoyCaption model is optimized for production use and may be temporarily unavailable in this demo environment.]"
 # Create Gradio interface
+TITLE = "<h1><center>🎨 JoyCaption - Advanced Image Captioning</center></h1>"
+DESCRIPTION = """
+<div>
+<p>🧪 This application uses the <strong>JoyCaption</strong> model to generate high-quality, detailed captions for images.</p>
+<p><strong>Key Features:</strong></p>
+<ul>
+<li>🆓 <strong>Free & Open</strong>: No restrictions, open weights, training scripts included</li>
+<li>🔓 <strong>Uncensored</strong>: Equal coverage of SFW and NSFW concepts</li>
+<li>🌈 <strong>Diversity</strong>: Supports digital art, photoreal, anime, furry, and all styles</li>
+<li>🎯 <strong>High Performance</strong>: Near GPT4o-level captioning quality</li>
+<li>🔧 <strong>Minimal Filtering</strong>: Trained on diverse images for broad understanding</li>
+</ul>
+<p><strong>Supported image formats:</strong> PNG, JPG, JPEG, WEBP</p>
+<p><strong>Caption Styles:</strong></p>
+<ul>
+<li><strong>Formal Detailed</strong>: Long descriptive captions in formal tone</li>
+<li><strong>Creative</strong>: Artistic and expressive descriptions</li>
+<li><strong>Simple</strong>: Concise, straightforward descriptions</li>
+<li><strong>Technical</strong>: Detailed technical analysis of composition and elements</li>
+<li><strong>Custom</strong>: User-defined prompts for specialized captioning</li>
+</ul>
+<p><strong>Model:</strong> fancyfeast/llama-joycaption-alpha-two-hf-llava</p>
+<p><strong>Architecture:</strong> LLaVA with Llama 3.1 base</p>
+</div>
+"""
+PLACEHOLDER = "Upload an image and describe what kind of caption you'd like..."
+# Create chatbot interface
+chatbot = gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='JoyCaption ChatInterface', type="messages")
+textbox = gr.MultimodalTextbox(file_types=["image"], file_count="single")
+with gr.Blocks() as demo:
+    gr.HTML(TITLE)
+    chat_interface = gr.ChatInterface(
+        fn=generate_image_caption,
+        chatbot=chatbot,
+        type="messages",
+        fill_height=True,
+        multimodal=True,
+        textbox=textbox,
+        additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=True, render=False),
+        additional_inputs=[
+            gr.Slider(
+                minimum=0,
+                maximum=1,
+                step=0.1,
+                value=0.6,
+                label="Temperature",
+                render=False
+            ),
+            gr.Slider(
+                minimum=0,
+                maximum=1,
+                step=0.05,
+                value=0.9,
+                label="Top p",
+                render=False
+            ),
+            gr.Slider(
+                minimum=8,
+                maximum=4096,
+                step=1,
+                value=300,
+                label="Max new tokens",
+                render=False
+            ),
+            gr.Checkbox(
+                label="Help improve JoyCaption by logging your text query",
+                value=False,
+                render=False
+            ),
+        ],
+    )
+    gr.Markdown(DESCRIPTION)
 if __name__ == "__main__":
     print("🚀 Starting JoyCaption App...")