Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 22, 2025

Commit

82fef69

verified ·

1 Parent(s): 3c448f0

Update app.py

Browse files

Files changed (1) hide show

app.py +95 -858

app.py CHANGED Viewed

@@ -1,6 +1,5 @@
 try:
     import spaces
-    # Ensure spaces.GPU exists and is a decorator
     if not hasattr(spaces, 'GPU'):
         def _spaces_gpu(*args, **kwargs):
             def _wrap(f):
@@ -8,7 +7,6 @@ try:
             return _wrap
         spaces.GPU = _spaces_gpu
 except Exception:
-    # Provide a no-op spaces with a GPU decorator fallback so app can run outside HF Spaces
     import types
     spaces = types.SimpleNamespace()
     def _spaces_gpu(*args, **kwargs):
@@ -16,46 +14,34 @@ except Exception:
             return f
         return _wrap
     spaces.GPU = _spaces_gpu
-# Some Spaces runtimes require at least one function decorated with @spaces.GPU
-# Register a no-op GPU-decorated function so the platform detection succeeds.
 @spaces.GPU()
 def _joycaption_register_gpu():
-    """No-op function decorated with @spaces.GPU to satisfy Spaces startup detection."""
     return None
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
-import tempfile
-import gc
-import time
-import gc
-import os
-import shutil
-import json
 from pathlib import Path
-import re
 from hf_space_utils import fix_image_url, postprocess_caption
-# Storage optimization - redirect cache to temporary directories (platform independent)
 _tmpdir = tempfile.gettempdir()
 os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
 os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
-# Model configuration
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
-# Optional public host for converting /tmp/gradio paths to public gradio_api URLs
 SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
 def cleanup_storage():
-    """Clean up temporary files and caches to prevent storage overflow"""
     try:
-        # Clean up temporary caches using the configured environment paths
         temp_dirs = [
             os.environ.get("HF_HOME"),
             os.environ.get("TRANSFORMERS_CACHE"),
@@ -63,26 +49,15 @@ def cleanup_storage():
             os.environ.get("TORCH_HOME")
         ]
         for temp_dir in temp_dirs:
-            if not temp_dir:
-                continue
-            if os.path.exists(temp_dir):
-                try:
-                    shutil.rmtree(temp_dir, ignore_errors=True)
-                except Exception:
-                    # best-effort cleanup
-                    pass
-        # Force garbage collection
         gc.collect()
-        # Clear GPU cache if available
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
         print("✅ Storage cleanup completed")
     except Exception as e:
-        print(f"⚠️ Storage cleanup warning: {e}")
 TITLE = """
 <div style="text-align: center; margin: 20px 0;">
@@ -93,26 +68,18 @@ TITLE = """
 <hr>
 """
-print("🚀 Loading Sequential Three-Tone JoyCaption system... v2.1")
-# Load model and processor at startup
-print("📦 Loading model and processor at startup...")
 processor = None
 model = None
 MODEL_TORCH_DTYPE = None
 MODEL_USE_CUDA = False
-# Force cleanup before loading model to avoid tokenizer desync
-cleanup_storage()
-torch.cuda.empty_cache()
-gc.collect()
-# Allow skipping model loading for tests or light-weight runs by setting SKIP_MODEL_LOAD=1
 if not os.environ.get("SKIP_MODEL_LOAD"):
-    # Determine target device for model loading. On zero-GPU spaces, fall back to CPU.
     use_cuda = torch.cuda.is_available()
     if use_cuda:
-        # Prefer bf16 on supported GPUs, otherwise try float16
         torch_dtype = getattr(torch, 'bfloat16', None) or getattr(torch, 'float16', None)
         device_map = "auto"
         MODEL_USE_CUDA = True
@@ -121,912 +88,182 @@ if not os.environ.get("SKIP_MODEL_LOAD"):
         device_map = "cpu"
         MODEL_USE_CUDA = False
-    processor = AutoProcessor.from_pretrained(
-        MODEL_PATH,
-        low_cpu_mem_usage=True
-    )
     model_kwargs = dict(low_cpu_mem_usage=True, device_map=device_map)
     if torch_dtype is not None and use_cuda:
         model_kwargs['torch_dtype'] = torch_dtype
-    model = LlavaForConditionalGeneration.from_pretrained(
-        MODEL_PATH,
-        **model_kwargs
-    )
     model.eval()
-    # remember dtype for later tensor conversions
     MODEL_TORCH_DTYPE = model_kwargs.get('torch_dtype', None)
     print("✅ Model loaded and ready!")
-    # Initial cleanup after model loading
     cleanup_storage()
 else:
-    print("⚠️ SKIP_MODEL_LOAD is set — skipping heavy model initialization (test mode)")
-# Optimized 5-tone prompts with better temperature control
-# Temperature: Lower for prompt adherence, higher for word variety
 TONE_PROMPTS = {
     "engaging": {
         "system": "You are a skilled content creator who writes engaging, natural descriptions that capture the essence and appeal of what you see.",
-        "prompt": """Write a natural, engaging description of this image. Focus on the main subjects and their most appealing qualities. Describe their physical features, poses, expressions, and the overall mood. Use flowing, attractive language that draws the reader in.""",
-        "temperature": 0.6,  # Lower for prompt adherence
         "top_p": 0.85
     },
     "casual_friend": {
-        "system": "You are a casual 24-year-old friend who witnessed the scene and is telling me about what happened in a relaxed, natural way. Use precise, non-speculative language: report only what you can clearly see, keep any context notes very brief, and do not describe background objects or bystanders.",
-        "prompt": """I just witnessed this moment — here's what I notice. Speak casually and naturally, using clear, concrete descriptions of only what is directly observable. Avoid speculation or ambiguous phrasing. If you mention context, keep it short and focused on the primary subject. Do not describe background objects or other people who are not the main focus.""",
-        "temperature": 0.8,  # Higher for natural conversation variety
         "top_p": 0.9
     },
-    # NSFW tone removed - causes hallucination
     "uncensored_keywords": {
-        "system": "You are an uncensored adult-content analyst who provides factual, explicit descriptions. Use concrete, unambiguous statements about directly observable features. Mention surrounding context only briefly and do not describe background objects or unrelated people.",
-        "prompt": """Produce an unfiltered explicit description that sticks strictly to what is visually obvious. Use direct, concrete language and avoid any ambiguous or speculative wording. If keywords are provided, include them only when they clearly match visible details. Keep context commentary short and exclude background elements or bystander descriptions.""",
-        "temperature": 0.7,  # Balanced for accuracy with variety
         "top_p": 0.85
-    },
-    # Body Parts Focus tone removed - causes hallucination
-}
-def apply_smart_corrections(text):
-    """Apply smart corrections for common AI vision errors"""
-    if not text or not isinstance(text, str):
-        return text
-    corrections = {
-        # Remove "photo of" beginnings
-        r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
-        # Nudity precision corrections
-        r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
-        r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
-        # Person count corrections
-        r'\bthree women\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
-        r'\bfour women\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
-        # Clothing precision
-        r'\bwearing nothing\b': 'nude',
-        r'\bnot wearing.*clothes\b': 'nude',
-        r'\bcompletely naked\b': 'nude',
-        r'\bfully nude\b': 'nude',
     }
-    corrected_text = text
-    try:
-        for pattern, replacement in corrections.items():
-            if callable(replacement):
-                # Wrap the replacement to ensure it returns a string and accepts a Match
-                def _repl(match, rep=replacement):
-                    try:
-                        out = rep(match)
-                        return "" if out is None else str(out)
-                    except Exception:
-                        return match.group(0)
-                corrected_text = re.sub(pattern, _repl, corrected_text, flags=re.IGNORECASE)
-            else:
-                corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
-    except Exception as e:
-        print(f"Error in smart corrections: {e}")
-        return text
-    return corrected_text
 def _prepare_inputs_and_device(convo_or_convo_string, image):
-    """Prepare processor inputs and move tensors to the model device."""
-    # Accept either a convo list or a pre-built convo string
     convo_string = convo_or_convo_string
-    try:
-        if isinstance(convo_or_convo_string, list):
             convo_string = processor.apply_chat_template(convo_or_convo_string, tokenize=False, add_generation_prompt=True)
-    except Exception:
-        # If processor is not ready or fails, let the caller handle the missing model/processor
-        pass
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
     device = next(model.parameters()).device
     inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
-    # Ensure pixel_values dtype is safe for the runtime
     if 'pixel_values' in inputs:
-        if MODEL_USE_CUDA and MODEL_TORCH_DTYPE is not None:
-            try:
-                inputs['pixel_values'] = inputs['pixel_values'].to(MODEL_TORCH_DTYPE)
-            except Exception:
-                inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
-        else:
-            inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
     return inputs
 def _decode_output(inputs, output):
-    """Decode generate output safely, removing input prompt tokens if present."""
-    if output is None or len(output) == 0:
         return ""
     try:
-        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
-            input_length = inputs['input_ids'].shape[1]
-            if len(output[0]) > input_length:
-                generate_ids = output[0][input_length:]
-                return processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
-            else:
-                return processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
-        else:
-            return processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
     except Exception:
-        # Fallback: try a direct decode
-        try:
-            return processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False).strip()
-        except Exception:
-            return ""
-def cleanup_after_inference(inputs=None, output=None):
-    """Lightweight cleanup after an inference run."""
-    try:
-        del inputs
-    except Exception:
-        pass
-    try:
-        del output
-    except Exception:
-        pass
-    try:
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-    except Exception:
-        pass
     gc.collect()
 def run_image_chat_generation(convo, image, max_new_tokens=150, temperature=0.7, top_p=0.9):
-    """
-    Centralized helper to run an image+chat generation using the loaded processor/model.
-    Returns the decoded string (possibly empty) or None and an error message string on failure.
-    """
     if processor is None or model is None:
-        return None, "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
     try:
-        # Prepare inputs
         inputs = _prepare_inputs_and_device(convo, image)
-        # Run generation
         with torch.no_grad():
             output = model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
-                do_sample=True,
                 temperature=temperature,
                 top_p=top_p,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 eos_token_id=processor.tokenizer.eos_token_id
             )
         decoded = _decode_output(inputs, output)
-        # Cleanup
-        cleanup_after_inference(inputs, output)
         return decoded, None
     except Exception as e:
-        try:
-            cleanup_after_inference(None, None)
-        except:
-            pass
-        return None, f"❌ Error during generation: {str(e)[:200]}"
 def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", custom_instruction=""):
-    """Generate caption directly with keywords and custom instructions support"""
     try:
-        if image is None:
-            return f"❌ No image provided for {tone} caption"
-        # Get tone configuration
-        tone_config = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
-        # Modify prompt based on tone and provided keywords/instructions
-        base_prompt = tone_config["prompt"]
-        # Add keywords instruction for uncensored_keywords tone
-        if tone == "uncensored_keywords" and keywords_text and keywords_text.strip():
-            base_prompt += f"\n\nKeywords to mention IF applicable: {keywords_text.strip()}"
-        # Add custom instruction to any tone if provided
-        if custom_instruction and custom_instruction.strip():
-            base_prompt += f"\n\nMake sure to mention: {custom_instruction.strip()}\nInclude this detail naturally in your description."
-        # Create conversation
         convo = [
-            {"role": "system", "content": tone_config["system"]},
             {"role": "user", "content": base_prompt}
         ]
-        # Ensure model and processor are loaded
-        if processor is None or model is None:
-            return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
-        # Use centralized generation helper
-        decoded, err = run_image_chat_generation(convo, image, max_new_tokens=150, temperature=tone_config.get("temperature", 0.7), top_p=tone_config.get("top_p", 0.9))
         if err:
             return err
-        result = (decoded or "").strip()
-        # Post-process caption (sanitize + smart corrections + truncation)
-        result = postprocess_caption(result, max_chars=max_chars)
-        # Aggressive cleanup to prevent storage overflow
-        del inputs, output
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-        gc.collect()
-        return result if result else f"❌ Empty result for {tone}"
     except Exception as e:
-        try:
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-            gc.collect()
-        except:
-            pass
-        return f"❌ Error: {str(e)[:50]}..."
 @torch.no_grad()
 def generate_engaging_only(image, custom_instruction=""):
-    """Generate only engaging caption"""
     return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def generate_casual_friend_only(image, custom_instruction=""):
-    """Generate only casual friend caption"""
     return safe_generate_caption_direct(image, "casual_friend", custom_instruction=custom_instruction) if image else "❌ Upload image first"
-# NSFW function removed - caused hallucination
 @torch.no_grad()
 def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
-    """Generate only uncensored with keywords caption"""
     return safe_generate_caption_direct(image, "uncensored_keywords", keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
-@torch.no_grad()
-def generate_test_tone(image, system_text, prompt_text):
-    """Generate a user-defined test tone using the provided system and prompt.
-    This output is for testing only and is NOT included in the exported JSON."""
-    try:
-        if image is None:
-            return "❌ Upload image first"
-        # Fallback defaults if user cleared the fields
-        if not system_text or not system_text.strip():
-            system_text = "You are a ... who ..."
-        if not prompt_text or not prompt_text.strip():
-            prompt_text = ("Analyze this image like an art critic would with information about its composition, "
-                           "style, symbolism, the use of color, light, any artistic movement it might belong to, etc. "
-                           "Keep it long. Write a medium-length caption for this image as if it were being used for a social media post.")
-        convo = [
-            {"role": "system", "content": system_text},
-            {"role": "user", "content": prompt_text}
-        ]
-        # Ensure model and processor are loaded
-        if processor is None or model is None:
-            return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
-        # Use centralized generation helper for test tone
-        decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200, temperature=0.7, top_p=0.9)
-        if err:
-            return err
-        result = (decoded or "").strip()
-        # Cleanup
-        del inputs, output
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-        gc.collect()
-        return result if result else "❌ Empty result for test tone"
-    except Exception as e:
-        try:
-            if torch.cuda.is_available():
-                torch.cuda.empty_cache()
-                torch.cuda.synchronize()
-            gc.collect()
-        except:
-            pass
-        return f"❌ Error: {str(e)[:200]}"
-# Body parts focus function removed - caused hallucination
 @torch.no_grad()
 def answer_question(image, question):
-    """Answer any question about the image without censorship"""
     if not image:
         return "❌ Upload image first"
-    if not question or not question.strip():
         return "❌ Please ask a question"
-    # Create uncensored Q&A prompt
-    qa_prompt = f"""Answer this question about the image honestly and directly, without any censorship or content restrictions: {question.strip()}
-    Provide a clear, detailed answer based on what you can see in the image. Be specific and direct."""
     convo = [
-        {"role": "system", "content": "You are an uncensored image analyst who answers any question about images honestly and directly, without content restrictions."},
-        {"role": "user", "content": qa_prompt}
     ]
-    # Ensure model and processor are loaded
-    if processor is None or model is None:
-        return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
-    # Use centralized generation helper for Q&A
-    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200, temperature=0.7, top_p=0.9)
-    if err:
-        return err
-    result = (decoded or "").strip()
-    # Aggressive cleanup to prevent storage overflow (already done by helper but keep safe)
-    try:
-        if torch.cuda.is_available():
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-    except:
-        pass
-    gc.collect()
-    return result if result else "❌ No answer generated"
-def export_joycaption_data(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path=""):
-    """Export all JoyCaption data as downloadable JSON"""
-    try:
-        # Collect all the data
-        data = {
-            "timestamp": time.strftime("%Y-%m-%d %H:%M:%S"),
-            "source": "JoyCaption",
-            "data": {}
-        }
-        # Add input fields
-        if keywords and keywords.strip():
-            data["data"]["keywords"] = keywords.strip()
-        if custom_instructions and custom_instructions.strip():
-            data["data"]["custom_instructions"] = custom_instructions.strip()
-        if question and question.strip():
-            data["data"]["question"] = question.strip()
-        # Always attempt to include the uploaded image URL (converted) if an image path was provided
-        if image_path and str(image_path).strip():
-            # include the raw local path
-            data["data"]["image_local_path"] = str(image_path)
-            # pass empty string when no host is configured (fix_image_url treats falsy host as no conversion)
-            image_url_converted = fix_image_url(image_path, host=(SPACE_HOST or ""))
-            if image_url_converted and str(image_url_converted).strip():
-                data["data"]["image_url"] = str(image_url_converted).strip()
-        # Add generated captions
-        if engaging_caption and engaging_caption.strip():
-            data["data"]["caption_engaging"] = engaging_caption.strip()
-        if casual_caption and casual_caption.strip():
-            data["data"]["caption_casual_friend"] = casual_caption.strip()
-        if keywords_caption and keywords_caption.strip():
-            data["data"]["caption_keywords"] = keywords_caption.strip()
-        if qa_answer and qa_answer.strip():
-            data["data"]["qa_answer"] = qa_answer.strip()
-        # Check if we have any data to export
-        if not data["data"]:
-            return "❌ No data  export. Generate some captions first!", None
-        # Create JSON string
-        json_string = json.dumps(data, indent=2, ensure_ascii=False)
-        # Create filename with timestamp
-        filename = f"joycaption_data_{time.strftime('%Y%m%d_%H%M%S')}.json"
-        # Return success message and file data
-        fields_count = len(data["data"])
-        return f"✅ Exported {fields_count} fields: {', '.join(data['data'].keys())}", (json_string, filename)
-    except Exception as e:
-        return f"❌ Export failed: {str(e)}", None
-# JavaScript for export functionality
-EXPORT_JS = """
-<script>
-// JoyCaption Export System
-(function() {
-    console.log('🚀 Initializing JoyCaption Export System...');
-    // Extract data from page fields
-    window.getJoyCaptionData = function() {
-        console.log('📊 Extracting JoyCaption data...');
-        const data = {};
-        // Get all textareas and inputs from the page
-        const allInputs = document.querySelectorAll('textarea, input[type="text"]');
-        allInputs.forEach((field, index) => {
-            const placeholder = (field.placeholder || '').toLowerCase();
-            const value = field.value ? field.value.trim() : '';
-            // Skip empty fields
-            if (!value) return;
-            // Map based on placeholder text and content length
-            if (placeholder.includes('engaging') || (value.length > 50 && placeholder.includes('generate engaging'))) {
-                data.caption_engaging = value;
-            } else if (placeholder.includes('casual') || placeholder.includes('friend') || (value.length > 50 && placeholder.includes('generate casual'))) {
-                data.caption_casual_friend = value;
-            } else if (placeholder.includes('keyword') && value.length > 50) {
-                data.caption_keywords = value;
-            } else if (placeholder.includes('keyword') && value.length <= 50) {
-                data.keywords = value;
-            } else if (placeholder.includes('custom') || placeholder.includes('make sure') || placeholder.includes('mention')) {
-                data.custom_instructions = value;
-            } else if (placeholder.includes('question')) {
-                data.question = value;
-            } else if (value.length > 50) {
-                // Long text likely a caption
-                if (!data.caption_engaging) data.caption_engaging = value;
-                else if (!data.caption_casual_friend) data.caption_casual_friend = value;
-                else if (!data.caption_keywords) data.caption_keywords = value;
-            }
-        });
-        // Add image URLs if present
-        const images = document.querySelectorAll('img');
-        const imageUrls = [];
-        images.forEach(img => {
-            if (img.src && !img.src.includes('data:') && !img.src.includes('blob:')) {
-                imageUrls.push(img.src);
-            }
-        });
-        if (imageUrls.length > 0) {
-            data.image_urls = imageUrls;
-        }
-        console.log('📦 Extracted data:', data);
-        return data;
-    };
-    // Listen for extension requests
-    window.addEventListener('message', function(event) {
-        if (event.data && event.data.action === 'getJoyCaptionData') {
-            const data = window.getJoyCaptionData();
-            event.source.postMessage({
-                action: 'joyCaptionData',
-                data: data,
-                success: Object.keys(data).length > 0
-            }, event.origin);
-        }
-    });
-    // Export functionality
-    window.downloadJoyCaptionData = function() {
-        try {
-            const rawData = window.getJoyCaptionData();
-            if (Object.keys(rawData).length === 0) {
-                alert('❌ No data found to export. Make sure you have generated captions first.');
-                return;
-            }
-            // Package data for export
-            const exportData = {
-                timestamp: new Date().toISOString(),
-                source: 'JoyCaption',
-                data: rawData
-            };
-            // Create and download JSON file
-            const jsonString = JSON.stringify(exportData, null, 2);
-            const blob = new Blob([jsonString], { type: 'application/json' });
-            const url = URL.createObjectURL(blob);
-            const a = document.createElement('a');
-            a.href = url;
-            a.download = `joycaption_data_${new Date().toISOString().slice(0, 16).replace(/:/g, '-')}.json`;
-            document.body.appendChild(a);
-            a.click();
-            document.body.removeChild(a);
-            URL.revokeObjectURL(url);
-            alert(`✅ Downloaded JoyCaption data with ${Object.keys(rawData).length} fields!`);
-            console.log('📥 Downloaded data:', exportData);
-        } catch (error) {
-            console.error('❌ Export error:', error);
-            alert('❌ Export failed: ' + error.message);
-        }
-    };
-    // Create export button
-    function createExportButton() {
-        // Remove any existing button first
-        const existingBtn = document.getElementById('joyCaption-export-btn');
-        if (existingBtn) existingBtn.remove();
-        // Create a floating export button
-        const exportBtn = document.createElement('button');
-        exportBtn.id = 'joyCaption-export-btn';
-        exportBtn.innerHTML = '📥 Export JoyCaption Data';
-        exportBtn.style.cssText = `
-            position: fixed;
-            top: 20px;
-            right: 20px;
-            z-index: 9999;
-            background: linear-gradient(135deg, #ff6b35, #f7931e);
-            color: white;
-            border: none;
-            padding: 12px 20px;
-            border-radius: 25px;
-            font-weight: 600;
-            cursor: pointer;
-            box-shadow: 0 4px 12px rgba(255, 107, 53, 0.3);
-            transition: all 0.3s ease;
-        `;
-        exportBtn.addEventListener('mouseover', () => {
-            exportBtn.style.transform = 'translateY(-2px)';
-            exportBtn.style.boxShadow = '0 6px 16px rgba(255, 107, 53, 0.4)';
-        });
-        exportBtn.addEventListener('mouseout', () => {
-            exportBtn.style.transform = 'translateY(0)';
-            exportBtn.style.boxShadow = '0 4px 12px rgba(255, 107, 53, 0.3)';
-        });
-        exportBtn.addEventListener('click', window.downloadJoyCaptionData);
-        document.body.appendChild(exportBtn);
-        console.log('✅ Export button created and attached to body');
-    }
-    // Multiple attempts to create button after Gradio loads
-    setTimeout(createExportButton, 1000);
-    setTimeout(createExportButton, 3000);
-    setTimeout(createExportButton, 5000);
-    // Also try when DOM changes (Gradio dynamic loading)
-    const observer = new MutationObserver(() => {
-        if (!document.getElementById('joyCaption-export-btn')) {
-            createExportButton();
-        }
-    });
-    observer.observe(document.body, { childList: true, subtree: true });
-})();
-</script>
-"""
-# Gradio Interface
 with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft()) as demo:
     gr.HTML(TITLE)
     with gr.Row():
-        # Left column - Image and controls
-        with gr.Column(scale=1):
-            image_input = gr.Image(type="filepath",
-                label="📸 Upload Image",
-                height=400
-            )
-            keywords_input = gr.Textbox(
-                placeholder="e.g., sensual, curves, intimate, alluring...",
-                label="🏷️ Keywords",
-                lines=2,
-                info="Add keywords that will be mentioned by the 'Keywords' tone ONLY if they apply to what's visible in the image"
-            )
-            # image_reference_input removed by request — we will export the actual image URL instead
-            custom_instruction_input = gr.Textbox(
-                placeholder="e.g., 'from instagram', 'the left girl has red hair', 'two girls kissing', 'beach setting'...",
-                label="🎯 Make sure that you mention:",
-                lines=2,
-                info="Any specific detail you want mentioned - context, scene details, features, etc. (Works with all tones)"
-            )
-            question_input = gr.Textbox(
-                placeholder="e.g., 'What are they doing?', 'Describe her pose', 'What's the setting?'...",
-                label="❓ Ask a Question",
-                lines=2,
-                info="Ask any question about the image - uncensored answers"
-            )
-            with gr.Row():
-                with gr.Column(scale=4):
-                    ask_question_btn = gr.Button(
-                        "❓ Ask Question",
-                        variant="secondary",
-                        size="sm"
-                    )
-                with gr.Column(scale=1, min_width=50):
-                    clear_qa_btn = gr.Button("🗑️", size="sm", variant="secondary")
-            qa_output = gr.Textbox(
-                label="",
-                lines=5,
-                max_lines=8,
-                show_copy_button=True,
-                interactive=True,
-                placeholder="Ask a question above to get uncensored answers..."
-            )
-        # Right column - Three caption outputs
         with gr.Column(scale=1):
-            # Engaging caption
-            with gr.Row():
-                with gr.Column(scale=4):
-                    generate_engaging_btn = gr.Button(
-                        "✨ Engaging",
-                        variant="primary",
-                        size="sm"
-                    )
-                with gr.Column(scale=1, min_width=50):
-                    reload_engaging = gr.Button("🔄", size="sm", variant="secondary")
-            with gr.Row():
-                with gr.Column(scale=1, min_width=50):
-                    clear_engaging_btn = gr.Button("🗑️", size="sm", variant="secondary")
-            engaging_output = gr.Textbox(
-                label="",
-                lines=5,
-                max_lines=8,
-                show_copy_button=True,
-                interactive=True,
-                placeholder="Click the button above to generate engaging caption..."
-            )
-            # Casual Friend caption
-            with gr.Row():
-                with gr.Column(scale=4):
-                    generate_friend_btn = gr.Button(
-                        "😎 Casual Friend",
-                        variant="primary",
-                        size="sm"
-                    )
-                with gr.Column(scale=1, min_width=50):
-                    reload_friend = gr.Button("🔄", size="sm", variant="secondary")
-            with gr.Row():
-                with gr.Column(scale=1, min_width=50):
-                    clear_friend_btn = gr.Button("🗑️", size="sm", variant="secondary")
-            friend_output = gr.Textbox(
-                label="",
-                lines=5,
-                max_lines=8,
-                show_copy_button=True,
-                interactive=True,
-                placeholder="Click the button above to generate casual friend caption..."
-            )
-            # NSFW section removed - caused hallucination
-            # Keywords caption
-            with gr.Row():
-                with gr.Column(scale=4):
-                    generate_uncensored_btn = gr.Button(
-                        "🔴 Keywords",
-                        variant="secondary",
-                        size="sm"
-                    )
-                with gr.Column(scale=1, min_width=50):
-                    reload_uncensored = gr.Button("🔄", size="sm", variant="secondary")
-            with gr.Row():
-                with gr.Column(scale=1, min_width=50):
-                    clear_uncensored_btn = gr.Button("🗑️", size="sm", variant="secondary")
-            uncensored_output = gr.Textbox(
-                label="",
-                lines=5,
-                max_lines=8,
-                show_copy_button=True,
-                interactive=True,
-                placeholder="Click the button above to generate keywords caption..."
-            )
-            # Body Parts Focus section removed - caused hallucination
-            # Descriptive text removed for cleaner interface
-            # Export functionality
-            with gr.Row():
-                export_btn = gr.Button(
-                    "📥 Export All Data (JSON)",
-                    variant="primary",
-                    size="lg"
-                )
-            export_output = gr.Textbox(
-                label="Export Status",
-                lines=2,
-                interactive=False,
-                visible=False
-            )
-            export_file = gr.File(
-                label="Download JSON",
-                visible=False
-            )
-            # --- Test Tone UI (for local testing only; not included in exported JSON) ---
-            system_test = gr.Textbox(
-                label="Test Tone System",
-                value="You are a ... who ...",
-                lines=1,
-                info="Editable system message for the test tone"
-            )
-            prompt_test = gr.Textbox(
-                label="Test Tone Prompt",
-                value="""Analyze this image like an art critic would with information about its composition, style, symbolism, the use of color, light, any artistic movement it might belong to, etc. Keep it long. Write a medium-length caption for this image as if it were being used for a social media post.""",
-                lines=6,
-                info="Editable user prompt for the test tone"
-            )
-            test_btn = gr.Button(
-                "🔬 Test Tone",
-                variant="secondary",
-                size="sm"
-            )
-            test_output = gr.Textbox(
-                label="Test Tone Output",
-                lines=5,
-                max_lines=8,
-                show_copy_button=True,
-                interactive=True,
-                placeholder="Click the button above to run the test tone..."
-            )
-    # Individual generate button handlers
-    generate_engaging_btn.click(
-        generate_engaging_only,
-        inputs=[image_input, custom_instruction_input],
-        outputs=engaging_output,
-        show_progress=True
-    )
-    generate_friend_btn.click(
-        generate_casual_friend_only,
-        inputs=[image_input, custom_instruction_input],
-        outputs=friend_output,
-        show_progress=True
-    )
-    # NSFW button handler removed
-    generate_uncensored_btn.click(
-        generate_uncensored_keywords_only,
-        inputs=[image_input, keywords_input, custom_instruction_input],
-        outputs=uncensored_output,
-        show_progress=True
-    )
-    test_btn.click(
-        generate_test_tone,
-        inputs=[image_input, system_test, prompt_test],
-        outputs=test_output,
-        show_progress=True
-    )
-    # Body Parts Focus button handler removed
-    # Individual reload buttons - using direct generation for consistency
-    def reload_engaging_fn(image, custom_instruction):
-        return generate_engaging_only(image, custom_instruction) if image else "❌ Upload image first"
-    def reload_friend_fn(image, custom_instruction):
-        return generate_casual_friend_only(image, custom_instruction) if image else "❌ Upload image first"
-    # NSFW reload function removed
-    def reload_uncensored_fn(image, keywords, custom_instruction):
-        return generate_uncensored_keywords_only(image, keywords, custom_instruction) if image else "❌ Upload image first"
-    # Body Parts Focus reload function removed
-    reload_engaging.click(
-        reload_engaging_fn,
-        inputs=[image_input, custom_instruction_input],
-        outputs=engaging_output,
-        show_progress=True
-    )
-    reload_friend.click(
-        reload_friend_fn,
-        inputs=[image_input, custom_instruction_input],
-        outputs=friend_output,
-        show_progress=True
-    )
-    # NSFW reload click handler removed
-    reload_uncensored.click(
-        reload_uncensored_fn,
-        inputs=[image_input, keywords_input, custom_instruction_input],
-        outputs=uncensored_output,
-        show_progress=True
-    )
-    # Body Parts Focus reload click handler removed
-    # Q&A functionality
-    ask_question_btn.click(
-        answer_question,
-        inputs=[image_input, question_input],
-        outputs=qa_output,
-        show_progress=True
-    )
-    # Clear button functions
-    def clear_text():
-        return ""
-    clear_qa_btn.click(
-        clear_text,
-        outputs=qa_output
-    )
-    clear_engaging_btn.click(
-        clear_text,
-        outputs=engaging_output
-    )
-    clear_friend_btn.click(
-        clear_text,
-        outputs=friend_output
-    )
-    # NSFW clear button handler removed
-    clear_uncensored_btn.click(
-        clear_text,
-        outputs=uncensored_output
-    )
-    # Export functionality
-    def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path):
-        """Handle export and return proper file download (cross-platform, uses tempdir)"""
-        message, file_data = export_joycaption_data(
-            keywords, custom_instructions, question,
-            engaging_caption, casual_caption, keywords_caption, qa_answer, image_path
-        )
-        if file_data:
-            json_string, filename = file_data
-            # Use the OS temp directory so this works on Windows, macOS, Linux and in Spaces
-            base_dir = tempfile.gettempdir()
-            temp_file = os.path.join(base_dir, filename)
-            with open(temp_file, 'w', encoding='utf-8') as f:
-                f.write(json_string)
-            return gr.update(value=message, visible=True), gr.update(value=temp_file, visible=True)
-        else:
-            return gr.update(value=message, visible=True), gr.update(visible=False)
-    export_btn.click(
-        handle_export,
-        inputs=[
-            keywords_input,
-            custom_instruction_input,
-            question_input,
-            engaging_output,
-            friend_output,
-            uncensored_output,
-            qa_output,
-            image_input
-        ],
-        outputs=[export_output, export_file]
-    )
-    # Body Parts Focus clear button handler removed
 if __name__ == "__main__":
     demo.launch()

 try:
     import spaces
     if not hasattr(spaces, 'GPU'):
         def _spaces_gpu(*args, **kwargs):
             def _wrap(f):
             return _wrap
         spaces.GPU = _spaces_gpu
 except Exception:
     import types
     spaces = types.SimpleNamespace()
     def _spaces_gpu(*args, **kwargs):
             return f
         return _wrap
     spaces.GPU = _spaces_gpu
 @spaces.GPU()
 def _joycaption_register_gpu():
+    """No-op GPU registration for Spaces"""
     return None
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
+import tempfile, gc, time, os, shutil, json, re
 from pathlib import Path
 from hf_space_utils import fix_image_url, postprocess_caption
+# --- Cache dirs redirected to temp ---
 _tmpdir = tempfile.gettempdir()
 os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
 os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
 os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
 os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
+# --- Model path ---
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
+# --- Cleanup helper ---
 def cleanup_storage():
     try:
         temp_dirs = [
             os.environ.get("HF_HOME"),
             os.environ.get("TRANSFORMERS_CACHE"),
             os.environ.get("TORCH_HOME")
         ]
         for temp_dir in temp_dirs:
+            if temp_dir and os.path.exists(temp_dir):
+                shutil.rmtree(temp_dir, ignore_errors=True)
         gc.collect()
         if torch.cuda.is_available():
             torch.cuda.empty_cache()
             torch.cuda.synchronize()
         print("✅ Storage cleanup completed")
     except Exception as e:
+        print(f"⚠️ Cleanup warning: {e}")
 TITLE = """
 <div style="text-align: center; margin: 20px 0;">
 <hr>
 """
+print("🚀 Loading JoyCaption system...")
+cleanup_storage()
+# --- Load model ---
 processor = None
 model = None
 MODEL_TORCH_DTYPE = None
 MODEL_USE_CUDA = False
 if not os.environ.get("SKIP_MODEL_LOAD"):
     use_cuda = torch.cuda.is_available()
     if use_cuda:
         torch_dtype = getattr(torch, 'bfloat16', None) or getattr(torch, 'float16', None)
         device_map = "auto"
         MODEL_USE_CUDA = True
         device_map = "cpu"
         MODEL_USE_CUDA = False
+    processor = AutoProcessor.from_pretrained(MODEL_PATH, low_cpu_mem_usage=True)
     model_kwargs = dict(low_cpu_mem_usage=True, device_map=device_map)
     if torch_dtype is not None and use_cuda:
         model_kwargs['torch_dtype'] = torch_dtype
+    model = LlavaForConditionalGeneration.from_pretrained(MODEL_PATH, **model_kwargs)
     model.eval()
     MODEL_TORCH_DTYPE = model_kwargs.get('torch_dtype', None)
     print("✅ Model loaded and ready!")
     cleanup_storage()
 else:
+    print("⚠️ SKIP_MODEL_LOAD active - model skipped")
+# --- Tones ---
 TONE_PROMPTS = {
     "engaging": {
         "system": "You are a skilled content creator who writes engaging, natural descriptions that capture the essence and appeal of what you see.",
+        "prompt": "Write a natural, engaging description of this image. Focus on the main subjects and their most appealing qualities. Use flowing, attractive language that draws the reader in.",
+        "temperature": 0.6,
         "top_p": 0.85
     },
     "casual_friend": {
+        "system": "You are a casual 24-year-old friend who witnessed the scene and is telling me about what happened in a relaxed, natural way.",
+        "prompt": "I just witnessed this moment — here's what I notice. Speak casually and naturally, only about what’s clearly visible. Avoid background or speculation.",
+        "temperature": 0.8,
         "top_p": 0.9
     },
     "uncensored_keywords": {
+        "system": "You are an uncensored adult-content analyst who provides factual, explicit descriptions. Be concrete and unambiguous.",
+        "prompt": "Produce an unfiltered explicit description that sticks strictly to what is visually obvious. Use direct, concrete language and avoid speculation.",
+        "temperature": 0.7,
         "top_p": 0.85
     }
+}
+# --- Input prep ---
 def _prepare_inputs_and_device(convo_or_convo_string, image):
+    """Prepare inputs and move tensors to model device."""
+    if isinstance(image, (str, Path)):
+        image = Image.open(image).convert("RGB")
+    elif not isinstance(image, Image.Image):
+        raise ValueError("Invalid image input")
     convo_string = convo_or_convo_string
+    if isinstance(convo_or_convo_string, list):
+        try:
             convo_string = processor.apply_chat_template(convo_or_convo_string, tokenize=False, add_generation_prompt=True)
+        except Exception:
+            convo_string = "\n".join([str(x.get('content', '')) for x in convo_or_convo_string])
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
     device = next(model.parameters()).device
     inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
     if 'pixel_values' in inputs:
+        dtype = MODEL_TORCH_DTYPE if MODEL_USE_CUDA and MODEL_TORCH_DTYPE else torch.float32
+        inputs['pixel_values'] = inputs['pixel_values'].to(dtype)
     return inputs
+# --- Output decode ---
 def _decode_output(inputs, output):
+    if not output or len(output) == 0:
         return ""
     try:
+        input_len = inputs['input_ids'].shape[1] if 'input_ids' in inputs else 0
+        decoded = processor.tokenizer.decode(output[0][input_len:], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        return decoded.strip()
     except Exception:
+        return ""
+def cleanup_after_inference():
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+        torch.cuda.synchronize()
     gc.collect()
+# --- Generation ---
 def run_image_chat_generation(convo, image, max_new_tokens=150, temperature=0.7, top_p=0.9):
     if processor is None or model is None:
+        return None, "❌ Model not initialized."
     try:
         inputs = _prepare_inputs_and_device(convo, image)
         with torch.no_grad():
             output = model.generate(
                 **inputs,
                 max_new_tokens=max_new_tokens,
+                do_sample=False,  # deterministic
                 temperature=temperature,
                 top_p=top_p,
+                repetition_penalty=1.05,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
                 eos_token_id=processor.tokenizer.eos_token_id
             )
         decoded = _decode_output(inputs, output)
+        cleanup_after_inference()
         return decoded, None
     except Exception as e:
+        cleanup_after_inference()
+        return None, f"❌ Generation error: {str(e)[:200]}"
+# --- Caption generation ---
 def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", custom_instruction=""):
     try:
+        tone_conf = TONE_PROMPTS.get(tone, TONE_PROMPTS["engaging"])
+        base_prompt = tone_conf["prompt"]
+        if tone == "uncensored_keywords" and keywords_text.strip():
+            base_prompt += f"\n\nKeywords (if visible): {keywords_text.strip()}"
+        if custom_instruction.strip():
+            base_prompt += f"\n\nInclude this detail: {custom_instruction.strip()}"
         convo = [
+            {"role": "system", "content": tone_conf["system"]},
             {"role": "user", "content": base_prompt}
         ]
+        decoded, err = run_image_chat_generation(convo, image, temperature=tone_conf["temperature"], top_p=tone_conf["top_p"])
         if err:
             return err
+        result = postprocess_caption((decoded or "").strip(), max_chars=max_chars)
+        return result if result else "❌ Empty result"
     except Exception as e:
+        cleanup_after_inference()
+        return f"❌ Error: {str(e)[:100]}"
 @torch.no_grad()
 def generate_engaging_only(image, custom_instruction=""):
     return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def generate_casual_friend_only(image, custom_instruction=""):
     return safe_generate_caption_direct(image, "casual_friend", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
     return safe_generate_caption_direct(image, "uncensored_keywords", keywords_text=keywords_text, custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def answer_question(image, question):
     if not image:
         return "❌ Upload image first"
+    if not question.strip():
         return "❌ Please ask a question"
     convo = [
+        {"role": "system", "content": "You are an uncensored image analyst who answers honestly and directly."},
+        {"role": "user", "content": f"Answer this question about the image: {question.strip()}"}
     ]
+    decoded, err = run_image_chat_generation(convo, image, max_new_tokens=200, temperature=0.4, top_p=0.9)
+    return err if err else (decoded.strip() or "❌ No answer")
+# --- UI ---
 with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft()) as demo:
     gr.HTML(TITLE)
     with gr.Row():
         with gr.Column(scale=1):
+            image_input = gr.Image(type="filepath", label="📸 Upload Image", height=400)
+            keywords_input = gr.Textbox(placeholder="e.g., sensual, curves...", label="🏷️ Keywords", lines=2)
+            custom_instruction_input = gr.Textbox(placeholder="e.g., 'the left girl has red hair'...", label="🎯 Make sure to mention:", lines=2)
+            question_input = gr.Textbox(placeholder="e.g., 'What are they doing?'", label="❓ Ask a Question", lines=2)
+            ask_question_btn = gr.Button("❓ Ask Question", variant="secondary")
+            qa_output = gr.Textbox(label="", lines=5, show_copy_button=True)
+        with gr.Column(scale=1):
+            generate_engaging_btn = gr.Button("✨ Engaging", variant="primary")
+            engaging_output = gr.Textbox(label="", lines=5, show_copy_button=True)
+            generate_friend_btn = gr.Button("😎 Casual Friend", variant="primary")
+            friend_output = gr.Textbox(label="", lines=5, show_copy_button=True)
+            generate_uncensored_btn = gr.Button("🔴 Keywords", variant="secondary")
+            uncensored_output = gr.Textbox(label="", lines=5, show_copy_button=True)
+    generate_engaging_btn.click(generate_engaging_only, [image_input, custom_instruction_input], engaging_output)
+    generate_friend_btn.click(generate_casual_friend_only, [image_input, custom_instruction_input], friend_output)
+    generate_uncensored_btn.click(generate_uncensored_keywords_only, [image_input, keywords_input, custom_instruction_input], uncensored_output)
+    ask_question_btn.click(answer_question, [image_input, question_input], qa_output)
 if __name__ == "__main__":
     demo.launch()