Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 10, 2025

Commit

cfdf4b3

verified ·

1 Parent(s): 6e12f0b

Update app.py

Browse files

Files changed (1) hide show

app.py +160 -81

app.py CHANGED Viewed

@@ -1,8 +1,32 @@
-import spaces
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
 import gc
 import time
 import gc
@@ -10,26 +34,42 @@ import os
 import shutil
 import json
 from pathlib import Path
 from hf_space_utils import fix_image_url
-# Storage optimization - redirect cache to temporary directories
-os.environ["HF_HOME"] = "/tmp/hf_cache"
-os.environ["TRANSFORMERS_CACHE"] = "/tmp/transformers_cache"
-os.environ["HF_DATASETS_CACHE"] = "/tmp/datasets_cache"
-os.environ["TORCH_HOME"] = "/tmp/torch_cache"
 # Model configuration
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
 def cleanup_storage():
     """Clean up temporary files and caches to prevent storage overflow"""
     try:
-        # Clean up temporary caches
-        temp_dirs = ["/tmp/hf_cache", "/tmp/transformers_cache", "/tmp/datasets_cache", "/tmp/torch_cache"]
         for temp_dir in temp_dirs:
             if os.path.exists(temp_dir):
-                shutil.rmtree(temp_dir, ignore_errors=True)
         # Force garbage collection
         gc.collect()
@@ -56,22 +96,47 @@ print("🚀 Loading Sequential Three-Tone JoyCaption system... v2.1")
 # Load model and processor at startup
 print("📦 Loading model and processor at startup...")
-processor = AutoProcessor.from_pretrained(
-    MODEL_PATH,
-    low_cpu_mem_usage=True
-)
-model = LlavaForConditionalGeneration.from_pretrained(
-    MODEL_PATH,
-    torch_dtype=torch.bfloat16,
-    device_map="auto",
-    low_cpu_mem_usage=True
-)
-model.eval()
-print("✅ Model loaded and ready!")
-# Initial cleanup after model loading
-cleanup_storage()
 # Optimized 5-tone prompts with better temperature control
 # Temperature: Lower for prompt adherence, higher for word variety
@@ -112,25 +177,32 @@ def apply_smart_corrections(text):
         r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
         # Nudity precision corrections
-        r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
-        r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
         # Person count corrections
-        r'\bthree women\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
-        r'\bfour women\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
         # Clothing precision
-        r'\bwearing nothing\b': 'nude',
-        r'\bnot wearing.*clothes\b': 'nude',
-        r'\bcompletely naked\b': 'nude',
-        r'\bfully nude\b': 'nude',
     }
     corrected_text = text
     try:
         for pattern, replacement in corrections.items():
             if callable(replacement):
-                corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
             else:
                 corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
     except Exception as e:
@@ -165,14 +237,27 @@ def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", c
             {"role": "user", "content": base_prompt}
         ]
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
         device = next(model.parameters()).device
         inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
         if 'pixel_values' in inputs:
-            inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
         # Get tone-specific generation parameters
         temperature = tone_config.get("temperature", 0.7)
@@ -240,14 +325,11 @@ def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", c
             pass
         return f"❌ Error: {str(e)[:50]}..."
-# Individual GPU-decorated functions for all 3 tones
-@spaces.GPU(duration=45)
 @torch.no_grad()
 def generate_engaging_only(image, custom_instruction=""):
     """Generate only engaging caption"""
     return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
-@spaces.GPU(duration=45)
 @torch.no_grad()
 def generate_casual_friend_only(image, custom_instruction=""):
     """Generate only casual friend caption"""
@@ -255,7 +337,6 @@ def generate_casual_friend_only(image, custom_instruction=""):
 # NSFW function removed - caused hallucination
-@spaces.GPU(duration=45)
 @torch.no_grad()
 def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
     """Generate only uncensored with keywords caption"""
@@ -263,7 +344,6 @@ def generate_uncensored_keywords_only(image, keywords_text, custom_instruction="
 # Body parts focus function removed - caused hallucination
-@spaces.GPU(duration=45)
 @torch.no_grad()
 def answer_question(image, question):
     """Answer any question about the image without censorship"""
@@ -282,14 +362,25 @@ def answer_question(image, question):
         {"role": "user", "content": qa_prompt}
     ]
     convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
     device = next(model.parameters()).device
     inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
     if 'pixel_values' in inputs:
-        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
     with torch.no_grad():
         output = model.generate(
@@ -345,7 +436,13 @@ def export_joycaption_data(keywords, custom_instructions, question, engaging_cap
         if question and question.strip():
             data["data"]["question"] = question.strip()
-            image_url_converted = fix_image_url(image_path)
             if image_url_converted and str(image_url_converted).strip():
                 data["data"]["image_url"] = str(image_url_converted).strip()
         # Add generated captions
@@ -392,7 +489,7 @@ EXPORT_JS = """
         // Get all textareas and inputs from the page
         const allInputs = document.querySelectorAll('textarea, input[type="text"]');
         allInputs.forEach((field, index) => {
             const placeholder = (field.placeholder || '').toLowerCase();
             const value = field.value ? field.value.trim() : '';
@@ -565,13 +662,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
                 lines=2,
                 info="Add keywords that will be mentioned by the 'Keywords' tone ONLY if they apply to what's visible in the image"
             )
-                placeholder="e.g., blonde_girl_001.jpg, Instagram photo, OnlyFans pic...",
-                label="🖼️ Image Reference",
-                lines=1,
-                info="Image filename or description for your reference (will be exported)"
-            )
             custom_instruction_input = gr.Textbox(
                 placeholder="e.g., 'from instagram', 'the left girl has red hair', 'two girls kissing', 'beach setting'...",
@@ -630,7 +721,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
                 interactive=True,
                 placeholder="Click the button above to generate engaging caption..."
             )
             # Casual Friend caption
             with gr.Row():
                 with gr.Column(scale=4):
@@ -652,9 +743,9 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
                 interactive=True,
                 placeholder="Click the button above to generate casual friend caption..."
             )
             # NSFW section removed - caused hallucination
             # Keywords caption
             with gr.Row():
                 with gr.Column(scale=4):
@@ -676,11 +767,11 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
                 interactive=True,
                 placeholder="Click the button above to generate keywords caption..."
             )
             # Body Parts Focus section removed - caused hallucination
             # Descriptive text removed for cleaner interface
             # Export functionality
             with gr.Row():
                 export_btn = gr.Button(
@@ -717,7 +808,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
     )
     # NSFW button handler removed
     generate_uncensored_btn.click(
         generate_uncensored_keywords_only,
         inputs=[image_input, keywords_input, custom_instruction_input],
@@ -726,7 +817,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
     )
     # Body Parts Focus button handler removed
     # Individual reload buttons - using direct generation for consistency
     def reload_engaging_fn(image, custom_instruction):
         return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
@@ -801,31 +892,18 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
     )
     # Export functionality
-    def handle_export():
-        """Handle the export button click"""
-        # Get current values from all fields
-        return export_joycaption_data(
-    keywords_input.value or "",
-    custom_instruction_input.value or "",
-    question_input.value or "",
-    engaging_output.value or "",
-    friend_output.value or "",
-    uncensored_output.value or "",
-    qa_output.value or "",
-    image_input.value or ""
-)
     def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path):
-        """Handle export and return proper file download"""
         message, file_data = export_joycaption_data(
             keywords, custom_instructions, question,
             engaging_caption, casual_caption, keywords_caption, qa_answer, image_path
         )
         if file_data:
             json_string, filename = file_data
-            # Create temporary file for download
-            temp_file = f"C:\\Users\\Andrei\\{filename}"
             with open(temp_file, 'w', encoding='utf-8') as f:
                 f.write(json_string)
             return gr.update(value=message, visible=True), gr.update(value=temp_file, visible=True)
@@ -841,7 +919,8 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
             engaging_output,
             friend_output,
             uncensored_output,
-            qa_output
         ],
         outputs=[export_output, export_file]
     )

+"""
+Copy of the full `app.py` into the deploy folder for direct upload.
+This file is a snapshot of the application's main entrypoint and should be
+identical to the root `app.py` when uploading to Hugging Face Spaces.
+"""
+try:
+    import spaces
+    # Ensure spaces.GPU exists and is a decorator
+    if not hasattr(spaces, 'GPU'):
+        def _spaces_gpu(*args, **kwargs):
+            def _wrap(f):
+                return f
+            return _wrap
+        spaces.GPU = _spaces_gpu
+except Exception:
+    # Provide a no-op spaces with a GPU decorator fallback so app can run outside HF Spaces
+    import types
+    spaces = types.SimpleNamespace()
+    def _spaces_gpu(*args, **kwargs):
+        def _wrap(f):
+            return f
+        return _wrap
+    spaces.GPU = _spaces_gpu
 import gradio as gr
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 from PIL import Image
+import tempfile
 import gc
 import time
 import gc
 import shutil
 import json
 from pathlib import Path
+import re
 from hf_space_utils import fix_image_url
+# Storage optimization - redirect cache to temporary directories (platform independent)
+_tmpdir = tempfile.gettempdir()
+os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
+os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
+os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
+os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
 # Model configuration
 MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
+# Optional public host for converting /tmp/gradio paths to public gradio_api URLs
+SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
 def cleanup_storage():
     """Clean up temporary files and caches to prevent storage overflow"""
     try:
+        # Clean up temporary caches using the configured environment paths
+        temp_dirs = [
+            os.environ.get("HF_HOME"),
+            os.environ.get("TRANSFORMERS_CACHE"),
+            os.environ.get("HF_DATASETS_CACHE"),
+            os.environ.get("TORCH_HOME")
+        ]
         for temp_dir in temp_dirs:
+            if not temp_dir:
+                continue
             if os.path.exists(temp_dir):
+                try:
+                    shutil.rmtree(temp_dir, ignore_errors=True)
+                except Exception:
+                    # best-effort cleanup
+                    pass
         # Force garbage collection
         gc.collect()
 # Load model and processor at startup
 print("📦 Loading model and processor at startup...")
+processor = None
+model = None
+MODEL_TORCH_DTYPE = None
+MODEL_USE_CUDA = False
+# Allow skipping model loading for tests or light-weight runs by setting SKIP_MODEL_LOAD=1
+if not os.environ.get("SKIP_MODEL_LOAD"):
+    # Determine target device for model loading. On zero-GPU spaces, fall back to CPU.
+    use_cuda = torch.cuda.is_available()
+    if use_cuda:
+        # Prefer bf16 on supported GPUs, otherwise try float16
+        torch_dtype = getattr(torch, 'bfloat16', None) or getattr(torch, 'float16', None)
+        device_map = "auto"
+        MODEL_USE_CUDA = True
+    else:
+        torch_dtype = None
+        device_map = "cpu"
+        MODEL_USE_CUDA = False
+    processor = AutoProcessor.from_pretrained(
+        MODEL_PATH,
+        low_cpu_mem_usage=True
+    )
+    model_kwargs = dict(low_cpu_mem_usage=True, device_map=device_map)
+    if torch_dtype is not None and use_cuda:
+        model_kwargs['torch_dtype'] = torch_dtype
+    model = LlavaForConditionalGeneration.from_pretrained(
+        MODEL_PATH,
+        **model_kwargs
+    )
+    model.eval()
+    # remember dtype for later tensor conversions
+    MODEL_TORCH_DTYPE = model_kwargs.get('torch_dtype', None)
+    print("✅ Model loaded and ready!")
+    # Initial cleanup after model loading
+    cleanup_storage()
+else:
+    print("⚠️ SKIP_MODEL_LOAD is set — skipping heavy model initialization (test mode)")
 # Optimized 5-tone prompts with better temperature control
 # Temperature: Lower for prompt adherence, higher for word variety
         r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
         # Nudity precision corrections
+        r'\\btopless women\\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
+        r'\\btopless woman\\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
         # Person count corrections
+        r'\\bthree women\\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
+        r'\\bfour women\\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
         # Clothing precision
+        r'\\bwearing nothing\\b': 'nude',
+        r'\\bnot wearing.*clothes\\b': 'nude',
+        r'\\bcompletely naked\\b': 'nude',
+        r'\\bfully nude\\b': 'nude',
     }
     corrected_text = text
     try:
         for pattern, replacement in corrections.items():
             if callable(replacement):
+                # Wrap the replacement to ensure it returns a string and accepts a Match
+                def _repl(match, rep=replacement):
+                    try:
+                        out = rep(match)
+                        return "" if out is None else str(out)
+                    except Exception:
+                        return match.group(0)
+                corrected_text = re.sub(pattern, _repl, corrected_text, flags=re.IGNORECASE)
             else:
                 corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
     except Exception as e:
             {"role": "user", "content": base_prompt}
         ]
+        # Ensure model and processor are loaded
+        if processor is None or model is None:
+            return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
         device = next(model.parameters()).device
         inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
+        # Safely convert pixel tensor dtype depending on runtime capabilities
         if 'pixel_values' in inputs:
+            if MODEL_USE_CUDA and MODEL_TORCH_DTYPE is not None:
+                try:
+                    inputs['pixel_values'] = inputs['pixel_values'].to(MODEL_TORCH_DTYPE)
+                except Exception:
+                    # fallback to float32
+                    inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
+            else:
+                # CPU fallback
+                inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
         # Get tone-specific generation parameters
         temperature = tone_config.get("temperature", 0.7)
             pass
         return f"❌ Error: {str(e)[:50]}..."
 @torch.no_grad()
 def generate_engaging_only(image, custom_instruction=""):
     """Generate only engaging caption"""
     return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
 @torch.no_grad()
 def generate_casual_friend_only(image, custom_instruction=""):
     """Generate only casual friend caption"""
 # NSFW function removed - caused hallucination
 @torch.no_grad()
 def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
     """Generate only uncensored with keywords caption"""
 # Body parts focus function removed - caused hallucination
 @torch.no_grad()
 def answer_question(image, question):
     """Answer any question about the image without censorship"""
         {"role": "user", "content": qa_prompt}
     ]
+    # Ensure model and processor are loaded
+    if processor is None or model is None:
+        return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
     convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
     inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
     device = next(model.parameters()).device
     inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
+    # Safely convert pixel_values dtype depending on runtime
     if 'pixel_values' in inputs:
+        if MODEL_USE_CUDA and MODEL_TORCH_DTYPE is not None:
+            try:
+                inputs['pixel_values'] = inputs['pixel_values'].to(MODEL_TORCH_DTYPE)
+            except Exception:
+                inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
+        else:
+            inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
     with torch.no_grad():
         output = model.generate(
         if question and question.strip():
             data["data"]["question"] = question.strip()
+        # Always attempt to include the uploaded image URL (converted) if an image path was provided
+        if image_path and str(image_path).strip():
+            # include the raw local path
+            data["data"]["image_local_path"] = str(image_path)
+            # pass empty string when no host is configured (fix_image_url treats falsy host as no conversion)
+            image_url_converted = fix_image_url(image_path, host=(SPACE_HOST or ""))
             if image_url_converted and str(image_url_converted).strip():
                 data["data"]["image_url"] = str(image_url_converted).strip()
         # Add generated captions
         // Get all textareas and inputs from the page
         const allInputs = document.querySelectorAll('textarea, input[type="text"]');
         allInputs.forEach((field, index) => {
             const placeholder = (field.placeholder || '').toLowerCase();
             const value = field.value ? field.value.trim() : '';
                 lines=2,
                 info="Add keywords that will be mentioned by the 'Keywords' tone ONLY if they apply to what's visible in the image"
             )
+            # image_reference_input removed by request — we will export the actual image URL instead
             custom_instruction_input = gr.Textbox(
                 placeholder="e.g., 'from instagram', 'the left girl has red hair', 'two girls kissing', 'beach setting'...",
                 interactive=True,
                 placeholder="Click the button above to generate engaging caption..."
             )
             # Casual Friend caption
             with gr.Row():
                 with gr.Column(scale=4):
                 interactive=True,
                 placeholder="Click the button above to generate casual friend caption..."
             )
             # NSFW section removed - caused hallucination
             # Keywords caption
             with gr.Row():
                 with gr.Column(scale=4):
                 interactive=True,
                 placeholder="Click the button above to generate keywords caption..."
             )
             # Body Parts Focus section removed - caused hallucination
             # Descriptive text removed for cleaner interface
             # Export functionality
             with gr.Row():
                 export_btn = gr.Button(
     )
     # NSFW button handler removed
     generate_uncensored_btn.click(
         generate_uncensored_keywords_only,
         inputs=[image_input, keywords_input, custom_instruction_input],
     )
     # Body Parts Focus button handler removed
     # Individual reload buttons - using direct generation for consistency
     def reload_engaging_fn(image, custom_instruction):
         return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
     )
     # Export functionality
     def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path):
+        """Handle export and return proper file download (cross-platform, uses tempdir)"""
         message, file_data = export_joycaption_data(
             keywords, custom_instructions, question,
             engaging_caption, casual_caption, keywords_caption, qa_answer, image_path
         )
         if file_data:
             json_string, filename = file_data
+            # Use the OS temp directory so this works on Windows, macOS, Linux and in Spaces
+            base_dir = tempfile.gettempdir()
+            temp_file = os.path.join(base_dir, filename)
             with open(temp_file, 'w', encoding='utf-8') as f:
                 f.write(json_string)
             return gr.update(value=message, visible=True), gr.update(value=temp_file, visible=True)
             engaging_output,
             friend_output,
             uncensored_output,
+            qa_output,
+            image_input
         ],
         outputs=[export_output, export_file]
     )