Spaces:
Running on Zero
Running on Zero
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,8 +1,32 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
import torch
|
| 4 |
from transformers import LlavaForConditionalGeneration, AutoProcessor
|
| 5 |
from PIL import Image
|
|
|
|
| 6 |
import gc
|
| 7 |
import time
|
| 8 |
import gc
|
|
@@ -10,26 +34,42 @@ import os
|
|
| 10 |
import shutil
|
| 11 |
import json
|
| 12 |
from pathlib import Path
|
|
|
|
| 13 |
|
| 14 |
from hf_space_utils import fix_image_url
|
| 15 |
|
| 16 |
-
# Storage optimization - redirect cache to temporary directories
|
| 17 |
-
|
| 18 |
-
os.environ["
|
| 19 |
-
os.environ["
|
| 20 |
-
os.environ["
|
|
|
|
| 21 |
|
| 22 |
# Model configuration
|
| 23 |
MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
|
| 24 |
|
|
|
|
|
|
|
|
|
|
| 25 |
def cleanup_storage():
|
| 26 |
"""Clean up temporary files and caches to prevent storage overflow"""
|
| 27 |
try:
|
| 28 |
-
# Clean up temporary caches
|
| 29 |
-
temp_dirs = [
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
for temp_dir in temp_dirs:
|
|
|
|
|
|
|
| 31 |
if os.path.exists(temp_dir):
|
| 32 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
# Force garbage collection
|
| 35 |
gc.collect()
|
|
@@ -56,22 +96,47 @@ print("🚀 Loading Sequential Three-Tone JoyCaption system... v2.1")
|
|
| 56 |
|
| 57 |
# Load model and processor at startup
|
| 58 |
print("📦 Loading model and processor at startup...")
|
| 59 |
-
processor =
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
model
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
# Optimized 5-tone prompts with better temperature control
|
| 77 |
# Temperature: Lower for prompt adherence, higher for word variety
|
|
@@ -112,25 +177,32 @@ def apply_smart_corrections(text):
|
|
| 112 |
r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
|
| 113 |
|
| 114 |
# Nudity precision corrections
|
| 115 |
-
r'\btopless women\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
|
| 116 |
-
r'\btopless woman\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
|
| 117 |
|
| 118 |
# Person count corrections
|
| 119 |
-
r'\bthree women\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
|
| 120 |
-
r'\bfour women\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
|
| 121 |
|
| 122 |
# Clothing precision
|
| 123 |
-
r'\bwearing nothing\b': 'nude',
|
| 124 |
-
r'\bnot wearing.*clothes\b': 'nude',
|
| 125 |
-
r'\bcompletely naked\b': 'nude',
|
| 126 |
-
r'\bfully nude\b': 'nude',
|
| 127 |
}
|
| 128 |
|
| 129 |
corrected_text = text
|
| 130 |
try:
|
| 131 |
for pattern, replacement in corrections.items():
|
| 132 |
if callable(replacement):
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
else:
|
| 135 |
corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
|
| 136 |
except Exception as e:
|
|
@@ -165,14 +237,27 @@ def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", c
|
|
| 165 |
{"role": "user", "content": base_prompt}
|
| 166 |
]
|
| 167 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 169 |
inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
|
| 170 |
-
|
| 171 |
device = next(model.parameters()).device
|
| 172 |
inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
|
| 173 |
-
|
|
|
|
| 174 |
if 'pixel_values' in inputs:
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
# Get tone-specific generation parameters
|
| 178 |
temperature = tone_config.get("temperature", 0.7)
|
|
@@ -240,14 +325,11 @@ def safe_generate_caption_direct(image, tone, max_chars=600, keywords_text="", c
|
|
| 240 |
pass
|
| 241 |
return f"❌ Error: {str(e)[:50]}..."
|
| 242 |
|
| 243 |
-
# Individual GPU-decorated functions for all 3 tones
|
| 244 |
-
@spaces.GPU(duration=45)
|
| 245 |
@torch.no_grad()
|
| 246 |
def generate_engaging_only(image, custom_instruction=""):
|
| 247 |
"""Generate only engaging caption"""
|
| 248 |
return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
|
| 249 |
|
| 250 |
-
@spaces.GPU(duration=45)
|
| 251 |
@torch.no_grad()
|
| 252 |
def generate_casual_friend_only(image, custom_instruction=""):
|
| 253 |
"""Generate only casual friend caption"""
|
|
@@ -255,7 +337,6 @@ def generate_casual_friend_only(image, custom_instruction=""):
|
|
| 255 |
|
| 256 |
# NSFW function removed - caused hallucination
|
| 257 |
|
| 258 |
-
@spaces.GPU(duration=45)
|
| 259 |
@torch.no_grad()
|
| 260 |
def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
|
| 261 |
"""Generate only uncensored with keywords caption"""
|
|
@@ -263,7 +344,6 @@ def generate_uncensored_keywords_only(image, keywords_text, custom_instruction="
|
|
| 263 |
|
| 264 |
# Body parts focus function removed - caused hallucination
|
| 265 |
|
| 266 |
-
@spaces.GPU(duration=45)
|
| 267 |
@torch.no_grad()
|
| 268 |
def answer_question(image, question):
|
| 269 |
"""Answer any question about the image without censorship"""
|
|
@@ -282,14 +362,25 @@ def answer_question(image, question):
|
|
| 282 |
{"role": "user", "content": qa_prompt}
|
| 283 |
]
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 286 |
inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
|
| 287 |
-
|
| 288 |
device = next(model.parameters()).device
|
| 289 |
inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
|
| 290 |
-
|
|
|
|
| 291 |
if 'pixel_values' in inputs:
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
|
| 294 |
with torch.no_grad():
|
| 295 |
output = model.generate(
|
|
@@ -345,7 +436,13 @@ def export_joycaption_data(keywords, custom_instructions, question, engaging_cap
|
|
| 345 |
|
| 346 |
if question and question.strip():
|
| 347 |
data["data"]["question"] = question.strip()
|
| 348 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 349 |
if image_url_converted and str(image_url_converted).strip():
|
| 350 |
data["data"]["image_url"] = str(image_url_converted).strip()
|
| 351 |
# Add generated captions
|
|
@@ -392,7 +489,7 @@ EXPORT_JS = """
|
|
| 392 |
|
| 393 |
// Get all textareas and inputs from the page
|
| 394 |
const allInputs = document.querySelectorAll('textarea, input[type="text"]');
|
| 395 |
-
|
| 396 |
allInputs.forEach((field, index) => {
|
| 397 |
const placeholder = (field.placeholder || '').toLowerCase();
|
| 398 |
const value = field.value ? field.value.trim() : '';
|
|
@@ -565,13 +662,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 565 |
lines=2,
|
| 566 |
info="Add keywords that will be mentioned by the 'Keywords' tone ONLY if they apply to what's visible in the image"
|
| 567 |
)
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
placeholder="e.g., blonde_girl_001.jpg, Instagram photo, OnlyFans pic...",
|
| 571 |
-
label="🖼️ Image Reference",
|
| 572 |
-
lines=1,
|
| 573 |
-
info="Image filename or description for your reference (will be exported)"
|
| 574 |
-
)
|
| 575 |
|
| 576 |
custom_instruction_input = gr.Textbox(
|
| 577 |
placeholder="e.g., 'from instagram', 'the left girl has red hair', 'two girls kissing', 'beach setting'...",
|
|
@@ -630,7 +721,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 630 |
interactive=True,
|
| 631 |
placeholder="Click the button above to generate engaging caption..."
|
| 632 |
)
|
| 633 |
-
|
| 634 |
# Casual Friend caption
|
| 635 |
with gr.Row():
|
| 636 |
with gr.Column(scale=4):
|
|
@@ -652,9 +743,9 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 652 |
interactive=True,
|
| 653 |
placeholder="Click the button above to generate casual friend caption..."
|
| 654 |
)
|
| 655 |
-
|
| 656 |
# NSFW section removed - caused hallucination
|
| 657 |
-
|
| 658 |
# Keywords caption
|
| 659 |
with gr.Row():
|
| 660 |
with gr.Column(scale=4):
|
|
@@ -676,11 +767,11 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 676 |
interactive=True,
|
| 677 |
placeholder="Click the button above to generate keywords caption..."
|
| 678 |
)
|
| 679 |
-
|
| 680 |
# Body Parts Focus section removed - caused hallucination
|
| 681 |
-
|
| 682 |
# Descriptive text removed for cleaner interface
|
| 683 |
-
|
| 684 |
# Export functionality
|
| 685 |
with gr.Row():
|
| 686 |
export_btn = gr.Button(
|
|
@@ -717,7 +808,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 717 |
)
|
| 718 |
|
| 719 |
# NSFW button handler removed
|
| 720 |
-
|
| 721 |
generate_uncensored_btn.click(
|
| 722 |
generate_uncensored_keywords_only,
|
| 723 |
inputs=[image_input, keywords_input, custom_instruction_input],
|
|
@@ -726,7 +817,7 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 726 |
)
|
| 727 |
|
| 728 |
# Body Parts Focus button handler removed
|
| 729 |
-
|
| 730 |
# Individual reload buttons - using direct generation for consistency
|
| 731 |
def reload_engaging_fn(image, custom_instruction):
|
| 732 |
return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
|
|
@@ -801,31 +892,18 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 801 |
)
|
| 802 |
|
| 803 |
# Export functionality
|
| 804 |
-
def handle_export():
|
| 805 |
-
"""Handle the export button click"""
|
| 806 |
-
# Get current values from all fields
|
| 807 |
-
return export_joycaption_data(
|
| 808 |
-
keywords_input.value or "",
|
| 809 |
-
custom_instruction_input.value or "",
|
| 810 |
-
question_input.value or "",
|
| 811 |
-
engaging_output.value or "",
|
| 812 |
-
friend_output.value or "",
|
| 813 |
-
uncensored_output.value or "",
|
| 814 |
-
qa_output.value or "",
|
| 815 |
-
image_input.value or ""
|
| 816 |
-
)
|
| 817 |
-
|
| 818 |
def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path):
|
| 819 |
-
"""Handle export and return proper file download"""
|
| 820 |
message, file_data = export_joycaption_data(
|
| 821 |
keywords, custom_instructions, question,
|
| 822 |
engaging_caption, casual_caption, keywords_caption, qa_answer, image_path
|
| 823 |
)
|
| 824 |
-
|
| 825 |
if file_data:
|
| 826 |
json_string, filename = file_data
|
| 827 |
-
#
|
| 828 |
-
|
|
|
|
| 829 |
with open(temp_file, 'w', encoding='utf-8') as f:
|
| 830 |
f.write(json_string)
|
| 831 |
return gr.update(value=message, visible=True), gr.update(value=temp_file, visible=True)
|
|
@@ -841,7 +919,8 @@ with gr.Blocks(title="Sequential Three-Tone JoyCaption", theme=gr.themes.Soft())
|
|
| 841 |
engaging_output,
|
| 842 |
friend_output,
|
| 843 |
uncensored_output,
|
| 844 |
-
qa_output
|
|
|
|
| 845 |
],
|
| 846 |
outputs=[export_output, export_file]
|
| 847 |
)
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Copy of the full `app.py` into the deploy folder for direct upload.
|
| 3 |
+
This file is a snapshot of the application's main entrypoint and should be
|
| 4 |
+
identical to the root `app.py` when uploading to Hugging Face Spaces.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
try:
|
| 8 |
+
import spaces
|
| 9 |
+
# Ensure spaces.GPU exists and is a decorator
|
| 10 |
+
if not hasattr(spaces, 'GPU'):
|
| 11 |
+
def _spaces_gpu(*args, **kwargs):
|
| 12 |
+
def _wrap(f):
|
| 13 |
+
return f
|
| 14 |
+
return _wrap
|
| 15 |
+
spaces.GPU = _spaces_gpu
|
| 16 |
+
except Exception:
|
| 17 |
+
# Provide a no-op spaces with a GPU decorator fallback so app can run outside HF Spaces
|
| 18 |
+
import types
|
| 19 |
+
spaces = types.SimpleNamespace()
|
| 20 |
+
def _spaces_gpu(*args, **kwargs):
|
| 21 |
+
def _wrap(f):
|
| 22 |
+
return f
|
| 23 |
+
return _wrap
|
| 24 |
+
spaces.GPU = _spaces_gpu
|
| 25 |
import gradio as gr
|
| 26 |
import torch
|
| 27 |
from transformers import LlavaForConditionalGeneration, AutoProcessor
|
| 28 |
from PIL import Image
|
| 29 |
+
import tempfile
|
| 30 |
import gc
|
| 31 |
import time
|
| 32 |
import gc
|
|
|
|
| 34 |
import shutil
|
| 35 |
import json
|
| 36 |
from pathlib import Path
|
| 37 |
+
import re
|
| 38 |
|
| 39 |
from hf_space_utils import fix_image_url
|
| 40 |
|
| 41 |
+
# Storage optimization - redirect cache to temporary directories (platform independent)
|
| 42 |
+
_tmpdir = tempfile.gettempdir()
|
| 43 |
+
os.environ["HF_HOME"] = os.path.join(_tmpdir, "hf_cache")
|
| 44 |
+
os.environ["TRANSFORMERS_CACHE"] = os.path.join(_tmpdir, "transformers_cache")
|
| 45 |
+
os.environ["HF_DATASETS_CACHE"] = os.path.join(_tmpdir, "datasets_cache")
|
| 46 |
+
os.environ["TORCH_HOME"] = os.path.join(_tmpdir, "torch_cache")
|
| 47 |
|
| 48 |
# Model configuration
|
| 49 |
MODEL_PATH = "fancyfeast/llama-joycaption-beta-one-hf-llava"
|
| 50 |
|
| 51 |
+
# Optional public host for converting /tmp/gradio paths to public gradio_api URLs
|
| 52 |
+
SPACE_HOST = os.environ.get("SPACE_HOST") or os.environ.get("HF_SPACE_HOST") or None
|
| 53 |
+
|
| 54 |
def cleanup_storage():
|
| 55 |
"""Clean up temporary files and caches to prevent storage overflow"""
|
| 56 |
try:
|
| 57 |
+
# Clean up temporary caches using the configured environment paths
|
| 58 |
+
temp_dirs = [
|
| 59 |
+
os.environ.get("HF_HOME"),
|
| 60 |
+
os.environ.get("TRANSFORMERS_CACHE"),
|
| 61 |
+
os.environ.get("HF_DATASETS_CACHE"),
|
| 62 |
+
os.environ.get("TORCH_HOME")
|
| 63 |
+
]
|
| 64 |
for temp_dir in temp_dirs:
|
| 65 |
+
if not temp_dir:
|
| 66 |
+
continue
|
| 67 |
if os.path.exists(temp_dir):
|
| 68 |
+
try:
|
| 69 |
+
shutil.rmtree(temp_dir, ignore_errors=True)
|
| 70 |
+
except Exception:
|
| 71 |
+
# best-effort cleanup
|
| 72 |
+
pass
|
| 73 |
|
| 74 |
# Force garbage collection
|
| 75 |
gc.collect()
|
|
|
|
| 96 |
|
| 97 |
# Load model and processor at startup
|
| 98 |
print("📦 Loading model and processor at startup...")
|
| 99 |
+
processor = None
|
| 100 |
+
model = None
|
| 101 |
+
MODEL_TORCH_DTYPE = None
|
| 102 |
+
MODEL_USE_CUDA = False
|
| 103 |
+
|
| 104 |
+
# Allow skipping model loading for tests or light-weight runs by setting SKIP_MODEL_LOAD=1
|
| 105 |
+
if not os.environ.get("SKIP_MODEL_LOAD"):
|
| 106 |
+
# Determine target device for model loading. On zero-GPU spaces, fall back to CPU.
|
| 107 |
+
use_cuda = torch.cuda.is_available()
|
| 108 |
+
if use_cuda:
|
| 109 |
+
# Prefer bf16 on supported GPUs, otherwise try float16
|
| 110 |
+
torch_dtype = getattr(torch, 'bfloat16', None) or getattr(torch, 'float16', None)
|
| 111 |
+
device_map = "auto"
|
| 112 |
+
MODEL_USE_CUDA = True
|
| 113 |
+
else:
|
| 114 |
+
torch_dtype = None
|
| 115 |
+
device_map = "cpu"
|
| 116 |
+
MODEL_USE_CUDA = False
|
| 117 |
+
|
| 118 |
+
processor = AutoProcessor.from_pretrained(
|
| 119 |
+
MODEL_PATH,
|
| 120 |
+
low_cpu_mem_usage=True
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
model_kwargs = dict(low_cpu_mem_usage=True, device_map=device_map)
|
| 124 |
+
if torch_dtype is not None and use_cuda:
|
| 125 |
+
model_kwargs['torch_dtype'] = torch_dtype
|
| 126 |
+
|
| 127 |
+
model = LlavaForConditionalGeneration.from_pretrained(
|
| 128 |
+
MODEL_PATH,
|
| 129 |
+
**model_kwargs
|
| 130 |
+
)
|
| 131 |
+
model.eval()
|
| 132 |
+
# remember dtype for later tensor conversions
|
| 133 |
+
MODEL_TORCH_DTYPE = model_kwargs.get('torch_dtype', None)
|
| 134 |
+
print("✅ Model loaded and ready!")
|
| 135 |
+
|
| 136 |
+
# Initial cleanup after model loading
|
| 137 |
+
cleanup_storage()
|
| 138 |
+
else:
|
| 139 |
+
print("⚠️ SKIP_MODEL_LOAD is set — skipping heavy model initialization (test mode)")
|
| 140 |
|
| 141 |
# Optimized 5-tone prompts with better temperature control
|
| 142 |
# Temperature: Lower for prompt adherence, higher for word variety
|
|
|
|
| 177 |
r'^(a photo of|an image of|a picture of|this is a photo of|this shows)\s*': '',
|
| 178 |
|
| 179 |
# Nudity precision corrections
|
| 180 |
+
r'\\btopless women\\b': lambda m: 'nude women' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless women',
|
| 181 |
+
r'\\btopless woman\\b': lambda m: 'nude woman' if 'naked' in text.lower() or 'nude' in text.lower() else 'topless woman',
|
| 182 |
|
| 183 |
# Person count corrections
|
| 184 |
+
r'\\bthree women\\b': lambda m: 'two women' if text.count('woman') + text.count('female') <= 2 else 'three women',
|
| 185 |
+
r'\\bfour women\\b': lambda m: 'three women' if text.count('woman') + text.count('female') <= 3 else 'four women',
|
| 186 |
|
| 187 |
# Clothing precision
|
| 188 |
+
r'\\bwearing nothing\\b': 'nude',
|
| 189 |
+
r'\\bnot wearing.*clothes\\b': 'nude',
|
| 190 |
+
r'\\bcompletely naked\\b': 'nude',
|
| 191 |
+
r'\\bfully nude\\b': 'nude',
|
| 192 |
}
|
| 193 |
|
| 194 |
corrected_text = text
|
| 195 |
try:
|
| 196 |
for pattern, replacement in corrections.items():
|
| 197 |
if callable(replacement):
|
| 198 |
+
# Wrap the replacement to ensure it returns a string and accepts a Match
|
| 199 |
+
def _repl(match, rep=replacement):
|
| 200 |
+
try:
|
| 201 |
+
out = rep(match)
|
| 202 |
+
return "" if out is None else str(out)
|
| 203 |
+
except Exception:
|
| 204 |
+
return match.group(0)
|
| 205 |
+
corrected_text = re.sub(pattern, _repl, corrected_text, flags=re.IGNORECASE)
|
| 206 |
else:
|
| 207 |
corrected_text = re.sub(pattern, replacement, corrected_text, flags=re.IGNORECASE)
|
| 208 |
except Exception as e:
|
|
|
|
| 237 |
{"role": "user", "content": base_prompt}
|
| 238 |
]
|
| 239 |
|
| 240 |
+
# Ensure model and processor are loaded
|
| 241 |
+
if processor is None or model is None:
|
| 242 |
+
return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
|
| 243 |
+
|
| 244 |
convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 245 |
inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
|
| 246 |
+
|
| 247 |
device = next(model.parameters()).device
|
| 248 |
inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
|
| 249 |
+
|
| 250 |
+
# Safely convert pixel tensor dtype depending on runtime capabilities
|
| 251 |
if 'pixel_values' in inputs:
|
| 252 |
+
if MODEL_USE_CUDA and MODEL_TORCH_DTYPE is not None:
|
| 253 |
+
try:
|
| 254 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(MODEL_TORCH_DTYPE)
|
| 255 |
+
except Exception:
|
| 256 |
+
# fallback to float32
|
| 257 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
|
| 258 |
+
else:
|
| 259 |
+
# CPU fallback
|
| 260 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
|
| 261 |
|
| 262 |
# Get tone-specific generation parameters
|
| 263 |
temperature = tone_config.get("temperature", 0.7)
|
|
|
|
| 325 |
pass
|
| 326 |
return f"❌ Error: {str(e)[:50]}..."
|
| 327 |
|
|
|
|
|
|
|
| 328 |
@torch.no_grad()
|
| 329 |
def generate_engaging_only(image, custom_instruction=""):
|
| 330 |
"""Generate only engaging caption"""
|
| 331 |
return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
|
| 332 |
|
|
|
|
| 333 |
@torch.no_grad()
|
| 334 |
def generate_casual_friend_only(image, custom_instruction=""):
|
| 335 |
"""Generate only casual friend caption"""
|
|
|
|
| 337 |
|
| 338 |
# NSFW function removed - caused hallucination
|
| 339 |
|
|
|
|
| 340 |
@torch.no_grad()
|
| 341 |
def generate_uncensored_keywords_only(image, keywords_text, custom_instruction=""):
|
| 342 |
"""Generate only uncensored with keywords caption"""
|
|
|
|
| 344 |
|
| 345 |
# Body parts focus function removed - caused hallucination
|
| 346 |
|
|
|
|
| 347 |
@torch.no_grad()
|
| 348 |
def answer_question(image, question):
|
| 349 |
"""Answer any question about the image without censorship"""
|
|
|
|
| 362 |
{"role": "user", "content": qa_prompt}
|
| 363 |
]
|
| 364 |
|
| 365 |
+
# Ensure model and processor are loaded
|
| 366 |
+
if processor is None or model is None:
|
| 367 |
+
return "❌ Model or processor not initialized. Make sure model is loaded (unset SKIP_MODEL_LOAD) and dependencies are installed."
|
| 368 |
+
|
| 369 |
convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
|
| 370 |
inputs = processor(text=[convo_string], images=[image], return_tensors="pt")
|
| 371 |
+
|
| 372 |
device = next(model.parameters()).device
|
| 373 |
inputs = {k: v.to(device, non_blocking=True) if hasattr(v, 'to') else v for k, v in inputs.items()}
|
| 374 |
+
|
| 375 |
+
# Safely convert pixel_values dtype depending on runtime
|
| 376 |
if 'pixel_values' in inputs:
|
| 377 |
+
if MODEL_USE_CUDA and MODEL_TORCH_DTYPE is not None:
|
| 378 |
+
try:
|
| 379 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(MODEL_TORCH_DTYPE)
|
| 380 |
+
except Exception:
|
| 381 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
|
| 382 |
+
else:
|
| 383 |
+
inputs['pixel_values'] = inputs['pixel_values'].to(torch.float32)
|
| 384 |
|
| 385 |
with torch.no_grad():
|
| 386 |
output = model.generate(
|
|
|
|
| 436 |
|
| 437 |
if question and question.strip():
|
| 438 |
data["data"]["question"] = question.strip()
|
| 439 |
+
|
| 440 |
+
# Always attempt to include the uploaded image URL (converted) if an image path was provided
|
| 441 |
+
if image_path and str(image_path).strip():
|
| 442 |
+
# include the raw local path
|
| 443 |
+
data["data"]["image_local_path"] = str(image_path)
|
| 444 |
+
# pass empty string when no host is configured (fix_image_url treats falsy host as no conversion)
|
| 445 |
+
image_url_converted = fix_image_url(image_path, host=(SPACE_HOST or ""))
|
| 446 |
if image_url_converted and str(image_url_converted).strip():
|
| 447 |
data["data"]["image_url"] = str(image_url_converted).strip()
|
| 448 |
# Add generated captions
|
|
|
|
| 489 |
|
| 490 |
// Get all textareas and inputs from the page
|
| 491 |
const allInputs = document.querySelectorAll('textarea, input[type="text"]');
|
| 492 |
+
|
| 493 |
allInputs.forEach((field, index) => {
|
| 494 |
const placeholder = (field.placeholder || '').toLowerCase();
|
| 495 |
const value = field.value ? field.value.trim() : '';
|
|
|
|
| 662 |
lines=2,
|
| 663 |
info="Add keywords that will be mentioned by the 'Keywords' tone ONLY if they apply to what's visible in the image"
|
| 664 |
)
|
| 665 |
+
# image_reference_input removed by request — we will export the actual image URL instead
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 666 |
|
| 667 |
custom_instruction_input = gr.Textbox(
|
| 668 |
placeholder="e.g., 'from instagram', 'the left girl has red hair', 'two girls kissing', 'beach setting'...",
|
|
|
|
| 721 |
interactive=True,
|
| 722 |
placeholder="Click the button above to generate engaging caption..."
|
| 723 |
)
|
| 724 |
+
|
| 725 |
# Casual Friend caption
|
| 726 |
with gr.Row():
|
| 727 |
with gr.Column(scale=4):
|
|
|
|
| 743 |
interactive=True,
|
| 744 |
placeholder="Click the button above to generate casual friend caption..."
|
| 745 |
)
|
| 746 |
+
|
| 747 |
# NSFW section removed - caused hallucination
|
| 748 |
+
|
| 749 |
# Keywords caption
|
| 750 |
with gr.Row():
|
| 751 |
with gr.Column(scale=4):
|
|
|
|
| 767 |
interactive=True,
|
| 768 |
placeholder="Click the button above to generate keywords caption..."
|
| 769 |
)
|
| 770 |
+
|
| 771 |
# Body Parts Focus section removed - caused hallucination
|
| 772 |
+
|
| 773 |
# Descriptive text removed for cleaner interface
|
| 774 |
+
|
| 775 |
# Export functionality
|
| 776 |
with gr.Row():
|
| 777 |
export_btn = gr.Button(
|
|
|
|
| 808 |
)
|
| 809 |
|
| 810 |
# NSFW button handler removed
|
| 811 |
+
|
| 812 |
generate_uncensored_btn.click(
|
| 813 |
generate_uncensored_keywords_only,
|
| 814 |
inputs=[image_input, keywords_input, custom_instruction_input],
|
|
|
|
| 817 |
)
|
| 818 |
|
| 819 |
# Body Parts Focus button handler removed
|
| 820 |
+
|
| 821 |
# Individual reload buttons - using direct generation for consistency
|
| 822 |
def reload_engaging_fn(image, custom_instruction):
|
| 823 |
return safe_generate_caption_direct(image, "engaging", custom_instruction=custom_instruction) if image else "❌ Upload image first"
|
|
|
|
| 892 |
)
|
| 893 |
|
| 894 |
# Export functionality
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 895 |
def handle_export(keywords, custom_instructions, question, engaging_caption, casual_caption, keywords_caption, qa_answer, image_path):
|
| 896 |
+
"""Handle export and return proper file download (cross-platform, uses tempdir)"""
|
| 897 |
message, file_data = export_joycaption_data(
|
| 898 |
keywords, custom_instructions, question,
|
| 899 |
engaging_caption, casual_caption, keywords_caption, qa_answer, image_path
|
| 900 |
)
|
| 901 |
+
|
| 902 |
if file_data:
|
| 903 |
json_string, filename = file_data
|
| 904 |
+
# Use the OS temp directory so this works on Windows, macOS, Linux and in Spaces
|
| 905 |
+
base_dir = tempfile.gettempdir()
|
| 906 |
+
temp_file = os.path.join(base_dir, filename)
|
| 907 |
with open(temp_file, 'w', encoding='utf-8') as f:
|
| 908 |
f.write(json_string)
|
| 909 |
return gr.update(value=message, visible=True), gr.update(value=temp_file, visible=True)
|
|
|
|
| 919 |
engaging_output,
|
| 920 |
friend_output,
|
| 921 |
uncensored_output,
|
| 922 |
+
qa_output,
|
| 923 |
+
image_input
|
| 924 |
],
|
| 925 |
outputs=[export_output, export_file]
|
| 926 |
)
|