Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 25, 2025

Commit

b693e6d

verified ·

1 Parent(s): 951b327

Update app.py

Browse files

Files changed (1) hide show

app.py +73 -52

app.py CHANGED Viewed

@@ -45,7 +45,6 @@ def fix_image_url(raw_url_or_path: str, host: Optional[str] = None) -> str:
         return f"{host}/gradio_api/file=/{p}"
     return raw_url_or_path
 def postprocess_caption(caption: str, max_chars: int = 1200) -> str:
     if not caption or not isinstance(caption, str):
         return caption or ""
@@ -61,7 +60,6 @@ def postprocess_caption(caption: str, max_chars: int = 1200) -> str:
         result += "."
     return result
 def force_clear_all_caches():
     try:
         if torch.cuda.is_available():
@@ -71,7 +69,6 @@ def force_clear_all_caches():
     except Exception:
         pass
 force_clear_all_caches()
 # ===== SETUP =====
@@ -115,22 +112,27 @@ DEFAULT_PROMPTS = {
     }
 }
-# ===== CAPTION GENERATION =====
 def safe_generate_caption_direct(image, system_prompt, user_prompt, max_chars=1200):
     try:
         if image is None:
             return "❌ No image provided"
         if not system_prompt.strip() or not user_prompt.strip():
             return "❌ Both system and user prompts are required"
         torch.cuda.empty_cache()
         gc.collect()
         convo = [
             {"role": "system", "content": system_prompt.strip()},
             {"role": "user", "content": user_prompt.strip()}
         ]
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
-        inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
         with torch.no_grad():
             output = model.generate(
                 **inputs,
@@ -138,22 +140,39 @@ def safe_generate_caption_direct(image, system_prompt, user_prompt, max_chars=12
                 do_sample=True,
                 temperature=0.6,
                 top_p=0.9,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
-                eos_token_id=processor.tokenizer.eos_token_id,
             )
-        input_length = inputs["input_ids"].shape[1]
-        result = processor.tokenizer.decode(output[0][input_length:], skip_special_tokens=True)
         del inputs, output
         torch.cuda.empty_cache()
         gc.collect()
-        return postprocess_caption(result, max_chars=max_chars) or "❌ Empty result"
     except Exception as e:
         torch.cuda.empty_cache()
         gc.collect()
         return f"❌ Error: {str(e)[:200]}"
 @spaces.GPU(duration=60)
 @torch.no_grad()
 def generate_caption(image, system, user):
@@ -161,7 +180,6 @@ def generate_caption(image, system, user):
         return "❌ Upload image first"
     return safe_generate_caption_direct(image, system, user)
 # ===== Q&A =====
 @spaces.GPU(duration=40)
 @torch.no_grad()
@@ -180,7 +198,8 @@ def answer_question(image, question):
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
         inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
-        output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.6, top_p=0.9)
         input_length = inputs["input_ids"].shape[1]
         result = processor.tokenizer.decode(output[0][input_length:], skip_special_tokens=True)
         del inputs, output
@@ -192,7 +211,6 @@ def answer_question(image, question):
         gc.collect()
         return f"❌ Q&A Error: {str(e)[:200]}"
 # ===== TEMPLATE HELPERS =====
 def insert_template(current_text, template_text, field_content):
     if not field_content.strip():
@@ -202,7 +220,6 @@ def insert_template(current_text, template_text, field_content):
         return current_text
     return (current_text.rstrip() + " " + formatted).strip()
 def create_template_functions():
     def insert_key(s, u, c):
         t = "Pay attention to these keywords: {content}."
@@ -218,7 +235,6 @@ def create_template_functions():
         return insert_template(s, t, c), insert_template(u, t, c)
     return insert_key, insert_que, insert_use, insert_not
 # ===== EXPORT =====
 def export_joycaption_data(keywords, custom_instructions, avoid, question, c1, c2, c3, qa, img):
     try:
@@ -228,9 +244,9 @@ def export_joycaption_data(keywords, custom_instructions, avoid, question, c1, c
         if custom_instructions.strip(): add["custom_instructions"] = custom_instructions.strip()
         if avoid.strip(): add["avoid"] = avoid.strip()
         if question.strip(): add["question"] = question.strip()
-        if img.strip():
-            add["image_local_path"] = img
-            url = fix_image_url(img, host=(SPACE_HOST or ""))
             if url: add["image_url"] = url
         if c1.strip(): add["caption_casual"] = c1.strip()
         if c2.strip(): add["caption_friendly"] = c2.strip()
@@ -244,10 +260,9 @@ def export_joycaption_data(keywords, custom_instructions, avoid, question, c1, c
     except Exception as e:
         return f"❌ Export failed: {e}", None
 # ===== UI =====
 with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Soft()) as demo:
-    gr.HTML("<style>textarea{overflow-y:hidden!important;}</style>")
     gr.HTML("<h1 style='text-align:center;margin-top:10px;'>🎨 JoyCaption Advanced Prompting System (v6.0)</h1><hr>")
     insert_key, insert_que, insert_use, insert_not = create_template_functions()
@@ -273,25 +288,25 @@ with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Sof
             with gr.Tab("📝 Casual") as tab1:
                 gr.Markdown("**System Prompt**")
-                system1 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["casual"]["system"])
                 gr.Markdown("**User Prompt**")
-                user1 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["casual"]["user"])
                 gen1_btn = gr.Button("Generate Casual", variant="primary")
                 out1 = gr.Textbox(lines=5, show_copy_button=True)
             with gr.Tab("🤝 Friendly") as tab2:
                 gr.Markdown("**System Prompt**")
-                system2 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["friendly"]["system"])
                 gr.Markdown("**User Prompt**")
-                user2 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["friendly"]["user"])
                 gen2_btn = gr.Button("Generate Friendly", variant="primary")
                 out2 = gr.Textbox(lines=5, show_copy_button=True)
             with gr.Tab("🔥 Erotic") as tab3:
                 gr.Markdown("**System Prompt**")
-                system3 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["erotic"]["system"])
                 gr.Markdown("**User Prompt**")
-                user3 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["erotic"]["user"])
                 gen3_btn = gr.Button("Generate Erotic", variant="primary")
                 out3 = gr.Textbox(lines=5, show_copy_button=True)
@@ -300,37 +315,59 @@ with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Sof
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
     tab1.select(lambda: "casual", None, active_tab)
     tab2.select(lambda: "friendly", None, active_tab)
     tab3.select(lambda: "erotic", None, active_tab)
     gen1_btn.click(generate_caption, [image_input, system1, user1], out1)
     gen2_btn.click(generate_caption, [image_input, system2, user2], out2)
     gen3_btn.click(generate_caption, [image_input, system3, user3], out3)
     ask_btn.click(answer_question, [image_input, question_input], qa_output)
-    def handle_template(btn_type, tab, s1, u1, s2, u2, s3, u3, k, c, q, a):
         key_f, que_f, use_f, not_f = create_template_functions()
-        mapping = {"key": key_f, "que": que_f, "use": use_f, "not": not_f}
-        fn = mapping.get(btn_type)
         if not fn:
             return s1, u1, s2, u2, s3, u3
         if tab == "casual":
-            s1, u1 = fn(s1, u1, k or c or q or a)
         elif tab == "friendly":
-            s2, u2 = fn(s2, u2, k or c or q or a)
         elif tab == "erotic":
-            s3, u3 = fn(s3, u3, k or c or q or a)
         return s1, u1, s2, u2, s3, u3
-    for b, t in [(key_btn, "key"), (que_btn, "que"), (use_btn, "use"), (not_btn, "not")]:
-        b.click(
             handle_template,
-            [gr.State(t), active_tab, system1, user1, system2, user2, system3, user3,
              keywords_input, custom_instruction_input, question_input, avoid_input],
             [system1, user1, system2, user2, system3, user3],
         )
     def handle_export(k, c, a, q, c1, c2, c3, qa, img):
         msg, fd = export_joycaption_data(k, c, a, q, c1, c2, c3, qa, img)
         if fd:
@@ -348,21 +385,5 @@ with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Sof
         [export_out, export_file]
     )
-    # JS autoresize fix for all tabs
-    demo.load(js="""
-    () => {
-      function resizeAll() {
-        document.querySelectorAll('textarea').forEach(t=>{
-          t.style.height='auto';
-          t.style.height=(t.scrollHeight+5)+'px';
-        });
-      }
-      resizeAll();
-      document.querySelectorAll('[role="tab"]').forEach(tab=>{
-        tab.addEventListener('click', ()=>setTimeout(resizeAll,300));
-      });
-    }
-    """)
 if __name__ == "__main__":
-    demo.launch()

         return f"{host}/gradio_api/file=/{p}"
     return raw_url_or_path
 def postprocess_caption(caption: str, max_chars: int = 1200) -> str:
     if not caption or not isinstance(caption, str):
         return caption or ""
         result += "."
     return result
 def force_clear_all_caches():
     try:
         if torch.cuda.is_available():
     except Exception:
         pass
 force_clear_all_caches()
 # ===== SETUP =====
     }
 }
 def safe_generate_caption_direct(image, system_prompt, user_prompt, max_chars=1200):
+    """Generate caption using custom prompts"""
     try:
         if image is None:
             return "❌ No image provided"
         if not system_prompt.strip() or not user_prompt.strip():
             return "❌ Both system and user prompts are required"
         torch.cuda.empty_cache()
         gc.collect()
         convo = [
             {"role": "system", "content": system_prompt.strip()},
             {"role": "user", "content": user_prompt.strip()}
         ]
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
+        inputs['pixel_values'] = inputs['pixel_values'].to(torch.bfloat16)
         with torch.no_grad():
             output = model.generate(
                 **inputs,
                 do_sample=True,
                 temperature=0.6,
                 top_p=0.9,
+                top_k=None,
                 use_cache=True,
                 pad_token_id=processor.tokenizer.eos_token_id,
+                eos_token_id=processor.tokenizer.eos_token_id
             )
+        if output is None or len(output) == 0:
+            return "❌ No output generated"
+        if 'input_ids' in inputs and len(inputs['input_ids'].shape) >= 2:
+            input_length = inputs['input_ids'].shape[1]
+            if len(output[0]) > input_length:
+                generate_ids = output[0][input_length:]
+                result = processor.tokenizer.decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)
+            else:
+                result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        else:
+            result = processor.tokenizer.decode(output[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
+        result = result.strip()
         del inputs, output
         torch.cuda.empty_cache()
         gc.collect()
+        final_result = postprocess_caption(result, max_chars=max_chars)
+        return final_result if final_result else "❌ Empty result"
     except Exception as e:
         torch.cuda.empty_cache()
         gc.collect()
         return f"❌ Error: {str(e)[:200]}"
 @spaces.GPU(duration=60)
 @torch.no_grad()
 def generate_caption(image, system, user):
         return "❌ Upload image first"
     return safe_generate_caption_direct(image, system, user)
 # ===== Q&A =====
 @spaces.GPU(duration=40)
 @torch.no_grad()
         convo_string = processor.apply_chat_template(convo, tokenize=False, add_generation_prompt=True)
         inputs = processor(text=[convo_string], images=[image], return_tensors="pt").to("cuda")
         inputs["pixel_values"] = inputs["pixel_values"].to(torch.bfloat16)
+        with torch.no_grad():
+            output = model.generate(**inputs, max_new_tokens=300, do_sample=True, temperature=0.6, top_p=0.9)
         input_length = inputs["input_ids"].shape[1]
         result = processor.tokenizer.decode(output[0][input_length:], skip_special_tokens=True)
         del inputs, output
         gc.collect()
         return f"❌ Q&A Error: {str(e)[:200]}"
 # ===== TEMPLATE HELPERS =====
 def insert_template(current_text, template_text, field_content):
     if not field_content.strip():
         return current_text
     return (current_text.rstrip() + " " + formatted).strip()
 def create_template_functions():
     def insert_key(s, u, c):
         t = "Pay attention to these keywords: {content}."
         return insert_template(s, t, c), insert_template(u, t, c)
     return insert_key, insert_que, insert_use, insert_not
 # ===== EXPORT =====
 def export_joycaption_data(keywords, custom_instructions, avoid, question, c1, c2, c3, qa, img):
     try:
         if custom_instructions.strip(): add["custom_instructions"] = custom_instructions.strip()
         if avoid.strip(): add["avoid"] = avoid.strip()
         if question.strip(): add["question"] = question.strip()
+        if hasattr(img, '__str__') and str(img).strip():
+            add["image_local_path"] = str(img)
+            url = fix_image_url(str(img), host=(SPACE_HOST or ""))
             if url: add["image_url"] = url
         if c1.strip(): add["caption_casual"] = c1.strip()
         if c2.strip(): add["caption_friendly"] = c2.strip()
     except Exception as e:
         return f"❌ Export failed: {e}", None
 # ===== UI =====
 with gr.Blocks(title="JoyCaption Advanced Prompting System", theme=gr.themes.Soft()) as demo:
+    gr.HTML("<style>textarea{resize:none!important;}</style>")
     gr.HTML("<h1 style='text-align:center;margin-top:10px;'>🎨 JoyCaption Advanced Prompting System (v6.0)</h1><hr>")
     insert_key, insert_que, insert_use, insert_not = create_template_functions()
             with gr.Tab("📝 Casual") as tab1:
                 gr.Markdown("**System Prompt**")
+                system1 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["casual"]["system"], lines=3)
                 gr.Markdown("**User Prompt**")
+                user1 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["casual"]["user"], lines=3)
                 gen1_btn = gr.Button("Generate Casual", variant="primary")
                 out1 = gr.Textbox(lines=5, show_copy_button=True)
             with gr.Tab("🤝 Friendly") as tab2:
                 gr.Markdown("**System Prompt**")
+                system2 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["friendly"]["system"], lines=3)
                 gr.Markdown("**User Prompt**")
+                user2 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["friendly"]["user"], lines=3)
                 gen2_btn = gr.Button("Generate Friendly", variant="primary")
                 out2 = gr.Textbox(lines=5, show_copy_button=True)
             with gr.Tab("🔥 Erotic") as tab3:
                 gr.Markdown("**System Prompt**")
+                system3 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["erotic"]["system"], lines=3)
                 gr.Markdown("**User Prompt**")
+                user3 = gr.Textbox(show_label=False, value=DEFAULT_PROMPTS["erotic"]["user"], lines=3)
                 gen3_btn = gr.Button("Generate Erotic", variant="primary")
                 out3 = gr.Textbox(lines=5, show_copy_button=True)
             export_out = gr.Textbox(visible=False)
             export_file = gr.File(visible=False)
+    # Tab selection tracking
     tab1.select(lambda: "casual", None, active_tab)
     tab2.select(lambda: "friendly", None, active_tab)
     tab3.select(lambda: "erotic", None, active_tab)
+    # Caption generation
     gen1_btn.click(generate_caption, [image_input, system1, user1], out1)
     gen2_btn.click(generate_caption, [image_input, system2, user2], out2)
     gen3_btn.click(generate_caption, [image_input, system3, user3], out3)
     ask_btn.click(answer_question, [image_input, question_input], qa_output)
+    # Template insertion with proper field mapping
+    def handle_template(btn_type, tab, s1, u1, s2, u2, s3, u3, keywords, custom, question, avoid):
         key_f, que_f, use_f, not_f = create_template_functions()
+        # Map button type to field content
+        content_map = {
+            "key": keywords,
+            "que": question,
+            "use": custom,
+            "not": avoid
+        }
+        content = content_map.get(btn_type, "")
+        if not content.strip():
+            return s1, u1, s2, u2, s3, u3
+        # Map button type to function
+        fn_map = {"key": key_f, "que": que_f, "use": use_f, "not": not_f}
+        fn = fn_map.get(btn_type)
         if not fn:
             return s1, u1, s2, u2, s3, u3
+        # Apply to correct tab
         if tab == "casual":
+            s1, u1 = fn(s1, u1, content)
         elif tab == "friendly":
+            s2, u2 = fn(s2, u2, content)
         elif tab == "erotic":
+            s3, u3 = fn(s3, u3, content)
         return s1, u1, s2, u2, s3, u3
+    # Connect template buttons
+    for btn, btn_type in [(key_btn, "key"), (que_btn, "que"), (use_btn, "use"), (not_btn, "not")]:
+        btn.click(
             handle_template,
+            [gr.State(btn_type), active_tab, system1, user1, system2, user2, system3, user3,
              keywords_input, custom_instruction_input, question_input, avoid_input],
             [system1, user1, system2, user2, system3, user3],
         )
+    # Export functionality
     def handle_export(k, c, a, q, c1, c2, c3, qa, img):
         msg, fd = export_joycaption_data(k, c, a, q, c1, c2, c3, qa, img)
         if fd:
         [export_out, export_file]
     )
 if __name__ == "__main__":
+    demo.launch()