Spaces:

JS6969
/

ForgeCaptions

Runtime error

App Files Files Community

JS6969 commited on Sep 2, 2025

Commit

6231519

verified ·

1 Parent(s): 57c4398

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -36

app.py CHANGED Viewed

@@ -1,7 +1,9 @@
 import os, io, csv, time, json, hashlib, base64, zipfile, re
 from typing import List, Tuple, Dict, Any
-# Caches
 os.environ.setdefault("HF_HOME", "/home/user/.cache/huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
@@ -10,8 +12,16 @@ from PIL import Image
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
 # ────────────────────────────────────────────────────────
-# Paths & caches
 # ────────────────────────────────────────────────────────
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/session.json"
@@ -34,8 +44,8 @@ def _detect_gpu():
     return "cpu", 0, "CPU"
 BACKEND, VRAM_GB, GPU_NAME = _detect_gpu()
-DTYPE   = torch.bfloat16 if BACKEND == "cuda" else torch.float32
 DEVICE  = "cuda" if BACKEND == "cuda" else "cpu"
 MAX_SIDE_CAP = 1024 if BACKEND == "cuda" else 640
 processor = AutoProcessor.from_pretrained(MODEL_PATH)
@@ -66,8 +76,8 @@ STYLE_OPTIONS = [
 ]
 CAPTION_TYPE_MAP = {
-    "Descriptive (short)": "Write a short describition of the most important visible elements only. No speculation.",
-    "Descriptive (long)": "Write a long, highly detailed description for this image.",
     "Character training (short)": (
         "Output a concise, prompt-like caption for character LoRA/ID training. "
@@ -103,7 +113,7 @@ CAPTION_TYPE_MAP = {
 EXTRA_CHOICES = [
     "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
-    "IGNORE all watermarks.",
     "Do NOT use any ambiguous language.",
     "ONLY describe the most important elements of the image.",
     "Include information about the ages of any people/characters when applicable.",
@@ -124,7 +134,7 @@ EXTRA_CHOICES = [
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
 # ────────────────────────────────────────────────────────
-# Helpers (hashing, thumbs, resize)
 # ────────────────────────────────────────────────────────
 def ensure_thumb(path: str, max_side=256) -> str:
     try:
@@ -150,6 +160,17 @@ def resize_for_model(im: Image.Image, max_side: int) -> Image.Image:
     s = max_side / max(w, h)
     return im.resize((int(w*s), int(h*s)), Image.LANCZOS)
 # ────────────────────────────────────────────────────────
 # Instruction + caption helpers
 # ────────────────────────────────────────────────────────
@@ -163,17 +184,6 @@ def final_instruction(style_list: List[str], extra_opts: List[str], name_value:
         core = core.replace("{name}", (name_value or "{NAME}").strip())
     return core
-def apply_prefix_suffix(caption: str, trigger_word: str, begin_text: str, end_text: str) -> str:
-    parts = []
-    if trigger_word.strip():
-        parts.append(trigger_word.strip())
-    if begin_text.strip():
-        parts.append(begin_text.strip())
-    parts.append(caption.strip())
-    if end_text.strip():
-        parts.append(end_text.strip())
-    return " ".join([p for p in parts if p])
 @torch.no_grad()
 def caption_once(im: Image.Image, instr: str, temp: float, top_p: float, max_tokens: int) -> str:
     # Your requested role script:
@@ -190,6 +200,7 @@ def caption_once(im: Image.Image, instr: str, temp: float, top_p: float, max_tok
         do_sample=temp > 0,
         temperature=temp if temp > 0 else None,
         top_p=top_p if temp > 0 else None,
     )
     gen_ids = out[0, inputs["input_ids"].shape[1]:]
     return processor.tokenizer.decode(gen_ids, skip_special_tokens=True)
@@ -231,12 +242,11 @@ def load_settings() -> dict:
         "end": "",
         "shape_aliases_enabled": True,
         "shape_aliases": [],
-        "excel_thumb_px": 128,  # new default
     }
     for k, v in defaults.items():
         cfg.setdefault(k, v)
-    # migrate legacy names
     legacy_map = {
         "Descriptive": "Descriptive (short)",
         "LoRA (Flux_D Realism)": "LoRA (Flux_D Realism) (short)",
@@ -325,7 +335,7 @@ def save_shape_alias_rows(enabled, df_rows):
     )
 # ────────────────────────────────────────────────────────
-# Import / Export helpers
 # ────────────────────────────────────────────────────────
 def export_csv_from_table(table_value: Any) -> str:
     data = table_value or []
@@ -377,7 +387,7 @@ def export_excel_with_thumbs(table_value: Any, session_rows: List[dict], thumb_p
     ws.column_dimensions["B"].width = 42
     ws.column_dimensions["C"].width = 100
-    # Approx px→points (Excel row height is points; ~0.75 pt per px @ 96dpi)
     row_h = int(thumb_px * 0.75)
     r_i = 2
@@ -417,8 +427,9 @@ def _table_to_rows(table_value: Any, rows: List[dict]) -> List[dict]:
     return new
 # ────────────────────────────────────────────────────────
-# Batch captioning (returns rows, gallery, table)
 # ────────────────────────────────────────────────────────
 @torch.no_grad()
 def run_batch(
     files: List[Any],
@@ -475,6 +486,17 @@ def sync_table_to_session(table_value: Any, session_rows: List[dict]) -> Tuple[L
     ]
     return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
 # ────────────────────────────────────────────────────────
 # UI
 # ────────────────────────────────────────────────────────
@@ -501,7 +523,7 @@ BASE_CSS = """
 def logo_b64_img() -> str:
     candidates = [
         os.path.join(APP_DIR, "forgecaptions-logo.png"),
-        os.path.join(APP_DIR, "captionforge-logo.png"),  # fallback if you kept the old name
         "/home/user/app/forgecaptions-logo.png",
         "forgecaptions-logo.png",
         "captionforge-logo.png",
@@ -514,6 +536,9 @@ def logo_b64_img() -> str:
     return ""
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     settings = load_settings()
     settings["styles"] = [s for s in settings.get("styles", []) if s in STYLE_OPTIONS] or ["Character training (short)"]
@@ -556,9 +581,9 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
                 dataset_name = gr.Textbox(label="Dataset name (used for export file titles)", value=settings.get("dataset_name", "forgecaptions"))
                 max_side   = gr.Slider(256, MAX_SIDE_CAP, settings.get("max_side", min(896, MAX_SIDE_CAP)), step=32, label="Max side (resize)")
                 excel_thumb_px = gr.Slider(64, 256, value=settings.get("excel_thumb_px", 128), step=8, label="Excel thumbnail size (px)")
-                gr.Markdown("Generation (saved in settings): temperature 0.6 • top-p 0.9 • max_tokens 256")
-    # Auto-refresh instruction & persist key controls
     def _refresh_instruction(styles, extra, name_value, trigv, begv, endv, excel_px, ms):
         instr = final_instruction(styles or ["Character training (short)"], extra or [], name_value)
         cfg = load_settings()
@@ -580,11 +605,10 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             outputs=[instruction_preview]
         )
-    # Set initial instruction text on load
     demo.load(lambda s,e,n: final_instruction(s or ["Character training (short)"], e or [], n),
               inputs=[style_checks, extra_opts, name_input], outputs=[instruction_preview])
-    # ===== Shape Aliases
     with gr.Accordion("Shape Aliases", open=False):
         gr.Markdown(
             "### 🔷 Shape Aliases\n"
@@ -616,7 +640,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         clear_btn.click(_clear_rows, outputs=[alias_table])
         save_btn.click(save_shape_alias_rows, inputs=[enable_aliases, alias_table], outputs=[save_status, alias_table])
-    # ===== Tabs: Single & Batch (keeps gallery/table position below)
     with gr.Tabs():
         with gr.Tab("Single"):
             input_image_single = gr.Image(type="pil", label="Input Image", height=512, width=512)
@@ -647,7 +671,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
                 input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple", type="filepath")
             run_button = gr.Button("Caption batch", variant="primary")
-    # ===== Results/Table (position unchanged)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
@@ -764,14 +788,12 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         inputs=[table, rows_state, excel_thumb_px], outputs=[xlsx_file, xlsx_file]
     )
-# Launch
 if __name__ == "__main__":
     demo.queue(max_size=64).launch(
         server_name="0.0.0.0",
         server_port=int(os.getenv("PORT", "7860")),
-        ssr_mode=False,     # turn off experimental SSR
-        debug=True,         # log stack traces
-        show_error=True     # show UI error boxes
-        # share=True        # only for local dev; not needed on Spaces
     )

 import os, io, csv, time, json, hashlib, base64, zipfile, re
 from typing import List, Tuple, Dict, Any
+# ────────────────────────────────────────────────────────
+# Cache locations (kept simple / persistent)
+# ────────────────────────────────────────────────────────
 os.environ.setdefault("HF_HOME", "/home/user/.cache/huggingface")
 os.makedirs(os.environ["HF_HOME"], exist_ok=True)
 import torch
 from transformers import LlavaForConditionalGeneration, AutoProcessor
+# Try to import spaces and define a GPU decorator that works on CPU too
+try:
+    import spaces
+    gpu = spaces.GPU()
+except Exception:
+    def gpu(f):  # no-op on CPU / local
+        return f
 # ────────────────────────────────────────────────────────
+# Paths & files
 # ────────────────────────────────────────────────────────
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/session.json"
     return "cpu", 0, "CPU"
 BACKEND, VRAM_GB, GPU_NAME = _detect_gpu()
 DEVICE  = "cuda" if BACKEND == "cuda" else "cpu"
+DTYPE   = torch.bfloat16 if BACKEND == "cuda" else torch.float32
 MAX_SIDE_CAP = 1024 if BACKEND == "cuda" else 640
 processor = AutoProcessor.from_pretrained(MODEL_PATH)
 ]
 CAPTION_TYPE_MAP = {
+    "Descriptive (short)": "One sentence (≤25 words) describing the most important visible elements only. No speculation.",
+    "Descriptive (long)": "Write a detailed description for this image.",
     "Character training (short)": (
         "Output a concise, prompt-like caption for character LoRA/ID training. "
 EXTRA_CHOICES = [
     "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
+    "Do NOT include information about whether there is a watermark or not.",
     "Do NOT use any ambiguous language.",
     "ONLY describe the most important elements of the image.",
     "Include information about the ages of any people/characters when applicable.",
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
 # ────────────────────────────────────────────────────────
+# Helpers (thumbs, resize, prefix/suffix)
 # ────────────────────────────────────────────────────────
 def ensure_thumb(path: str, max_side=256) -> str:
     try:
     s = max_side / max(w, h)
     return im.resize((int(w*s), int(h*s)), Image.LANCZOS)
+def apply_prefix_suffix(caption: str, trigger_word: str, begin_text: str, end_text: str) -> str:
+    parts = []
+    if trigger_word.strip():
+        parts.append(trigger_word.strip())
+    if begin_text.strip():
+        parts.append(begin_text.strip())
+    parts.append(caption.strip())
+    if end_text.strip():
+        parts.append(end_text.strip())
+    return " ".join([p for p in parts if p])
 # ────────────────────────────────────────────────────────
 # Instruction + caption helpers
 # ────────────────────────────────────────────────────────
         core = core.replace("{name}", (name_value or "{NAME}").strip())
     return core
 @torch.no_grad()
 def caption_once(im: Image.Image, instr: str, temp: float, top_p: float, max_tokens: int) -> str:
     # Your requested role script:
         do_sample=temp > 0,
         temperature=temp if temp > 0 else None,
         top_p=top_p if temp > 0 else None,
+        use_cache=True,
     )
     gen_ids = out[0, inputs["input_ids"].shape[1]:]
     return processor.tokenizer.decode(gen_ids, skip_special_tokens=True)
         "end": "",
         "shape_aliases_enabled": True,
         "shape_aliases": [],
+        "excel_thumb_px": 128,
     }
     for k, v in defaults.items():
         cfg.setdefault(k, v)
     legacy_map = {
         "Descriptive": "Descriptive (short)",
         "LoRA (Flux_D Realism)": "LoRA (Flux_D Realism) (short)",
     )
 # ────────────────────────────────────────────────────────
+# Exports
 # ────────────────────────────────────────────────────────
 def export_csv_from_table(table_value: Any) -> str:
     data = table_value or []
     ws.column_dimensions["B"].width = 42
     ws.column_dimensions["C"].width = 100
+    # px→points (~0.75 pt per screen px @ ~96dpi)
     row_h = int(thumb_px * 0.75)
     r_i = 2
     return new
 # ────────────────────────────────────────────────────────
+# Batch captioning (GPU) + sync
 # ────────────────────────────────────────────────────────
+@gpu
 @torch.no_grad()
 def run_batch(
     files: List[Any],
     ]
     return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
+# Tiny GPU warmup for HF Spaces detection
+@gpu
+@torch.no_grad()
+def _gpu_startup_warm():
+    try:
+        im = Image.new("RGB", (64, 64), (127, 127, 127))
+        _ = caption_once(im, "Warm up.", temp=0.0, top_p=1.0, max_tokens=8)
+        print("[ForgeCaptions] GPU warmup complete")
+    except Exception as e:
+        print("[ForgeCaptions] GPU warmup skipped:", e)
 # ────────────────────────────────────────────────────────
 # UI
 # ────────────────────────────────────────────────────────
 def logo_b64_img() -> str:
     candidates = [
         os.path.join(APP_DIR, "forgecaptions-logo.png"),
+        os.path.join(APP_DIR, "captionforge-logo.png"),
         "/home/user/app/forgecaptions-logo.png",
         "forgecaptions-logo.png",
         "captionforge-logo.png",
     return ""
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
+    # Ensure HF GPU detection runs once UI starts
+    demo.load(_gpu_startup_warm, inputs=None, outputs=None)
     settings = load_settings()
     settings["styles"] = [s for s in settings.get("styles", []) if s in STYLE_OPTIONS] or ["Character training (short)"]
                 dataset_name = gr.Textbox(label="Dataset name (used for export file titles)", value=settings.get("dataset_name", "forgecaptions"))
                 max_side   = gr.Slider(256, MAX_SIDE_CAP, settings.get("max_side", min(896, MAX_SIDE_CAP)), step=32, label="Max side (resize)")
                 excel_thumb_px = gr.Slider(64, 256, value=settings.get("excel_thumb_px", 128), step=8, label="Excel thumbnail size (px)")
+                gr.Markdown("Generation (settings): temperature 0.6 • top-p 0.9 • max_tokens 256")
+    # Persist options + live instruction
     def _refresh_instruction(styles, extra, name_value, trigv, begv, endv, excel_px, ms):
         instr = final_instruction(styles or ["Character training (short)"], extra or [], name_value)
         cfg = load_settings()
             outputs=[instruction_preview]
         )
     demo.load(lambda s,e,n: final_instruction(s or ["Character training (short)"], e or [], n),
               inputs=[style_checks, extra_opts, name_input], outputs=[instruction_preview])
+    # ===== Shape Aliases (improved UX: add row / clear / save)
     with gr.Accordion("Shape Aliases", open=False):
         gr.Markdown(
             "### 🔷 Shape Aliases\n"
         clear_btn.click(_clear_rows, outputs=[alias_table])
         save_btn.click(save_shape_alias_rows, inputs=[enable_aliases, alias_table], outputs=[save_status, alias_table])
+    # ===== Tabs (Single + Batch)
     with gr.Tabs():
         with gr.Tab("Single"):
             input_image_single = gr.Image(type="pil", label="Input Image", height=512, width=512)
                 input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple", type="filepath")
             run_button = gr.Button("Caption batch", variant="primary")
+    # ===== Results + Table (kept in the same place)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
         inputs=[table, rows_state, excel_thumb_px], outputs=[xlsx_file, xlsx_file]
     )
+# Launch (disable experimental SSR to reduce churn)
 if __name__ == "__main__":
     demo.queue(max_size=64).launch(
         server_name="0.0.0.0",
         server_port=int(os.getenv("PORT", "7860")),
+        ssr_mode=False,
+        debug=True,
+        show_error=True,
     )