Spaces:

JS6969
/

ForgeCaptions

Sleeping

App Files Files Community

JS6969 commited on Sep 6, 2025

Commit

9f5ffa7

verified ·

1 Parent(s): 1ceac14

Update app.py

Browse files

Files changed (1) hide show

app.py +103 -109

app.py CHANGED Viewed

@@ -5,7 +5,9 @@
 # ------------------------------
 # 0) Imports & environment
 # ------------------------------
-import os, io, csv, time, json, base64, re, zipfile
 from typing import List, Tuple, Dict, Any
 # Persist model caches between restarts
@@ -15,7 +17,7 @@ os.makedirs(os.environ["HF_HOME"], exist_ok=True)
 import gradio as gr
 from PIL import Image
 import torch
-from transformers import LlavaForConditionalGeneration, AutoProcessor
 # Optional deps for import/export (we handle gracefully if missing)
 try:
@@ -23,6 +25,13 @@ try:
 except Exception:
     pd = None
 # Hugging Face Spaces GPU decorator (no-op locally)
 try:
     import spaces
@@ -37,12 +46,8 @@ except Exception:
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/forge_session.json"
 # --- Branding
-LOGO_AUTOFIT   = False      # match logo height to title+subtitles stack
-LOGO_HEIGHT_PX = 60       # used only if LOGO_AUTOFIT is False
-LOGO_SCALE     = 0.7      # multiplier when auto-fitting
-LOGO_AUTOFIT   = os.getenv("FORGE_LOGO_AUTOFIT", "1") == "1" if "FORGE_LOGO_AUTOFIT" in os.environ else LOGO_AUTOFIT
-LOGO_HEIGHT_PX = int(os.getenv("FORGE_LOGO_PX", LOGO_HEIGHT_PX))
-LOGO_SCALE     = float(os.getenv("FORGE_LOGO_SCALE", LOGO_SCALE))
 # Settings live in a user cache dir (persists better than /tmp)
@@ -98,6 +103,16 @@ def get_model():
                 low_cpu_mem_usage=True,
                 device_map=0,
             )
         else:
             _DEVICE = "cpu"
             _DTYPE = torch.float32
@@ -133,8 +148,8 @@ CAPTION_TYPE_MAP: Dict[str, str] = {
     "Descriptive": "Write a detailed description for this image.",
     "Character training": (
         "Write a thorough, training-ready caption for a character dataset. "
-        "Describe only what is visible: physique, face/hair, clothing, accessories, actions, pose, "
-        "camera angle/focal cues, lighting, background context."
     ),
     "Flux.1-Dev": "Write a Flux.1-Dev style prompt that would reproduce this image faithfully.",
     "Stable Diffusion": "Write a Stable Diffusion style prompt that would reproduce this image faithfully.",
@@ -167,10 +182,11 @@ def _length_hint(choice: str) -> str:
 EXTRA_CHOICES = [
-    "Do NOT include information about people/characters that cannot be changed (like ethnicity, gender, etc), but do still include changeable attributes (like hair style).",
-    "Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock, dick, cunt, etc.)",   # You do NOT have permission to remove
-    "Be sexually graphic and highly detailed.",
-    "IGNORE all information about watermarks.",
     "Do NOT use any ambiguous language.",
     "ONLY describe the most important elements of the image.",
     "Include information about the ages of any people/characters when applicable.",
@@ -179,8 +195,6 @@ EXTRA_CHOICES = [
     "Focus on setting, scenery, and context; ignore subject details.",
     "ONLY describe the subject’s pose, movement, or action. Do NOT mention appearance, clothing, or setting.",
     "Include synonyms/alternate phrasing to diversify training set.",
-    "ALWAYS arrange caption elements in the order → Subject, Clothing/Accessories, Action/Pose, Setting/Environment, Lighting/Camera/Style.",
-    "Do NOT mention the image's resolution.",
     "Include information about depth, lighting, and camera angle.",
     "Include information on composition (rule of thirds, symmetry, leading lines, etc).",
     "Specify the depth of field and whether the background is in focus or blurred.",
@@ -189,6 +203,19 @@ EXTRA_CHOICES = [
 ]
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
 # ------------------------------
 # 4) Persistence helpers (settings/session/journal)
@@ -224,7 +251,6 @@ def load_settings() -> dict:
         "max_tokens": 256,
         "max_side": 896,
         "styles": ["Character training"],
-        "extras": [],
         "name": "",
         "trigger": "",
         "begin": "",
@@ -232,10 +258,9 @@ def load_settings() -> dict:
         "shape_aliases_enabled": True,
         "shape_aliases": [],
         "excel_thumb_px": 128,
-        "logo_auto": True,
         "logo_px": 60,
-        "logo_scale": 0.7,
         "shape_aliases_persist": True,
     }
     for k, v in defaults.items():
@@ -246,6 +271,7 @@ def load_settings() -> dict:
     if not isinstance(styles, list):
         styles = [styles]
     cfg["styles"] = [s for s in styles if s in STYLE_OPTIONS] or ["Character training"]
     return cfg
@@ -555,14 +581,6 @@ def run_batch(
 @gpu
 @torch.no_grad()
-def _gpu_startup_warm():
-    try:
-        im = Image.new("RGB", (64, 64), (127,127,127))
-        _ = caption_once(im, "Warm up.", temp=0.0, top_p=1.0, max_tokens=8)
-        print("[ForgeCaptions] GPU warmup complete")
-    except Exception as e:
-        print("[ForgeCaptions] GPU warmup skipped:", e)
 # ------------------------------
 # 9) Export/Import helpers (CSV/XLSX/TXT ZIP)
@@ -762,87 +780,33 @@ def import_captions_file(file_path: str, session_rows: List[dict]) -> Tuple[List
 # ------------------------------
-# 10) UI header helper (logo auto-fit to match title/subtitle block)
 # ------------------------------
-def _render_header_html(auto: bool, px: int, scale: float) -> str:
-    auto_js = "true" if auto else "false"
     return f"""
 <div class="cf-hero">
   {logo_b64_img()}
   <div class="cf-text">
     <h1 class="cf-title">ForgeCaptions</h1>
-    <div class="cf-sub">JoyCaption Image Captioning </div>
     <div class="cf-sub">Import CSV/XLSX • Export CSV/XLSX/TXT</div>
-    <div class="cf-sub">Batch 10-20 per Zero GPU run • Larger batches with GPU</div>
   </div>
 </div>
 <hr>
 <style>
-  .cf-logo {{ height: auto; width: auto; object-fit: contain; display:block; }}
-</style>
-<script>
-(function() {{
-  const AUTO = {auto_js};
-  const PX = {int(px)};
-  const SCALE = {float(scale)};
-  const MIN = 60, MAX = 100;  // hard clamps
-  function outerH(el) {{
-    if (!el) return 0;
-    const r = el.getBoundingClientRect();
-    const cs = getComputedStyle(el);
-    return r.height + parseFloat(cs.marginTop) + parseFloat(cs.marginBottom);
   }}
-  function stackHeight(root) {{
-    // Sum title + every subtitle's full box height (including margins)
-    const title = root.querySelector('.cf-title');
-    const subs  = root.querySelectorAll('.cf-sub');
-    let h = outerH(title);
-    subs.forEach(s => h += outerH(s));
-    // tiny buffer so the two columns don't look mismatched if rounding occurs
-    return Math.round(h + 2);
   }}
-  function fit() {{
-    const logo = document.querySelector('.cf-logo');
-    const text = document.querySelector('.cf-text');
-    if (!logo || !text) return;
-    if (AUTO) {{
-      const total = stackHeight(text);
-      const target = Math.max(MIN, Math.min(MAX, Math.round(total * SCALE)));
-      logo.style.height = target + 'px';
-    }} else {{
-      logo.style.height = Math.max(MIN, Math.min(MAX, PX)) + 'px';
-    }}
-  }}
-  // Re-fit at the right times
-  const textNode = document.querySelector('.cf-text');
-  // 1) Once fonts are ready (prevents under-measuring before webfonts load)
-  if (document.fonts && document.fonts.ready) {{
-    document.fonts.ready.then(() => requestAnimationFrame(fit));
-  }}
-  // 2) On resize
-  window.addEventListener('resize', () => requestAnimationFrame(fit), {{ passive: true }});
-  // 3) Whenever the text block changes size (line wrapping, content edits)
-  if (window.ResizeObserver && textNode) {{
-    const ro = new ResizeObserver(() => requestAnimationFrame(fit));
-    ro.observe(textNode);
-  }}
-  // 4) As a fallback, run a couple times after first paint
-  requestAnimationFrame(fit);
-  setTimeout(fit, 100);
-  setTimeout(fit, 400);
-}})();
-</script>
 """
 # ------------------------------
 # 11) UI (Blocks)
 # ------------------------------
@@ -867,11 +831,12 @@ BASE_CSS = """
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     # Ensure Spaces sees a GPU function (without touching CUDA in main)
-    demo.load(_gpu_startup_warm, inputs=None, outputs=None)
     # ---- Header
     settings = load_settings()
-    header_html = gr.HTML(_render_header_html(LOGO_AUTOFIT, LOGO_HEIGHT_PX, LOGO_SCALE))
     # ---- Controls group
@@ -923,10 +888,10 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
                                            step=8, label="Excel thumbnail size (px)")
                 # Chunking
                 chunk_mode = gr.Radio(
-                    choices=["Auto", "Manual (all at once)", "Manual (step)"],
                     value="Manual (step)", label="Batch mode"
                 )
-                chunk_size = gr.Slider(1, 50, value=10, step=1, label="Chunk size")
                 gpu_budget = gr.Slider(20, 110, value=55, step=5, label="Max seconds per GPU call")
                 no_time_limit = gr.Checkbox(value=False, label="No time limit (ignore above)")
@@ -1033,15 +998,29 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
                 outputs=[single_caption_out]
             )
         with gr.Tab("Batch"):
             with gr.Accordion("Uploaded images", open=True):
-                input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple", type="filepath")
-            run_button = gr.Button("Caption batch", variant="primary")
-            with gr.Accordion("Import captions from CSV/XLSX (merge by filename)", open=False):
-                import_file = gr.File(label="Choose .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath")
-                import_btn = gr.Button("Import into current session")
     # ---- Results area (gallery left / table right)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
@@ -1049,9 +1028,9 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     remaining_state = gr.State([])
     with gr.Row():
-        with gr.Column(scale=1):
             gallery = gr.Gallery(
-                label="Results (image + caption)",
                 show_label=True,
                 columns=3,
                 elem_id="cfGal",
@@ -1059,7 +1038,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             )
         with gr.Column(scale=1, elem_id="cfTableWrap", elem_classes=["cf-scroll"]):
             table = gr.Dataframe(
-                label="Editable captions (whole session)",
                 value=_rows_to_table(load_session()),
                 headers=["filename", "caption"],
                 interactive=True,
@@ -1156,7 +1135,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             prog = f"Batch progress: {done}/{total} processed in this step • Remaining overall: {len(remaining)}"
             return new_rows, gal, tbl, stamp, remaining, panel_vis, gr.update(value=msg), gr.update(value=prog)
-        # Auto / all-at-once
         new_rows, gal, tbl, stamp, leftover, done, total = run_batch(
             files, rows or [], instr, t, p, m, int(ms), budget
         )
@@ -1168,9 +1147,21 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     run_button.click(
         _run_click,
         inputs=[input_files, rows_state, instruction_preview, max_side, chunk_mode, chunk_size, gpu_budget, no_time_limit],
-        outputs=[rows_state, gallery, table, autosave_md, remaining_state, step_panel, step_msg, progress_md]
     )
     def _step_next(remain, rows, instr, ms, csize, budget_s, no_limit):
         t, p, m = _tpms()
         remain = remain or []
@@ -1180,7 +1171,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
             return (
                 rows,
                 gr.update(value="No files remaining."),
-                gr.update(visible=False),
                 [],
                 [],
                 [],
@@ -1218,6 +1209,9 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
         return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
     table.change(sync_table_to_session, inputs=[table, rows_state], outputs=[rows_state, gallery, autosave_md])
     # ---- Import hook
     def _do_import(fpath, rows):
         new_rows, gal, tbl, stamp = import_captions_file(fpath, rows or [])

 # ------------------------------
 # 0) Imports & environment
 # ------------------------------
+import os,
+os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
+import io, csv, time, json, base64, re, zipfile
 from typing import List, Tuple, Dict, Any
 # Persist model caches between restarts
 import gradio as gr
 from PIL import Image
 import torch
+from transformers import LlavaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
 # Optional deps for import/export (we handle gracefully if missing)
 try:
 except Exception:
     pd = None
+# Liger is optional; skip if missing
+try:
+    from liger_kernel.transformers import apply_liger_kernel_to_llama
+except Exception:
+    def apply_liger_kernel_to_llama(*args, **kwargs):
+        pass
 # Hugging Face Spaces GPU decorator (no-op locally)
 try:
     import spaces
 APP_DIR = os.getcwd()
 SESSION_FILE = "/tmp/forge_session.json"
 # --- Branding
+LOGO_HEIGHT_PX = int(os.getenv("FORGE_LOGO_PX", 60))
 # Settings live in a user cache dir (persists better than /tmp)
                 low_cpu_mem_usage=True,
                 device_map=0,
             )
+             try:
+                from liger_kernel.transformers import apply_liger_kernel_to_llama
+                lm = getattr(_MODEL, "language_model", None) or getattr(_MODEL, "model", None)
+                if lm is not None:
+                    ok = apply_liger_kernel_to_llama(lm)
+                    print(f"[liger] enabled: {bool(ok)}")
+                else:
+                    print("[liger] not enabled: LLM submodule not found")
+            except Exception as e:
+                print(f"[liger] not enabled: {e}")
         else:
             _DEVICE = "cpu"
             _DTYPE = torch.float32
     "Descriptive": "Write a detailed description for this image.",
     "Character training": (
         "Write a thorough, training-ready caption for a character dataset. "
+        "Describe subject appearance (physique, face/hair), clothing and accessories, actions/pose/gesture, camera angle/focal cues."
+        "If multiple subjects are present, describe each briefly (most prominent first) and distinguish them by visible traits."
     ),
     "Flux.1-Dev": "Write a Flux.1-Dev style prompt that would reproduce this image faithfully.",
     "Stable Diffusion": "Write a Stable Diffusion style prompt that would reproduce this image faithfully.",
 EXTRA_CHOICES = [
+    "Only include a character's modifiable, style-level attributes (hair style/color, makeup, clothing/accessories, pose, expression). Do NOT mention identity traits (skin tone, age, body type).",
+    "Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock, etc.).",   # You do NOT have permission to remove
+    "Be sexually graphic and describe sexual position when visible.",
+    "Exclude brand names, watermarks, artist names, file/meta info, quality terms, hashtags, and resolution notes.",
+    "Arrange information in the order: Subject; Clothing/Accessories; Action/Pose; Setting/Environment; Lighting/Camera.",
     "Do NOT use any ambiguous language.",
     "ONLY describe the most important elements of the image.",
     "Include information about the ages of any people/characters when applicable.",
     "Focus on setting, scenery, and context; ignore subject details.",
     "ONLY describe the subject’s pose, movement, or action. Do NOT mention appearance, clothing, or setting.",
     "Include synonyms/alternate phrasing to diversify training set.",
     "Include information about depth, lighting, and camera angle.",
     "Include information on composition (rule of thirds, symmetry, leading lines, etc).",
     "Specify the depth of field and whether the background is in focus or blurred.",
 ]
 NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
+# Place near EXTRA_CHOICES
+DEFAULT_EXTRA_CHOICES = [
+    "Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock, etc.).",
+    "Be sexually graphic and describe sexual position when visible.",
+    "Exclude brand names, watermarks, artist names, file/meta info, quality terms, hashtags, and resolution notes.",
+    "Arrange information in the order: Subject; Clothing/Accessories; Action/Pose; Setting/Environment; Lighting/Camera.",
+]
+ALL_EXTRA_CHOICES = [NAME_OPTION] + EXTRA_CHOICES
+def _valid_extras(selected: list[str] | None) -> list[str]:
+    allowed = set(ALL_EXTRA_CHOICES)
+    return [x for x in (selected or []) if x in allowed]
 # ------------------------------
 # 4) Persistence helpers (settings/session/journal)
         "max_tokens": 256,
         "max_side": 896,
         "styles": ["Character training"],
         "name": "",
         "trigger": "",
         "begin": "",
         "shape_aliases_enabled": True,
         "shape_aliases": [],
         "excel_thumb_px": 128,
         "logo_px": 60,
         "shape_aliases_persist": True,
+        "extras": DEFAULT_EXTRA_CHOICES,
     }
     for k, v in defaults.items():
     if not isinstance(styles, list):
         styles = [styles]
     cfg["styles"] = [s for s in styles if s in STYLE_OPTIONS] or ["Character training"]
+    cfg["extras"] = _valid_extras(cfg.get("extras"))
     return cfg
 @gpu
 @torch.no_grad()
 # ------------------------------
 # 9) Export/Import helpers (CSV/XLSX/TXT ZIP)
 # ------------------------------
+# 10) UI header helper (fixed logo size)
 # ------------------------------
+def _render_header_html(px: int) -> str:
     return f"""
 <div class="cf-hero">
   {logo_b64_img()}
   <div class="cf-text">
     <h1 class="cf-title">ForgeCaptions</h1>
+    <div class="cf-sub">JoyCaption Image Captioning</div>
     <div class="cf-sub">Import CSV/XLSX • Export CSV/XLSX/TXT</div>
+    <div class="cf-sub">Batch 10–20 per Zero GPU run • Larger batches with dedicated GPU</div>
   </div>
 </div>
 <hr>
 <style>
+  .cf-logo {{
+    height: {int(px)}px;   /* fixed height */
+    width: auto;
+    object-fit: contain;
+    display: block;
   }}
+  @media (max-width: 640px) {{
+    .cf-logo {{ height: {max(60, int(px) - 12)}px; }} /* optional small-screen tweak */
   }}
+</style>
 """
 # ------------------------------
 # 11) UI (Blocks)
 # ------------------------------
 with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
     # Ensure Spaces sees a GPU function (without touching CUDA in main)
+    demo.load(inputs=None, outputs=None)
     # ---- Header
     settings = load_settings()
+    header_html = gr.HTML(_render_header_html(LOGO_HEIGHT_PX))
     # ---- Controls group
                                            step=8, label="Excel thumbnail size (px)")
                 # Chunking
                 chunk_mode = gr.Radio(
+                    choices=["Auto", "Manual (step)"],
                     value="Manual (step)", label="Batch mode"
                 )
+                chunk_size = gr.Slider(1, 200, value=15, step=1, label="Chunk size")
                 gpu_budget = gr.Slider(20, 110, value=55, step=5, label="Max seconds per GPU call")
                 no_time_limit = gr.Checkbox(value=False, label="No time limit (ignore above)")
                 outputs=[single_caption_out]
             )
+#        with gr.Tab("Batch"):
+#            with gr.Accordion("Uploaded images", open=True):
+#                input_files = gr.File(label="Drop images (or click to select)", file_types=["image"], file_count="multiple", type="filepath")
+#            run_button = gr.Button("Caption batch", variant="primary")
+#            with gr.Accordion("Import captions from CSV/XLSX (merge by filename)", open=False):
+#                import_file = gr.File(label="Choose .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath")
+#                import_btn = gr.Button("Import into current session")
         with gr.Tab("Batch"):
             with gr.Accordion("Uploaded images", open=True):
+                input_files = gr.File(label="Drop images (or click to select)", file_types=["image"], file_count="multiple",)
+            run_button = gr.Button("Caption batch", variant="primary")
+            preview_gallery = gr.Gallery(
+                        label="Preview (un-captioned)",
+                        show_label=True,
+                        columns=5,
+                        height=220,
+                    )
+                    input_files.change(on_files_changed, inputs=[input_files], outputs=[preview_gallery])
     # ---- Results area (gallery left / table right)
     rows_state  = gr.State(load_session())
     autosave_md = gr.Markdown("Ready.")
     remaining_state = gr.State([])
     with gr.Row():
+        with gr.Column(scale=2):
             gallery = gr.Gallery(
+                label="Results",
                 show_label=True,
                 columns=3,
                 elem_id="cfGal",
             )
         with gr.Column(scale=1, elem_id="cfTableWrap", elem_classes=["cf-scroll"]):
             table = gr.Dataframe(
+                label="Editable captions",
                 value=_rows_to_table(load_session()),
                 headers=["filename", "caption"],
                 interactive=True,
             prog = f"Batch progress: {done}/{total} processed in this step • Remaining overall: {len(remaining)}"
             return new_rows, gal, tbl, stamp, remaining, panel_vis, gr.update(value=msg), gr.update(value=prog)
+        # Auto
         new_rows, gal, tbl, stamp, leftover, done, total = run_batch(
             files, rows or [], instr, t, p, m, int(ms), budget
         )
     run_button.click(
         _run_click,
         inputs=[input_files, rows_state, instruction_preview, max_side, chunk_mode, chunk_size, gpu_budget, no_time_limit],
+        outputs=[rows_state, gallery, table, autosave_md, remaining_state, step_panel, step_msg, progress_md],
+    ).then(
+        lambda rows: [(Image.open(r["path"]).convert("RGB"), r["caption"]) for r in rows],
+        inputs=[rows_state],
+        outputs=[gallery],
+    )
+    table.change(
+        sync_table_to_session,
+        inputs=[table, rows_state],
+        outputs=[rows_state, captions_text],
+    ).then(
+        lambda rows: [(Image.open(r["path"]).convert("RGB"), r["caption"]) for r in rows],
+        inputs=[rows_state],
+        outputs=[gallery],
     )
     def _step_next(remain, rows, instr, ms, csize, budget_s, no_limit):
         t, p, m = _tpms()
         remain = remain or []
             return (
                 rows,
                 gr.update(value="No files remaining."),
+                gr.update(visible=True),
                 [],
                 [],
                 [],
         return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
     table.change(sync_table_to_session, inputs=[table, rows_state], outputs=[rows_state, gallery, autosave_md])
+    def new_session() -> Tuple[List[dict], list, list, str]:
+    return [], [], _rows_to_table([]), ""
     # ---- Import hook
     def _do_import(fpath, rows):
         new_rows, gal, tbl, stamp = import_captions_file(fpath, rows or [])