Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,9 @@
|
|
| 5 |
# ------------------------------
|
| 6 |
# 0) Imports & environment
|
| 7 |
# ------------------------------
|
| 8 |
-
import os,
|
|
|
|
|
|
|
| 9 |
from typing import List, Tuple, Dict, Any
|
| 10 |
|
| 11 |
# Persist model caches between restarts
|
|
@@ -15,7 +17,7 @@ os.makedirs(os.environ["HF_HOME"], exist_ok=True)
|
|
| 15 |
import gradio as gr
|
| 16 |
from PIL import Image
|
| 17 |
import torch
|
| 18 |
-
from transformers import LlavaForConditionalGeneration, AutoProcessor
|
| 19 |
|
| 20 |
# Optional deps for import/export (we handle gracefully if missing)
|
| 21 |
try:
|
|
@@ -23,6 +25,13 @@ try:
|
|
| 23 |
except Exception:
|
| 24 |
pd = None
|
| 25 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 26 |
# Hugging Face Spaces GPU decorator (no-op locally)
|
| 27 |
try:
|
| 28 |
import spaces
|
|
@@ -37,12 +46,8 @@ except Exception:
|
|
| 37 |
APP_DIR = os.getcwd()
|
| 38 |
SESSION_FILE = "/tmp/forge_session.json"
|
| 39 |
# --- Branding
|
| 40 |
-
|
| 41 |
-
LOGO_HEIGHT_PX = 60
|
| 42 |
-
LOGO_SCALE = 0.7 # multiplier when auto-fitting
|
| 43 |
-
LOGO_AUTOFIT = os.getenv("FORGE_LOGO_AUTOFIT", "1") == "1" if "FORGE_LOGO_AUTOFIT" in os.environ else LOGO_AUTOFIT
|
| 44 |
-
LOGO_HEIGHT_PX = int(os.getenv("FORGE_LOGO_PX", LOGO_HEIGHT_PX))
|
| 45 |
-
LOGO_SCALE = float(os.getenv("FORGE_LOGO_SCALE", LOGO_SCALE))
|
| 46 |
|
| 47 |
|
| 48 |
# Settings live in a user cache dir (persists better than /tmp)
|
|
@@ -98,6 +103,16 @@ def get_model():
|
|
| 98 |
low_cpu_mem_usage=True,
|
| 99 |
device_map=0,
|
| 100 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
else:
|
| 102 |
_DEVICE = "cpu"
|
| 103 |
_DTYPE = torch.float32
|
|
@@ -133,8 +148,8 @@ CAPTION_TYPE_MAP: Dict[str, str] = {
|
|
| 133 |
"Descriptive": "Write a detailed description for this image.",
|
| 134 |
"Character training": (
|
| 135 |
"Write a thorough, training-ready caption for a character dataset. "
|
| 136 |
-
"Describe
|
| 137 |
-
"
|
| 138 |
),
|
| 139 |
"Flux.1-Dev": "Write a Flux.1-Dev style prompt that would reproduce this image faithfully.",
|
| 140 |
"Stable Diffusion": "Write a Stable Diffusion style prompt that would reproduce this image faithfully.",
|
|
@@ -167,10 +182,11 @@ def _length_hint(choice: str) -> str:
|
|
| 167 |
|
| 168 |
|
| 169 |
EXTRA_CHOICES = [
|
| 170 |
-
"
|
| 171 |
-
"Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock,
|
| 172 |
-
"Be sexually graphic and
|
| 173 |
-
"
|
|
|
|
| 174 |
"Do NOT use any ambiguous language.",
|
| 175 |
"ONLY describe the most important elements of the image.",
|
| 176 |
"Include information about the ages of any people/characters when applicable.",
|
|
@@ -179,8 +195,6 @@ EXTRA_CHOICES = [
|
|
| 179 |
"Focus on setting, scenery, and context; ignore subject details.",
|
| 180 |
"ONLY describe the subject’s pose, movement, or action. Do NOT mention appearance, clothing, or setting.",
|
| 181 |
"Include synonyms/alternate phrasing to diversify training set.",
|
| 182 |
-
"ALWAYS arrange caption elements in the order → Subject, Clothing/Accessories, Action/Pose, Setting/Environment, Lighting/Camera/Style.",
|
| 183 |
-
"Do NOT mention the image's resolution.",
|
| 184 |
"Include information about depth, lighting, and camera angle.",
|
| 185 |
"Include information on composition (rule of thirds, symmetry, leading lines, etc).",
|
| 186 |
"Specify the depth of field and whether the background is in focus or blurred.",
|
|
@@ -189,6 +203,19 @@ EXTRA_CHOICES = [
|
|
| 189 |
]
|
| 190 |
NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
|
| 191 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
|
| 193 |
# ------------------------------
|
| 194 |
# 4) Persistence helpers (settings/session/journal)
|
|
@@ -224,7 +251,6 @@ def load_settings() -> dict:
|
|
| 224 |
"max_tokens": 256,
|
| 225 |
"max_side": 896,
|
| 226 |
"styles": ["Character training"],
|
| 227 |
-
"extras": [],
|
| 228 |
"name": "",
|
| 229 |
"trigger": "",
|
| 230 |
"begin": "",
|
|
@@ -232,10 +258,9 @@ def load_settings() -> dict:
|
|
| 232 |
"shape_aliases_enabled": True,
|
| 233 |
"shape_aliases": [],
|
| 234 |
"excel_thumb_px": 128,
|
| 235 |
-
"logo_auto": True,
|
| 236 |
"logo_px": 60,
|
| 237 |
-
"logo_scale": 0.7,
|
| 238 |
"shape_aliases_persist": True,
|
|
|
|
| 239 |
}
|
| 240 |
|
| 241 |
for k, v in defaults.items():
|
|
@@ -246,6 +271,7 @@ def load_settings() -> dict:
|
|
| 246 |
if not isinstance(styles, list):
|
| 247 |
styles = [styles]
|
| 248 |
cfg["styles"] = [s for s in styles if s in STYLE_OPTIONS] or ["Character training"]
|
|
|
|
| 249 |
|
| 250 |
return cfg
|
| 251 |
|
|
@@ -555,14 +581,6 @@ def run_batch(
|
|
| 555 |
|
| 556 |
@gpu
|
| 557 |
@torch.no_grad()
|
| 558 |
-
def _gpu_startup_warm():
|
| 559 |
-
try:
|
| 560 |
-
im = Image.new("RGB", (64, 64), (127,127,127))
|
| 561 |
-
_ = caption_once(im, "Warm up.", temp=0.0, top_p=1.0, max_tokens=8)
|
| 562 |
-
print("[ForgeCaptions] GPU warmup complete")
|
| 563 |
-
except Exception as e:
|
| 564 |
-
print("[ForgeCaptions] GPU warmup skipped:", e)
|
| 565 |
-
|
| 566 |
|
| 567 |
# ------------------------------
|
| 568 |
# 9) Export/Import helpers (CSV/XLSX/TXT ZIP)
|
|
@@ -762,87 +780,33 @@ def import_captions_file(file_path: str, session_rows: List[dict]) -> Tuple[List
|
|
| 762 |
|
| 763 |
|
| 764 |
# ------------------------------
|
| 765 |
-
# 10) UI header helper (logo
|
| 766 |
# ------------------------------
|
| 767 |
-
def _render_header_html(
|
| 768 |
-
auto_js = "true" if auto else "false"
|
| 769 |
return f"""
|
| 770 |
<div class="cf-hero">
|
| 771 |
{logo_b64_img()}
|
| 772 |
<div class="cf-text">
|
| 773 |
<h1 class="cf-title">ForgeCaptions</h1>
|
| 774 |
-
<div class="cf-sub">JoyCaption Image Captioning
|
| 775 |
<div class="cf-sub">Import CSV/XLSX • Export CSV/XLSX/TXT</div>
|
| 776 |
-
<div class="cf-sub">Batch 10
|
| 777 |
</div>
|
| 778 |
</div>
|
| 779 |
<hr>
|
| 780 |
<style>
|
| 781 |
-
.cf-logo {{
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
const PX = {int(px)};
|
| 787 |
-
const SCALE = {float(scale)};
|
| 788 |
-
const MIN = 60, MAX = 100; // hard clamps
|
| 789 |
-
|
| 790 |
-
function outerH(el) {{
|
| 791 |
-
if (!el) return 0;
|
| 792 |
-
const r = el.getBoundingClientRect();
|
| 793 |
-
const cs = getComputedStyle(el);
|
| 794 |
-
return r.height + parseFloat(cs.marginTop) + parseFloat(cs.marginBottom);
|
| 795 |
}}
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
// Sum title + every subtitle's full box height (including margins)
|
| 799 |
-
const title = root.querySelector('.cf-title');
|
| 800 |
-
const subs = root.querySelectorAll('.cf-sub');
|
| 801 |
-
let h = outerH(title);
|
| 802 |
-
subs.forEach(s => h += outerH(s));
|
| 803 |
-
// tiny buffer so the two columns don't look mismatched if rounding occurs
|
| 804 |
-
return Math.round(h + 2);
|
| 805 |
}}
|
| 806 |
-
|
| 807 |
-
function fit() {{
|
| 808 |
-
const logo = document.querySelector('.cf-logo');
|
| 809 |
-
const text = document.querySelector('.cf-text');
|
| 810 |
-
if (!logo || !text) return;
|
| 811 |
-
if (AUTO) {{
|
| 812 |
-
const total = stackHeight(text);
|
| 813 |
-
const target = Math.max(MIN, Math.min(MAX, Math.round(total * SCALE)));
|
| 814 |
-
logo.style.height = target + 'px';
|
| 815 |
-
}} else {{
|
| 816 |
-
logo.style.height = Math.max(MIN, Math.min(MAX, PX)) + 'px';
|
| 817 |
-
}}
|
| 818 |
-
}}
|
| 819 |
-
|
| 820 |
-
// Re-fit at the right times
|
| 821 |
-
const textNode = document.querySelector('.cf-text');
|
| 822 |
-
|
| 823 |
-
// 1) Once fonts are ready (prevents under-measuring before webfonts load)
|
| 824 |
-
if (document.fonts && document.fonts.ready) {{
|
| 825 |
-
document.fonts.ready.then(() => requestAnimationFrame(fit));
|
| 826 |
-
}}
|
| 827 |
-
|
| 828 |
-
// 2) On resize
|
| 829 |
-
window.addEventListener('resize', () => requestAnimationFrame(fit), {{ passive: true }});
|
| 830 |
-
|
| 831 |
-
// 3) Whenever the text block changes size (line wrapping, content edits)
|
| 832 |
-
if (window.ResizeObserver && textNode) {{
|
| 833 |
-
const ro = new ResizeObserver(() => requestAnimationFrame(fit));
|
| 834 |
-
ro.observe(textNode);
|
| 835 |
-
}}
|
| 836 |
-
|
| 837 |
-
// 4) As a fallback, run a couple times after first paint
|
| 838 |
-
requestAnimationFrame(fit);
|
| 839 |
-
setTimeout(fit, 100);
|
| 840 |
-
setTimeout(fit, 400);
|
| 841 |
-
}})();
|
| 842 |
-
</script>
|
| 843 |
"""
|
| 844 |
|
| 845 |
-
|
| 846 |
# ------------------------------
|
| 847 |
# 11) UI (Blocks)
|
| 848 |
# ------------------------------
|
|
@@ -867,11 +831,12 @@ BASE_CSS = """
|
|
| 867 |
|
| 868 |
with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
| 869 |
# Ensure Spaces sees a GPU function (without touching CUDA in main)
|
| 870 |
-
demo.load(
|
| 871 |
|
| 872 |
# ---- Header
|
| 873 |
settings = load_settings()
|
| 874 |
-
header_html = gr.HTML(_render_header_html(
|
|
|
|
| 875 |
|
| 876 |
|
| 877 |
# ---- Controls group
|
|
@@ -923,10 +888,10 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 923 |
step=8, label="Excel thumbnail size (px)")
|
| 924 |
# Chunking
|
| 925 |
chunk_mode = gr.Radio(
|
| 926 |
-
choices=["Auto", "Manual (
|
| 927 |
value="Manual (step)", label="Batch mode"
|
| 928 |
)
|
| 929 |
-
chunk_size = gr.Slider(1,
|
| 930 |
gpu_budget = gr.Slider(20, 110, value=55, step=5, label="Max seconds per GPU call")
|
| 931 |
no_time_limit = gr.Checkbox(value=False, label="No time limit (ignore above)")
|
| 932 |
|
|
@@ -1033,15 +998,29 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1033 |
outputs=[single_caption_out]
|
| 1034 |
)
|
| 1035 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1036 |
with gr.Tab("Batch"):
|
| 1037 |
with gr.Accordion("Uploaded images", open=True):
|
| 1038 |
-
input_files = gr.File(label="Drop images", file_types=["image"], file_count="multiple",
|
| 1039 |
-
run_button = gr.Button("Caption batch", variant="primary")
|
| 1040 |
|
| 1041 |
-
|
| 1042 |
-
|
| 1043 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1044 |
|
|
|
|
| 1045 |
# ---- Results area (gallery left / table right)
|
| 1046 |
rows_state = gr.State(load_session())
|
| 1047 |
autosave_md = gr.Markdown("Ready.")
|
|
@@ -1049,9 +1028,9 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1049 |
remaining_state = gr.State([])
|
| 1050 |
|
| 1051 |
with gr.Row():
|
| 1052 |
-
with gr.Column(scale=
|
| 1053 |
gallery = gr.Gallery(
|
| 1054 |
-
label="Results
|
| 1055 |
show_label=True,
|
| 1056 |
columns=3,
|
| 1057 |
elem_id="cfGal",
|
|
@@ -1059,7 +1038,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1059 |
)
|
| 1060 |
with gr.Column(scale=1, elem_id="cfTableWrap", elem_classes=["cf-scroll"]):
|
| 1061 |
table = gr.Dataframe(
|
| 1062 |
-
label="Editable captions
|
| 1063 |
value=_rows_to_table(load_session()),
|
| 1064 |
headers=["filename", "caption"],
|
| 1065 |
interactive=True,
|
|
@@ -1156,7 +1135,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1156 |
prog = f"Batch progress: {done}/{total} processed in this step • Remaining overall: {len(remaining)}"
|
| 1157 |
return new_rows, gal, tbl, stamp, remaining, panel_vis, gr.update(value=msg), gr.update(value=prog)
|
| 1158 |
|
| 1159 |
-
# Auto
|
| 1160 |
new_rows, gal, tbl, stamp, leftover, done, total = run_batch(
|
| 1161 |
files, rows or [], instr, t, p, m, int(ms), budget
|
| 1162 |
)
|
|
@@ -1168,9 +1147,21 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1168 |
run_button.click(
|
| 1169 |
_run_click,
|
| 1170 |
inputs=[input_files, rows_state, instruction_preview, max_side, chunk_mode, chunk_size, gpu_budget, no_time_limit],
|
| 1171 |
-
outputs=[rows_state, gallery, table, autosave_md, remaining_state, step_panel, step_msg, progress_md]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1172 |
)
|
| 1173 |
-
|
| 1174 |
def _step_next(remain, rows, instr, ms, csize, budget_s, no_limit):
|
| 1175 |
t, p, m = _tpms()
|
| 1176 |
remain = remain or []
|
|
@@ -1180,7 +1171,7 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1180 |
return (
|
| 1181 |
rows,
|
| 1182 |
gr.update(value="No files remaining."),
|
| 1183 |
-
gr.update(visible=
|
| 1184 |
[],
|
| 1185 |
[],
|
| 1186 |
[],
|
|
@@ -1218,6 +1209,9 @@ with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
|
| 1218 |
return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
|
| 1219 |
table.change(sync_table_to_session, inputs=[table, rows_state], outputs=[rows_state, gallery, autosave_md])
|
| 1220 |
|
|
|
|
|
|
|
|
|
|
| 1221 |
# ---- Import hook
|
| 1222 |
def _do_import(fpath, rows):
|
| 1223 |
new_rows, gal, tbl, stamp = import_captions_file(fpath, rows or [])
|
|
|
|
| 5 |
# ------------------------------
|
| 6 |
# 0) Imports & environment
|
| 7 |
# ------------------------------
|
| 8 |
+
import os,
|
| 9 |
+
os.environ.setdefault("HF_HUB_ENABLE_HF_TRANSFER", "1")
|
| 10 |
+
import io, csv, time, json, base64, re, zipfile
|
| 11 |
from typing import List, Tuple, Dict, Any
|
| 12 |
|
| 13 |
# Persist model caches between restarts
|
|
|
|
| 17 |
import gradio as gr
|
| 18 |
from PIL import Image
|
| 19 |
import torch
|
| 20 |
+
from transformers import LlavaForConditionalGeneration, AutoProcessor, TextIteratorStreamer
|
| 21 |
|
| 22 |
# Optional deps for import/export (we handle gracefully if missing)
|
| 23 |
try:
|
|
|
|
| 25 |
except Exception:
|
| 26 |
pd = None
|
| 27 |
|
| 28 |
+
# Liger is optional; skip if missing
|
| 29 |
+
try:
|
| 30 |
+
from liger_kernel.transformers import apply_liger_kernel_to_llama
|
| 31 |
+
except Exception:
|
| 32 |
+
def apply_liger_kernel_to_llama(*args, **kwargs):
|
| 33 |
+
pass
|
| 34 |
+
|
| 35 |
# Hugging Face Spaces GPU decorator (no-op locally)
|
| 36 |
try:
|
| 37 |
import spaces
|
|
|
|
| 46 |
APP_DIR = os.getcwd()
|
| 47 |
SESSION_FILE = "/tmp/forge_session.json"
|
| 48 |
# --- Branding
|
| 49 |
+
|
| 50 |
+
LOGO_HEIGHT_PX = int(os.getenv("FORGE_LOGO_PX", 60))
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
|
| 53 |
# Settings live in a user cache dir (persists better than /tmp)
|
|
|
|
| 103 |
low_cpu_mem_usage=True,
|
| 104 |
device_map=0,
|
| 105 |
)
|
| 106 |
+
try:
|
| 107 |
+
from liger_kernel.transformers import apply_liger_kernel_to_llama
|
| 108 |
+
lm = getattr(_MODEL, "language_model", None) or getattr(_MODEL, "model", None)
|
| 109 |
+
if lm is not None:
|
| 110 |
+
ok = apply_liger_kernel_to_llama(lm)
|
| 111 |
+
print(f"[liger] enabled: {bool(ok)}")
|
| 112 |
+
else:
|
| 113 |
+
print("[liger] not enabled: LLM submodule not found")
|
| 114 |
+
except Exception as e:
|
| 115 |
+
print(f"[liger] not enabled: {e}")
|
| 116 |
else:
|
| 117 |
_DEVICE = "cpu"
|
| 118 |
_DTYPE = torch.float32
|
|
|
|
| 148 |
"Descriptive": "Write a detailed description for this image.",
|
| 149 |
"Character training": (
|
| 150 |
"Write a thorough, training-ready caption for a character dataset. "
|
| 151 |
+
"Describe subject appearance (physique, face/hair), clothing and accessories, actions/pose/gesture, camera angle/focal cues."
|
| 152 |
+
"If multiple subjects are present, describe each briefly (most prominent first) and distinguish them by visible traits."
|
| 153 |
),
|
| 154 |
"Flux.1-Dev": "Write a Flux.1-Dev style prompt that would reproduce this image faithfully.",
|
| 155 |
"Stable Diffusion": "Write a Stable Diffusion style prompt that would reproduce this image faithfully.",
|
|
|
|
| 182 |
|
| 183 |
|
| 184 |
EXTRA_CHOICES = [
|
| 185 |
+
"Only include a character's modifiable, style-level attributes (hair style/color, makeup, clothing/accessories, pose, expression). Do NOT mention identity traits (skin tone, age, body type).",
|
| 186 |
+
"Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock, etc.).", # You do NOT have permission to remove
|
| 187 |
+
"Be sexually graphic and describe sexual position when visible.",
|
| 188 |
+
"Exclude brand names, watermarks, artist names, file/meta info, quality terms, hashtags, and resolution notes.",
|
| 189 |
+
"Arrange information in the order: Subject; Clothing/Accessories; Action/Pose; Setting/Environment; Lighting/Camera.",
|
| 190 |
"Do NOT use any ambiguous language.",
|
| 191 |
"ONLY describe the most important elements of the image.",
|
| 192 |
"Include information about the ages of any people/characters when applicable.",
|
|
|
|
| 195 |
"Focus on setting, scenery, and context; ignore subject details.",
|
| 196 |
"ONLY describe the subject’s pose, movement, or action. Do NOT mention appearance, clothing, or setting.",
|
| 197 |
"Include synonyms/alternate phrasing to diversify training set.",
|
|
|
|
|
|
|
| 198 |
"Include information about depth, lighting, and camera angle.",
|
| 199 |
"Include information on composition (rule of thirds, symmetry, leading lines, etc).",
|
| 200 |
"Specify the depth of field and whether the background is in focus or blurred.",
|
|
|
|
| 203 |
]
|
| 204 |
NAME_OPTION = "If there is a person/character in the image you must refer to them as {name}."
|
| 205 |
|
| 206 |
+
# Place near EXTRA_CHOICES
|
| 207 |
+
DEFAULT_EXTRA_CHOICES = [
|
| 208 |
+
"Use profanity when describing sexual acts or genitalia (pussy, fucking, cum, cock, etc.).",
|
| 209 |
+
"Be sexually graphic and describe sexual position when visible.",
|
| 210 |
+
"Exclude brand names, watermarks, artist names, file/meta info, quality terms, hashtags, and resolution notes.",
|
| 211 |
+
"Arrange information in the order: Subject; Clothing/Accessories; Action/Pose; Setting/Environment; Lighting/Camera.",
|
| 212 |
+
]
|
| 213 |
+
ALL_EXTRA_CHOICES = [NAME_OPTION] + EXTRA_CHOICES
|
| 214 |
+
|
| 215 |
+
def _valid_extras(selected: list[str] | None) -> list[str]:
|
| 216 |
+
allowed = set(ALL_EXTRA_CHOICES)
|
| 217 |
+
return [x for x in (selected or []) if x in allowed]
|
| 218 |
+
|
| 219 |
|
| 220 |
# ------------------------------
|
| 221 |
# 4) Persistence helpers (settings/session/journal)
|
|
|
|
| 251 |
"max_tokens": 256,
|
| 252 |
"max_side": 896,
|
| 253 |
"styles": ["Character training"],
|
|
|
|
| 254 |
"name": "",
|
| 255 |
"trigger": "",
|
| 256 |
"begin": "",
|
|
|
|
| 258 |
"shape_aliases_enabled": True,
|
| 259 |
"shape_aliases": [],
|
| 260 |
"excel_thumb_px": 128,
|
|
|
|
| 261 |
"logo_px": 60,
|
|
|
|
| 262 |
"shape_aliases_persist": True,
|
| 263 |
+
"extras": DEFAULT_EXTRA_CHOICES,
|
| 264 |
}
|
| 265 |
|
| 266 |
for k, v in defaults.items():
|
|
|
|
| 271 |
if not isinstance(styles, list):
|
| 272 |
styles = [styles]
|
| 273 |
cfg["styles"] = [s for s in styles if s in STYLE_OPTIONS] or ["Character training"]
|
| 274 |
+
cfg["extras"] = _valid_extras(cfg.get("extras"))
|
| 275 |
|
| 276 |
return cfg
|
| 277 |
|
|
|
|
| 581 |
|
| 582 |
@gpu
|
| 583 |
@torch.no_grad()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 584 |
|
| 585 |
# ------------------------------
|
| 586 |
# 9) Export/Import helpers (CSV/XLSX/TXT ZIP)
|
|
|
|
| 780 |
|
| 781 |
|
| 782 |
# ------------------------------
|
| 783 |
+
# 10) UI header helper (fixed logo size)
|
| 784 |
# ------------------------------
|
| 785 |
+
def _render_header_html(px: int) -> str:
|
|
|
|
| 786 |
return f"""
|
| 787 |
<div class="cf-hero">
|
| 788 |
{logo_b64_img()}
|
| 789 |
<div class="cf-text">
|
| 790 |
<h1 class="cf-title">ForgeCaptions</h1>
|
| 791 |
+
<div class="cf-sub">JoyCaption Image Captioning</div>
|
| 792 |
<div class="cf-sub">Import CSV/XLSX • Export CSV/XLSX/TXT</div>
|
| 793 |
+
<div class="cf-sub">Batch 10–20 per Zero GPU run • Larger batches with dedicated GPU</div>
|
| 794 |
</div>
|
| 795 |
</div>
|
| 796 |
<hr>
|
| 797 |
<style>
|
| 798 |
+
.cf-logo {{
|
| 799 |
+
height: {int(px)}px; /* fixed height */
|
| 800 |
+
width: auto;
|
| 801 |
+
object-fit: contain;
|
| 802 |
+
display: block;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 803 |
}}
|
| 804 |
+
@media (max-width: 640px) {{
|
| 805 |
+
.cf-logo {{ height: {max(60, int(px) - 12)}px; }} /* optional small-screen tweak */
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 806 |
}}
|
| 807 |
+
</style>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 808 |
"""
|
| 809 |
|
|
|
|
| 810 |
# ------------------------------
|
| 811 |
# 11) UI (Blocks)
|
| 812 |
# ------------------------------
|
|
|
|
| 831 |
|
| 832 |
with gr.Blocks(css=BASE_CSS, title="ForgeCaptions") as demo:
|
| 833 |
# Ensure Spaces sees a GPU function (without touching CUDA in main)
|
| 834 |
+
demo.load(inputs=None, outputs=None)
|
| 835 |
|
| 836 |
# ---- Header
|
| 837 |
settings = load_settings()
|
| 838 |
+
header_html = gr.HTML(_render_header_html(LOGO_HEIGHT_PX))
|
| 839 |
+
|
| 840 |
|
| 841 |
|
| 842 |
# ---- Controls group
|
|
|
|
| 888 |
step=8, label="Excel thumbnail size (px)")
|
| 889 |
# Chunking
|
| 890 |
chunk_mode = gr.Radio(
|
| 891 |
+
choices=["Auto", "Manual (step)"],
|
| 892 |
value="Manual (step)", label="Batch mode"
|
| 893 |
)
|
| 894 |
+
chunk_size = gr.Slider(1, 200, value=15, step=1, label="Chunk size")
|
| 895 |
gpu_budget = gr.Slider(20, 110, value=55, step=5, label="Max seconds per GPU call")
|
| 896 |
no_time_limit = gr.Checkbox(value=False, label="No time limit (ignore above)")
|
| 897 |
|
|
|
|
| 998 |
outputs=[single_caption_out]
|
| 999 |
)
|
| 1000 |
|
| 1001 |
+
# with gr.Tab("Batch"):
|
| 1002 |
+
# with gr.Accordion("Uploaded images", open=True):
|
| 1003 |
+
# input_files = gr.File(label="Drop images (or click to select)", file_types=["image"], file_count="multiple", type="filepath")
|
| 1004 |
+
# run_button = gr.Button("Caption batch", variant="primary")
|
| 1005 |
+
|
| 1006 |
+
# with gr.Accordion("Import captions from CSV/XLSX (merge by filename)", open=False):
|
| 1007 |
+
# import_file = gr.File(label="Choose .csv or .xlsx", file_types=[".csv", ".xlsx"], type="filepath")
|
| 1008 |
+
# import_btn = gr.Button("Import into current session")
|
| 1009 |
+
|
| 1010 |
with gr.Tab("Batch"):
|
| 1011 |
with gr.Accordion("Uploaded images", open=True):
|
| 1012 |
+
input_files = gr.File(label="Drop images (or click to select)", file_types=["image"], file_count="multiple",)
|
|
|
|
| 1013 |
|
| 1014 |
+
run_button = gr.Button("Caption batch", variant="primary")
|
| 1015 |
+
preview_gallery = gr.Gallery(
|
| 1016 |
+
label="Preview (un-captioned)",
|
| 1017 |
+
show_label=True,
|
| 1018 |
+
columns=5,
|
| 1019 |
+
height=220,
|
| 1020 |
+
)
|
| 1021 |
+
input_files.change(on_files_changed, inputs=[input_files], outputs=[preview_gallery])
|
| 1022 |
|
| 1023 |
+
|
| 1024 |
# ---- Results area (gallery left / table right)
|
| 1025 |
rows_state = gr.State(load_session())
|
| 1026 |
autosave_md = gr.Markdown("Ready.")
|
|
|
|
| 1028 |
remaining_state = gr.State([])
|
| 1029 |
|
| 1030 |
with gr.Row():
|
| 1031 |
+
with gr.Column(scale=2):
|
| 1032 |
gallery = gr.Gallery(
|
| 1033 |
+
label="Results",
|
| 1034 |
show_label=True,
|
| 1035 |
columns=3,
|
| 1036 |
elem_id="cfGal",
|
|
|
|
| 1038 |
)
|
| 1039 |
with gr.Column(scale=1, elem_id="cfTableWrap", elem_classes=["cf-scroll"]):
|
| 1040 |
table = gr.Dataframe(
|
| 1041 |
+
label="Editable captions",
|
| 1042 |
value=_rows_to_table(load_session()),
|
| 1043 |
headers=["filename", "caption"],
|
| 1044 |
interactive=True,
|
|
|
|
| 1135 |
prog = f"Batch progress: {done}/{total} processed in this step • Remaining overall: {len(remaining)}"
|
| 1136 |
return new_rows, gal, tbl, stamp, remaining, panel_vis, gr.update(value=msg), gr.update(value=prog)
|
| 1137 |
|
| 1138 |
+
# Auto
|
| 1139 |
new_rows, gal, tbl, stamp, leftover, done, total = run_batch(
|
| 1140 |
files, rows or [], instr, t, p, m, int(ms), budget
|
| 1141 |
)
|
|
|
|
| 1147 |
run_button.click(
|
| 1148 |
_run_click,
|
| 1149 |
inputs=[input_files, rows_state, instruction_preview, max_side, chunk_mode, chunk_size, gpu_budget, no_time_limit],
|
| 1150 |
+
outputs=[rows_state, gallery, table, autosave_md, remaining_state, step_panel, step_msg, progress_md],
|
| 1151 |
+
).then(
|
| 1152 |
+
lambda rows: [(Image.open(r["path"]).convert("RGB"), r["caption"]) for r in rows],
|
| 1153 |
+
inputs=[rows_state],
|
| 1154 |
+
outputs=[gallery],
|
| 1155 |
+
)
|
| 1156 |
+
table.change(
|
| 1157 |
+
sync_table_to_session,
|
| 1158 |
+
inputs=[table, rows_state],
|
| 1159 |
+
outputs=[rows_state, captions_text],
|
| 1160 |
+
).then(
|
| 1161 |
+
lambda rows: [(Image.open(r["path"]).convert("RGB"), r["caption"]) for r in rows],
|
| 1162 |
+
inputs=[rows_state],
|
| 1163 |
+
outputs=[gallery],
|
| 1164 |
)
|
|
|
|
| 1165 |
def _step_next(remain, rows, instr, ms, csize, budget_s, no_limit):
|
| 1166 |
t, p, m = _tpms()
|
| 1167 |
remain = remain or []
|
|
|
|
| 1171 |
return (
|
| 1172 |
rows,
|
| 1173 |
gr.update(value="No files remaining."),
|
| 1174 |
+
gr.update(visible=True),
|
| 1175 |
[],
|
| 1176 |
[],
|
| 1177 |
[],
|
|
|
|
| 1209 |
return session_rows, gallery_pairs, f"Saved • {time.strftime('%H:%M:%S')}"
|
| 1210 |
table.change(sync_table_to_session, inputs=[table, rows_state], outputs=[rows_state, gallery, autosave_md])
|
| 1211 |
|
| 1212 |
+
def new_session() -> Tuple[List[dict], list, list, str]:
|
| 1213 |
+
return [], [], _rows_to_table([]), ""
|
| 1214 |
+
|
| 1215 |
# ---- Import hook
|
| 1216 |
def _do_import(fpath, rows):
|
| 1217 |
new_rows, gal, tbl, stamp = import_captions_file(fpath, rows or [])
|