Spaces:

nickdigger
/

joy-caption-enhanced

Running on Zero

App Files Files Community

nickdigger commited on Oct 10, 2025

Commit

6e12f0b

verified ·

1 Parent(s): 25c3e29

Update hf_space_utils.py

Browse files

Files changed (1) hide show

hf_space_utils.py +88 -171

hf_space_utils.py CHANGED Viewed

@@ -1,194 +1,111 @@
 """
-Utilities for Hugging Face Spaces post-processing and filename generation.
-This module provides:
-- fix_image_url: convert a gradio /file=... tmp path to a public /gradio_api/file=... URL or normalize existing URLs.
-- sanitize_caption: post-process caption text to remove photographic-technical sentences and background/people descriptions, and keep a brief context.
-- sanitize_all_captions: apply sanitize_caption to a dict with caption fields.
-- build_output_filename: create a filename with model name and current timestamp.
-Usage:
-- Import these functions from your Space app before exporting the JSON and use them to post-process the JSON payload.
-- A companion CLI script `process_json.py` (included) can be run to process an existing exported JSON file.
 """
 from datetime import datetime
 import re
 from urllib.parse import urlparse, urlunparse
 FORBIDDEN_KEYWORDS = [
-    "camera", "angle", "lighting", "lens", "exposure", "shutter", "aperture", "iso",
-    "f-stop", "hdr", "photograph", "photographed", "photography", "photo"
 ]
 BACKGROUND_KEYWORDS = ["background", "people", "person", "objects", "object", "crowd", "bystanders"]
 SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def build_output_filename(model_name: str) -> str:
-    """
-    Return filename like '{modelname}_YYYYMMDD_HHMMSS.json'
-    """
-    ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
-    safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', model_name.lower())
-    return f"{safe_name}_{ts}.json"
-def fix_image_url(raw_url_or_path: str, host: str = None) -> str:
-    """
-    Convert a gradio internal file URL/path to the public gradio_api URL.
-    Examples:
-    - Input: 'https://nickdigger-joy-caption-enhanced.hf.space/file=/tmp/gradio/.../img.png'
-      Output: 'https://nickdigger-joy-caption-enhanced.hf.space/gradio_api/file=/tmp/gradio/.../img.png'
-    - Input: '/tmp/gradio/.../img.png' with host='nickdigger-joy-caption-enhanced.hf.space'
-      Output: 'https://nickdigger-joy-caption-enhanced.hf.space/gradio_api/file=/tmp/gradio/.../img.png'
-    - If host is provided and input is already a path, builds an absolute URL.
-    """
-    if not raw_url_or_path:
-        return raw_url_or_path
-    # If it's a full URL, replace '/file=' with '/gradio_api/file='
-    try:
-        parsed = urlparse(raw_url_or_path)
-    except Exception:
-        parsed = None
-    if parsed and parsed.scheme and parsed.netloc:
-        # full URL
-        path = parsed.path
-        query = parsed.query
-        full = raw_url_or_path
-        # Some spaces embed file= in the path or in the query, replace both occurrences
-        if "/file=" in full and "/gradio_api/file=" not in full:
-            full = full.replace("/file=", "/gradio_api/file=")
-        if "file=" in full and "/gradio_api/file=" not in full and "/gradio_api" not in full:
-            # try to move 'file=' into /gradio_api/file= form
-            full = full.replace("file=", "gradio_api/file=")
-        return full
-    # If it's just a local path (starting with /tmp/gradio), build a gradio_api URL if host provided
-    if raw_url_or_path.startswith("/tmp/") or raw_url_or_path.startswith("tmp/"):
-        if not host:
-            return raw_url_or_path
-        host = host.rstrip("/")
-        if not (host.startswith("http://") or host.startswith("https://")):
-            host = "https://" + host
-        # strip leading slash from path for consistent formatting in query-style path
-        p = raw_url_or_path
-        if p.startswith("/"):
-            p = p[1:]
-        # assemble url: https://{host}/gradio_api/file=/{tmp...}
-        return f"{host}/gradio_api/file=/{p}"
-    return raw_url_or_path
 def _contains_forbidden(sentence: str) -> bool:
-    s = sentence.lower()
-    for kw in FORBIDDEN_KEYWORDS:
-        if kw in s:
-            return True
-    return False
 def _contains_background(sentence: str) -> bool:
-    s = sentence.lower()
-    for kw in BACKGROUND_KEYWORDS:
-        if kw in s:
-            return True
-    return False
 def sanitize_caption(caption: str, max_sentences: int = 2) -> str:
-    """
-    Return a sanitized, brief caption:
-    - Remove sentences that mention camera/photographic technical details.
-    - Remove sentences that describe background people/objects.
-    - Keep up to max_sentences of the remaining text (to make it brief).
-    - If nothing remains, return a short fallback one-line description.
-    """
-    if not caption:
-        return ""
-    # Normalize whitespace
-    caption = caption.strip()
-    sentences = SENTENCE_SPLIT_RE.split(caption)
-    kept = []
-    for s in sentences:
-        s_stripped = s.strip()
-        if not s_stripped:
-            continue
-        if _contains_forbidden(s_stripped):
-            continue
-        if _contains_background(s_stripped):
-            continue
-        kept.append(s_stripped)
-        if len(kept) >= max_sentences:
-            break
-    if not kept:
-        # Fallback: try to extract a short phrase from the original caption without forbidden words
-        tokens = []
-        for w in re.split(r'\s+', caption):
-            if any(kw in w.lower() for kw in FORBIDDEN_KEYWORDS + BACKGROUND_KEYWORDS):
-                continue
-            tokens.append(w)
-            if len(tokens) >= 12:
-                break
-        if tokens:
-            return " ".join(tokens).rstrip(",.") + "."
-        # final fallback
-        return caption.split('.')[0].strip() + "."
-    # Join kept sentences into a short paragraph
-    result = " ".join(kept)
-    # Ensure it ends with a period
-    if not result.endswith(('.', '!', '?')):
-        result = result + "."
-    return result
 def sanitize_all_captions(data: dict, caption_keys=None) -> dict:
-    """
-    Given a data dict with caption fields, returns a new dict with sanitized captions.
-    - caption_keys: list of keys to sanitize (defaults to common keys)
-    """
-    if caption_keys is None:
-        caption_keys = ["caption_engaging", "caption_casual_friend", "caption_keywords", "caption", "caption_short"]
-    out = dict(data)  # shallow copy
-    for key in caption_keys:
-        if key in out and isinstance(out[key], str):
-            out[key] = sanitize_caption(out[key], max_sentences=2)
-    return out
 if __name__ == "__main__":
-    import argparse
-    import json
-    parser = argparse.ArgumentParser(description="Post-process exported JoyCaption JSON")
-    parser.add_argument("input", help="Path to input JSON file")
-    parser.add_argument("output", nargs="?", help="Output path (optional). If omitted, will overwrite input.")
-    parser.add_argument("--host", help="Public host (e.g. nickdigger-joy-caption-enhanced.hf.space) used to build gradio_api URLs from tmp paths")
-    parser.add_argument("--model", default="joycaption", help="Model tag to include in output filename")
-    args = parser.parse_args()
-    with open(args.input, "r", encoding="utf-8") as f:
-        j = json.load(f)
-    # fix image_data.url if present
-    img = j.get("image_data", {})
-    if isinstance(img, dict):
-        raw = img.get("url") or img.get("image_url") or img.get("image")
-        if raw:
-            fixed = fix_image_url(raw, host=args.host)
-            j.setdefault("image_data", {})["url"] = fixed
-    # sanitize captions in data
-    data = j.get("data", {})
-    if isinstance(data, dict):
-        j["data"] = sanitize_all_captions(data)
-    # write out
-    outpath = args.output or args.input
-    with open(outpath, "w", encoding="utf-8") as f:
-        json.dump(j, f, ensure_ascii=False, indent=2)
-    # if output filename should include model & timestamp and output was not explicitly provided, print a suggested filename
-    suggested = build_output_filename(args.model)
-    print(f"Processed JSON written to: {outpath}")
-    print(f"Suggested export filename with model tag: {suggested}")

 """
+Copy of hf_space_utils.py for deployment package. This is the same helper module used by the app.
 """
 from datetime import datetime
+from typing import Optional
 import re
 from urllib.parse import urlparse, urlunparse
 FORBIDDEN_KEYWORDS = [
+	"camera", "angle", "lighting", "lens", "exposure", "shutter", "aperture", "iso",
+	"f-stop", "hdr", "photograph", "photographed", "photography", "photo"
 ]
 BACKGROUND_KEYWORDS = ["background", "people", "person", "objects", "object", "crowd", "bystanders"]
 SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
 def build_output_filename(model_name: str) -> str:
+	ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
+	safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', model_name.lower())
+	return f"{safe_name}_{ts}.json"
+def fix_image_url(raw_url_or_path: str, host: Optional[str] = None) -> str:
+	if not raw_url_or_path:
+		return raw_url_or_path
+	try:
+		parsed = urlparse(raw_url_or_path)
+	except Exception:
+		parsed = None
+	if parsed and parsed.scheme and parsed.netloc:
+		full = raw_url_or_path
+		if "/file=" in full and "/gradio_api/file=" not in full:
+			full = full.replace("/file=", "/gradio_api/file=")
+		if "file=" in full and "/gradio_api/file=" not in full and "/gradio_api" not in full:
+			full = full.replace("file=", "gradio_api/file=")
+		return full
+	if raw_url_or_path.startswith("/tmp/") or raw_url_or_path.startswith("tmp/"):
+		if not host:
+			return raw_url_or_path
+		host = host.rstrip("/")
+		if not (host.startswith("http://") or host.startswith("https://")):
+			host = "https://" + host
+		p = raw_url_or_path
+		if p.startswith("/"):
+			p = p[1:]
+		return f"{host}/gradio_api/file=/{p}"
+	return raw_url_or_path
 def _contains_forbidden(sentence: str) -> bool:
+	s = sentence.lower()
+	for kw in FORBIDDEN_KEYWORDS:
+		if kw in s:
+			return True
+	return False
 def _contains_background(sentence: str) -> bool:
+	s = sentence.lower()
+	for kw in BACKGROUND_KEYWORDS:
+		if kw in s:
+			return True
+	return False
 def sanitize_caption(caption: str, max_sentences: int = 2) -> str:
+	if not caption:
+		return ""
+	caption = caption.strip()
+	sentences = SENTENCE_SPLIT_RE.split(caption)
+	kept = []
+	for s in sentences:
+		s_stripped = s.strip()
+		if not s_stripped:
+			continue
+		if _contains_forbidden(s_stripped):
+			continue
+		if _contains_background(s_stripped):
+			continue
+		kept.append(s_stripped)
+		if len(kept) >= max_sentences:
+			break
+	if not kept:
+		tokens = []
+		for w in re.split(r'\s+', caption):
+			if any(kw in w.lower() for kw in FORBIDDEN_KEYWORDS + BACKGROUND_KEYWORDS):
+				continue
+			tokens.append(w)
+			if len(tokens) >= 12:
+				break
+		if tokens:
+			return " ".join(tokens).rstrip(",.") + "."
+		return caption.split('.')[0].strip() + "."
+	result = " ".join(kept)
+	if not result.endswith(('.', '!', '?')):
+		result = result + "."
+	return result
 def sanitize_all_captions(data: dict, caption_keys=None) -> dict:
+	if caption_keys is None:
+		caption_keys = ["caption_engaging", "caption_casual_friend", "caption_keywords", "caption", "caption_short"]
+	out = dict(data)
+	for key in caption_keys:
+		if key in out and isinstance(out[key], str):
+			out[key] = sanitize_caption(out[key], max_sentences=2)
+	return out
 if __name__ == "__main__":
+	print("hf_space_utils deployed")