nickdigger commited on
Commit
6e12f0b
·
verified ·
1 Parent(s): 25c3e29

Update hf_space_utils.py

Browse files
Files changed (1) hide show
  1. hf_space_utils.py +88 -171
hf_space_utils.py CHANGED
@@ -1,194 +1,111 @@
1
  """
2
- Utilities for Hugging Face Spaces post-processing and filename generation.
3
-
4
- This module provides:
5
- - fix_image_url: convert a gradio /file=... tmp path to a public /gradio_api/file=... URL or normalize existing URLs.
6
- - sanitize_caption: post-process caption text to remove photographic-technical sentences and background/people descriptions, and keep a brief context.
7
- - sanitize_all_captions: apply sanitize_caption to a dict with caption fields.
8
- - build_output_filename: create a filename with model name and current timestamp.
9
-
10
- Usage:
11
- - Import these functions from your Space app before exporting the JSON and use them to post-process the JSON payload.
12
- - A companion CLI script `process_json.py` (included) can be run to process an existing exported JSON file.
13
  """
14
-
15
  from datetime import datetime
 
16
  import re
17
  from urllib.parse import urlparse, urlunparse
18
 
19
  FORBIDDEN_KEYWORDS = [
20
- "camera", "angle", "lighting", "lens", "exposure", "shutter", "aperture", "iso",
21
- "f-stop", "hdr", "photograph", "photographed", "photography", "photo"
22
  ]
23
  BACKGROUND_KEYWORDS = ["background", "people", "person", "objects", "object", "crowd", "bystanders"]
24
 
25
  SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
26
 
27
  def build_output_filename(model_name: str) -> str:
28
- """
29
- Return filename like '{modelname}_YYYYMMDD_HHMMSS.json'
30
- """
31
- ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
32
- safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', model_name.lower())
33
- return f"{safe_name}_{ts}.json"
34
-
35
- def fix_image_url(raw_url_or_path: str, host: str = None) -> str:
36
- """
37
- Convert a gradio internal file URL/path to the public gradio_api URL.
38
- Examples:
39
- - Input: 'https://nickdigger-joy-caption-enhanced.hf.space/file=/tmp/gradio/.../img.png'
40
- Output: 'https://nickdigger-joy-caption-enhanced.hf.space/gradio_api/file=/tmp/gradio/.../img.png'
41
- - Input: '/tmp/gradio/.../img.png' with host='nickdigger-joy-caption-enhanced.hf.space'
42
- Output: 'https://nickdigger-joy-caption-enhanced.hf.space/gradio_api/file=/tmp/gradio/.../img.png'
43
- - If host is provided and input is already a path, builds an absolute URL.
44
- """
45
- if not raw_url_or_path:
46
- return raw_url_or_path
47
-
48
- # If it's a full URL, replace '/file=' with '/gradio_api/file='
49
- try:
50
- parsed = urlparse(raw_url_or_path)
51
- except Exception:
52
- parsed = None
53
-
54
- if parsed and parsed.scheme and parsed.netloc:
55
- # full URL
56
- path = parsed.path
57
- query = parsed.query
58
- full = raw_url_or_path
59
- # Some spaces embed file= in the path or in the query, replace both occurrences
60
- if "/file=" in full and "/gradio_api/file=" not in full:
61
- full = full.replace("/file=", "/gradio_api/file=")
62
- if "file=" in full and "/gradio_api/file=" not in full and "/gradio_api" not in full:
63
- # try to move 'file=' into /gradio_api/file= form
64
- full = full.replace("file=", "gradio_api/file=")
65
- return full
66
-
67
- # If it's just a local path (starting with /tmp/gradio), build a gradio_api URL if host provided
68
- if raw_url_or_path.startswith("/tmp/") or raw_url_or_path.startswith("tmp/"):
69
- if not host:
70
- return raw_url_or_path
71
- host = host.rstrip("/")
72
- if not (host.startswith("http://") or host.startswith("https://")):
73
- host = "https://" + host
74
- # strip leading slash from path for consistent formatting in query-style path
75
- p = raw_url_or_path
76
- if p.startswith("/"):
77
- p = p[1:]
78
- # assemble url: https://{host}/gradio_api/file=/{tmp...}
79
- return f"{host}/gradio_api/file=/{p}"
80
-
81
- return raw_url_or_path
82
 
83
  def _contains_forbidden(sentence: str) -> bool:
84
- s = sentence.lower()
85
- for kw in FORBIDDEN_KEYWORDS:
86
- if kw in s:
87
- return True
88
- return False
89
 
90
  def _contains_background(sentence: str) -> bool:
91
- s = sentence.lower()
92
- for kw in BACKGROUND_KEYWORDS:
93
- if kw in s:
94
- return True
95
- return False
96
 
97
  def sanitize_caption(caption: str, max_sentences: int = 2) -> str:
98
- """
99
- Return a sanitized, brief caption:
100
- - Remove sentences that mention camera/photographic technical details.
101
- - Remove sentences that describe background people/objects.
102
- - Keep up to max_sentences of the remaining text (to make it brief).
103
- - If nothing remains, return a short fallback one-line description.
104
- """
105
- if not caption:
106
- return ""
107
-
108
- # Normalize whitespace
109
- caption = caption.strip()
110
- sentences = SENTENCE_SPLIT_RE.split(caption)
111
- kept = []
112
- for s in sentences:
113
- s_stripped = s.strip()
114
- if not s_stripped:
115
- continue
116
- if _contains_forbidden(s_stripped):
117
- continue
118
- if _contains_background(s_stripped):
119
- continue
120
- kept.append(s_stripped)
121
- if len(kept) >= max_sentences:
122
- break
123
-
124
- if not kept:
125
- # Fallback: try to extract a short phrase from the original caption without forbidden words
126
- tokens = []
127
- for w in re.split(r'\s+', caption):
128
- if any(kw in w.lower() for kw in FORBIDDEN_KEYWORDS + BACKGROUND_KEYWORDS):
129
- continue
130
- tokens.append(w)
131
- if len(tokens) >= 12:
132
- break
133
- if tokens:
134
- return " ".join(tokens).rstrip(",.") + "."
135
-
136
- # final fallback
137
- return caption.split('.')[0].strip() + "."
138
-
139
- # Join kept sentences into a short paragraph
140
- result = " ".join(kept)
141
- # Ensure it ends with a period
142
- if not result.endswith(('.', '!', '?')):
143
- result = result + "."
144
- return result
145
 
146
  def sanitize_all_captions(data: dict, caption_keys=None) -> dict:
147
- """
148
- Given a data dict with caption fields, returns a new dict with sanitized captions.
149
- - caption_keys: list of keys to sanitize (defaults to common keys)
150
- """
151
- if caption_keys is None:
152
- caption_keys = ["caption_engaging", "caption_casual_friend", "caption_keywords", "caption", "caption_short"]
153
-
154
- out = dict(data) # shallow copy
155
- for key in caption_keys:
156
- if key in out and isinstance(out[key], str):
157
- out[key] = sanitize_caption(out[key], max_sentences=2)
158
- return out
159
 
160
  if __name__ == "__main__":
161
- import argparse
162
- import json
163
- parser = argparse.ArgumentParser(description="Post-process exported JoyCaption JSON")
164
- parser.add_argument("input", help="Path to input JSON file")
165
- parser.add_argument("output", nargs="?", help="Output path (optional). If omitted, will overwrite input.")
166
- parser.add_argument("--host", help="Public host (e.g. nickdigger-joy-caption-enhanced.hf.space) used to build gradio_api URLs from tmp paths")
167
- parser.add_argument("--model", default="joycaption", help="Model tag to include in output filename")
168
- args = parser.parse_args()
169
-
170
- with open(args.input, "r", encoding="utf-8") as f:
171
- j = json.load(f)
172
-
173
- # fix image_data.url if present
174
- img = j.get("image_data", {})
175
- if isinstance(img, dict):
176
- raw = img.get("url") or img.get("image_url") or img.get("image")
177
- if raw:
178
- fixed = fix_image_url(raw, host=args.host)
179
- j.setdefault("image_data", {})["url"] = fixed
180
-
181
- # sanitize captions in data
182
- data = j.get("data", {})
183
- if isinstance(data, dict):
184
- j["data"] = sanitize_all_captions(data)
185
-
186
- # write out
187
- outpath = args.output or args.input
188
- with open(outpath, "w", encoding="utf-8") as f:
189
- json.dump(j, f, ensure_ascii=False, indent=2)
190
-
191
- # if output filename should include model & timestamp and output was not explicitly provided, print a suggested filename
192
- suggested = build_output_filename(args.model)
193
- print(f"Processed JSON written to: {outpath}")
194
- print(f"Suggested export filename with model tag: {suggested}")
 
1
  """
2
+ Copy of hf_space_utils.py for deployment package. This is the same helper module used by the app.
 
 
 
 
 
 
 
 
 
 
3
  """
 
4
  from datetime import datetime
5
+ from typing import Optional
6
  import re
7
  from urllib.parse import urlparse, urlunparse
8
 
9
  FORBIDDEN_KEYWORDS = [
10
+ "camera", "angle", "lighting", "lens", "exposure", "shutter", "aperture", "iso",
11
+ "f-stop", "hdr", "photograph", "photographed", "photography", "photo"
12
  ]
13
  BACKGROUND_KEYWORDS = ["background", "people", "person", "objects", "object", "crowd", "bystanders"]
14
 
15
  SENTENCE_SPLIT_RE = re.compile(r'(?<=[.!?])\s+')
16
 
17
  def build_output_filename(model_name: str) -> str:
18
+ ts = datetime.utcnow().strftime("%Y%m%d_%H%M%S")
19
+ safe_name = re.sub(r'[^a-zA-Z0-9_-]', '', model_name.lower())
20
+ return f"{safe_name}_{ts}.json"
21
+
22
+ def fix_image_url(raw_url_or_path: str, host: Optional[str] = None) -> str:
23
+ if not raw_url_or_path:
24
+ return raw_url_or_path
25
+
26
+ try:
27
+ parsed = urlparse(raw_url_or_path)
28
+ except Exception:
29
+ parsed = None
30
+
31
+ if parsed and parsed.scheme and parsed.netloc:
32
+ full = raw_url_or_path
33
+ if "/file=" in full and "/gradio_api/file=" not in full:
34
+ full = full.replace("/file=", "/gradio_api/file=")
35
+ if "file=" in full and "/gradio_api/file=" not in full and "/gradio_api" not in full:
36
+ full = full.replace("file=", "gradio_api/file=")
37
+ return full
38
+
39
+ if raw_url_or_path.startswith("/tmp/") or raw_url_or_path.startswith("tmp/"):
40
+ if not host:
41
+ return raw_url_or_path
42
+ host = host.rstrip("/")
43
+ if not (host.startswith("http://") or host.startswith("https://")):
44
+ host = "https://" + host
45
+ p = raw_url_or_path
46
+ if p.startswith("/"):
47
+ p = p[1:]
48
+ return f"{host}/gradio_api/file=/{p}"
49
+
50
+ return raw_url_or_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
 
52
  def _contains_forbidden(sentence: str) -> bool:
53
+ s = sentence.lower()
54
+ for kw in FORBIDDEN_KEYWORDS:
55
+ if kw in s:
56
+ return True
57
+ return False
58
 
59
  def _contains_background(sentence: str) -> bool:
60
+ s = sentence.lower()
61
+ for kw in BACKGROUND_KEYWORDS:
62
+ if kw in s:
63
+ return True
64
+ return False
65
 
66
  def sanitize_caption(caption: str, max_sentences: int = 2) -> str:
67
+ if not caption:
68
+ return ""
69
+ caption = caption.strip()
70
+ sentences = SENTENCE_SPLIT_RE.split(caption)
71
+ kept = []
72
+ for s in sentences:
73
+ s_stripped = s.strip()
74
+ if not s_stripped:
75
+ continue
76
+ if _contains_forbidden(s_stripped):
77
+ continue
78
+ if _contains_background(s_stripped):
79
+ continue
80
+ kept.append(s_stripped)
81
+ if len(kept) >= max_sentences:
82
+ break
83
+
84
+ if not kept:
85
+ tokens = []
86
+ for w in re.split(r'\s+', caption):
87
+ if any(kw in w.lower() for kw in FORBIDDEN_KEYWORDS + BACKGROUND_KEYWORDS):
88
+ continue
89
+ tokens.append(w)
90
+ if len(tokens) >= 12:
91
+ break
92
+ if tokens:
93
+ return " ".join(tokens).rstrip(",.") + "."
94
+ return caption.split('.')[0].strip() + "."
95
+
96
+ result = " ".join(kept)
97
+ if not result.endswith(('.', '!', '?')):
98
+ result = result + "."
99
+ return result
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  def sanitize_all_captions(data: dict, caption_keys=None) -> dict:
102
+ if caption_keys is None:
103
+ caption_keys = ["caption_engaging", "caption_casual_friend", "caption_keywords", "caption", "caption_short"]
104
+ out = dict(data)
105
+ for key in caption_keys:
106
+ if key in out and isinstance(out[key], str):
107
+ out[key] = sanitize_caption(out[key], max_sentences=2)
108
+ return out
 
 
 
 
 
109
 
110
  if __name__ == "__main__":
111
+ print("hf_space_utils deployed")