saliacoel commited on
Commit
2a6a218
·
verified ·
1 Parent(s): 078fcdb

Upload TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip.py

Browse files
TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip.py ADDED
@@ -0,0 +1,500 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import shutil
4
+ import tempfile
5
+ import time
6
+ import unicodedata
7
+ import zipfile
8
+ from collections import defaultdict
9
+ from urllib.parse import quote, unquote
10
+ from urllib.request import Request, urlopen
11
+
12
+ try:
13
+ from huggingface_hub import HfApi
14
+ except Exception:
15
+ HfApi = None
16
+
17
+
18
+ SOURCE_ZIP_URL = "https://huggingface.co/saliacoel/tmp/resolve/main/chars_unsorted.zip"
19
+ TMP_REPO_ID = "saliacoel/tmp"
20
+ OUTPUT_ZIP_NAME = "chars_numbered_desc.zip"
21
+
22
+ # Finds entries like:
23
+ # 1. Adali, tags...
24
+ # 0001. Adali, tags...
25
+ # 2000. Name, tags...
26
+ #
27
+ # Does not depend on line breaks.
28
+ ID_MARKER_RE = re.compile(r"(?<!\S)(0*\d{1,8})\.\s*")
29
+
30
+
31
+ def _resolve_hf_token(hf_token: str) -> str:
32
+ token = (hf_token or "").strip()
33
+ if token:
34
+ return token
35
+
36
+ for env_name in ("HF_TOKEN", "HUGGINGFACE_TOKEN", "HUGGING_FACE_HUB_TOKEN"):
37
+ env_token = os.getenv(env_name, "").strip()
38
+ if env_token:
39
+ return env_token
40
+
41
+ raise ValueError("No Hugging Face token provided.")
42
+
43
+
44
+ def _download_file(url: str, suffix: str = ".zip") -> str:
45
+ """
46
+ Download URL once into a temporary file.
47
+ This avoids 2000 separate Hugging Face requests.
48
+ """
49
+ req = Request(
50
+ url,
51
+ headers={
52
+ "User-Agent": "Mozilla/5.0",
53
+ "Accept": "*/*",
54
+ },
55
+ )
56
+
57
+ tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
58
+ tmp_path = tmp.name
59
+ tmp.close()
60
+
61
+ try:
62
+ with urlopen(req, timeout=300) as resp, open(tmp_path, "wb") as f:
63
+ shutil.copyfileobj(resp, f, length=1024 * 1024)
64
+ except Exception:
65
+ try:
66
+ os.remove(tmp_path)
67
+ except Exception:
68
+ pass
69
+ raise
70
+
71
+ return tmp_path
72
+
73
+
74
+ def _clean_zip_path(path: str) -> str:
75
+ path = path.replace("\\", "/")
76
+ path = path.lstrip("/")
77
+ while path.startswith("./"):
78
+ path = path[2:]
79
+ return path
80
+
81
+
82
+ def _split_zip_path(path: str):
83
+ clean = _clean_zip_path(path)
84
+ return [p for p in clean.split("/") if p and p != "."]
85
+
86
+
87
+ def _name_key(name: str) -> str:
88
+ """
89
+ Normalize names for folder-name/BAM-name matching.
90
+
91
+ Examples that become equivalent:
92
+ "Adali"
93
+ " Adali "
94
+ "Adali%20Name" vs "Adali Name"
95
+ """
96
+ name = unquote(name)
97
+ name = unicodedata.normalize("NFC", name)
98
+ name = " ".join(name.strip().split())
99
+ return name.casefold()
100
+
101
+
102
+ def _extract_name_from_bam_segment(segment: str) -> str:
103
+ segment = segment.strip()
104
+ if not segment:
105
+ return ""
106
+
107
+ comma_index = segment.find(",")
108
+ if comma_index == -1:
109
+ name = segment.strip()
110
+ else:
111
+ name = segment[:comma_index].strip()
112
+
113
+ name = unicodedata.normalize("NFC", name)
114
+ name = " ".join(name.split())
115
+ return name
116
+
117
+
118
+ def _parse_bam_name_to_id(bam_text: str):
119
+ """
120
+ Returns:
121
+ name_to_id: normalized character name -> integer ID
122
+ id_to_name: integer ID -> original parsed name
123
+ duplicate_names: list of duplicate names found in BAM
124
+ """
125
+ matches = list(ID_MARKER_RE.finditer(bam_text))
126
+ if not matches:
127
+ raise ValueError("No ID markers found in bam.txt.")
128
+
129
+ name_to_id = {}
130
+ id_to_name = {}
131
+ duplicate_names = []
132
+
133
+ for i, match in enumerate(matches):
134
+ character_id = int(match.group(1))
135
+
136
+ segment_start = match.end()
137
+ segment_end = matches[i + 1].start() if i + 1 < len(matches) else len(bam_text)
138
+ segment = bam_text[segment_start:segment_end]
139
+
140
+ name = _extract_name_from_bam_segment(segment)
141
+ if not name:
142
+ continue
143
+
144
+ key = _name_key(name)
145
+
146
+ if key in name_to_id:
147
+ duplicate_names.append(name)
148
+ # Keep first mapping. Duplicate names are ambiguous.
149
+ continue
150
+
151
+ name_to_id[key] = character_id
152
+ id_to_name[character_id] = name
153
+
154
+ return name_to_id, id_to_name, duplicate_names
155
+
156
+
157
+ def _count_immediate_folders_under_prefix(zip_infos, prefix_parts):
158
+ """
159
+ Used to detect whether the zip has a wrapper folder like:
160
+ chars_unsorted/bam.txt
161
+ chars_unsorted/Adali/character_description.txt
162
+
163
+ or no wrapper:
164
+ bam.txt
165
+ Adali/character_description.txt
166
+ """
167
+ folders = set()
168
+
169
+ for info in zip_infos:
170
+ if info.is_dir():
171
+ continue
172
+
173
+ parts = _split_zip_path(info.filename)
174
+ if len(parts) <= len(prefix_parts):
175
+ continue
176
+
177
+ if parts[:len(prefix_parts)] != prefix_parts:
178
+ continue
179
+
180
+ rest = parts[len(prefix_parts):]
181
+ if len(rest) >= 2:
182
+ folders.add(rest[0])
183
+
184
+ return len(folders)
185
+
186
+
187
+ def _find_bam_member_and_root_prefix(zf: zipfile.ZipFile):
188
+ """
189
+ Finds the main bam.txt inside the zip and determines the logical root.
190
+
191
+ Handles:
192
+ bam.txt
193
+ Adali/...
194
+ and:
195
+ chars_unsorted/bam.txt
196
+ chars_unsorted/Adali/...
197
+ """
198
+ infos = zf.infolist()
199
+ candidates = []
200
+
201
+ for info in infos:
202
+ if info.is_dir():
203
+ continue
204
+
205
+ parts = _split_zip_path(info.filename)
206
+ if not parts:
207
+ continue
208
+
209
+ if parts[-1].casefold() == "bam.txt":
210
+ prefix = parts[:-1]
211
+ score = _count_immediate_folders_under_prefix(infos, prefix)
212
+ candidates.append((score, len(prefix), info, prefix))
213
+
214
+ if not candidates:
215
+ raise ValueError("Could not find bam.txt inside the zip.")
216
+
217
+ # Highest number of immediate child folders wins.
218
+ # If tied, shorter prefix wins.
219
+ candidates.sort(key=lambda x: (-x[0], x[1]))
220
+ score, _, bam_info, root_prefix = candidates[0]
221
+
222
+ if score == 0:
223
+ raise ValueError(
224
+ "Found bam.txt, but could not find character folders next to it."
225
+ )
226
+
227
+ return bam_info, root_prefix
228
+
229
+
230
+ def _collect_character_folders(zf: zipfile.ZipFile, root_prefix):
231
+ """
232
+ Returns:
233
+ folder_files = {
234
+ "Adali": [(ZipInfo, ["character_description.txt"]), ...],
235
+ ...
236
+ }
237
+ """
238
+ folder_files = defaultdict(list)
239
+
240
+ for info in zf.infolist():
241
+ if info.is_dir():
242
+ continue
243
+
244
+ parts = _split_zip_path(info.filename)
245
+ if len(parts) <= len(root_prefix):
246
+ continue
247
+
248
+ if parts[:len(root_prefix)] != root_prefix:
249
+ continue
250
+
251
+ rest = parts[len(root_prefix):]
252
+
253
+ # Skip root-level bam.txt
254
+ if len(rest) == 1 and rest[0].casefold() == "bam.txt":
255
+ continue
256
+
257
+ # Character folder must contain at least one file.
258
+ if len(rest) >= 2:
259
+ folder_name = rest[0]
260
+ relative_inside_folder = rest[1:]
261
+ folder_files[folder_name].append((info, relative_inside_folder))
262
+
263
+ return folder_files
264
+
265
+
266
+ def _description_candidate_score(relative_parts):
267
+ """
268
+ Lower score = better.
269
+
270
+ We prefer:
271
+ Character Description.txt
272
+ character_description.txt
273
+ description.txt
274
+
275
+ Then any .txt containing "description".
276
+ Then any .txt except bam.txt.
277
+ """
278
+ basename = relative_parts[-1]
279
+ basename_l = basename.casefold()
280
+
281
+ direct_file = len(relative_parts) == 1
282
+
283
+ exact_names = {
284
+ "character description.txt",
285
+ "character_description.txt",
286
+ "character-description.txt",
287
+ "characterdescription.txt",
288
+ "character desc.txt",
289
+ "character_desc.txt",
290
+ "description.txt",
291
+ }
292
+
293
+ if basename_l in exact_names:
294
+ base_score = 0
295
+ elif basename_l.endswith(".txt") and "description" in basename_l:
296
+ base_score = 1
297
+ elif basename_l.endswith(".txt") and "desc" in basename_l:
298
+ base_score = 2
299
+ elif basename_l.endswith(".txt") and basename_l != "bam.txt":
300
+ base_score = 3
301
+ else:
302
+ base_score = 999
303
+
304
+ nested_penalty = 0 if direct_file else 10
305
+ return base_score + nested_penalty
306
+
307
+
308
+ def _select_description_file(files_for_folder):
309
+ candidates = []
310
+
311
+ for info, relative_parts in files_for_folder:
312
+ if not relative_parts:
313
+ continue
314
+
315
+ score = _description_candidate_score(relative_parts)
316
+ if score < 999:
317
+ path_depth = len(relative_parts)
318
+ path_text = "/".join(relative_parts)
319
+ candidates.append((score, path_depth, path_text, info))
320
+
321
+ if not candidates:
322
+ return None
323
+
324
+ candidates.sort(key=lambda x: (x[0], x[1], x[2]))
325
+ return candidates[0][3]
326
+
327
+
328
+ def _safe_status_list(items, limit=20):
329
+ if not items:
330
+ return "[]"
331
+
332
+ shown = items[:limit]
333
+ text = ", ".join(str(x) for x in shown)
334
+
335
+ if len(items) > limit:
336
+ text += f", ... +{len(items) - limit} more"
337
+
338
+ return "[" + text + "]"
339
+
340
+
341
+ class TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip:
342
+ @classmethod
343
+ def INPUT_TYPES(cls):
344
+ return {
345
+ "required": {
346
+ "hf_token": ("STRING", {"default": "", "multiline": False}),
347
+ }
348
+ }
349
+
350
+ RETURN_TYPES = ("STRING",)
351
+ RETURN_NAMES = ("status",)
352
+ FUNCTION = "run"
353
+ CATEGORY = "Salia"
354
+
355
+ @classmethod
356
+ def IS_CHANGED(cls, hf_token):
357
+ # This is an action node. Force ComfyUI to run it again when queued.
358
+ return time.time()
359
+
360
+ def run(self, hf_token):
361
+ if HfApi is None:
362
+ raise ValueError(
363
+ "huggingface_hub is not installed. Install it with:\n"
364
+ "pip install huggingface_hub"
365
+ )
366
+
367
+ token = _resolve_hf_token(hf_token)
368
+
369
+ source_zip_path = None
370
+ output_zip_path = None
371
+
372
+ successes = []
373
+ failures = []
374
+ warnings = []
375
+
376
+ try:
377
+ # 1. Download chars_unsorted.zip once.
378
+ source_zip_path = _download_file(SOURCE_ZIP_URL, suffix=".zip")
379
+
380
+ # 2. Open zip, find root bam.txt, parse name -> ID mapping.
381
+ with zipfile.ZipFile(source_zip_path, "r") as zf:
382
+ bam_info, root_prefix = _find_bam_member_and_root_prefix(zf)
383
+
384
+ bam_text = zf.read(bam_info).decode("utf-8", errors="replace")
385
+ name_to_id, id_to_name, duplicate_names = _parse_bam_name_to_id(bam_text)
386
+
387
+ if duplicate_names:
388
+ warnings.append(
389
+ "duplicate_names_in_bam="
390
+ + _safe_status_list(duplicate_names, limit=10)
391
+ )
392
+
393
+ folder_files = _collect_character_folders(zf, root_prefix)
394
+
395
+ if not folder_files:
396
+ raise ValueError("No character folders found inside zip.")
397
+
398
+ numbered_items = []
399
+ used_ids = set()
400
+
401
+ # 3. Assign folder name -> BAM ID.
402
+ for folder_name, files_for_folder in folder_files.items():
403
+ key = _name_key(folder_name)
404
+ character_id = name_to_id.get(key)
405
+
406
+ if character_id is None:
407
+ failures.append(f"folder_not_found_in_bam:{folder_name}")
408
+ continue
409
+
410
+ if character_id in used_ids:
411
+ failures.append(
412
+ f"duplicate_id_assignment:{character_id:04d}:{folder_name}"
413
+ )
414
+ continue
415
+
416
+ desc_info = _select_description_file(files_for_folder)
417
+ if desc_info is None:
418
+ failures.append(f"no_description_file:{folder_name}")
419
+ continue
420
+
421
+ used_ids.add(character_id)
422
+ numbered_items.append((character_id, folder_name, desc_info))
423
+
424
+ if not numbered_items:
425
+ raise ValueError(
426
+ "No folders could be assigned to BAM IDs. "
427
+ f"Failures: {_safe_status_list(failures, limit=20)}"
428
+ )
429
+
430
+ numbered_items.sort(key=lambda x: x[0])
431
+
432
+ # 4. Create numbered output zip.
433
+ output_tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".zip")
434
+ output_zip_path = output_tmp.name
435
+ output_tmp.close()
436
+
437
+ with zipfile.ZipFile(
438
+ output_zip_path,
439
+ "w",
440
+ compression=zipfile.ZIP_DEFLATED,
441
+ compresslevel=6,
442
+ ) as out_zf:
443
+ for character_id, folder_name, desc_info in numbered_items:
444
+ output_name = f"{character_id:04d}_desc.txt"
445
+ data = zf.read(desc_info)
446
+
447
+ out_zf.writestr(output_name, data)
448
+ successes.append(f"{character_id:04d}:{folder_name}")
449
+
450
+ # 5. Upload final zip to saliacoel/tmp.
451
+ api = HfApi(token=token)
452
+ api.upload_file(
453
+ path_or_fileobj=output_zip_path,
454
+ path_in_repo=OUTPUT_ZIP_NAME,
455
+ repo_id=TMP_REPO_ID,
456
+ repo_type="model",
457
+ revision="main",
458
+ commit_message=f"Upload {OUTPUT_ZIP_NAME} from chars_unsorted.zip",
459
+ )
460
+
461
+ uploaded_url = (
462
+ f"https://huggingface.co/{TMP_REPO_ID}/resolve/main/"
463
+ f"{quote(OUTPUT_ZIP_NAME, safe='')}"
464
+ )
465
+
466
+ status_parts = [
467
+ f"uploaded={uploaded_url}",
468
+ f"output_zip={OUTPUT_ZIP_NAME}",
469
+ f"descriptions_written={len(successes)}",
470
+ f"folders_failed={len(failures)}",
471
+ f"bam_entries={len(id_to_name)}",
472
+ f"root_prefix={'/'.join(root_prefix) if root_prefix else '<zip_root>'}",
473
+ ]
474
+
475
+ if warnings:
476
+ status_parts.append("warnings=" + " ; ".join(warnings))
477
+
478
+ if failures:
479
+ status_parts.append("failures=" + _safe_status_list(failures, limit=20))
480
+
481
+ status_parts.append("ok=" + _safe_status_list(successes, limit=20))
482
+
483
+ return (" | ".join(status_parts),)
484
+
485
+ finally:
486
+ for path in (source_zip_path, output_zip_path):
487
+ if path and os.path.exists(path):
488
+ try:
489
+ os.remove(path)
490
+ except Exception:
491
+ pass
492
+
493
+
494
+ NODE_CLASS_MAPPINGS = {
495
+ "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip": TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip,
496
+ }
497
+
498
+ NODE_DISPLAY_NAME_MAPPINGS = {
499
+ "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip": "TMP_Salia_Unsorted_Chars_Zip_To_Numbered_Desc_Zip",
500
+ }