x / BAM_GetDuplicates.py
saliacoel's picture
Upload BAM_GetDuplicates.py
2e40f58 verified
import re
from collections import Counter
from urllib.request import Request, urlopen
BAM_URL = "https://huggingface.co/saliacoel/chars/resolve/main/BAM.txt"
# Matches ID markers like:
# 1.
# 0001.
# and also works when the next entry starts after a space instead of a line break.
ID_MARKER_RE = re.compile(r"(?<!\S)0*\d+\.\s*")
def _download_text(url: str) -> str:
req = Request(
url,
headers={
"User-Agent": "Mozilla/5.0",
"Accept": "*/*",
},
)
with urlopen(req, timeout=60) as resp:
return resp.read().decode("utf-8", errors="replace")
def _iter_entry_segments(text: str):
"""
Yield each BAM entry body between ID markers.
Example:
'0001. Adali, tags... 0002. Petra, tags...'
yields:
'Adali, tags...'
'Petra, tags...'
"""
prev_match = None
for match in ID_MARKER_RE.finditer(text):
if prev_match is not None:
yield text[prev_match.end() : match.start()]
prev_match = match
if prev_match is not None:
yield text[prev_match.end() :]
def _extract_name_from_segment(segment: str) -> str:
"""
Name = first string in the entry, until the first comma.
"""
segment = segment.strip()
if not segment:
return ""
comma_index = segment.find(",")
if comma_index == -1:
name = segment.strip()
else:
name = segment[:comma_index].strip()
# Normalize whitespace inside the name
name = " ".join(name.split())
return name
def _find_duplicate_names(text: str) -> str:
"""
Returns a comma-separated string of all names that appear more than once.
Preserves order of first appearance.
"""
counts = Counter()
ordered_names = []
seen_once = set()
for segment in _iter_entry_segments(text):
name = _extract_name_from_segment(segment)
if not name:
continue
counts[name] += 1
if name not in seen_once:
seen_once.add(name)
ordered_names.append(name)
duplicates = [name for name in ordered_names if counts[name] > 1]
return ", ".join(duplicates)
class Salia_BAM_Get_Duplicate_Names:
@classmethod
def INPUT_TYPES(cls):
return {
"required": {}
}
RETURN_TYPES = ("STRING",)
RETURN_NAMES = ("duplicate_names",)
FUNCTION = "get_duplicate_names"
CATEGORY = "Salia"
def get_duplicate_names(self):
try:
bam_text = _download_text(BAM_URL)
except Exception as e:
raise ValueError(f"Failed to download BAM.txt:\n{BAM_URL}\n\n{e}") from e
result = _find_duplicate_names(bam_text)
return (result,)
NODE_CLASS_MAPPINGS = {
"Salia_BAM_Get_Duplicate_Names": Salia_BAM_Get_Duplicate_Names,
}
NODE_DISPLAY_NAME_MAPPINGS = {
"Salia_BAM_Get_Duplicate_Names": "Salia_BAM_Get_Duplicate_Names",
}