saliacoel
/

x

Model card Files Files and versions

xet

Community

saliacoel commited on Apr 18

Commit

2e40f58

verified ·

1 Parent(s): 8d56289

Upload BAM_GetDuplicates.py

Browse files

Files changed (1) hide show

BAM_GetDuplicates.py +118 -0

BAM_GetDuplicates.py ADDED Viewed

	@@ -0,0 +1,118 @@

+import re
+from collections import Counter
+from urllib.request import Request, urlopen
+BAM_URL = "https://huggingface.co/saliacoel/chars/resolve/main/BAM.txt"
+# Matches ID markers like:
+# 1.
+# 0001.
+# and also works when the next entry starts after a space instead of a line break.
+ID_MARKER_RE = re.compile(r"(?<!\S)0*\d+\.\s*")
+def _download_text(url: str) -> str:
+    req = Request(
+        url,
+        headers={
+            "User-Agent": "Mozilla/5.0",
+            "Accept": "*/*",
+        },
+    )
+    with urlopen(req, timeout=60) as resp:
+        return resp.read().decode("utf-8", errors="replace")
+def _iter_entry_segments(text: str):
+    """
+    Yield each BAM entry body between ID markers.
+    Example:
+    '0001. Adali, tags... 0002. Petra, tags...'
+    yields:
+      'Adali, tags...'
+      'Petra, tags...'
+    """
+    prev_match = None
+    for match in ID_MARKER_RE.finditer(text):
+        if prev_match is not None:
+            yield text[prev_match.end() : match.start()]
+        prev_match = match
+    if prev_match is not None:
+        yield text[prev_match.end() :]
+def _extract_name_from_segment(segment: str) -> str:
+    """
+    Name = first string in the entry, until the first comma.
+    """
+    segment = segment.strip()
+    if not segment:
+        return ""
+    comma_index = segment.find(",")
+    if comma_index == -1:
+        name = segment.strip()
+    else:
+        name = segment[:comma_index].strip()
+    # Normalize whitespace inside the name
+    name = " ".join(name.split())
+    return name
+def _find_duplicate_names(text: str) -> str:
+    """
+    Returns a comma-separated string of all names that appear more than once.
+    Preserves order of first appearance.
+    """
+    counts = Counter()
+    ordered_names = []
+    seen_once = set()
+    for segment in _iter_entry_segments(text):
+        name = _extract_name_from_segment(segment)
+        if not name:
+            continue
+        counts[name] += 1
+        if name not in seen_once:
+            seen_once.add(name)
+            ordered_names.append(name)
+    duplicates = [name for name in ordered_names if counts[name] > 1]
+    return ", ".join(duplicates)
+class Salia_BAM_Get_Duplicate_Names:
+    @classmethod
+    def INPUT_TYPES(cls):
+        return {
+            "required": {}
+        }
+    RETURN_TYPES = ("STRING",)
+    RETURN_NAMES = ("duplicate_names",)
+    FUNCTION = "get_duplicate_names"
+    CATEGORY = "Salia"
+    def get_duplicate_names(self):
+        try:
+            bam_text = _download_text(BAM_URL)
+        except Exception as e:
+            raise ValueError(f"Failed to download BAM.txt:\n{BAM_URL}\n\n{e}") from e
+        result = _find_duplicate_names(bam_text)
+        return (result,)
+NODE_CLASS_MAPPINGS = {
+    "Salia_BAM_Get_Duplicate_Names": Salia_BAM_Get_Duplicate_Names,
+}
+NODE_DISPLAY_NAME_MAPPINGS = {
+    "Salia_BAM_Get_Duplicate_Names": "Salia_BAM_Get_Duplicate_Names",
+}