saliacoel commited on
Commit
2e40f58
·
verified ·
1 Parent(s): 8d56289

Upload BAM_GetDuplicates.py

Browse files
Files changed (1) hide show
  1. BAM_GetDuplicates.py +118 -0
BAM_GetDuplicates.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from collections import Counter
3
+ from urllib.request import Request, urlopen
4
+
5
+
6
+ BAM_URL = "https://huggingface.co/saliacoel/chars/resolve/main/BAM.txt"
7
+
8
+ # Matches ID markers like:
9
+ # 1.
10
+ # 0001.
11
+ # and also works when the next entry starts after a space instead of a line break.
12
+ ID_MARKER_RE = re.compile(r"(?<!\S)0*\d+\.\s*")
13
+
14
+
15
+ def _download_text(url: str) -> str:
16
+ req = Request(
17
+ url,
18
+ headers={
19
+ "User-Agent": "Mozilla/5.0",
20
+ "Accept": "*/*",
21
+ },
22
+ )
23
+ with urlopen(req, timeout=60) as resp:
24
+ return resp.read().decode("utf-8", errors="replace")
25
+
26
+
27
+ def _iter_entry_segments(text: str):
28
+ """
29
+ Yield each BAM entry body between ID markers.
30
+
31
+ Example:
32
+ '0001. Adali, tags... 0002. Petra, tags...'
33
+ yields:
34
+ 'Adali, tags...'
35
+ 'Petra, tags...'
36
+ """
37
+ prev_match = None
38
+
39
+ for match in ID_MARKER_RE.finditer(text):
40
+ if prev_match is not None:
41
+ yield text[prev_match.end() : match.start()]
42
+ prev_match = match
43
+
44
+ if prev_match is not None:
45
+ yield text[prev_match.end() :]
46
+
47
+
48
+ def _extract_name_from_segment(segment: str) -> str:
49
+ """
50
+ Name = first string in the entry, until the first comma.
51
+ """
52
+ segment = segment.strip()
53
+ if not segment:
54
+ return ""
55
+
56
+ comma_index = segment.find(",")
57
+ if comma_index == -1:
58
+ name = segment.strip()
59
+ else:
60
+ name = segment[:comma_index].strip()
61
+
62
+ # Normalize whitespace inside the name
63
+ name = " ".join(name.split())
64
+ return name
65
+
66
+
67
+ def _find_duplicate_names(text: str) -> str:
68
+ """
69
+ Returns a comma-separated string of all names that appear more than once.
70
+ Preserves order of first appearance.
71
+ """
72
+ counts = Counter()
73
+ ordered_names = []
74
+ seen_once = set()
75
+
76
+ for segment in _iter_entry_segments(text):
77
+ name = _extract_name_from_segment(segment)
78
+ if not name:
79
+ continue
80
+
81
+ counts[name] += 1
82
+ if name not in seen_once:
83
+ seen_once.add(name)
84
+ ordered_names.append(name)
85
+
86
+ duplicates = [name for name in ordered_names if counts[name] > 1]
87
+ return ", ".join(duplicates)
88
+
89
+
90
+ class Salia_BAM_Get_Duplicate_Names:
91
+ @classmethod
92
+ def INPUT_TYPES(cls):
93
+ return {
94
+ "required": {}
95
+ }
96
+
97
+ RETURN_TYPES = ("STRING",)
98
+ RETURN_NAMES = ("duplicate_names",)
99
+ FUNCTION = "get_duplicate_names"
100
+ CATEGORY = "Salia"
101
+
102
+ def get_duplicate_names(self):
103
+ try:
104
+ bam_text = _download_text(BAM_URL)
105
+ except Exception as e:
106
+ raise ValueError(f"Failed to download BAM.txt:\n{BAM_URL}\n\n{e}") from e
107
+
108
+ result = _find_duplicate_names(bam_text)
109
+ return (result,)
110
+
111
+
112
+ NODE_CLASS_MAPPINGS = {
113
+ "Salia_BAM_Get_Duplicate_Names": Salia_BAM_Get_Duplicate_Names,
114
+ }
115
+
116
+ NODE_DISPLAY_NAME_MAPPINGS = {
117
+ "Salia_BAM_Get_Duplicate_Names": "Salia_BAM_Get_Duplicate_Names",
118
+ }