saliacoel
/

x

Model card Files Files and versions

x / BAM_GetDuplicates.py

saliacoel's picture

Upload BAM_GetDuplicates.py

2e40f58 verified 25 days ago

history blame contribute delete

3.04 kB

	import re
	from collections import Counter
	from urllib.request import Request, urlopen


	BAM_URL = "https://huggingface.co/saliacoel/chars/resolve/main/BAM.txt"

	# Matches ID markers like:
	# 1.
	# 0001.
	# and also works when the next entry starts after a space instead of a line break.
	ID_MARKER_RE = re.compile(r"(?<!\S)0\d+\.\s")


	def _download_text(url: str) -> str:
	req = Request(
	url,
	headers={
	"User-Agent": "Mozilla/5.0",
	"Accept": "/",
	},
	)
	with urlopen(req, timeout=60) as resp:
	return resp.read().decode("utf-8", errors="replace")


	def _iter_entry_segments(text: str):
	"""
	Yield each BAM entry body between ID markers.

	Example:
	'0001. Adali, tags... 0002. Petra, tags...'
	yields:
	'Adali, tags...'
	'Petra, tags...'
	"""
	prev_match = None

	for match in ID_MARKER_RE.finditer(text):
	if prev_match is not None:
	yield text[prev_match.end() : match.start()]
	prev_match = match

	if prev_match is not None:
	yield text[prev_match.end() :]


	def _extract_name_from_segment(segment: str) -> str:
	"""
	Name = first string in the entry, until the first comma.
	"""
	segment = segment.strip()
	if not segment:
	return ""

	comma_index = segment.find(",")
	if comma_index == -1:
	name = segment.strip()
	else:
	name = segment[:comma_index].strip()

	# Normalize whitespace inside the name
	name = " ".join(name.split())
	return name


	def _find_duplicate_names(text: str) -> str:
	"""
	Returns a comma-separated string of all names that appear more than once.
	Preserves order of first appearance.
	"""
	counts = Counter()
	ordered_names = []
	seen_once = set()

	for segment in _iter_entry_segments(text):
	name = _extract_name_from_segment(segment)
	if not name:
	continue

	counts[name] += 1
	if name not in seen_once:
	seen_once.add(name)
	ordered_names.append(name)

	duplicates = [name for name in ordered_names if counts[name] > 1]
	return ", ".join(duplicates)


	class Salia_BAM_Get_Duplicate_Names:
	@classmethod
	def INPUT_TYPES(cls):
	return {
	"required": {}
	}

	RETURN_TYPES = ("STRING",)
	RETURN_NAMES = ("duplicate_names",)
	FUNCTION = "get_duplicate_names"
	CATEGORY = "Salia"

	def get_duplicate_names(self):
	try:
	bam_text = _download_text(BAM_URL)
	except Exception as e:
	raise ValueError(f"Failed to download BAM.txt:\n{BAM_URL}\n\n{e}") from e

	result = _find_duplicate_names(bam_text)
	return (result,)


	NODE_CLASS_MAPPINGS = {
	"Salia_BAM_Get_Duplicate_Names": Salia_BAM_Get_Duplicate_Names,
	}

	NODE_DISPLAY_NAME_MAPPINGS = {
	"Salia_BAM_Get_Duplicate_Names": "Salia_BAM_Get_Duplicate_Names",
	}