import re
from collections import Counter
from urllib.request import Request, urlopen


BAM_URL = "https://huggingface.co/saliacoel/chars/resolve/main/BAM.txt"

# Matches ID markers like:
# 1.
# 0001.
# and also works when the next entry starts after a space instead of a line break.
ID_MARKER_RE = re.compile(r"(?<!\S)0*\d+\.\s*")


def _download_text(url: str) -> str:
    req = Request(
        url,
        headers={
            "User-Agent": "Mozilla/5.0",
            "Accept": "*/*",
        },
    )
    with urlopen(req, timeout=60) as resp:
        return resp.read().decode("utf-8", errors="replace")


def _iter_entry_segments(text: str):
    """
    Yield each BAM entry body between ID markers.

    Example:
    '0001. Adali, tags... 0002. Petra, tags...'
    yields:
      'Adali, tags...'
      'Petra, tags...'
    """
    prev_match = None

    for match in ID_MARKER_RE.finditer(text):
        if prev_match is not None:
            yield text[prev_match.end() : match.start()]
        prev_match = match

    if prev_match is not None:
        yield text[prev_match.end() :]


def _extract_name_from_segment(segment: str) -> str:
    """
    Name = first string in the entry, until the first comma.
    """
    segment = segment.strip()
    if not segment:
        return ""

    comma_index = segment.find(",")
    if comma_index == -1:
        name = segment.strip()
    else:
        name = segment[:comma_index].strip()

    # Normalize whitespace inside the name
    name = " ".join(name.split())
    return name


def _find_duplicate_names(text: str) -> str:
    """
    Returns a comma-separated string of all names that appear more than once.
    Preserves order of first appearance.
    """
    counts = Counter()
    ordered_names = []
    seen_once = set()

    for segment in _iter_entry_segments(text):
        name = _extract_name_from_segment(segment)
        if not name:
            continue

        counts[name] += 1
        if name not in seen_once:
            seen_once.add(name)
            ordered_names.append(name)

    duplicates = [name for name in ordered_names if counts[name] > 1]
    return ", ".join(duplicates)


class Salia_BAM_Get_Duplicate_Names:
    @classmethod
    def INPUT_TYPES(cls):
        return {
            "required": {}
        }

    RETURN_TYPES = ("STRING",)
    RETURN_NAMES = ("duplicate_names",)
    FUNCTION = "get_duplicate_names"
    CATEGORY = "Salia"

    def get_duplicate_names(self):
        try:
            bam_text = _download_text(BAM_URL)
        except Exception as e:
            raise ValueError(f"Failed to download BAM.txt:\n{BAM_URL}\n\n{e}") from e

        result = _find_duplicate_names(bam_text)
        return (result,)


NODE_CLASS_MAPPINGS = {
    "Salia_BAM_Get_Duplicate_Names": Salia_BAM_Get_Duplicate_Names,
}

NODE_DISPLAY_NAME_MAPPINGS = {
    "Salia_BAM_Get_Duplicate_Names": "Salia_BAM_Get_Duplicate_Names",
}