#!/usr/bin/env python3
"""Build a LoRA catalog for a Hugging Face user.

Usage:
  python scripts/update_loras_catalog.py --author artificialguybr --output loras.json
"""

from __future__ import annotations

import argparse
import json
import re
from dataclasses import dataclass
from pathlib import Path
from typing import Any

import requests

HF_API_MODELS = "https://huggingface.co/api/models"
IMAGE_EXTENSIONS = (".png", ".jpg", ".jpeg", ".webp")


@dataclass
class LoraEntry:
    title: str
    repo: str
    trigger_word: str
    family: str
    base_model: str
    image: str
    weight_name: str

    def as_dict(self) -> dict[str, Any]:
        return {
            "title": self.title,
            "repo": self.repo,
            "trigger_word": self.trigger_word,
            "family": self.family,
            "base_model": self.base_model,
            "image": self.image,
            "weight_name": self.weight_name,
        }


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser()
    parser.add_argument("--author", required=True, help="HF username/org")
    parser.add_argument("--output", default="loras.json", help="Output JSON path")
    return parser.parse_args()


def load_existing_triggers(path: Path) -> dict[str, str]:
    if not path.exists():
        return {}
    try:
        content = json.loads(path.read_text(encoding="utf-8"))
    except Exception:
        return {}
    triggers: dict[str, str] = {}
    for item in content:
        repo = str(item.get("repo", "")).strip()
        trigger = str(item.get("trigger_word", "")).strip()
        if repo and trigger:
            triggers[repo] = trigger
    return triggers


def paginated_models(author: str) -> list[dict[str, Any]]:
    models: list[dict[str, Any]] = []
    url = HF_API_MODELS
    params: dict[str, Any] | None = {"author": author, "full": "true", "limit": 100}

    while True:
        response = requests.get(url, params=params, timeout=60)
        response.raise_for_status()
        chunk = response.json()
        models.extend(chunk)

        link_header = response.headers.get("Link", "")
        if 'rel="next"' not in link_header:
            break

        next_url = link_header.split(";")[0].strip("<>")
        url = next_url
        params = None

    return models


def extract_base_model(tags: list[str]) -> str:
    for tag in tags:
        if tag.startswith("base_model:adapter:"):
            return tag.replace("base_model:adapter:", "", 1)
    for tag in tags:
        if tag.startswith("base_model:"):
            return tag.replace("base_model:", "", 1)
    return ""


def detect_family(base_model: str, repo_id: str, tags: list[str]) -> str:
    base = base_model.lower()
    if "stable-diffusion-xl" in base or "sdxl" in base:
        return "sdxl"
    if "stable-diffusion-v1-5" in base or "sd 1.5" in base or "sd1.5" in base or "sd-1-5" in base:
        return "sd15"
    if "qwen-image" in base or "qwen image" in base:
        return "qwen-image"
    if "z-image" in base or "zimage" in base:
        return "z-image"
    if "flux" in base:
        return "flux"

    text = " ".join([repo_id.lower(), *[t.lower() for t in tags]])
    if "stable-diffusion-xl" in text or "sdxl" in text:
        return "sdxl"
    if "stable-diffusion-v1-5" in text or "sd 1.5" in text or "sd1.5" in text or "sd-1-5" in text:
        return "sd15"
    if "qwen-image" in text or "qwen image" in text:
        return "qwen-image"
    if "z-image" in text or "zimage" in text:
        return "z-image"
    if "flux" in text:
        return "flux"
    return "other"


def is_t2i_lora(model: dict[str, Any]) -> bool:
    if model.get("pipeline_tag") != "text-to-image":
        return False
    tags = [str(tag).lower() for tag in model.get("tags", [])]
    if any("lora" in tag for tag in tags):
        return True
    return "base_model:adapter:" in " ".join(tags)


def infer_title(repo_id: str) -> str:
    name = repo_id.split("/", 1)[-1]
    cleaned = name.replace("_", " ").replace("-", " ").strip()
    return " ".join(part.capitalize() for part in cleaned.split())


def pick_cover_image(repo_id: str, siblings: list[dict[str, Any]]) -> str:
    for item in siblings:
        filename = str(item.get("rfilename", ""))
        lower = filename.lower()
        if lower.endswith(IMAGE_EXTENSIONS) and not lower.startswith("."):
            return f"https://huggingface.co/{repo_id}/resolve/main/{filename}"
    return ""


def pick_weight_name(siblings: list[dict[str, Any]]) -> str:
    preferred = []
    fallback = []
    for item in siblings:
        filename = str(item.get("rfilename", ""))
        lower = filename.lower()
        if not lower.endswith(".safetensors"):
            continue
        if "comfyui/" in lower:
            continue
        if lower.startswith("adapter_model"):
            preferred.append(filename)
            continue
        if "/" not in filename:
            preferred.append(filename)
            continue
        fallback.append(filename)

    if preferred:
        return sorted(preferred)[0]
    if fallback:
        return sorted(fallback)[0]
    return ""


def normalize_trigger(text: str) -> str:
    cleaned = text.strip().strip("\"'").strip()
    cleaned = re.sub(r"\s+", " ", cleaned)
    cleaned = cleaned.strip(" ,;.")
    if cleaned in {"-", "none", "n/a"}:
        return ""
    return cleaned


def extract_trigger_from_readme(readme: str) -> str:
    frontmatter = readme
    if readme.startswith("---"):
        parts = readme.split("---", 2)
        if len(parts) >= 3:
            frontmatter = parts[1]

    patterns = [
        r"(?im)^\s*instance_prompt\s*:\s*(.+?)\s*$",
        r"(?im)^\s*trigger_word\s*:\s*(.+?)\s*$",
        r"(?im)^\s*activation[_ ]token\s*:\s*(.+?)\s*$",
        r"(?im)^\s*trigger[_ ]phrase\s*:\s*(.+?)\s*$",
        r"(?im)^\s*token\s*:\s*(.+?)\s*$",
    ]

    for pattern in patterns:
        match = re.search(pattern, frontmatter)
        if match:
            trigger = normalize_trigger(match.group(1))
            if trigger:
                return trigger

    body_patterns = [
        r"(?im)trigger word\s*[:\-]\s*`?([^`\n]+)`?",
        r"(?im)activation token\s*[:\-]\s*`?([^`\n]+)`?",
        r"(?im)use\s+`([^`]+)`\s+in your prompt",
        r"(?im)you can use\s+([^.\n]+)",
    ]
    for pattern in body_patterns:
        match = re.search(pattern, readme)
        if match:
            trigger = normalize_trigger(match.group(1))
            if trigger:
                return trigger

    return ""


def fetch_trigger_word(repo_id: str, session: requests.Session) -> str:
    readme_url = f"https://huggingface.co/{repo_id}/raw/main/README.md"
    try:
        response = session.get(readme_url, timeout=30)
        if response.status_code != 200:
            return ""
        return extract_trigger_from_readme(response.text)
    except Exception:
        return ""


def build_catalog(
    models: list[dict[str, Any]], existing_triggers: dict[str, str]
) -> list[dict[str, Any]]:
    entries: list[LoraEntry] = []
    session = requests.Session()

    for model in models:
        if not is_t2i_lora(model):
            continue

        repo_id = model["id"]
        tags = [str(tag) for tag in model.get("tags", [])]
        base_model = extract_base_model(tags)
        family = detect_family(base_model, repo_id, tags)
        siblings = model.get("siblings") or []
        trigger_word = fetch_trigger_word(repo_id, session) or existing_triggers.get(repo_id, "")

        entries.append(
            LoraEntry(
                title=infer_title(repo_id),
                repo=repo_id,
                trigger_word=trigger_word,
                family=family,
                base_model=base_model,
                image=pick_cover_image(repo_id, siblings),
                weight_name=pick_weight_name(siblings),
            )
        )

    entries.sort(key=lambda x: (x.family, x.title.lower()))
    return [entry.as_dict() for entry in entries]


def main() -> None:
    args = parse_args()
    output_path = Path(args.output)
    existing_triggers = load_existing_triggers(output_path)
    models = paginated_models(args.author)
    catalog = build_catalog(models, existing_triggers)

    output_path.write_text(json.dumps(catalog, indent=2, ensure_ascii=False) + "\n", encoding="utf-8")

    by_family: dict[str, int] = {}
    for row in catalog:
        fam = row["family"]
        by_family[fam] = by_family.get(fam, 0) + 1
    with_trigger = sum(1 for row in catalog if row.get("trigger_word"))

    print(f"Saved {len(catalog)} LoRAs to {output_path}")
    print(f"Trigger words filled: {with_trigger}")
    print("Family counts:")
    for fam in sorted(by_family):
        print(f"  - {fam}: {by_family[fam]}")


if __name__ == "__main__":
    main()