import re
import os
import io
import requests
from urllib.parse import urljoin, urlparse
from bs4 import BeautifulSoup
from PIL import Image
from typing import Optional, List, Dict, Any

HEADERS = {
    "User-Agent": (
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
        "AppleWebKit/537.36 (KHTML, like Gecko) "
        "Chrome/120.0 Safari/537.36"
    )
}

# ------------------ Basic utils ------------------ #


def validate_url(url: str) -> str:
    if not url or not isinstance(url, str):
        raise ValueError("URL must be a non-empty string.")
    parsed = urlparse(url)
    if parsed.scheme not in ("http", "https"):
        raise ValueError("URL must start with http:// or https://")
    blocked_hosts = {"localhost", "127.0.0.1", "0.0.0.0", "::1"}
    if parsed.hostname in blocked_hosts:
        raise ValueError("Local addresses are not allowed.")
    if parsed.hostname and parsed.hostname.startswith(("10.", "192.168.", "172.16.")):
        raise ValueError("Private network addresses are not allowed.")
    return url


def allowed_by_robots(url: str) -> bool:
    parsed = urlparse(url)
    robots_url = f"{parsed.scheme}://{parsed.netloc}/robots.txt"
    try:
        r = requests.get(robots_url, headers=HEADERS, timeout=5)
        if not r.ok:
            return True
        content = r.text.lower()
        lines = content.splitlines()
        applies = False
        disallows: List[str] = []
        for line in lines:
            line = line.strip()
            if not line or line.startswith("#"):
                continue
            if line.startswith("user-agent:"):
                ua = line.split(":", 1)[1].strip()
                applies = ua in ("*", "scraper-bot")
            elif applies and line.startswith("disallow:"):
                rule = line.split(":", 1)[1].strip()
                disallows.append(rule)
        path = parsed.path or "/"
        for rule in disallows:
            if rule and path.startswith(rule):
                return False
        return True
    except Exception:
        return True


def fetch_html(url: str) -> str:
    r = requests.get(url, headers=HEADERS, timeout=10)
    r.raise_for_status()
    return r.text


def normalize_url(src: str, base: str) -> Optional[str]:
    try:
        return urljoin(base, src)
    except Exception:
        return None


def is_image_candidate(url: str) -> bool:
    if not url:
        return False
    if url.startswith(("data:", "blob:", "about:")):
        return False
    lower = url.lower()
    if lower.endswith(".svg"):
        return False
    exts = [".jpg", ".jpeg", ".png", ".gif", ".webp", ".avif", ".jfif"]
    if any(ext in lower for ext in exts):
        return True
    return lower.startswith("http://") or lower.startswith("https://")


# ------------------ Author/headshot-focused HTML heuristics ------------------ #

AUTHOR_SELECTORS: List[str] = [
    ".author",
    ".author-info",
    ".author-bio",
    ".author-box",
    ".about",
    ".about-author",
    ".byline",
    ".post-author",
    ".entry-author",
    ".site-author",
    ".profile",
    ".profile-card",
    ".user-info",
    "[rel='author']",
    "[itemprop='author']",
    "[itemtype*='Person']",
]


def extract_author_candidates(
    soup: BeautifulSoup, base_url: str
) -> List[Dict[str, Any]]:
    """
    Find images likely to be author / headshot images.
    Returns a list of dicts:
    [{url, score, source, width_attr, height_attr, width_px, height_px}, ...]
    """
    candidates: List[Dict[str, Any]] = []
    seen = set()

    def add_image(
        src: Optional[str], source: str, score_boost: float = 0.0, tag=None
    ) -> None:
        if not src:
            return
        abs_url = normalize_url(src, base_url)
        if not abs_url or abs_url in seen:
            return
        if not is_image_candidate(abs_url):
            return

        score = 0.0 + score_boost
        width_attr = None
        height_attr = None

        if tag is not None:
            width_attr = tag.get("width")
            height_attr = tag.get("height")
            try:
                w = int(width_attr or 0)
                h = int(height_attr or 0)
                if w > 0 and h > 0:
                    aspect = float(h) / float(max(w, 1))
                    if 0.8 <= aspect <= 2.0:
                        score += 2.0
                    if w > 800 and w > h * 2:
                        score -= 2.0
            except ValueError:
                pass

        seen.add(abs_url)
        candidates.append(
            {
                "url": abs_url,
                "score": score,
                "source": source,
                "width_attr": width_attr,
                "height_attr": height_attr,
                "width_px": None,
                "height_px": None,
            }
        )

    # 1) Inside author/about-ish containers
    for selector in AUTHOR_SELECTORS:
        for container in soup.select(selector):
            imgs = container.find_all("img")
            for img in imgs:
                src = (
                    img.get("src")
                    or img.get("data-src")
                    or img.get("data-lazy-src")
                    or img.get("data-original")
                )
                add_image(src, "author:" + selector, score_boost=5.0, tag=img)

            style = container.get("style") or ""
            match = re.search(r"background-image\s*:\s*url\(([^)]+)\)", style, re.I)
            if match:
                raw = match.group(1).strip("'\" ")
                add_image(raw, "author-bg:" + selector, score_boost=4.0, tag=None)

    # 2) Fallback: near "about"/"author" text
    text_blocks = soup.find_all(
        lambda tag: tag.name in ("p", "div", "section") and tag.get_text(strip=True)
    )
    for block in text_blocks:
        txt = block.get_text(" ", strip=True).lower()
        if any(
            phrase in txt
            for phrase in [
                "about me",
                "about the author",
                "about the creator",
                "meet",
                "author",
            ]
        ):
            for img in block.find_all("img"):
                src = (
                    img.get("src")
                    or img.get("data-src")
                    or img.get("data-lazy-src")
                    or img.get("data-original")
                )
                add_image(src, "near-about-text", score_boost=3.0, tag=img)

    return candidates


# ------------------ Size-based filtering via Pillow ------------------ #


def measure_image_dimensions(
    url: str, timeout: int = 10
) -> (Optional[int], Optional[int]):
    """
    Fetch the image and read dimensions with Pillow.
    Returns (width, height) or (None, None) on failure.
    """
    try:
        resp = requests.get(url, headers=HEADERS, timeout=timeout)
        resp.raise_for_status()
        data = io.BytesIO(resp.content)
        with Image.open(data) as img:
            return img.width, img.height
    except Exception:
        return None, None


def refine_candidates_with_dimensions(
    candidates: List[Dict[str, Any]], max_to_check: int = 10
) -> List[Dict[str, Any]]:
    """
    For up to `max_to_check` candidates, compute real width/height.
    Adjust scores based on real dimensions.
    """
    candidates_sorted = sorted(candidates, key=lambda c: c["score"], reverse=True)

    for idx, c in enumerate(candidates_sorted):
        if idx >= max_to_check:
            break

        w, h = measure_image_dimensions(c["url"])
        c["width_px"] = w
        c["height_px"] = h

        if not w or not h:
            continue

        if w < 80 or h < 80:
            c["score"] -= 4.0
            continue

        aspect = float(h) / float(max(w, 1))

        if 0.7 <= aspect <= 1.8:
            c["score"] += 4.0

        if w > 1000 and w > h * 2.5:
            c["score"] -= 5.0

        if w > 2500 or h > 2500:
            c["score"] -= 3.0

    return sorted(candidates_sorted, key=lambda c: c["score"], reverse=True)


def pick_best_author_image(
    candidates: List[Dict[str, Any]],
) -> Optional[Dict[str, Any]]:
    if not candidates:
        return None
    return sorted(candidates, key=lambda c: c["score"], reverse=True)[0]


# ------------------ Main entry: scrape_author_image ------------------ #


def scrape_author_image(url: str) -> Dict[str, Any]:
    """
    Given a URL, return the most likely author/headshot image.
    """
    url = validate_url(url)
    if not allowed_by_robots(url):
        raise PermissionError("Blocked by robots.txt")

    html = fetch_html(url)
    soup = BeautifulSoup(html, "lxml")

    title_tag = (
        soup.select_one("meta[property='og:title']")
        or soup.select_one("meta[name='twitter:title']")
        or soup.find("h1")
        or soup.title
    )
    if title_tag:
        if hasattr(title_tag, "get"):
            title = title_tag.get("content", "") or title_tag.get_text("", strip=True)
        else:
            title = title_tag.get_text("", strip=True)
    else:
        title = urlparse(url).hostname

    candidates = extract_author_candidates(soup, url)
    candidates_refined = refine_candidates_with_dimensions(candidates, max_to_check=10)
    best = pick_best_author_image(candidates_refined)

    return {
        "title": title,
        "author_image_url": best["url"] if best else None,
        "debug_candidates": candidates_refined,
    }


# ------------------ Download helpers ------------------ #


def download_image(
    image_url: str, out_dir: str = "author_images", filename: Optional[str] = None
) -> str:
    """
    Download a single image URL to `out_dir`, return local file path.
    If `filename` is provided, it is used instead of the remote filename.
    """
    os.makedirs(out_dir, exist_ok=True)

    parsed = urlparse(image_url)
    fallback = os.path.basename(parsed.path) or "image"

    if filename:
        base_name = filename
    else:
        base_name = fallback

    # Ensure file extension
    if not any(
        base_name.lower().endswith(ext)
        for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]
    ):
        base_name += ".jpg"

    file_path = os.path.join(out_dir, base_name)

    resp = requests.get(image_url, headers=HEADERS, timeout=15)
    resp.raise_for_status()

    with open(file_path, "wb") as f:
        f.write(resp.content)

    return file_path


def download_author_image(
    page_url: str, out_dir: str = "author_images"
) -> Dict[str, Any]:
    """
    High-level helper:
      1. Scrape the page for the best author/headshot image.
      2. Download that image to `out_dir`.
    """
    result = scrape_author_image(page_url)
    author_url = result["author_image_url"]

    if not author_url:
        return {
            "title": result["title"],
            "author_image_url": None,
            "local_path": None,
        }

    local_path = download_image(author_url, out_dir=out_dir)
    return {
        "title": result["title"],
        "author_image_url": author_url,
        "local_path": local_path,
    }


# ---------- Site normalization & about-page discovery ---------- #


def normalize_site_input(site: str) -> str:
    """
    Allow user to input:
      - damndelicious.net
      - https://damndelicious.net
      - http://www.damndelicious.net/
    and normalize to a base URL like:
      - https://damndelicious.net
    """
    site = site.strip()

    if not site:
        raise ValueError("Site must be a non-empty string.")

    if not site.startswith(("http://", "https://")):
        site = "https://" + site

    parsed = urlparse(site)
    netloc = parsed.netloc
    if netloc.startswith("www."):
        netloc = netloc[4:]

    base_url = f"{parsed.scheme}://{netloc}"
    return base_url


ABOUT_PATH_GUESSES: List[str] = [
    "/about",
    "/about/",
    "/about-me",
    "/about-me/",
    "/about-us",
    "/about-us/",
    "/about-the-author",
    "/about-the-author/",
    "/about-the-creator",
    "/about-the-creator/",
    "/meet-the-team",
    "/meet-the-team/",
    "/meet-the-author",
    "/meet-the-author/",
]

ABOUT_TEXT_KEYWORDS: List[str] = [
    "about",
    "about me",
    "about us",
    "about the author",
    "about the creator",
    "our story",
    "my story",
    "meet",
    "meet the author",
    "meet the team",
    "bio",
]


def url_is_html(url: str) -> bool:
    """
    Quick check that a URL returns HTML (not 404, not an image, etc.).
    """
    try:
        r = requests.get(url, headers=HEADERS, timeout=8)
        if not r.ok:
            return False
        ctype = r.headers.get("Content-Type", "").lower()
        return "text/html" in ctype or "application/xhtml+xml" in ctype
    except Exception:
        return False


def find_about_like_urls(base_url: str, max_links: int = 20) -> List[str]:
    """
    Strategy:
      1. Try common about paths relative to base_url.
      2. Fetch the homepage and look for internal <a> links whose
         text or href contains about-ish keywords.
    """
    candidates: List[str] = []

    # 1) Common about paths
    for path in ABOUT_PATH_GUESSES:
        candidates.append(urljoin(base_url, path))

    # 2) Extract from homepage
    try:
        html = fetch_html(base_url)
    except Exception:
        html = ""

    if html:
        soup = BeautifulSoup(html, "lxml")
        for a in soup.find_all("a", href=True):
            href = a["href"]
            text = a.get_text(" ", strip=True).lower()
            href_lower = href.lower()

            if any(kw in text for kw in ABOUT_TEXT_KEYWORDS) or any(
                kw in href_lower for kw in ["about", "our-story", "my-story", "meet"]
            ):
                abs_url = normalize_url(href, base_url)
                if not abs_url:
                    continue
                candidates.append(abs_url)

    # Deduplicate while preserving order
    seen = set()
    unique: List[str] = []
    for c in candidates:
        if c not in seen:
            seen.add(c)
            unique.append(c)

    return unique[:max_links]


def pick_best_about_url(site_input: str) -> Optional[str]:
    """
    Given a site name or URL, try to find the 'best' about page URL.
    """
    base_url = normalize_site_input(site_input)
    base_url = validate_url(base_url)

    if not allowed_by_robots(base_url):
        raise PermissionError("Blocked by robots.txt for base site.")

    candidates = find_about_like_urls(base_url)

    for cand in candidates:
        if not url_is_html(cand):
            continue
        return cand

    return base_url


def clean_site_name(site: str) -> str:
    """
    Convert a site or URL into a safe filename component.
    Examples:
        https://www.damndelicious.net  ->  damndelicious_net
        foodblog.com                   ->  foodblog_com
    """
    site = site.strip().lower()
    if site.startswith("http://"):
        site = site[7:]
    elif site.startswith("https://"):
        site = site[8:]
    site = site.split("/")[0]
    return site.replace(".", "_").replace("-", "_")


def download_author_image_for_site(
    site_input: str, out_dir: str = "author_images"
) -> Dict[str, Any]:
    """
    1. Convert site input into a normalized base URL.
    2. Locate the site's best 'About' page.
    3. Extract the author image.
    4. Download it using a filename that includes the site name.
    """
    base_url = normalize_site_input(site_input)
    about_url = pick_best_about_url(site_input)

    if not about_url:
        return {
            "site_base_url": base_url,
            "about_url": None,
            "title": None,
            "author_image_url": None,
            "local_path": None,
        }

    info = scrape_author_image(about_url)
    author_url = info["author_image_url"]

    if not author_url:
        return {
            "site_base_url": base_url,
            "about_url": about_url,
            "title": info["title"],
            "author_image_url": None,
            "local_path": None,
        }

    safe_name = clean_site_name(base_url)
    filename = safe_name + "_author"

    local_path = download_image(author_url, out_dir=out_dir, filename=filename)

    return {
        "site_base_url": base_url,
        "about_url": about_url,
        "title": info["title"],
        "author_image_url": author_url,
        "local_path": local_path,
    }


# ------------------ Optional CLI test ------------------ #

if __name__ == "__main__":
    site_or_url = input("Enter site (e.g. 'damndelicious.net' or full URL): ").strip()
    result = download_author_image_for_site(site_or_url, out_dir="author_images")
    print("\nBase site:", result["site_base_url"])
    print("About URL:", result["about_url"])
    print("Page title:", result["title"])
    print("Headshot URL:", result["author_image_url"])
    print("Saved to:", result["local_path"])