"""Hacker News scraper — uses the free Algolia HN Search API.

No API key required. Searches for user complaints and frustrations
in HN comments and stories, returning structured comment data compatible
with the pain_point_miner pipeline.

API docs: https://hn.algolia.com/api
"""
from __future__ import annotations

import logging
import time
from dataclasses import dataclass

import diskcache
import requests

from src.config import settings
from src.tools.types import ScrapedComment

logger = logging.getLogger(__name__)

# Disk-backed cache
_CACHE = diskcache.Cache(settings.cache_dir)
_TTL_S: int = settings.cache_ttl_hours * 3600
_MISSING = object()

# ------------------------------------------------------------------
# Constants
# ------------------------------------------------------------------
_BASE_URL = "https://hn.algolia.com/api/v1"
_REQUEST_DELAY_S: float = 0.3
_REQUEST_TIMEOUT: int = 15
_MAX_RESULTS_PER_QUERY: int = 50
_MIN_COMMENT_LENGTH: int = 40

# Filter for content from the last 1 years (365 days)
_TWO_YEARS_AGO = int(time.time()) - (365 * 24 * 60 * 60)

# Search queries targeting frustration/pain points
_SEARCH_QUERIES: list[str] = [
    '"{domain}" frustrated',
    '"{domain}" annoying problem',
    '"{domain}" I wish there was',
    '"{domain}" biggest pain',
    '"{domain}" hate dealing with',
    '"{domain}" waste time',
]

# Domain-specific keyword expansions
_DOMAIN_EXPANSIONS: dict[str, list[str]] = {
    "developer tools": ["devtools", "IDE", "CI/CD", "debugging", "testing framework"],
    "healthcare": ["EHR", "patient portal", "medical software", "telehealth"],
    "finance": ["fintech", "banking app", "payment processing", "accounting software"],
    "education": ["edtech", "LMS", "online learning", "course platform"],
    "e-commerce": ["shopify", "online store", "checkout", "inventory management"],
    "marketing": ["SEO tool", "analytics", "email marketing", "social media management"],
    "ai": ["LLM", "machine learning", "AI tool", "model training"],
    "productivity": ["project management", "task manager", "note-taking", "workflow"],
}


def _get_domain_keywords(domain: str) -> list[str]:
    """Get additional search keywords based on domain."""
    domain_lower = domain.lower()
    keywords = [domain]
    for key, expansions in _DOMAIN_EXPANSIONS.items():
        if key in domain_lower:
            keywords.extend(expansions)
            break
    return keywords


def _make_request(url: str, params: dict | None = None) -> dict | None:
    """GET request to HN Algolia API with caching."""
    cache_key = ("hn_api", url, str(sorted(params.items())) if params else "")
    cached = _CACHE.get(cache_key, default=_MISSING)
    if cached is not _MISSING:
        return cached

    try:
        time.sleep(_REQUEST_DELAY_S)
        r = requests.get(
            url,
            params=params,
            headers={"User-Agent": "ventureforge/0.1.0 (academic research)"},
            timeout=_REQUEST_TIMEOUT,
        )
        r.raise_for_status()
        data = r.json()
        _CACHE.set(cache_key, data, expire=_TTL_S)
        return data
    except Exception as e:
        logger.warning(f"[hackernews] request error for {url}: {e}")
        return None


def _search_comments(query: str, num_results: int = _MAX_RESULTS_PER_QUERY) -> list[dict]:
    """Search HN comments via Algolia API."""
    params = {
        "query": query,
        "tags": "comment",
        "hitsPerPage": num_results,
        "numericFilters": f"created_at_i>{_TWO_YEARS_AGO}",
    }
    data = _make_request(f"{_BASE_URL}/search_by_date", params)
    if not data:
        return []
    return data.get("hits", [])


def _search_stories(query: str, num_results: int = 20) -> list[dict]:
    """Search HN stories (Ask HN, Show HN, etc.) via Algolia API."""
    params = {
        "query": query,
        "tags": "story",
        "hitsPerPage": num_results,
        "numericFilters": f"created_at_i>{_TWO_YEARS_AGO}",
    }
    data = _make_request(f"{_BASE_URL}/search", params)
    if not data:
        return []
    return data.get("hits", [])


def _hit_to_comment(hit: dict) -> ScrapedComment | None:
    """Convert an Algolia hit to a ScrapedComment."""
    text = hit.get("comment_text", "")
    if not text:
        text = hit.get("story_text", "")
    if not text:
        return None

    # Strip HTML tags (HN API returns HTML in comment_text)
    import re
    text = re.sub(r"<[^>]+>", " ", text)
    text = re.sub(r"&[a-z]+;", " ", text)
    text = " ".join(text.split()).strip()

    if len(text) < _MIN_COMMENT_LENGTH:
        return None

    object_id = hit.get("objectID", "")
    story_id = hit.get("story_id", object_id)
    story_title = hit.get("story_title", "") or hit.get("title", "")
    url = f"https://news.ycombinator.com/item?id={object_id}"

    return ScrapedComment(
        text=text,
        url=url,
        subreddit=f"hackernews",  # reuse field name for compatibility
        post_title=story_title,
    )


def scrape_for_domain(domain: str, max_total_comments: int = 150) -> list[ScrapedComment]:
    """Main entry point: scrape HN comments related to a domain's pain points.

    Returns a list of ScrapedComment objects compatible with the
    pain_point_miner pipeline.
    """
    keywords = _get_domain_keywords(domain)
    all_comments: list[ScrapedComment] = []
    seen_ids: set[str] = set()

    for keyword in keywords:
        if len(all_comments) >= max_total_comments:
            break

        for query_template in _SEARCH_QUERIES:
            if len(all_comments) >= max_total_comments:
                break

            query = query_template.replace("{domain}", keyword)
            hits = _search_comments(query, num_results=30)

            for hit in hits:
                oid = hit.get("objectID", "")
                if oid in seen_ids:
                    continue
                seen_ids.add(oid)

                comment = _hit_to_comment(hit)
                if comment:
                    all_comments.append(comment)

                if len(all_comments) >= max_total_comments:
                    break

    # Also search "Ask HN" stories which often contain pain points
    ask_hn_queries = [
        f"Ask HN: {domain} frustrating",
        f"Ask HN: {domain} problem",
        f"Ask HN: what tools {domain}",
    ]
    for query in ask_hn_queries:
        if len(all_comments) >= max_total_comments:
            break
        stories = _search_stories(query, num_results=10)
        for story in stories:
            story_id = story.get("objectID", "")
            if story_id in seen_ids:
                continue
            seen_ids.add(story_id)

            # Get comments from this story
            story_comments = _get_story_comments(story_id, limit=10)
            for comment in story_comments:
                if comment.url.split("=")[-1] not in seen_ids:
                    all_comments.append(comment)
                    seen_ids.add(comment.url.split("=")[-1])

                if len(all_comments) >= max_total_comments:
                    break

    logger.info(f"[hackernews] scraped {len(all_comments)} comments for domain='{domain}'")
    return all_comments


def _get_story_comments(story_id: str, limit: int = 10) -> list[ScrapedComment]:
    """Fetch top comments from a specific HN story."""
    params = {
        "tags": f"comment,story_{story_id}",
        "hitsPerPage": limit,
    }
    data = _make_request(f"{_BASE_URL}/search", params)
    if not data:
        return []

    comments: list[ScrapedComment] = []
    for hit in data.get("hits", []):
        comment = _hit_to_comment(hit)
        if comment:
            comments.append(comment)
    return comments