Spaces:
Sleeping
Sleeping
| """Hacker News scraper — uses the free Algolia HN Search API. | |
| No API key required. Searches for user complaints and frustrations | |
| in HN comments and stories, returning structured comment data compatible | |
| with the pain_point_miner pipeline. | |
| API docs: https://hn.algolia.com/api | |
| """ | |
| from __future__ import annotations | |
| import logging | |
| import time | |
| from dataclasses import dataclass | |
| import diskcache | |
| import requests | |
| from src.config import settings | |
| from src.tools.types import ScrapedComment | |
| logger = logging.getLogger(__name__) | |
| # Disk-backed cache | |
| _CACHE = diskcache.Cache(settings.cache_dir) | |
| _TTL_S: int = settings.cache_ttl_hours * 3600 | |
| _MISSING = object() | |
| # ------------------------------------------------------------------ | |
| # Constants | |
| # ------------------------------------------------------------------ | |
| _BASE_URL = "https://hn.algolia.com/api/v1" | |
| _REQUEST_DELAY_S: float = 0.3 | |
| _REQUEST_TIMEOUT: int = 15 | |
| _MAX_RESULTS_PER_QUERY: int = 50 | |
| _MIN_COMMENT_LENGTH: int = 40 | |
| # Filter for content from the last 1 years (365 days) | |
| _TWO_YEARS_AGO = int(time.time()) - (365 * 24 * 60 * 60) | |
| # Search queries targeting frustration/pain points | |
| _SEARCH_QUERIES: list[str] = [ | |
| '"{domain}" frustrated', | |
| '"{domain}" annoying problem', | |
| '"{domain}" I wish there was', | |
| '"{domain}" biggest pain', | |
| '"{domain}" hate dealing with', | |
| '"{domain}" waste time', | |
| ] | |
| # Domain-specific keyword expansions | |
| _DOMAIN_EXPANSIONS: dict[str, list[str]] = { | |
| "developer tools": ["devtools", "IDE", "CI/CD", "debugging", "testing framework"], | |
| "healthcare": ["EHR", "patient portal", "medical software", "telehealth"], | |
| "finance": ["fintech", "banking app", "payment processing", "accounting software"], | |
| "education": ["edtech", "LMS", "online learning", "course platform"], | |
| "e-commerce": ["shopify", "online store", "checkout", "inventory management"], | |
| "marketing": ["SEO tool", "analytics", "email marketing", "social media management"], | |
| "ai": ["LLM", "machine learning", "AI tool", "model training"], | |
| "productivity": ["project management", "task manager", "note-taking", "workflow"], | |
| } | |
| def _get_domain_keywords(domain: str) -> list[str]: | |
| """Get additional search keywords based on domain.""" | |
| domain_lower = domain.lower() | |
| keywords = [domain] | |
| for key, expansions in _DOMAIN_EXPANSIONS.items(): | |
| if key in domain_lower: | |
| keywords.extend(expansions) | |
| break | |
| return keywords | |
| def _make_request(url: str, params: dict | None = None) -> dict | None: | |
| """GET request to HN Algolia API with caching.""" | |
| cache_key = ("hn_api", url, str(sorted(params.items())) if params else "") | |
| cached = _CACHE.get(cache_key, default=_MISSING) | |
| if cached is not _MISSING: | |
| return cached | |
| try: | |
| time.sleep(_REQUEST_DELAY_S) | |
| r = requests.get( | |
| url, | |
| params=params, | |
| headers={"User-Agent": "ventureforge/0.1.0 (academic research)"}, | |
| timeout=_REQUEST_TIMEOUT, | |
| ) | |
| r.raise_for_status() | |
| data = r.json() | |
| _CACHE.set(cache_key, data, expire=_TTL_S) | |
| return data | |
| except Exception as e: | |
| logger.warning(f"[hackernews] request error for {url}: {e}") | |
| return None | |
| def _search_comments(query: str, num_results: int = _MAX_RESULTS_PER_QUERY) -> list[dict]: | |
| """Search HN comments via Algolia API.""" | |
| params = { | |
| "query": query, | |
| "tags": "comment", | |
| "hitsPerPage": num_results, | |
| "numericFilters": f"created_at_i>{_TWO_YEARS_AGO}", | |
| } | |
| data = _make_request(f"{_BASE_URL}/search_by_date", params) | |
| if not data: | |
| return [] | |
| return data.get("hits", []) | |
| def _search_stories(query: str, num_results: int = 20) -> list[dict]: | |
| """Search HN stories (Ask HN, Show HN, etc.) via Algolia API.""" | |
| params = { | |
| "query": query, | |
| "tags": "story", | |
| "hitsPerPage": num_results, | |
| "numericFilters": f"created_at_i>{_TWO_YEARS_AGO}", | |
| } | |
| data = _make_request(f"{_BASE_URL}/search", params) | |
| if not data: | |
| return [] | |
| return data.get("hits", []) | |
| def _hit_to_comment(hit: dict) -> ScrapedComment | None: | |
| """Convert an Algolia hit to a ScrapedComment.""" | |
| text = hit.get("comment_text", "") | |
| if not text: | |
| text = hit.get("story_text", "") | |
| if not text: | |
| return None | |
| # Strip HTML tags (HN API returns HTML in comment_text) | |
| import re | |
| text = re.sub(r"<[^>]+>", " ", text) | |
| text = re.sub(r"&[a-z]+;", " ", text) | |
| text = " ".join(text.split()).strip() | |
| if len(text) < _MIN_COMMENT_LENGTH: | |
| return None | |
| object_id = hit.get("objectID", "") | |
| story_id = hit.get("story_id", object_id) | |
| story_title = hit.get("story_title", "") or hit.get("title", "") | |
| url = f"https://news.ycombinator.com/item?id={object_id}" | |
| return ScrapedComment( | |
| text=text, | |
| url=url, | |
| subreddit=f"hackernews", # reuse field name for compatibility | |
| post_title=story_title, | |
| ) | |
| def scrape_for_domain(domain: str, max_total_comments: int = 150) -> list[ScrapedComment]: | |
| """Main entry point: scrape HN comments related to a domain's pain points. | |
| Returns a list of ScrapedComment objects compatible with the | |
| pain_point_miner pipeline. | |
| """ | |
| keywords = _get_domain_keywords(domain) | |
| all_comments: list[ScrapedComment] = [] | |
| seen_ids: set[str] = set() | |
| for keyword in keywords: | |
| if len(all_comments) >= max_total_comments: | |
| break | |
| for query_template in _SEARCH_QUERIES: | |
| if len(all_comments) >= max_total_comments: | |
| break | |
| query = query_template.replace("{domain}", keyword) | |
| hits = _search_comments(query, num_results=30) | |
| for hit in hits: | |
| oid = hit.get("objectID", "") | |
| if oid in seen_ids: | |
| continue | |
| seen_ids.add(oid) | |
| comment = _hit_to_comment(hit) | |
| if comment: | |
| all_comments.append(comment) | |
| if len(all_comments) >= max_total_comments: | |
| break | |
| # Also search "Ask HN" stories which often contain pain points | |
| ask_hn_queries = [ | |
| f"Ask HN: {domain} frustrating", | |
| f"Ask HN: {domain} problem", | |
| f"Ask HN: what tools {domain}", | |
| ] | |
| for query in ask_hn_queries: | |
| if len(all_comments) >= max_total_comments: | |
| break | |
| stories = _search_stories(query, num_results=10) | |
| for story in stories: | |
| story_id = story.get("objectID", "") | |
| if story_id in seen_ids: | |
| continue | |
| seen_ids.add(story_id) | |
| # Get comments from this story | |
| story_comments = _get_story_comments(story_id, limit=10) | |
| for comment in story_comments: | |
| if comment.url.split("=")[-1] not in seen_ids: | |
| all_comments.append(comment) | |
| seen_ids.add(comment.url.split("=")[-1]) | |
| if len(all_comments) >= max_total_comments: | |
| break | |
| logger.info(f"[hackernews] scraped {len(all_comments)} comments for domain='{domain}'") | |
| return all_comments | |
| def _get_story_comments(story_id: str, limit: int = 10) -> list[ScrapedComment]: | |
| """Fetch top comments from a specific HN story.""" | |
| params = { | |
| "tags": f"comment,story_{story_id}", | |
| "hitsPerPage": limit, | |
| } | |
| data = _make_request(f"{_BASE_URL}/search", params) | |
| if not data: | |
| return [] | |
| comments: list[ScrapedComment] = [] | |
| for hit in data.get("hits", []): | |
| comment = _hit_to_comment(hit) | |
| if comment: | |
| comments.append(comment) | |
| return comments | |