#!/usr/bin/env python3 """ Fetch real content items from public APIs and save as static JSON. Sources (all free, no auth): - Hacker News (Firebase API) - arXiv (public API) - DEV.to (public API) - Reddit (public JSON) Run once: python scripts/fetch_data.py Output: data/items.json """ import json import math import time import xml.etree.ElementTree as ET from pathlib import Path from urllib.request import Request, urlopen DATA_DIR = Path(__file__).parent.parent / "data" # Tag extraction keywords TAG_KEYWORDS = { "ai": [ "ai", "artificial intelligence", "machine learning", "ml", "deep learning", "neural", ], "nlp": [ "nlp", "natural language", "language model", "llm", "gpt", "transformer", "bert", ], "web": [ "web", "javascript", "react", "frontend", "css", "html", "browser", "nextjs", "vue", ], "systems": [ "systems", "linux", "kernel", "os", "distributed", "infrastructure", "devops", ], "rust": ["rust", "cargo", "rustc", "borrow checker"], "python": ["python", "pip", "django", "flask", "fastapi", "pytorch"], "go": ["golang", " go ", "goroutine"], "security": [ "security", "vulnerability", "exploit", "crypto", "encryption", "privacy", ], "database": ["database", "sql", "postgres", "mongodb", "redis", "sqlite"], "cloud": ["cloud", "aws", "gcp", "azure", "kubernetes", "docker", "k8s"], "mobile": ["mobile", "ios", "android", "swift", "kotlin", "flutter"], "data": [ "data", "analytics", "visualization", "pandas", "spark", "etl", "pipeline", ], "career": ["career", "hiring", "interview", "salary", "remote", "job"], "startup": ["startup", "funding", "venture", "entrepreneur", "saas", "product"], "open-source": [ "open source", "open-source", "oss", "github", "foss", "mit license", ], "robotics": ["robot", "robotics", "autonomous", "drone", "perception", "slam"], "cv": ["computer vision", "image", "object detection", "segmentation", "diffusion"], } def extract_tags(title: str, summary: str = "") -> list[str]: """Extract topic tags from title and summary text.""" text = f"{title} {summary}".lower() tags = [] for tag, keywords in TAG_KEYWORDS.items(): if any(kw in text for kw in keywords): tags.append(tag) return tags if tags else ["general"] def fetch_json(url: str, headers: dict | None = None) -> dict | list: """Fetch JSON from a URL.""" req = Request(url, headers=headers or {"User-Agent": "Curator/1.0"}) with urlopen(req, timeout=30) as resp: return json.loads(resp.read().decode()) def fetch_text(url: str) -> str: """Fetch raw text from a URL.""" req = Request(url, headers={"User-Agent": "Curator/1.0"}) with urlopen(req, timeout=30) as resp: return resp.read().decode() def fetch_hackernews(count: int = 60) -> list[dict]: """Fetch top stories from Hacker News.""" print(f" Fetching {count} Hacker News stories...") story_ids = fetch_json("https://hacker-news.firebaseio.com/v0/topstories.json") items = [] for sid in story_ids[:count]: try: story = fetch_json(f"https://hacker-news.firebaseio.com/v0/item/{sid}.json") if not story or story.get("type") != "story": continue title = story.get("title", "") url = story.get("url", f"https://news.ycombinator.com/item?id={sid}") items.append( { "id": f"hn_{sid}", "source": "hackernews", "title": title, "summary": title, # HN doesn't have summaries; title is the content "tags": extract_tags(title), "url": url, "author": story.get("by", ""), "score": story.get("score", 0), "reading_time_mins": 5, "content_type": "article", } ) except Exception as e: print(f" Skipping HN story {sid}: {e}") time.sleep(0.05) # Be polite print(f" Got {len(items)} HN items") return items def fetch_arxiv(count: int = 50) -> list[dict]: """Fetch recent AI/ML papers from arXiv.""" print(f" Fetching {count} arXiv papers...") categories = "cat:cs.AI+OR+cat:cs.LG+OR+cat:cs.CL" url = f"https://export.arxiv.org/api/query?search_query={categories}&sortBy=submittedDate&sortOrder=descending&max_results={count}" xml_text = fetch_text(url) root = ET.fromstring(xml_text) ns = {"atom": "http://www.w3.org/2005/Atom"} items = [] for entry in root.findall("atom:entry", ns): try: arxiv_id = entry.find("atom:id", ns).text.split("/abs/")[-1] title = entry.find("atom:title", ns).text.strip().replace("\n", " ") summary = ( entry.find("atom:summary", ns).text.strip().replace("\n", " ")[:300] ) authors = [ a.find("atom:name", ns).text for a in entry.findall("atom:author", ns) ] link = entry.find("atom:id", ns).text # Estimate reading time from summary length word_count = len(summary.split()) reading_time = max(10, word_count // 20) items.append( { "id": f"arxiv_{arxiv_id.replace('/', '_').replace('.', '_')}", "source": "arxiv", "title": title, "summary": summary, "tags": extract_tags(title, summary), "url": link, "author": authors[0] if authors else "", "score": 0, "reading_time_mins": reading_time, "content_type": "paper", } ) except Exception as e: print(f" Skipping arXiv entry: {e}") print(f" Got {len(items)} arXiv items") return items def fetch_devto(count: int = 50) -> list[dict]: """Fetch articles from DEV.to.""" print(f" Fetching {count} DEV.to articles...") items = [] # Fetch from multiple tags to get variety tags_to_fetch = ["programming", "ai", "webdev", "python", "tutorial"] per_tag = math.ceil(count / len(tags_to_fetch)) seen_ids = set() for tag in tags_to_fetch: try: articles = fetch_json( f"https://dev.to/api/articles?per_page={per_tag}&tag={tag}&top=7" ) for article in articles: aid = article["id"] if aid in seen_ids: continue seen_ids.add(aid) title = article.get("title", "") desc = article.get("description", "") tag_list = article.get("tag_list", []) items.append( { "id": f"devto_{aid}", "source": "devto", "title": title, "summary": desc[:300] if desc else title, "tags": extract_tags(title, desc) if not tag_list else [t.lower() for t in tag_list[:5]], "url": article.get("url", ""), "author": article.get("user", {}).get("username", ""), "score": article.get("positive_reactions_count", 0), "reading_time_mins": article.get("reading_time_minutes", 5), "content_type": "tutorial" if "tutorial" in (tag_list or []) else "article", } ) time.sleep(0.2) except Exception as e: print(f" Skipping DEV.to tag {tag}: {e}") items = items[:count] print(f" Got {len(items)} DEV.to items") return items def fetch_reddit(count: int = 40) -> list[dict]: """Fetch posts from Reddit programming subreddits.""" print(f" Fetching {count} Reddit posts...") items = [] subreddits = ["programming", "machinelearning", "webdev"] per_sub = math.ceil(count / len(subreddits)) seen_ids = set() for sub in subreddits: try: data = fetch_json( f"https://www.reddit.com/r/{sub}/hot.json?limit={per_sub}", headers={"User-Agent": "Curator/1.0 (content-curation-research)"}, ) for post in data.get("data", {}).get("children", []): pd = post["data"] rid = pd["id"] if rid in seen_ids or pd.get("stickied"): continue seen_ids.add(rid) title = pd.get("title", "") selftext = pd.get("selftext", "")[:300] items.append( { "id": f"reddit_{rid}", "source": "reddit", "title": title, "summary": selftext if selftext else title, "tags": extract_tags(title, selftext), "url": f"https://reddit.com{pd.get('permalink', '')}", "author": pd.get("author", ""), "score": pd.get("score", 0), "reading_time_mins": max(2, len(selftext.split()) // 200) if selftext else 3, "content_type": "discussion", } ) time.sleep(0.5) except Exception as e: print(f" Skipping Reddit r/{sub}: {e}") items = items[:count] print(f" Got {len(items)} Reddit items") return items def compute_relevance(item: dict, profile: dict) -> float: """Compute relevance score (0-1) of an item for a user profile. Scoring: - 0.50 weight: tag match (sum of matched interest weights / total interest weight) - 0.20 weight: source preference (1.0 if preferred, 0.3 otherwise) - 0.15 weight: community signal (normalized score/upvotes) - 0.10 weight: reading time fit (within budget = 1.0, over = 0.3) - 0.05 weight: content type match (paper for expert, tutorial for beginner) - Penalty: -0.4 for already-read items """ interests = profile["interests"] item_tags = set(item["tags"]) if not interests: return 0.05 # Tag match: how much of the user's interest space does this item cover? total_interest_weight = sum(interests.values()) matched_weight = sum(interests.get(tag, 0.0) for tag in item_tags) tag_score = ( matched_weight / total_interest_weight if total_interest_weight > 0 else 0.0 ) # Source preference preferred = profile.get("preferred_sources", []) source_score = 1.0 if (not preferred or item["source"] in preferred) else 0.3 # Community signal (normalize score: 0-100+ -> 0-1) raw_score = item.get("score", 0) community_score = min(1.0, raw_score / 200) if raw_score > 0 else 0.2 # Reading time fit budget = profile.get("time_budget_mins", 60) per_item_budget = budget / 5 time_score = 1.0 if item["reading_time_mins"] <= per_item_budget else 0.3 # Content type match skill = profile.get("skill_level", "intermediate") ctype = item.get("content_type", "article") if skill == "expert" and ctype == "paper": type_score = 1.0 elif skill == "beginner" and ctype in ("tutorial", "article"): type_score = 1.0 elif skill == "intermediate": type_score = 0.8 else: type_score = 0.5 # Weighted combination relevance = ( 0.50 * tag_score + 0.20 * source_score + 0.15 * community_score + 0.10 * time_score + 0.05 * type_score ) # Already-read penalty if item["id"] in profile.get("read_history", []): relevance -= 0.4 return round(max(0.0, min(1.0, relevance)), 4) def create_tasks() -> list[dict]: """Create task definitions with embedded user profiles for 3 difficulty levels.""" return [ { "task_id": "easy", "difficulty": "easy", "item_count": 20, "max_steps": 10, "sources": ["hackernews"], "recommend_k": 5, "description": "Curate 5 top articles from 20 Hacker News stories for an AI/ML enthusiast.", "profile": { "interests": { "ai": 0.95, "nlp": 0.85, "python": 0.8, "data": 0.7, }, "preferred_sources": ["hackernews", "arxiv"], "time_budget_mins": 120, "read_history": [], "skill_level": "intermediate", }, }, { "task_id": "medium", "difficulty": "medium", "item_count": 50, "max_steps": 20, "sources": ["hackernews", "devto", "arxiv"], "recommend_k": 10, "description": "Curate 10 items from 50 across HN, DEV.to, and arXiv for a senior engineer with broad interests.", "profile": { "interests": { "ai": 0.9, "web": 0.7, "systems": 0.6, "security": 0.5, "python": 0.75, "cloud": 0.4, "open-source": 0.65, "startup": 0.3, }, "preferred_sources": ["hackernews", "devto"], "time_budget_mins": 60, "read_history": [], "skill_level": "expert", }, }, { "task_id": "hard", "difficulty": "hard", "item_count": 100, "max_steps": 30, "sources": ["hackernews", "devto", "arxiv", "reddit"], "recommend_k": 15, "description": "Curate 15 items from 100 across all sources for a beginner with minimal stated preferences. Must infer interests from feedback.", "profile": { "interests": { "rust": 0.5, "systems": 0.4, }, "preferred_sources": [], "time_budget_mins": 30, "read_history": [], "skill_level": "beginner", }, }, ] def main(): DATA_DIR.mkdir(exist_ok=True) print("Fetching real content data from public APIs...\n") # Fetch from all sources all_items = [] all_items.extend(fetch_hackernews(60)) all_items.extend(fetch_arxiv(50)) all_items.extend(fetch_devto(50)) all_items.extend(fetch_reddit(40)) print(f"\nTotal items fetched: {len(all_items)}") # Save items items_path = DATA_DIR / "items.json" with open(items_path, "w") as f: json.dump(all_items, f, indent=2) print(f"Saved items to {items_path}") # Create tasks (profiles are embedded in each task) tasks = create_tasks() # Compute ground truth relevance and set read_history ground_truth = {} for task in tasks: profile = task["profile"] sources = task["sources"] task_items = [it for it in all_items if it["source"] in sources][ : task["item_count"] ] # Set some items as already read for medium/hard tasks if task["task_id"] == "medium" and len(task_items) > 5: profile["read_history"] = [task_items[i]["id"] for i in range(0, 6, 2)] elif task["task_id"] == "hard" and len(task_items) > 10: profile["read_history"] = [task_items[i]["id"] for i in range(0, 10, 3)] relevance = {} for item in task_items: relevance[item["id"]] = round(compute_relevance(item, profile), 4) ground_truth[task["task_id"]] = relevance # Save tasks (with updated read_history in profiles) tasks_path = DATA_DIR / "tasks.json" with open(tasks_path, "w") as f: json.dump(tasks, f, indent=2) print(f"Saved tasks to {tasks_path}") gt_path = DATA_DIR / "ground_truth.json" with open(gt_path, "w") as f: json.dump(ground_truth, f, indent=2) print(f"Saved ground truth to {gt_path}") # Summary print("\n--- Summary ---") for task in tasks: tid = task["task_id"] gt = ground_truth[tid] avg_rel = sum(gt.values()) / len(gt) if gt else 0 high_rel = sum(1 for v in gt.values() if v >= 0.5) print( f" {tid}: {len(gt)} items, avg relevance={avg_rel:.3f}, high-relevance={high_rel}" ) if __name__ == "__main__": main()