Spaces:

nishparadox
/

tiktokify

Running

App Files Files Community

nishparadox commited on Dec 29, 2025

Commit

51da887

1 Parent(s): d038c44

Add initial vibe-coded shits

Browse files

Files changed (24) hide show

.gitignore +7 -0
pyproject.toml +34 -0
src/tiktokify/__init__.py +3 -0
src/tiktokify/__main__.py +6 -0
src/tiktokify/cli.py +303 -0
src/tiktokify/crawler/__init__.py +8 -0
src/tiktokify/crawler/blog_crawler.py +382 -0
src/tiktokify/enrichment/__init__.py +20 -0
src/tiktokify/enrichment/base.py +66 -0
src/tiktokify/enrichment/llm_enricher.py +180 -0
src/tiktokify/enrichment/providers/__init__.py +12 -0
src/tiktokify/enrichment/providers/hackernews.py +242 -0
src/tiktokify/enrichment/providers/links.py +210 -0
src/tiktokify/enrichment/providers/wikipedia.py +78 -0
src/tiktokify/generator/__init__.py +5 -0
src/tiktokify/generator/html_generator.py +52 -0
src/tiktokify/generator/templates/swipe.html.jinja2 +1028 -0
src/tiktokify/models/__init__.py +17 -0
src/tiktokify/models/post.py +116 -0
src/tiktokify/recommender/__init__.py +5 -0
src/tiktokify/recommender/engine.py +59 -0
src/tiktokify/recommender/metadata.py +51 -0
src/tiktokify/recommender/tfidf.py +51 -0
tests/__init__.py +1 -0

.gitignore CHANGED Viewed

@@ -6,6 +6,13 @@ __pycache__/
 # C extensions
 *.so
 # Distribution / packaging
 .Python
 build/

 # C extensions
 *.so
+output/
+html/
+tmp/
+out/
+temp/
+data/
 # Distribution / packaging
 .Python
 build/

pyproject.toml ADDED Viewed

	@@ -0,0 +1,34 @@

+[project]
+name = "tiktokify"
+version = "0.1.0"
+description = "TikTok-style swipeable blog viewer with recommendations"
+readme = "README.md"
+requires-python = ">=3.11"
+dependencies = [
+    "crawl4ai>=0.4.0",
+    "scikit-learn>=1.3.0",
+    "numpy>=1.24.0",
+    "jinja2>=3.1.0",
+    "httpx>=0.25.0",
+    "litellm>=1.0.0",
+    "click>=8.1.0",
+    "pydantic>=2.0.0",
+    "rich>=13.0.0",
+    "loguru>=0.7.0",
+]
+[project.scripts]
+tiktokify = "tiktokify.cli:main"
+[build-system]
+requires = ["hatchling"]
+build-backend = "hatchling.build"
+[tool.hatch.build.targets.wheel]
+packages = ["src/tiktokify"]
+[dependency-groups]
+dev = [
+    "pytest>=7.0.0",
+    "pytest-asyncio>=0.21.0",
+]

src/tiktokify/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ """TikTokify - TikTok-style swipeable blog viewer with recommendations."""
2	+
3	+ __version__ = "0.1.0"

src/tiktokify/__main__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+"""Entry point for `python -m tiktokify` or `uv run tiktokify`."""
+from tiktokify.cli import main
+if __name__ == "__main__":
+    main()

src/tiktokify/cli.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""CLI interface for tiktokify."""
+import asyncio
+from pathlib import Path
+import click
+from rich.console import Console
+from rich.progress import Progress, SpinnerColumn, TextColumn
+console = Console()
+@click.command()
+@click.option(
+    "--base-url",
+    "-u",
+    required=True,
+    help="Base URL of the Jekyll blog (e.g., https://nish1001.github.io)",
+)
+@click.option(
+    "--output-html",
+    "-o",
+    required=True,
+    type=click.Path(),
+    help="Output path for generated HTML file",
+)
+@click.option(
+    "--model",
+    "-m",
+    default=None,
+    help="LLM model for enrichment (e.g., gpt-4o-mini, claude-3-haiku-20240307). Skip if not provided.",
+)
+@click.option(
+    "--n-key-points",
+    type=int,
+    default=5,
+    help="Number of key points to generate per post",
+)
+@click.option(
+    "--n-wiki",
+    type=int,
+    default=3,
+    help="Number of Wikipedia articles to suggest per post",
+)
+@click.option(
+    "--sources",
+    type=str,
+    default="",
+    help="Comma-separated external sources to fetch. Available: hackernews (hn), hn-frontpage (frontpage), links (linked)",
+)
+@click.option(
+    "--n-external",
+    type=int,
+    default=3,
+    help="Number of items to fetch per external source",
+)
+@click.option(
+    "--content-weight",
+    type=float,
+    default=0.6,
+    help="Weight for content-based similarity (0-1)",
+)
+@click.option(
+    "--metadata-weight",
+    type=float,
+    default=0.4,
+    help="Weight for tag/category similarity (0-1)",
+)
+@click.option(
+    "--top-k",
+    type=int,
+    default=5,
+    help="Number of recommendations per post",
+)
+@click.option(
+    "--max-concurrent",
+    type=int,
+    default=5,
+    help="Maximum concurrent requests",
+)
+@click.option(
+    "--max-depth",
+    type=int,
+    default=1,
+    help="Spider crawl depth (1=seed only, 2=seed+linked pages, etc.)",
+)
+@click.option(
+    "--verbose",
+    "-v",
+    is_flag=True,
+    help="Enable verbose output",
+)
+def main(
+    base_url: str,
+    output_html: str,
+    model: str | None,
+    n_key_points: int,
+    n_wiki: int,
+    sources: str,
+    n_external: int,
+    content_weight: float,
+    metadata_weight: float,
+    top_k: int,
+    max_concurrent: int,
+    max_depth: int,
+    verbose: bool,
+) -> None:
+    """
+    TikTokify - Generate a TikTok-style swipe interface for your Jekyll blog.
+    Example:
+        uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html
+    With LLM enrichment (key points + Wikipedia):
+        uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html -m gpt-4o-mini
+    With deeper spider crawling:
+        uv run tiktokify -u https://example.com -o output.html --max-depth 2
+    """
+    asyncio.run(
+        _main_async(
+            base_url=base_url,
+            output_html=Path(output_html),
+            model=model,
+            n_key_points=n_key_points,
+            n_wiki=n_wiki,
+            sources=[s.strip() for s in sources.split(",") if s.strip()],
+            n_external=n_external,
+            content_weight=content_weight,
+            metadata_weight=metadata_weight,
+            top_k=top_k,
+            max_concurrent=max_concurrent,
+            max_depth=max_depth,
+            verbose=verbose,
+        )
+    )
+async def _main_async(
+    base_url: str,
+    output_html: Path,
+    model: str | None,
+    n_key_points: int,
+    n_wiki: int,
+    sources: list[str],
+    n_external: int,
+    content_weight: float,
+    metadata_weight: float,
+    top_k: int,
+    max_concurrent: int,
+    max_depth: int,
+    verbose: bool,
+) -> None:
+    """Async main function."""
+    from tiktokify.crawler import SpiderCrawler
+    from tiktokify.enrichment import (
+        HackerNewsProvider,
+        HNFrontPageProvider,
+        LinkedContentProvider,
+        PostEnricher,
+    )
+    from tiktokify.generator import HTMLGenerator
+    from tiktokify.models import ExternalContentItem
+    from tiktokify.recommender import RecommendationEngine
+    # Map source names to provider classes
+    PROVIDERS = {
+        "hackernews": HackerNewsProvider,
+        "hn": HackerNewsProvider,  # alias
+        "hn-frontpage": HNFrontPageProvider,
+        "frontpage": HNFrontPageProvider,  # alias
+        "links": LinkedContentProvider,
+        "linked": LinkedContentProvider,  # alias
+    }
+    console.print(f"\n[bold blue]TikTokify[/bold blue] - Generating swipe UI for {base_url}\n")
+    with Progress(
+        SpinnerColumn(),
+        TextColumn("[progress.description]{task.description}"),
+        console=console,
+        transient=True,
+    ) as progress:
+        # Step 1: Spider crawl
+        depth_info = f" (depth={max_depth})" if max_depth > 1 else ""
+        task = progress.add_task(f"Spider crawling{depth_info}...", total=None)
+        crawler = SpiderCrawler(
+            base_url=base_url,
+            max_concurrent=max_concurrent,
+            max_depth=max_depth,
+            verbose=verbose,
+        )
+        posts = await crawler.crawl()
+        progress.remove_task(task)
+        if not posts:
+            console.print("[red]Error: No posts found![/red]")
+            return
+        console.print(f"  [green]✓[/green] Found {len(posts)} posts")
+        # Step 2: Build recommendations
+        task = progress.add_task("Building recommendation graph...", total=None)
+        engine = RecommendationEngine(
+            content_weight=content_weight,
+            metadata_weight=metadata_weight,
+            top_k=top_k,
+        )
+        graph = engine.build_graph(posts)
+        progress.remove_task(task)
+        console.print(f"  [green]✓[/green] Built recommendation graph")
+        # Step 3: LLM enrichment (optional)
+        if model:
+            task = progress.add_task(f"Enriching posts with LLM ({model})...", total=None)
+            enricher = PostEnricher(
+                model=model,
+                max_key_points=n_key_points,
+                max_wikipedia=n_wiki,
+                max_concurrent=max_concurrent,
+                verbose=verbose,
+            )
+            await enricher.enrich_posts(list(graph.posts.values()))
+            progress.remove_task(task)
+            enriched_count = sum(
+                1 for p in graph.posts.values() if p.key_points
+            )
+            console.print(f"  [green]✓[/green] Enriched {enriched_count} posts with key points + Wikipedia")
+        else:
+            console.print("  [dim]⊘ Skipping LLM enrichment (no --model specified)[/dim]")
+        # Step 4: External sources (optional)
+        if sources:
+            valid_sources = [s for s in sources if s in PROVIDERS]
+            if valid_sources:
+                task = progress.add_task(f"Fetching from {', '.join(valid_sources)}...", total=None)
+                # Build list of (provider, post) pairs for parallel fetching
+                fetch_tasks = []
+                task_info = []  # Track (source_name, post) for each task
+                for source_name in valid_sources:
+                    provider_class = PROVIDERS[source_name]
+                    provider = provider_class(max_items=n_external, verbose=verbose)
+                    for post in graph.posts.values():
+                        fetch_tasks.append(provider.fetch_for_post(post))
+                        task_info.append((source_name, post))
+                # Fetch all in parallel with concurrency limit
+                semaphore = asyncio.Semaphore(max_concurrent)
+                async def fetch_with_limit(coro, info):
+                    async with semaphore:
+                        try:
+                            return await coro, info, None
+                        except Exception as e:
+                            return [], info, e
+                results = await asyncio.gather(
+                    *[fetch_with_limit(t, info) for t, info in zip(fetch_tasks, task_info)]
+                )
+                # Process results
+                for external_items, (source_name, post), error in results:
+                    if error:
+                        if verbose:
+                            console.print(f"[yellow]Warning: {source_name} failed for {post.slug}: {error}[/yellow]")
+                        continue
+                    for item in external_items:
+                        post.external_content.append(
+                            ExternalContentItem(
+                                source=item.source,
+                                title=item.title,
+                                url=item.url,
+                                description=item.description,
+                                relevance=item.relevance,
+                                metadata=item.metadata,
+                            )
+                        )
+                progress.remove_task(task)
+                console.print(f"  [green]✓[/green] Fetched external content from {', '.join(valid_sources)}")
+            else:
+                console.print(f"  [yellow]⚠ Unknown sources: {sources}. Available: {list(PROVIDERS.keys())}[/yellow]")
+        # Step 5: Generate HTML
+        task = progress.add_task("Generating HTML...", total=None)
+        generator = HTMLGenerator()
+        generator.generate(graph, base_url, output_html)
+        progress.remove_task(task)
+        console.print(f"  [green]✓[/green] Generated {output_html}")
+    console.print(f"\n[bold green]Done![/bold green] Open {output_html} in a browser to view.\n")
+if __name__ == "__main__":
+    main()

src/tiktokify/crawler/__init__.py ADDED Viewed

	@@ -0,0 +1,8 @@

+"""Spider crawler module for fetching website content."""
+from .blog_crawler import SpiderCrawler
+# Backward compatibility alias
+JekyllBlogCrawler = SpiderCrawler
+__all__ = ["SpiderCrawler", "JekyllBlogCrawler"]

src/tiktokify/crawler/blog_crawler.py ADDED Viewed

	@@ -0,0 +1,382 @@

+"""Async crawler for Jekyll blogs using crawl4ai."""
+import asyncio
+import re
+from datetime import datetime
+from urllib.parse import urljoin, urlparse
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
+from rich.console import Console
+from tiktokify.models import Post, PostMetadata
+console = Console()
+class SpiderCrawler:
+    """Async spider crawler for any website with recursive link discovery."""
+    def __init__(
+        self,
+        base_url: str,
+        max_concurrent: int = 5,
+        max_depth: int = 1,
+        verbose: bool = False,
+    ):
+        self.base_url = base_url.rstrip("/")
+        self.max_concurrent = max_concurrent
+        self.max_depth = max_depth
+        self.verbose = verbose
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+        self.base_domain = urlparse(self.base_url).netloc
+    async def crawl(self) -> list[Post]:
+        """Main entry point - crawls entire blog and returns posts."""
+        browser_config = BrowserConfig(headless=True, verbose=self.verbose)
+        async with AsyncWebCrawler(config=browser_config) as crawler:
+            # Step 1: Discover post URLs
+            if self.verbose:
+                console.print("[dim]Discovering post URLs...[/dim]")
+            post_urls = await self._discover_post_urls(crawler)
+            if self.verbose:
+                console.print(f"[green]Found {len(post_urls)} posts[/green]")
+            # Step 2: Crawl individual posts concurrently
+            posts = await self._crawl_posts(crawler, post_urls)
+        return posts
+    async def _discover_post_urls(self, crawler: AsyncWebCrawler) -> list[str]:
+        """Discover all content URLs using spider-style recursive crawling.
+        Starts from base URL and follows internal links up to max_depth levels.
+        - Depth 1: Only links from seed URL (default)
+        - Depth 2: Links from seed + links from those pages
+        - etc.
+        """
+        discovered: set[str] = set()
+        visited: set[str] = set()
+        async def crawl_page(url: str, depth: int) -> set[str]:
+            """Crawl a single page and return new URLs found."""
+            if depth > self.max_depth or url in visited:
+                return set()
+            visited.add(url)
+            new_urls: set[str] = set()
+            try:
+                async with self.semaphore:
+                    result = await crawler.arun(
+                        url=url,
+                        config=CrawlerRunConfig(wait_until="domcontentloaded"),
+                    )
+                if not result.success:
+                    return set()
+                # Extract links from crawl4ai
+                if result.links:
+                    for link in result.links.get("internal", []):
+                        href = link.get("href", "") if isinstance(link, dict) else str(link)
+                        if self._is_content_url(href, self.base_domain):
+                            full_url = href if href.startswith("http") else urljoin(url, href)
+                            if full_url not in discovered:
+                                new_urls.add(full_url)
+                # Also parse HTML directly as fallback
+                if result.html:
+                    hrefs = re.findall(r'href=["\']([^"\']+)["\']', result.html)
+                    for href in hrefs:
+                        if self._is_content_url(href, self.base_domain):
+                            full_url = href if href.startswith("http") else urljoin(url, href)
+                            if full_url not in discovered:
+                                new_urls.add(full_url)
+                discovered.update(new_urls)
+                if self.verbose and new_urls:
+                    console.print(f"[dim]Depth {depth}: Found {len(new_urls)} URLs from {url}[/dim]")
+            except Exception as e:
+                if self.verbose:
+                    console.print(f"[yellow]Warning: Failed to crawl {url}: {e}[/yellow]")
+            return new_urls
+        # Start with seed URL
+        if self.verbose:
+            console.print(f"[dim]Spider crawling with max_depth={self.max_depth}[/dim]")
+        # Depth 1: crawl seed URL
+        current_urls = await crawl_page(self.base_url, 1)
+        # Deeper levels: recursively crawl discovered URLs
+        for depth in range(2, self.max_depth + 1):
+            if not current_urls:
+                break
+            if self.verbose:
+                console.print(f"[dim]Crawling depth {depth}: {len(current_urls)} URLs to explore[/dim]")
+            # Crawl all current URLs in parallel
+            tasks = [crawl_page(url, depth) for url in current_urls]
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+            # Collect new URLs for next depth
+            next_urls: set[str] = set()
+            for result in results:
+                if isinstance(result, set):
+                    next_urls.update(result)
+            current_urls = next_urls
+        if self.verbose:
+            console.print(f"[dim]Total discovered: {len(discovered)} content URLs[/dim]")
+        return list(discovered)
+    def _is_content_url(self, href: str, base_domain: str) -> bool:
+        """Check if URL is internal content (not static asset or utility page).
+        This is a simple filter - accepts anything that's:
+        1. On the same domain
+        2. Not a static asset (css, js, images, fonts)
+        3. Not a utility link (mailto, javascript, anchor)
+        """
+        if not href:
+            return False
+        # Skip anchors, mailto, javascript
+        if href.startswith(("#", "mailto:", "javascript:", "tel:")):
+            return False
+        # Skip static assets
+        static_extensions = (
+            ".css", ".js", ".json", ".xml", ".rss", ".atom",
+            ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
+            ".woff", ".woff2", ".ttf", ".eot", ".otf",
+            ".pdf", ".zip", ".tar", ".gz",
+            ".mp3", ".mp4", ".webm", ".ogg",
+        )
+        if any(href.lower().endswith(ext) for ext in static_extensions):
+            return False
+        # Check if it's an external link
+        if href.startswith(("http://", "https://")):
+            parsed = urlparse(href)
+            if parsed.netloc != base_domain:
+                return False
+        # Skip the base URL itself (index page)
+        path = urlparse(href).path if href.startswith("http") else href
+        if path in ("", "/", "/index.html", "/index.htm"):
+            return False
+        return True
+    async def _crawl_posts(
+        self, crawler: AsyncWebCrawler, urls: list[str]
+    ) -> list[Post]:
+        """Crawl all post URLs concurrently with semaphore."""
+        async def crawl_one(url: str) -> Post | None:
+            async with self.semaphore:
+                return await self._crawl_single_post(crawler, url)
+        tasks = [crawl_one(url) for url in urls]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        posts = []
+        for i, result in enumerate(results):
+            if isinstance(result, Post):
+                posts.append(result)
+            elif isinstance(result, Exception) and self.verbose:
+                console.print(f"[yellow]Failed to crawl {urls[i]}: {result}[/yellow]")
+        return posts
+    async def _crawl_single_post(
+        self, crawler: AsyncWebCrawler, url: str
+    ) -> Post | None:
+        """Crawl and parse a single post."""
+        try:
+            result = await crawler.arun(
+                url=url,
+                config=CrawlerRunConfig(wait_until="domcontentloaded"),
+            )
+            if not result.success:
+                return None
+            # Extract metadata from HTML
+            metadata = self._extract_metadata(result.html, url)
+            # Use markdown for clean text (TF-IDF)
+            content_text = result.markdown or ""
+            # Calculate reading time (~200 words/min)
+            word_count = len(content_text.split())
+            reading_time = max(1, word_count // 200)
+            # Extract slug from URL
+            slug = self._extract_slug(url)
+            return Post(
+                url=url,
+                slug=slug,
+                metadata=metadata,
+                content_text=content_text,
+                content_html=result.html or "",
+                reading_time_minutes=reading_time,
+            )
+        except Exception as e:
+            if self.verbose:
+                console.print(f"[yellow]Error parsing {url}: {e}[/yellow]")
+            return None
+    def _extract_metadata(self, html: str, url: str) -> PostMetadata:
+        """Extract metadata from rendered HTML using regex (works with various blog themes)."""
+        # Try multiple patterns for title
+        title = "Untitled"
+        title_patterns = [
+            # Jekyll Clean Blog theme
+            r'<h1[^>]*class="[^"]*post-title[^"]*"[^>]*>([^<]+)</h1>',
+            # WordPress/common patterns
+            r'<h1[^>]*class="[^"]*entry-title[^"]*"[^>]*>([^<]+)</h1>',
+            r'<h1[^>]*class="[^"]*article-title[^"]*"[^>]*>([^<]+)</h1>',
+            r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)</h1>',
+            # Meta og:title
+            r'<meta[^>]*property="og:title"[^>]*content="([^"]+)"',
+            r'<meta[^>]*name="title"[^>]*content="([^"]+)"',
+            # Generic h1
+            r"<h1[^>]*>([^<]+)</h1>",
+            # Title tag fallback
+            r"<title>([^<|]+)",
+        ]
+        for pattern in title_patterns:
+            match = re.search(pattern, html, re.IGNORECASE)
+            if match:
+                title = re.sub(r"<[^>]+>", "", match.group(1)).strip()
+                if title:
+                    break
+        # Try multiple patterns for date
+        date = datetime.now()
+        date_patterns = [
+            # Various date formats
+            (r"Posted on (\w+ \d+, \d{4})", "%B %d, %Y"),
+            (r'datetime="(\d{4}-\d{2}-\d{2})', "%Y-%m-%d"),
+            (r"(\d{4}-\d{2}-\d{2})", "%Y-%m-%d"),
+            (r"(\w+ \d{1,2}, \d{4})", "%B %d, %Y"),
+            (r"(\d{1,2} \w+ \d{4})", "%d %B %Y"),
+            (r'<time[^>]*>([^<]+)</time>', None),  # Will try multiple formats
+        ]
+        for pattern, fmt in date_patterns:
+            match = re.search(pattern, html, re.IGNORECASE)
+            if match:
+                date_str = match.group(1).strip()
+                if fmt:
+                    try:
+                        date = datetime.strptime(date_str, fmt)
+                        break
+                    except ValueError:
+                        continue
+                else:
+                    # Try common formats
+                    for try_fmt in ["%B %d, %Y", "%Y-%m-%d", "%d %B %Y", "%b %d, %Y"]:
+                        try:
+                            date = datetime.strptime(date_str, try_fmt)
+                            break
+                        except ValueError:
+                            continue
+        # Extract date from URL if not found in HTML
+        if date == datetime.now():
+            url_date = re.search(r"(20\d{2})[/-](\d{1,2})[/-](\d{1,2})", url)
+            if url_date:
+                try:
+                    date = datetime(int(url_date.group(1)), int(url_date.group(2)), int(url_date.group(3)))
+                except ValueError:
+                    pass
+        # Tags from various patterns
+        tags = []
+        tag_patterns = [
+            r'<span[^>]*class="[^"]*badge[^"]*"[^>]*>([^<]+)</span>',
+            r'<a[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</a>',
+            r'rel="tag"[^>]*>([^<]+)</a>',
+            r'<span[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</span>',
+        ]
+        for pattern in tag_patterns:
+            found = re.findall(pattern, html, re.IGNORECASE)
+            tags.extend([t.strip() for t in found if t.strip()])
+        tags = list(set(tags))[:10]  # Dedupe and limit
+        # Category from URL
+        path = urlparse(url).path
+        parts = [p for p in path.strip("/").split("/") if p and not re.match(r"^\d+$", p)]
+        # Skip date-like parts and get first meaningful segment
+        categories = []
+        for part in parts[:-1]:  # Exclude last part (the slug)
+            if not re.match(r"^20\d{2}$", part) and part not in ["blog", "posts", "articles"]:
+                categories.append(part)
+                break
+        # Header image from various patterns
+        header_img = None
+        img_patterns = [
+            r'class="[^"]*intro-header[^"]*"[^>]*style="[^"]*url\([\'"]?([^\'")\s]+)',
+            r'class="[^"]*featured[^"]*"[^>]*src="([^"]+)"',
+            r'<meta[^>]*property="og:image"[^>]*content="([^"]+)"',
+            r'class="[^"]*post-image[^"]*"[^>]*src="([^"]+)"',
+            r'class="[^"]*hero[^"]*"[^>]*src="([^"]+)"',
+        ]
+        for pattern in img_patterns:
+            match = re.search(pattern, html, re.IGNORECASE)
+            if match:
+                header_img = match.group(1)
+                break
+        # Subtitle/description from various patterns
+        subtitle = None
+        subtitle_patterns = [
+            r'<span[^>]*class="[^"]*subheading[^"]*"[^>]*>([^<]+)</span>',
+            r'<p[^>]*class="[^"]*subtitle[^"]*"[^>]*>([^<]+)</p>',
+            r'<meta[^>]*name="description"[^>]*content="([^"]+)"',
+            r'<meta[^>]*property="og:description"[^>]*content="([^"]+)"',
+        ]
+        for pattern in subtitle_patterns:
+            match = re.search(pattern, html, re.IGNORECASE)
+            if match:
+                subtitle = match.group(1).strip()
+                if len(subtitle) > 200:
+                    subtitle = subtitle[:197] + "..."
+                break
+        return PostMetadata(
+            title=title,
+            date=date,
+            categories=categories,
+            tags=tags,
+            subtitle=subtitle,
+            header_img=header_img,
+        )
+    def _extract_slug(self, url: str) -> str:
+        """Extract slug from post URL."""
+        path = urlparse(url).path
+        # Remove trailing slash and get last meaningful part
+        path = path.rstrip("/")
+        if not path:
+            return "index"
+        slug = path.rsplit("/", 1)[-1]
+        # Remove common extensions
+        for ext in (".html", ".htm", ".php", ".aspx"):
+            if slug.endswith(ext):
+                slug = slug[:-len(ext)]
+                break
+        return slug or "page"

src/tiktokify/enrichment/__init__.py ADDED Viewed

	@@ -0,0 +1,20 @@

+"""Enrichment module for LLM-based post enrichment."""
+from .base import ContentProvider, ExternalContent
+from .llm_enricher import PostEnricher
+from .providers import (
+    HackerNewsProvider,
+    HNFrontPageProvider,
+    LinkedContentProvider,
+    WikipediaProvider,
+)
+__all__ = [
+    "PostEnricher",
+    "ContentProvider",
+    "ExternalContent",
+    "WikipediaProvider",
+    "HackerNewsProvider",
+    "HNFrontPageProvider",
+    "LinkedContentProvider",
+]

src/tiktokify/enrichment/base.py ADDED Viewed

	@@ -0,0 +1,66 @@

+"""Base classes for content providers."""
+from abc import ABC, abstractmethod
+from typing import Optional
+from pydantic import BaseModel, Field, HttpUrl
+from tiktokify.models import Post
+class ExternalContent(BaseModel):
+    """A piece of external content from any source."""
+    source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
+    title: str
+    url: HttpUrl
+    description: str = Field(default="", description="Brief description or excerpt")
+    relevance: str = Field(default="", description="Why this is relevant to the post")
+    metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
+class ContentProvider(ABC):
+    """Abstract base class for external content providers.
+    To add a new source:
+    1. Create a new file (e.g., hackernews.py)
+    2. Subclass ContentProvider
+    3. Implement source_type and fetch_for_post
+    4. Register in enricher.py
+    """
+    def __init__(self, max_items: int = 3, verbose: bool = False):
+        self.max_items = max_items
+        self.verbose = verbose
+    @property
+    @abstractmethod
+    def source_type(self) -> str:
+        """Unique identifier for this source (e.g., 'wikipedia', 'hackernews')."""
+        pass
+    @abstractmethod
+    async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
+        """Fetch relevant external content for a blog post.
+        Args:
+            post: The blog post to find related content for
+        Returns:
+            List of ExternalContent items (up to max_items)
+        """
+        pass
+    async def fetch_for_posts(self, posts: list[Post]) -> dict[str, list[ExternalContent]]:
+        """Fetch content for multiple posts.
+        Default implementation calls fetch_for_post sequentially.
+        Override for batch optimization.
+        Returns:
+            Dict mapping post slug to list of ExternalContent
+        """
+        results = {}
+        for post in posts:
+            results[post.slug] = await self.fetch_for_post(post)
+        return results

src/tiktokify/enrichment/llm_enricher.py ADDED Viewed

	@@ -0,0 +1,180 @@

+"""LLM-based post enrichment using litellm.
+This module uses LLM to:
+1. Generate key points/takeaways for each post
+2. Suggest relevant Wikipedia articles
+The actual Wikipedia extract fetching is done by providers/wikipedia.py
+"""
+import asyncio
+import json
+import litellm
+from pydantic import ValidationError
+from rich.console import Console
+from tiktokify.models import Post, WikipediaSuggestion
+console = Console()
+# Disable litellm's verbose logging
+litellm.suppress_debug_info = True
+class PostEnricher:
+    """Enrich posts with key points and Wikipedia suggestions using LLM."""
+    def __init__(
+        self,
+        model: str = "gpt-4o-mini",
+        max_key_points: int = 5,
+        max_wikipedia: int = 3,
+        max_concurrent: int = 3,
+        verbose: bool = False,
+    ):
+        self.model = model
+        self.max_key_points = max_key_points
+        self.max_wikipedia = max_wikipedia
+        self.max_concurrent = max_concurrent
+        self.verbose = verbose
+        self.semaphore = asyncio.Semaphore(max_concurrent)
+    async def enrich_post(self, post: Post) -> None:
+        """Enrich a single post with key points and Wikipedia suggestions."""
+        prompt = self._build_prompt(post)
+        try:
+            # Calculate tokens needed: ~50 tokens per key point, ~80 per wiki suggestion
+            estimated_tokens = (self.max_key_points * 50) + (self.max_wikipedia * 100) + 200
+            max_tokens = max(1000, min(estimated_tokens, 4000))
+            response = await litellm.acompletion(
+                model=self.model,
+                messages=[{"role": "user", "content": prompt}],
+                temperature=0.3,
+                max_tokens=max_tokens,
+            )
+            content = response.choices[0].message.content
+            key_points, wikipedia = self._parse_response(content)
+            # Fetch Wikipedia extracts for each suggestion
+            wikipedia_with_extracts = await self._fetch_wiki_extracts(wikipedia)
+            post.key_points = key_points
+            post.wikipedia_suggestions = wikipedia_with_extracts
+        except Exception as e:
+            if self.verbose:
+                console.print(
+                    f"[yellow]Warning: LLM call failed for {post.slug}: {e}[/yellow]"
+                )
+    async def _fetch_wiki_extracts(
+        self, suggestions: list[WikipediaSuggestion]
+    ) -> list[WikipediaSuggestion]:
+        """Fetch Wikipedia extracts for all suggestions concurrently."""
+        from tiktokify.enrichment.providers.wikipedia import WikipediaProvider
+        provider = WikipediaProvider(max_items=len(suggestions), verbose=self.verbose)
+        async def fetch_one(suggestion: WikipediaSuggestion) -> WikipediaSuggestion:
+            extract = await provider._fetch_extract(
+                provider._extract_title_from_url(str(suggestion.url)) or suggestion.title
+            )
+            return WikipediaSuggestion(
+                title=suggestion.title,
+                url=suggestion.url,
+                relevance=suggestion.relevance,
+                extract=extract,
+            )
+        tasks = [fetch_one(s) for s in suggestions]
+        results = await asyncio.gather(*tasks, return_exceptions=True)
+        return [r for r in results if isinstance(r, WikipediaSuggestion)]
+    def _build_prompt(self, post: Post) -> str:
+        """Build LLM prompt for key points and Wikipedia suggestions."""
+        content_excerpt = post.content_text[:2000] if post.content_text else ""
+        return f"""Analyze this blog post and provide:
+1. {self.max_key_points} key points/takeaways (concise bullet points)
+2. {self.max_wikipedia} relevant Wikipedia articles for further reading
+Title: {post.metadata.title}
+Subtitle: {post.metadata.subtitle or "N/A"}
+Categories: {', '.join(post.metadata.categories)}
+Tags: {', '.join(post.metadata.tags)}
+Content:
+{content_excerpt}
+Return ONLY valid JSON with this exact structure:
+{{
+  "keyPoints": ["point 1", "point 2", ...],
+  "wikipedia": [
+    {{"title": "Article Title", "url": "https://en.wikipedia.org/wiki/...", "relevance": "Why it's relevant"}}
+  ]
+}}
+Guidelines:
+- Key points should be insightful takeaways, not just summaries
+- Each key point should be 1-2 sentences max
+- Wikipedia URLs must be valid (use underscores for spaces)
+- Return ONLY the JSON, no markdown formatting"""
+    def _parse_response(self, content: str) -> tuple[list[str], list[WikipediaSuggestion]]:
+        """Parse LLM response into key points and Wikipedia suggestions."""
+        # Clean up response - remove markdown code blocks if present
+        content = content.strip()
+        if content.startswith("```"):
+            lines = content.split("\n")
+            content = "\n".join(
+                line for line in lines if not line.startswith("```")
+            )
+        key_points: list[str] = []
+        wikipedia: list[WikipediaSuggestion] = []
+        try:
+            data = json.loads(content)
+            # Parse key points
+            if "keyPoints" in data and isinstance(data["keyPoints"], list):
+                key_points = [str(p) for p in data["keyPoints"] if p]
+            # Parse Wikipedia suggestions
+            if "wikipedia" in data and isinstance(data["wikipedia"], list):
+                for item in data["wikipedia"]:
+                    try:
+                        suggestion = WikipediaSuggestion(
+                            title=item.get("title", ""),
+                            url=item.get("url", ""),
+                            relevance=item.get("relevance", ""),
+                            extract="",  # Will be filled later
+                        )
+                        wikipedia.append(suggestion)
+                    except ValidationError:
+                        continue
+        except json.JSONDecodeError as e:
+            if self.verbose:
+                console.print(f"[yellow]JSON parse error: {e}[/yellow]")
+        return key_points, wikipedia
+    async def enrich_posts(self, posts: list[Post]) -> None:
+        """Enrich all posts concurrently."""
+        async def enrich_one(post: Post) -> None:
+            async with self.semaphore:
+                await self.enrich_post(post)
+        tasks = [enrich_one(post) for post in posts]
+        await asyncio.gather(*tasks, return_exceptions=True)
+# Backwards compatibility alias
+WikipediaSuggester = PostEnricher

src/tiktokify/enrichment/providers/__init__.py ADDED Viewed

	@@ -0,0 +1,12 @@

+"""Content providers for external sources."""
+from .hackernews import HackerNewsProvider, HNFrontPageProvider
+from .links import LinkedContentProvider
+from .wikipedia import WikipediaProvider
+__all__ = [
+    "WikipediaProvider",
+    "HackerNewsProvider",
+    "HNFrontPageProvider",
+    "LinkedContentProvider",
+]

src/tiktokify/enrichment/providers/hackernews.py ADDED Viewed

	@@ -0,0 +1,242 @@

+"""Hacker News content providers.
+Provides two providers:
+- HackerNewsProvider: Keyword-based search for stories related to post topics
+- HNFrontPageProvider: Current front page stories for general interest
+"""
+import asyncio
+import re
+import httpx
+from tiktokify.enrichment.base import ContentProvider, ExternalContent
+from tiktokify.models import Post
+async def fetch_article_excerpt(url: str, max_chars: int = 800) -> str:
+    """Fetch and extract text excerpt from an article URL."""
+    if not url:
+        return ""
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                url,
+                headers={
+                    "User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
+                    "Accept": "text/html,application/xhtml+xml",
+                },
+                timeout=10.0,
+                follow_redirects=True,
+            )
+            if response.status_code != 200:
+                return ""
+            html = response.text
+            # Remove script, style, nav, header, footer tags
+            html = re.sub(r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
+            # Extract text from paragraph tags (most content is in <p>)
+            paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", html, flags=re.DOTALL | re.IGNORECASE)
+            # Clean HTML tags from extracted text
+            text_parts = []
+            for p in paragraphs:
+                clean = re.sub(r"<[^>]+>", " ", p)
+                clean = re.sub(r"\s+", " ", clean).strip()
+                if len(clean) > 50:  # Skip very short paragraphs
+                    text_parts.append(clean)
+            if not text_parts:
+                # Fallback: extract all text
+                text = re.sub(r"<[^>]+>", " ", html)
+                text = re.sub(r"\s+", " ", text).strip()
+                return text[:max_chars] + "..." if len(text) > max_chars else text
+            excerpt = " ".join(text_parts)
+            if len(excerpt) > max_chars:
+                excerpt = excerpt[:max_chars].rsplit(" ", 1)[0] + "..."
+            return excerpt
+    except Exception:
+        return ""
+class HackerNewsProvider(ContentProvider):
+    """Fetch relevant Hacker News discussions for blog posts.
+    Uses the Algolia HN Search API to find related stories by keyword.
+    """
+    HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search"
+    @property
+    def source_type(self) -> str:
+        return "hackernews"
+    async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
+        """Search HN for stories related to the post's topics."""
+        # Build search query from post metadata
+        query_parts = []
+        # Use tags (most specific)
+        if post.metadata.tags:
+            query_parts.extend(post.metadata.tags[:3])
+        # Add key terms from title
+        title_words = [
+            w for w in post.metadata.title.split()
+            if len(w) > 4 and w.lower() not in {"about", "using", "with", "from", "that", "this", "what", "when", "where", "which"}
+        ]
+        query_parts.extend(title_words[:2])
+        if not query_parts:
+            return []
+        query = " ".join(query_parts)
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(
+                    self.HN_SEARCH_URL,
+                    params={
+                        "query": query,
+                        "tags": "story",
+                        "hitsPerPage": self.max_items * 2,  # Fetch extra for filtering
+                    },
+                    timeout=10.0,
+                )
+                if response.status_code != 200:
+                    return []
+                data = response.json()
+                hits = data.get("hits", [])
+                # Prepare hits for parallel fetching
+                selected_hits = hits[: self.max_items]
+                story_urls = [hit.get("url", "") for hit in selected_hits]
+                # Fetch all article excerpts in parallel
+                excerpts = await asyncio.gather(
+                    *[fetch_article_excerpt(url) for url in story_urls],
+                    return_exceptions=True,
+                )
+                results = []
+                for hit, excerpt in zip(selected_hits, excerpts):
+                    story_id = hit.get("objectID", "")
+                    hn_url = f"https://news.ycombinator.com/item?id={story_id}"
+                    title = hit.get("title", "")
+                    points = hit.get("points", 0)
+                    num_comments = hit.get("num_comments", 0)
+                    author = hit.get("author", "")
+                    story_url = hit.get("url", "")
+                    # Handle exceptions from parallel fetch
+                    if isinstance(excerpt, Exception) or not excerpt:
+                        excerpt = f"{points} points · {num_comments} comments"
+                    results.append(
+                        ExternalContent(
+                            source=self.source_type,
+                            title=title,
+                            url=hn_url,
+                            description=excerpt,
+                            relevance=f"Found via search: {query}",
+                            metadata={
+                                "points": points,
+                                "num_comments": num_comments,
+                                "author": author,
+                                "story_url": story_url,
+                            },
+                        )
+                    )
+                return results
+        except Exception:
+            return []
+class HNFrontPageProvider(ContentProvider):
+    """Fetch current Hacker News front page stories.
+    Uses the Algolia HN API to get stories currently on the front page.
+    Good for adding general tech interest content to any blog.
+    """
+    HN_FRONT_PAGE_URL = "https://hn.algolia.com/api/v1/search"
+    @property
+    def source_type(self) -> str:
+        return "hn-frontpage"
+    async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
+        """Fetch current front page stories (post-independent)."""
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(
+                    self.HN_FRONT_PAGE_URL,
+                    params={
+                        "tags": "front_page",
+                        "hitsPerPage": self.max_items,
+                    },
+                    timeout=10.0,
+                )
+                if response.status_code != 200:
+                    return []
+                data = response.json()
+                hits = data.get("hits", [])
+                # Prepare hits for parallel fetching
+                selected_hits = hits[: self.max_items]
+                story_urls = [hit.get("url", "") for hit in selected_hits]
+                # Fetch all article excerpts in parallel
+                excerpts = await asyncio.gather(
+                    *[fetch_article_excerpt(url) for url in story_urls],
+                    return_exceptions=True,
+                )
+                results = []
+                for hit, excerpt in zip(selected_hits, excerpts):
+                    story_id = hit.get("objectID", "")
+                    hn_url = f"https://news.ycombinator.com/item?id={story_id}"
+                    title = hit.get("title", "")
+                    points = hit.get("points", 0)
+                    num_comments = hit.get("num_comments", 0)
+                    author = hit.get("author", "")
+                    story_url = hit.get("url", "")
+                    # Handle exceptions from parallel fetch
+                    if isinstance(excerpt, Exception) or not excerpt:
+                        excerpt = f"{points} points · {num_comments} comments"
+                    results.append(
+                        ExternalContent(
+                            source=self.source_type,
+                            title=title,
+                            url=hn_url,
+                            description=excerpt,
+                            relevance="Currently on HN front page",
+                            metadata={
+                                "points": points,
+                                "num_comments": num_comments,
+                                "author": author,
+                                "story_url": story_url,
+                            },
+                        )
+                    )
+                return results
+        except Exception:
+            return []

src/tiktokify/enrichment/providers/links.py ADDED Viewed

	@@ -0,0 +1,210 @@

+"""Link extractor provider for crawling external links from blog posts."""
+import asyncio
+import re
+from urllib.parse import urljoin, urlparse
+import httpx
+from tiktokify.enrichment.base import ContentProvider, ExternalContent
+from tiktokify.models import Post
+async def fetch_link_metadata(url: str, max_excerpt_chars: int = 600) -> tuple[str, str]:
+    """Fetch title and excerpt from a URL.
+    Returns (title, excerpt) tuple.
+    """
+    try:
+        async with httpx.AsyncClient() as client:
+            response = await client.get(
+                url,
+                headers={
+                    "User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
+                    "Accept": "text/html,application/xhtml+xml",
+                },
+                timeout=10.0,
+                follow_redirects=True,
+            )
+            if response.status_code != 200:
+                return "", ""
+            html = response.text
+            # Extract title
+            title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
+            title = ""
+            if title_match:
+                title = re.sub(r"<[^>]+>", "", title_match.group(1))
+                title = re.sub(r"\s+", " ", title).strip()
+            # Try meta description first
+            meta_desc = re.search(
+                r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
+                html,
+                re.IGNORECASE,
+            )
+            if meta_desc:
+                excerpt = meta_desc.group(1).strip()
+                return title, excerpt
+            # Remove script, style, nav, header, footer tags
+            clean_html = re.sub(
+                r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>",
+                "",
+                html,
+                flags=re.DOTALL | re.IGNORECASE,
+            )
+            # Extract text from paragraph tags
+            paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", clean_html, flags=re.DOTALL | re.IGNORECASE)
+            text_parts = []
+            for p in paragraphs:
+                clean = re.sub(r"<[^>]+>", " ", p)
+                clean = re.sub(r"\s+", " ", clean).strip()
+                if len(clean) > 50:
+                    text_parts.append(clean)
+            excerpt = " ".join(text_parts)
+            if len(excerpt) > max_excerpt_chars:
+                excerpt = excerpt[:max_excerpt_chars].rsplit(" ", 1)[0] + "..."
+            return title, excerpt
+    except Exception:
+        return "", ""
+class LinkedContentProvider(ContentProvider):
+    """Extract and crawl external links from blog post content.
+    Finds links within the blog post HTML and fetches their content,
+    creating a "spider" of related content from the post's references.
+    """
+    # Domains to skip (social media, generic sites, etc.)
+    SKIP_DOMAINS = {
+        "twitter.com",
+        "x.com",
+        "facebook.com",
+        "instagram.com",
+        "linkedin.com",
+        "youtube.com",
+        "youtu.be",
+        "github.com",
+        "gist.github.com",
+        "reddit.com",
+        "news.ycombinator.com",
+        "google.com",
+        "amazon.com",
+        "wikipedia.org",  # Already have Wikipedia provider
+        "fonts.googleapis.com",
+        "cdn.jsdelivr.net",
+        "unpkg.com",
+        "cloudflare.com",
+    }
+    @property
+    def source_type(self) -> str:
+        return "linked"
+    def _extract_links(self, html: str, base_url: str) -> list[str]:
+        """Extract external links from HTML content."""
+        # Find all href links
+        links = re.findall(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE)
+        parsed_base = urlparse(base_url)
+        base_domain = parsed_base.netloc.lower()
+        external_links = []
+        seen = set()
+        for link in links:
+            # Skip anchor links, mailto, javascript, etc.
+            if link.startswith(("#", "mailto:", "javascript:", "tel:")):
+                continue
+            # Resolve relative URLs
+            if link.startswith("/"):
+                link = urljoin(base_url, link)
+            elif not link.startswith(("http://", "https://")):
+                continue
+            # Parse and validate
+            parsed = urlparse(link)
+            domain = parsed.netloc.lower()
+            # Skip internal links
+            if domain == base_domain or domain.endswith(f".{base_domain}"):
+                continue
+            # Skip blocked domains
+            if any(skip in domain for skip in self.SKIP_DOMAINS):
+                continue
+            # Skip non-http(s) links
+            if parsed.scheme not in ("http", "https"):
+                continue
+            # Skip duplicates
+            normalized = f"{parsed.scheme}://{domain}{parsed.path}"
+            if normalized in seen:
+                continue
+            seen.add(normalized)
+            external_links.append(link)
+        return external_links
+    async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
+        """Extract links from post content and fetch their metadata."""
+        if not post.content_html:
+            return []
+        # Extract external links
+        links = self._extract_links(post.content_html, post.url)
+        if not links:
+            return []
+        # Limit to max_items and fetch all in parallel
+        selected_links = links[: self.max_items]
+        metadata_results = await asyncio.gather(
+            *[fetch_link_metadata(link) for link in selected_links],
+            return_exceptions=True,
+        )
+        results = []
+        for link, meta in zip(selected_links, metadata_results):
+            # Handle exceptions
+            if isinstance(meta, Exception):
+                continue
+            title, excerpt = meta
+            if not title and not excerpt:
+                continue
+            # Use URL domain as fallback title
+            if not title:
+                parsed = urlparse(link)
+                title = parsed.netloc
+            results.append(
+                ExternalContent(
+                    source=self.source_type,
+                    title=title,
+                    url=link,
+                    description=excerpt,
+                    relevance=f"Referenced in: {post.metadata.title}",
+                    metadata={
+                        "source_post_slug": post.slug,
+                        "link_type": "reference",
+                    },
+                )
+            )
+        return results

src/tiktokify/enrichment/providers/wikipedia.py ADDED Viewed

	@@ -0,0 +1,78 @@

+"""Wikipedia content provider."""
+from urllib.parse import unquote, urlparse
+import httpx
+from tiktokify.enrichment.base import ContentProvider, ExternalContent
+from tiktokify.models import Post
+class WikipediaProvider(ContentProvider):
+    """Fetch relevant Wikipedia articles for blog posts.
+    Uses Wikipedia REST API to fetch article summaries.
+    Requires LLM to first suggest relevant articles (see PostEnricher).
+    """
+    @property
+    def source_type(self) -> str:
+        return "wikipedia"
+    async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
+        """Fetch Wikipedia extracts for pre-suggested articles."""
+        results = []
+        for suggestion in post.wikipedia_suggestions[: self.max_items]:
+            title = self._extract_title_from_url(str(suggestion.url)) or suggestion.title
+            extract = await self._fetch_extract(title)
+            results.append(
+                ExternalContent(
+                    source=self.source_type,
+                    title=suggestion.title,
+                    url=suggestion.url,
+                    description=extract,
+                    relevance=suggestion.relevance,
+                    metadata={"extract": extract},
+                )
+            )
+        return results
+    async def _fetch_extract(self, title: str, max_chars: int = 1500) -> str:
+        """Fetch article extract from Wikipedia API."""
+        title = title.strip()
+        url = "https://en.wikipedia.org/api/rest_v1/page/summary/" + title.replace(" ", "_")
+        try:
+            async with httpx.AsyncClient() as client:
+                response = await client.get(
+                    url,
+                    headers={"User-Agent": "TikTokify/1.0"},
+                    timeout=10.0,
+                    follow_redirects=True,
+                )
+                if response.status_code == 200:
+                    data = response.json()
+                    extract = data.get("extract", "")
+                    if len(extract) > max_chars:
+                        extract = extract[:max_chars].rsplit(" ", 1)[0] + "..."
+                    return extract
+        except Exception:
+            pass
+        return ""
+    def _extract_title_from_url(self, url: str) -> str:
+        """Extract Wikipedia article title from URL."""
+        parsed = urlparse(url)
+        if "wikipedia.org" in parsed.netloc:
+            path = parsed.path
+            if path.startswith("/wiki/"):
+                title = path[6:]
+                title = unquote(title)
+                title = title.replace("_", " ")
+                return title
+        return ""

src/tiktokify/generator/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""HTML generator module."""
+from .html_generator import HTMLGenerator
+__all__ = ["HTMLGenerator"]

src/tiktokify/generator/html_generator.py ADDED Viewed

	@@ -0,0 +1,52 @@

+"""HTML generator for TikTok-style swipe UI."""
+import json
+from pathlib import Path
+from jinja2 import Environment, FileSystemLoader
+from tiktokify.models import RecommendationGraph
+class HTMLGenerator:
+    """Generate standalone HTML with embedded data and swipe UI."""
+    def __init__(self, template_dir: Path | None = None):
+        if template_dir is None:
+            template_dir = Path(__file__).parent / "templates"
+        self.env = Environment(
+            loader=FileSystemLoader(template_dir),
+            autoescape=True,
+        )
+    def generate(
+        self,
+        graph: RecommendationGraph,
+        base_url: str,
+        output_path: Path,
+    ) -> None:
+        """Generate HTML file with embedded recommendation data."""
+        template = self.env.get_template("swipe.html.jinja2")
+        # Prepare data for embedding
+        graph_data = graph.to_json_for_embed()
+        graph_json = json.dumps(graph_data, indent=2)
+        # Sort posts by date for initial list
+        sorted_posts = sorted(
+            graph.posts.values(),
+            key=lambda p: p.metadata.date,
+            reverse=True,
+        )
+        post_slugs = [p.slug for p in sorted_posts]
+        html = template.render(
+            base_url=base_url.rstrip("/"),
+            graph_json=graph_json,
+            post_slugs_json=json.dumps(post_slugs),
+            post_count=len(sorted_posts),
+        )
+        output_path.parent.mkdir(parents=True, exist_ok=True)
+        output_path.write_text(html)

src/tiktokify/generator/templates/swipe.html.jinja2 ADDED Viewed

	@@ -0,0 +1,1028 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
+    <title>TikTokify - Swipe to Discover</title>
+    <style>
+        * {
+            margin: 0;
+            padding: 0;
+            box-sizing: border-box;
+        }
+        html, body {
+            height: 100%;
+            overflow: hidden;
+            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+            background: #000;
+            color: #fff;
+        }
+        .swipe-container {
+            height: 100vh;
+            overflow-y: scroll;
+            scroll-snap-type: y mandatory;
+            -webkit-overflow-scrolling: touch;
+        }
+        .card {
+            height: 100vh;
+            scroll-snap-align: start;
+            display: flex;
+            flex-direction: column;
+            position: relative;
+            background-size: cover;
+            background-position: center;
+        }
+        .card-overlay {
+            position: absolute;
+            inset: 0;
+            background: linear-gradient(
+                to bottom,
+                rgba(0,0,0,0.3) 0%,
+                rgba(0,0,0,0.1) 30%,
+                rgba(0,0,0,0.6) 60%,
+                rgba(0,0,0,0.95) 100%
+            );
+        }
+        .card-content {
+            position: relative;
+            z-index: 1;
+            height: 100%;
+            display: flex;
+            flex-direction: column;
+            justify-content: flex-end;
+            padding: 20px;
+            padding-bottom: 100px;
+            overflow-y: auto;
+        }
+        .card-type {
+            display: inline-flex;
+            align-items: center;
+            gap: 6px;
+            background: rgba(255,255,255,0.15);
+            padding: 4px 10px;
+            border-radius: 20px;
+            font-size: 10px;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+            margin-bottom: 12px;
+            width: fit-content;
+        }
+        .card-type.wiki {
+            background: rgba(77, 163, 255, 0.3);
+        }
+        .card-meta {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 8px;
+            margin-bottom: 12px;
+        }
+        .card-category {
+            background: rgba(255,255,255,0.2);
+            padding: 4px 12px;
+            border-radius: 20px;
+            font-size: 11px;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+        }
+        .card-title {
+            font-size: 24px;
+            font-weight: 700;
+            margin-bottom: 10px;
+            line-height: 1.25;
+            text-shadow: 0 2px 4px rgba(0,0,0,0.3);
+        }
+        .card-subtitle {
+            font-size: 14px;
+            opacity: 0.85;
+            margin-bottom: 12px;
+            line-height: 1.5;
+        }
+        /* Key points */
+        .key-points {
+            background: rgba(255,255,255,0.08);
+            border-radius: 12px;
+            padding: 14px;
+            margin-bottom: 14px;
+        }
+        .key-points-title {
+            font-size: 11px;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+            opacity: 0.6;
+            margin-bottom: 10px;
+        }
+        .key-point {
+            display: flex;
+            gap: 10px;
+            margin-bottom: 8px;
+            font-size: 13px;
+            line-height: 1.5;
+        }
+        .key-point:last-child {
+            margin-bottom: 0;
+        }
+        .key-point-bullet {
+            color: #4da3ff;
+            flex-shrink: 0;
+        }
+        .card-tags {
+            display: flex;
+            flex-wrap: wrap;
+            gap: 6px;
+            margin-bottom: 12px;
+        }
+        .card-tag {
+            background: rgba(255,255,255,0.12);
+            padding: 4px 10px;
+            border-radius: 4px;
+            font-size: 11px;
+            opacity: 0.9;
+        }
+        .card-date {
+            font-size: 12px;
+            opacity: 0.6;
+        }
+        /* Wikipedia card specific */
+        .wiki-card {
+            background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
+        }
+        .wiki-card .card-overlay {
+            background: linear-gradient(
+                to bottom,
+                rgba(15, 52, 96, 0.3) 0%,
+                rgba(15, 52, 96, 0.6) 50%,
+                rgba(10, 10, 30, 0.95) 100%
+            );
+        }
+        .wiki-excerpt {
+            font-size: 15px;
+            line-height: 1.7;
+            opacity: 0.95;
+            margin-bottom: 16px;
+            max-height: 40vh;
+            overflow-y: auto;
+        }
+        /* Wikipedia card - center content vertically */
+        .wiki-card .card-content {
+            justify-content: center;
+            padding-top: 80px;
+        }
+        /* HackerNews card */
+        .hn-card {
+            background: linear-gradient(135deg, #1a0a00 0%, #ff6600 100%);
+        }
+        .hn-card .card-overlay {
+            background: linear-gradient(
+                to bottom,
+                rgba(26, 10, 0, 0.4) 0%,
+                rgba(26, 10, 0, 0.6) 50%,
+                rgba(10, 5, 0, 0.95) 100%
+            );
+        }
+        .hn-card .card-content {
+            justify-content: center;
+            padding-top: 80px;
+        }
+        .card-type.hn {
+            background: rgba(255, 102, 0, 0.4);
+        }
+        .hn-meta {
+            display: flex;
+            gap: 16px;
+            margin-bottom: 16px;
+            font-size: 14px;
+            opacity: 0.9;
+        }
+        .hn-meta-item {
+            display: flex;
+            align-items: center;
+            gap: 6px;
+        }
+        .hn-description {
+            font-size: 15px;
+            line-height: 1.7;
+            opacity: 0.9;
+            margin-bottom: 16px;
+            max-height: 35vh;
+            overflow-y: auto;
+        }
+        .hn-link {
+            display: inline-flex;
+            align-items: center;
+            gap: 8px;
+            background: rgba(255, 102, 0, 0.3);
+            padding: 10px 16px;
+            border-radius: 8px;
+            font-size: 13px;
+            margin-top: 12px;
+            opacity: 0.9;
+        }
+        .wiki-relevance {
+            font-size: 13px;
+            line-height: 1.5;
+            opacity: 0.7;
+            margin-bottom: 12px;
+            padding-left: 12px;
+            border-left: 2px solid rgba(77, 163, 255, 0.5);
+        }
+        .wiki-source {
+            display: inline-flex;
+            align-items: center;
+            gap: 6px;
+            font-size: 12px;
+            opacity: 0.6;
+        }
+        /* Linked content card (tertiary crawled links) */
+        .linked-card {
+            background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
+        }
+        .linked-card .card-overlay {
+            background: linear-gradient(
+                to bottom,
+                rgba(26, 26, 26, 0.3) 0%,
+                rgba(26, 26, 26, 0.6) 50%,
+                rgba(10, 20, 15, 0.95) 100%
+            );
+        }
+        .linked-card .card-content {
+            justify-content: center;
+            padding-top: 80px;
+        }
+        .card-type.linked {
+            background: rgba(46, 204, 113, 0.3);
+        }
+        .linked-description {
+            font-size: 15px;
+            line-height: 1.7;
+            opacity: 0.9;
+            margin-bottom: 16px;
+            max-height: 35vh;
+            overflow-y: auto;
+        }
+        .rec-item.linked-rec {
+            border-left: 3px solid rgba(46, 204, 113, 0.6);
+            padding-left: 12px;
+        }
+        .rec-thumb.linked-thumb {
+            background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+        }
+        /* Action buttons */
+        .card-actions {
+            position: fixed;
+            right: 12px;
+            bottom: 120px;
+            display: flex;
+            flex-direction: column;
+            gap: 14px;
+            z-index: 10;
+        }
+        .action-btn {
+            width: 46px;
+            height: 46px;
+            border-radius: 50%;
+            background: rgba(255,255,255,0.15);
+            backdrop-filter: blur(10px);
+            border: none;
+            color: #fff;
+            font-size: 18px;
+            cursor: pointer;
+            display: flex;
+            align-items: center;
+            justify-content: center;
+            transition: transform 0.2s, background 0.2s;
+        }
+        .action-btn:hover {
+            transform: scale(1.1);
+            background: rgba(255,255,255,0.25);
+        }
+        .action-btn:active {
+            transform: scale(0.95);
+        }
+        .read-btn {
+            background: rgba(255,71,87,0.8);
+        }
+        .read-btn:hover {
+            background: rgba(255,71,87,1);
+        }
+        /* Panels */
+        .panel {
+            position: fixed;
+            bottom: 0;
+            left: 0;
+            right: 0;
+            background: rgba(20,20,20,0.98);
+            backdrop-filter: blur(20px);
+            border-radius: 20px 20px 0 0;
+            padding: 20px;
+            transform: translateY(100%);
+            transition: transform 0.3s ease;
+            z-index: 20;
+            max-height: 60vh;
+            overflow-y: auto;
+        }
+        .panel.open {
+            transform: translateY(0);
+        }
+        .panel-header {
+            display: flex;
+            justify-content: space-between;
+            align-items: center;
+            margin-bottom: 16px;
+        }
+        .panel h3 {
+            font-size: 16px;
+            font-weight: 600;
+        }
+        .panel-close {
+            background: none;
+            border: none;
+            color: #fff;
+            font-size: 24px;
+            cursor: pointer;
+            opacity: 0.6;
+        }
+        .panel-close:hover {
+            opacity: 1;
+        }
+        /* Recommendation items */
+        .rec-item {
+            display: flex;
+            gap: 14px;
+            padding: 12px 0;
+            border-bottom: 1px solid rgba(255,255,255,0.08);
+            cursor: pointer;
+            transition: background 0.2s;
+        }
+        .rec-item:hover {
+            background: rgba(255,255,255,0.05);
+        }
+        .rec-thumb {
+            width: 64px;
+            height: 64px;
+            border-radius: 8px;
+            background-size: cover;
+            background-position: center;
+            flex-shrink: 0;
+            background-color: rgba(255,255,255,0.1);
+        }
+        .rec-info {
+            flex: 1;
+            min-width: 0;
+        }
+        .rec-info h4 {
+            font-size: 14px;
+            font-weight: 600;
+            margin-bottom: 4px;
+        }
+        .rec-info span {
+            font-size: 12px;
+            opacity: 0.5;
+        }
+        .rec-section-title {
+            font-size: 12px;
+            text-transform: uppercase;
+            letter-spacing: 0.5px;
+            opacity: 0.5;
+            margin: 16px 0 8px 0;
+        }
+        .rec-section-title:first-child {
+            margin-top: 0;
+        }
+        .rec-item.wiki-rec {
+            border-left: 3px solid rgba(77, 163, 255, 0.6);
+            padding-left: 12px;
+        }
+        .rec-thumb.wiki-thumb {
+            background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+        }
+        .rec-thumb.wiki-thumb svg {
+            opacity: 0.6;
+        }
+        .rec-item.hn-rec {
+            border-left: 3px solid rgba(255, 102, 0, 0.6);
+            padding-left: 12px;
+        }
+        .rec-thumb.hn-thumb {
+            background: linear-gradient(135deg, #1a0a00 0%, #3d1a00 100%);
+            display: flex;
+            align-items: center;
+            justify-content: center;
+        }
+        /* Swipe hint */
+        .swipe-hint {
+            position: fixed;
+            bottom: 20px;
+            left: 50%;
+            transform: translateX(-50%);
+            font-size: 12px;
+            opacity: 0.4;
+            display: flex;
+            flex-direction: column;
+            align-items: center;
+            gap: 4px;
+            animation: fadeOut 3s forwards;
+            animation-delay: 2s;
+        }
+        .swipe-hint-arrow {
+            animation: bounce 1.5s infinite;
+        }
+        @keyframes bounce {
+            0%, 100% { transform: translateY(0); }
+            50% { transform: translateY(-6px); }
+        }
+        @keyframes fadeOut {
+            to { opacity: 0; pointer-events: none; }
+        }
+        .empty-state {
+            text-align: center;
+            padding: 40px 20px;
+            opacity: 0.6;
+        }
+        .panel-overlay {
+            position: fixed;
+            inset: 0;
+            background: rgba(0,0,0,0.5);
+            z-index: 15;
+            opacity: 0;
+            pointer-events: none;
+            transition: opacity 0.3s;
+        }
+        .panel-overlay.visible {
+            opacity: 1;
+            pointer-events: auto;
+        }
+    </style>
+</head>
+<body>
+    <div class="swipe-container" id="container"></div>
+    <div class="card-actions" id="actions">
+        <button class="action-btn" id="recs-btn" title="Similar Posts">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <circle cx="12" cy="12" r="3"/>
+                <path d="M12 2v4m0 12v4M2 12h4m12 0h4"/>
+            </svg>
+        </button>
+        <button class="action-btn read-btn" id="read-btn" title="Read Full">
+            <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                <path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
+                <polyline points="15,3 21,3 21,9"/>
+                <line x1="10" y1="14" x2="21" y2="3"/>
+            </svg>
+        </button>
+    </div>
+    <div class="panel-overlay" id="panel-overlay"></div>
+    <div class="panel" id="recs-panel">
+        <div class="panel-header">
+            <h3>Related Content</h3>
+            <button class="panel-close" id="recs-close">&times;</button>
+        </div>
+        <div id="recs-content"></div>
+    </div>
+    <div class="swipe-hint" id="swipe-hint">
+        <span class="swipe-hint-arrow">&#8593;</span>
+        <span>Swipe up for more</span>
+    </div>
+    <script>
+        // Embedded data
+        const GRAPH = {{ graph_json|safe }};
+        const POST_SLUGS = {{ post_slugs_json|safe }};
+        const BASE_URL = "{{ base_url }}";
+        // State
+        let currentIndex = 0;
+        let feedItems = []; // Mixed array of {type: 'post'|'wiki', data: ...}
+        // Shuffle array (Fisher-Yates)
+        function shuffle(arr) {
+            const result = [...arr];
+            for (let i = result.length - 1; i > 0; i--) {
+                const j = Math.floor(Math.random() * (i + 1));
+                [result[i], result[j]] = [result[j], result[i]];
+            }
+            return result;
+        }
+        // Format date
+        function formatDate(isoDate) {
+            const date = new Date(isoDate);
+            return date.toLocaleDateString('en-US', {
+                year: 'numeric',
+                month: 'long',
+                day: 'numeric'
+            });
+        }
+        // Build feed with interleaved external content (Wikipedia, HN, etc.)
+        function buildFeed() {
+            const shuffledSlugs = shuffle(POST_SLUGS);
+            feedItems = [];
+            // Collect all Wikipedia suggestions
+            const allWiki = [];
+            // Collect all external content (HN, etc.)
+            const allExternal = [];
+            for (const slug of shuffledSlugs) {
+                const post = GRAPH.posts[slug];
+                if (post.wikipedia) {
+                    for (const w of post.wikipedia) {
+                        allWiki.push({ ...w, sourcePost: post.title });
+                    }
+                }
+                if (post.externalContent) {
+                    for (const e of post.externalContent) {
+                        allExternal.push({ ...e, sourcePost: post.title });
+                    }
+                }
+            }
+            const shuffledWiki = shuffle(allWiki);
+            const shuffledExternal = shuffle(allExternal);
+            // Interleave content: wiki every 3 posts, external every 4 posts
+            let wikiIndex = 0;
+            let extIndex = 0;
+            for (let i = 0; i < shuffledSlugs.length; i++) {
+                feedItems.push({ type: 'post', slug: shuffledSlugs[i] });
+                // Insert Wikipedia after every 3 posts
+                if ((i + 1) % 3 === 0 && wikiIndex < shuffledWiki.length) {
+                    feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
+                    wikiIndex++;
+                }
+                // Insert external content (HN) after every 4 posts
+                if ((i + 1) % 4 === 0 && extIndex < shuffledExternal.length) {
+                    feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
+                    extIndex++;
+                }
+            }
+            // Add remaining content at the end
+            while (wikiIndex < shuffledWiki.length) {
+                feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
+                wikiIndex++;
+            }
+            while (extIndex < shuffledExternal.length) {
+                feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
+                extIndex++;
+            }
+        }
+        // Render a blog post card
+        function renderPostCard(slug, idx) {
+            const post = GRAPH.posts[slug];
+            const bgImg = post.headerImg ? `${BASE_URL}/${post.headerImg}` : '';
+            const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%)';
+            const keyPointsHtml = post.keyPoints && post.keyPoints.length > 0 ? `
+                <div class="key-points">
+                    <div class="key-points-title">Key Takeaways</div>
+                    ${post.keyPoints.slice(0, 4).map(point => `
+                        <div class="key-point">
+                            <span class="key-point-bullet">•</span>
+                            <span>${point}</span>
+                        </div>
+                    `).join('')}
+                </div>
+            ` : '';
+            return `
+                <div class="card" data-type="post" data-slug="${slug}" data-index="${idx}" style="${bgStyle}">
+                    <div class="card-overlay"></div>
+                    <div class="card-content">
+                        <div class="card-type">📝 Blog Post</div>
+                        <div class="card-meta">
+                            ${post.categories.map(c =>
+                                `<span class="card-category">${c}</span>`
+                            ).join('')}
+                            <span class="card-category">${post.readingTime} min</span>
+                        </div>
+                        <h1 class="card-title">${post.title}</h1>
+                        ${post.subtitle ? `<p class="card-subtitle">${post.subtitle}</p>` : ''}
+                        ${keyPointsHtml}
+                        <div class="card-tags">
+                            ${post.tags.slice(0, 4).map(t =>
+                                `<span class="card-tag">#${t}</span>`
+                            ).join('')}
+                        </div>
+                        <div class="card-date">${formatDate(post.date)}</div>
+                    </div>
+                </div>
+            `;
+        }
+        // Render a Wikipedia card
+        function renderWikiCard(wiki, idx) {
+            // Prefer extract from Wikipedia API, fall back to LLM-generated relevance
+            const excerpt = wiki.extract || wiki.relevance || '';
+            return `
+                <div class="card wiki-card" data-type="wiki" data-url="${wiki.url}" data-index="${idx}">
+                    <div class="card-overlay"></div>
+                    <div class="card-content">
+                        <div class="card-type wiki">
+                            <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                <circle cx="12" cy="12" r="10"/>
+                                <path d="M12 16v-4M12 8h.01"/>
+                            </svg>
+                            Wikipedia
+                        </div>
+                        <h1 class="card-title">${wiki.title}</h1>
+                        <p class="wiki-excerpt">${excerpt}</p>
+                        ${wiki.relevance && wiki.extract ? `<p class="wiki-relevance"><em>${wiki.relevance}</em></p>` : ''}
+                        <div class="wiki-source">
+                            Related to: ${wiki.sourcePost}
+                        </div>
+                    </div>
+                </div>
+            `;
+        }
+        // Render an external content card (HackerNews, linked, etc.)
+        function renderExternalCard(ext, idx) {
+            const isHN = ext.source === 'hackernews' || ext.source === 'hn-frontpage';
+            const isLinked = ext.source === 'linked';
+            let cardClass, typeClass, typeName, descClass;
+            if (isHN) {
+                cardClass = 'hn-card';
+                typeClass = 'hn';
+                typeName = ext.source === 'hn-frontpage' ? 'HN Front Page' : 'Hacker News';
+                descClass = 'hn-description';
+            } else if (isLinked) {
+                cardClass = 'linked-card';
+                typeClass = 'linked';
+                typeName = 'Referenced Link';
+                descClass = 'linked-description';
+            } else {
+                cardClass = 'external-card';
+                typeClass = '';
+                typeName = ext.source;
+                descClass = 'hn-description';
+            }
+            const points = ext.metadata?.points || 0;
+            const comments = ext.metadata?.num_comments || 0;
+            const storyUrl = ext.metadata?.story_url || '';
+            // Icon based on type
+            const icon = isHN
+                ? `<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>`
+                : isLinked
+                ? `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/><path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/></svg>`
+                : '';
+            return `
+                <div class="card ${cardClass}" data-type="external" data-url="${ext.url}" data-index="${idx}">
+                    <div class="card-overlay"></div>
+                    <div class="card-content">
+                        <div class="card-type ${typeClass}">
+                            ${icon}
+                            ${typeName}
+                        </div>
+                        <h1 class="card-title">${ext.title}</h1>
+                        ${isHN ? `
+                            <div class="hn-meta">
+                                <div class="hn-meta-item">
+                                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                        <path d="M12 2l3.09 6.26L22 9.27l-5 4.87 1.18 6.88L12 17.77l-6.18 3.25L7 14.14 2 9.27l6.91-1.01L12 2z"/>
+                                    </svg>
+                                    ${points} points
+                                </div>
+                                <div class="hn-meta-item">
+                                    <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                        <path d="M21 15a2 2 0 01-2 2H7l-4 4V5a2 2 0 012-2h14a2 2 0 012 2z"/>
+                                    </svg>
+                                    ${comments} comments
+                                </div>
+                            </div>
+                        ` : ''}
+                        ${ext.description ? `<p class="${descClass}">${ext.description}</p>` : ''}
+                        ${storyUrl ? `
+                            <div class="hn-link">
+                                <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
+                                    <path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
+                                    <polyline points="15,3 21,3 21,9"/>
+                                    <line x1="10" y1="14" x2="21" y2="3"/>
+                                </svg>
+                                View original article
+                            </div>
+                        ` : ''}
+                        <div class="wiki-source">
+                            ${ext.relevance || `Related to: ${ext.sourcePost}`}
+                        </div>
+                    </div>
+                </div>
+            `;
+        }
+        // Render all slides
+        function renderSlides() {
+            buildFeed();
+            const container = document.getElementById('container');
+            container.innerHTML = feedItems.map((item, idx) => {
+                if (item.type === 'post') {
+                    return renderPostCard(item.slug, idx);
+                } else if (item.type === 'wiki') {
+                    return renderWikiCard(item.data, idx);
+                } else if (item.type === 'external') {
+                    return renderExternalCard(item.data, idx);
+                }
+            }).join('');
+        }
+        // Track current slide on scroll
+        function onScroll() {
+            const container = document.getElementById('container');
+            const slideHeight = window.innerHeight;
+            currentIndex = Math.round(container.scrollTop / slideHeight);
+            // Update action buttons visibility based on card type
+            const currentItem = feedItems[currentIndex];
+            const recsBtn = document.getElementById('recs-btn');
+            if (currentItem && (currentItem.type === 'wiki' || currentItem.type === 'external')) {
+                recsBtn.style.display = 'none';
+            } else {
+                recsBtn.style.display = 'flex';
+            }
+        }
+        // Open full content
+        function openFullContent() {
+            const item = feedItems[currentIndex];
+            if (item.type === 'post') {
+                const post = GRAPH.posts[item.slug];
+                window.open(post.url, '_blank');
+            } else if (item.type === 'wiki') {
+                window.open(item.data.url, '_blank');
+            } else if (item.type === 'external') {
+                window.open(item.data.url, '_blank');
+            }
+        }
+        // Close all panels
+        function closePanels() {
+            document.getElementById('recs-panel').classList.remove('open');
+            document.getElementById('panel-overlay').classList.remove('visible');
+        }
+        // Show recommendations panel
+        function showRecsPanel() {
+            const item = feedItems[currentIndex];
+            if (item.type !== 'post') return;
+            const panel = document.getElementById('recs-panel');
+            const content = document.getElementById('recs-content');
+            const post = GRAPH.posts[item.slug];
+            const recs = GRAPH.recommendations[item.slug] || [];
+            const wikiSuggestions = post.wikipedia || [];
+            const externalContent = post.externalContent || [];
+            let html = '';
+            // Similar posts section
+            if (recs.length > 0) {
+                html += '<div class="rec-section-title">Similar Posts</div>';
+                html += recs.map(([recSlug, score]) => {
+                    const recPost = GRAPH.posts[recSlug];
+                    const bgImg = recPost.headerImg ? `${BASE_URL}/${recPost.headerImg}` : '';
+                    const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #2d3436 0%, #636e72 100%)';
+                    return `
+                        <div class="rec-item" data-slug="${recSlug}">
+                            <div class="rec-thumb" style="${bgStyle}"></div>
+                            <div class="rec-info">
+                                <h4>${recPost.title}</h4>
+                                <span>${Math.round(score * 100)}% match</span>
+                            </div>
+                        </div>
+                    `;
+                }).join('');
+            }
+            // Wikipedia section
+            if (wikiSuggestions.length > 0) {
+                html += '<div class="rec-section-title">Wikipedia Articles</div>';
+                html += wikiSuggestions.map(wiki => {
+                    const excerpt = wiki.extract || wiki.relevance || '';
+                    const truncatedExcerpt = excerpt.length > 80 ? excerpt.slice(0, 80) + '...' : excerpt;
+                    return `
+                        <div class="rec-item wiki-rec" data-url="${wiki.url}">
+                            <div class="rec-thumb wiki-thumb">
+                                <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
+                                    <circle cx="12" cy="12" r="10"/>
+                                    <path d="M12 16v-4M12 8h.01"/>
+                                </svg>
+                            </div>
+                            <div class="rec-info">
+                                <h4>${wiki.title}</h4>
+                                <span>${truncatedExcerpt}</span>
+                            </div>
+                        </div>
+                    `;
+                }).join('');
+            }
+            // External content section (HN, etc.)
+            const hnContent = externalContent.filter(e => e.source === 'hackernews' || e.source === 'hn-frontpage');
+            if (hnContent.length > 0) {
+                html += '<div class="rec-section-title">Hacker News</div>';
+                html += hnContent.map(hn => {
+                    const points = hn.metadata?.points || 0;
+                    const comments = hn.metadata?.num_comments || 0;
+                    return `
+                        <div class="rec-item hn-rec" data-url="${hn.url}">
+                            <div class="rec-thumb hn-thumb">
+                                <svg width="20" height="20" viewBox="0 0 24 24" fill="#ff6600"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>
+                            </div>
+                            <div class="rec-info">
+                                <h4>${hn.title}</h4>
+                                <span>${points} pts · ${comments} comments</span>
+                            </div>
+                        </div>
+                    `;
+                }).join('');
+            }
+            // Linked content section (tertiary crawled links)
+            const linkedContent = externalContent.filter(e => e.source === 'linked');
+            if (linkedContent.length > 0) {
+                html += '<div class="rec-section-title">Referenced Links</div>';
+                html += linkedContent.map(link => {
+                    const truncatedDesc = link.description && link.description.length > 80
+                        ? link.description.slice(0, 80) + '...'
+                        : (link.description || '');
+                    return `
+                        <div class="rec-item linked-rec" data-url="${link.url}">
+                            <div class="rec-thumb linked-thumb">
+                                <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="#2ecc71" stroke-width="2">
+                                    <path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/>
+                                    <path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/>
+                                </svg>
+                            </div>
+                            <div class="rec-info">
+                                <h4>${link.title}</h4>
+                                <span>${truncatedDesc}</span>
+                            </div>
+                        </div>
+                    `;
+                }).join('');
+            }
+            if (html) {
+                content.innerHTML = html;
+            } else {
+                content.innerHTML = '<div class="empty-state">No related content found.</div>';
+            }
+            document.getElementById('panel-overlay').classList.add('visible');
+            panel.classList.add('open');
+        }
+        // Jump to a specific post
+        function jumpToPost(slug) {
+            const idx = feedItems.findIndex(item => item.type === 'post' && item.slug === slug);
+            if (idx >= 0) {
+                const container = document.getElementById('container');
+                container.scrollTo({
+                    top: idx * window.innerHeight,
+                    behavior: 'smooth'
+                });
+                closePanels();
+            }
+        }
+        // Event listeners
+        document.getElementById('container').addEventListener('scroll', onScroll);
+        document.getElementById('read-btn').addEventListener('click', openFullContent);
+        document.getElementById('recs-btn').addEventListener('click', showRecsPanel);
+        document.getElementById('recs-close').addEventListener('click', closePanels);
+        document.getElementById('panel-overlay').addEventListener('click', closePanels);
+        document.getElementById('recs-content').addEventListener('click', (e) => {
+            const item = e.target.closest('.rec-item');
+            if (item) {
+                if (item.dataset.url) {
+                    // Wikipedia item - open in new tab
+                    window.open(item.dataset.url, '_blank');
+                    closePanels();
+                } else if (item.dataset.slug) {
+                    // Blog post - jump to it
+                    jumpToPost(item.dataset.slug);
+                }
+            }
+        });
+        // Keyboard navigation
+        document.addEventListener('keydown', (e) => {
+            const container = document.getElementById('container');
+            const slideHeight = window.innerHeight;
+            if (e.key === 'ArrowDown' || e.key === 'j') {
+                container.scrollBy({ top: slideHeight, behavior: 'smooth' });
+            } else if (e.key === 'ArrowUp' || e.key === 'k') {
+                container.scrollBy({ top: -slideHeight, behavior: 'smooth' });
+            } else if (e.key === 'Enter' || e.key === 'o') {
+                openFullContent();
+            } else if (e.key === 'Escape') {
+                closePanels();
+            }
+        });
+        // Hide swipe hint after first scroll
+        document.getElementById('container').addEventListener('scroll', () => {
+            document.getElementById('swipe-hint').style.display = 'none';
+        }, { once: true });
+        // Initialize
+        renderSlides();
+        onScroll(); // Update button visibility for initial state
+    </script>
+</body>
+</html>

src/tiktokify/models/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""Data models for tiktokify."""
+from .post import (
+    ExternalContentItem,
+    Post,
+    PostMetadata,
+    RecommendationGraph,
+    WikipediaSuggestion,
+)
+__all__ = [
+    "ExternalContentItem",
+    "Post",
+    "PostMetadata",
+    "RecommendationGraph",
+    "WikipediaSuggestion",
+]

src/tiktokify/models/post.py ADDED Viewed

	@@ -0,0 +1,116 @@

+"""Pydantic models for blog posts and recommendation graph."""
+from datetime import datetime
+from typing import Optional
+from pydantic import BaseModel, Field, HttpUrl
+class ExternalContentItem(BaseModel):
+    """Generic external content from any source."""
+    source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
+    title: str
+    url: HttpUrl
+    description: str = Field(default="", description="Brief description or excerpt")
+    relevance: str = Field(default="", description="Why this is relevant to the post")
+    metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
+class WikipediaSuggestion(BaseModel):
+    """A Wikipedia article suggestion for a blog post."""
+    title: str
+    url: HttpUrl
+    relevance: str = Field(description="Brief explanation of why this is relevant")
+    extract: str = Field(default="", description="Article summary from Wikipedia API")
+class PostMetadata(BaseModel):
+    """Metadata extracted from Jekyll post front matter."""
+    title: str
+    date: datetime
+    categories: list[str] = Field(default_factory=list)
+    tags: list[str] = Field(default_factory=list)
+    subtitle: Optional[str] = None
+    header_img: Optional[str] = None
+    last_edited_on: Optional[datetime] = None
+class Post(BaseModel):
+    """Complete representation of a blog post."""
+    url: str
+    slug: str
+    metadata: PostMetadata
+    content_text: str = Field(description="Plain text content for TF-IDF")
+    content_html: str = Field(default="", description="Full HTML content")
+    reading_time_minutes: int = Field(default=1)
+    # Populated during enrichment phase
+    key_points: list[str] = Field(
+        default_factory=list, description="LLM-generated key points/summary"
+    )
+    similar_posts: list[str] = Field(
+        default_factory=list, description="List of similar post slugs"
+    )
+    similarity_scores: dict[str, float] = Field(
+        default_factory=dict, description="slug -> similarity score"
+    )
+    wikipedia_suggestions: list[WikipediaSuggestion] = Field(default_factory=list)
+    external_content: list[ExternalContentItem] = Field(
+        default_factory=list, description="Content from external sources (HN, Reddit, etc.)"
+    )
+class RecommendationGraph(BaseModel):
+    """Graph of posts with recommendation adjacency list."""
+    posts: dict[str, Post] = Field(description="slug -> Post mapping")
+    adjacency: dict[str, list[tuple[str, float]]] = Field(
+        default_factory=dict, description="slug -> [(similar_slug, score), ...]"
+    )
+    def to_json_for_embed(self) -> dict:
+        """Serialize for embedding in HTML (minimal, frontend-friendly format)."""
+        return {
+            "posts": {
+                slug: {
+                    "title": p.metadata.title,
+                    "subtitle": p.metadata.subtitle,
+                    "date": p.metadata.date.isoformat(),
+                    "categories": p.metadata.categories,
+                    "tags": p.metadata.tags,
+                    "url": p.url,
+                    "headerImg": p.metadata.header_img,
+                    "readingTime": p.reading_time_minutes,
+                    "keyPoints": p.key_points,
+                    "wikipedia": [
+                        {
+                            "title": w.title,
+                            "url": str(w.url),
+                            "relevance": w.relevance,
+                            "extract": w.extract,
+                        }
+                        for w in p.wikipedia_suggestions
+                    ],
+                    "externalContent": [
+                        {
+                            "source": e.source,
+                            "title": e.title,
+                            "url": str(e.url),
+                            "description": e.description,
+                            "relevance": e.relevance,
+                            "metadata": e.metadata,
+                        }
+                        for e in p.external_content
+                    ],
+                }
+                for slug, p in self.posts.items()
+            },
+            "recommendations": {
+                slug: [(s, round(score, 3)) for s, score in recs]
+                for slug, recs in self.adjacency.items()
+            },
+        }

src/tiktokify/recommender/__init__.py ADDED Viewed

	@@ -0,0 +1,5 @@

+"""Recommendation engine module."""
+from .engine import RecommendationEngine
+__all__ = ["RecommendationEngine"]

src/tiktokify/recommender/engine.py ADDED Viewed

	@@ -0,0 +1,59 @@

+"""Combined recommendation engine."""
+from tiktokify.models import Post, RecommendationGraph
+from .metadata import MetadataSimilarity
+from .tfidf import TFIDFSimilarity
+class RecommendationEngine:
+    """Hybrid recommendation combining content and metadata similarity."""
+    def __init__(
+        self,
+        content_weight: float = 0.6,
+        metadata_weight: float = 0.4,
+        top_k: int = 5,
+    ):
+        self.content_weight = content_weight
+        self.metadata_weight = metadata_weight
+        self.top_k = top_k
+        self.tfidf = TFIDFSimilarity()
+        self.metadata = MetadataSimilarity()
+    def build_graph(self, posts: list[Post]) -> RecommendationGraph:
+        """Build complete recommendation graph."""
+        # Fit both models
+        self.tfidf.fit(posts)
+        self.metadata.fit(posts)
+        posts_dict = {p.slug: p for p in posts}
+        adjacency: dict[str, list[tuple[str, float]]] = {}
+        for post in posts:
+            # Get similarities from both sources
+            content_sims = dict(self.tfidf.get_similar(post.slug, self.top_k * 2))
+            metadata_sims = dict(self.metadata.get_similar(post.slug, self.top_k * 2))
+            # Combine scores
+            all_slugs = set(content_sims.keys()) | set(metadata_sims.keys())
+            combined: list[tuple[str, float]] = []
+            for slug in all_slugs:
+                c_score = content_sims.get(slug, 0)
+                m_score = metadata_sims.get(slug, 0)
+                combined_score = (
+                    self.content_weight * c_score + self.metadata_weight * m_score
+                )
+                combined.append((slug, combined_score))
+            # Sort and take top_k
+            combined.sort(key=lambda x: x[1], reverse=True)
+            adjacency[post.slug] = combined[: self.top_k]
+            # Update post object with recommendations
+            post.similar_posts = [s for s, _ in combined[: self.top_k]]
+            post.similarity_scores = dict(combined[: self.top_k])
+        return RecommendationGraph(posts=posts_dict, adjacency=adjacency)

src/tiktokify/recommender/metadata.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""Tag and category based similarity."""
+from tiktokify.models import Post
+class MetadataSimilarity:
+    """Tag and category based Jaccard similarity."""
+    def __init__(
+        self,
+        tag_weight: float = 0.7,
+        category_weight: float = 0.3,
+    ):
+        self.tag_weight = tag_weight
+        self.category_weight = category_weight
+        self.posts: dict[str, Post] = {}
+    def fit(self, posts: list[Post]) -> None:
+        """Store posts for similarity computation."""
+        self.posts = {p.slug: p for p in posts}
+    def compute_similarity(self, slug1: str, slug2: str) -> float:
+        """Compute Jaccard-like similarity between two posts."""
+        p1, p2 = self.posts.get(slug1), self.posts.get(slug2)
+        if not p1 or not p2:
+            return 0.0
+        # Tag similarity (Jaccard index)
+        tags1, tags2 = set(p1.metadata.tags), set(p2.metadata.tags)
+        tag_union = tags1 | tags2
+        tag_sim = len(tags1 & tags2) / len(tag_union) if tag_union else 0
+        # Category similarity (exact match)
+        cats1, cats2 = set(p1.metadata.categories), set(p2.metadata.categories)
+        cat_union = cats1 | cats2
+        cat_sim = len(cats1 & cats2) / len(cat_union) if cat_union else 0
+        return self.tag_weight * tag_sim + self.category_weight * cat_sim
+    def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
+        """Get top-k similar posts based on metadata."""
+        if slug not in self.posts:
+            return []
+        scores = [
+            (other_slug, self.compute_similarity(slug, other_slug))
+            for other_slug in self.posts
+            if other_slug != slug
+        ]
+        scores.sort(key=lambda x: x[1], reverse=True)
+        return scores[:top_k]

src/tiktokify/recommender/tfidf.py ADDED Viewed

	@@ -0,0 +1,51 @@

+"""TF-IDF based content similarity."""
+import numpy as np
+from sklearn.feature_extraction.text import TfidfVectorizer
+from sklearn.metrics.pairwise import cosine_similarity
+from tiktokify.models import Post
+class TFIDFSimilarity:
+    """Content-based similarity using TF-IDF."""
+    def __init__(
+        self,
+        max_features: int = 5000,
+        ngram_range: tuple[int, int] = (1, 2),
+    ):
+        self.vectorizer = TfidfVectorizer(
+            max_features=max_features,
+            ngram_range=ngram_range,
+            stop_words="english",
+            min_df=1,
+            max_df=0.9,
+        )
+        self.tfidf_matrix: np.ndarray | None = None
+        self.slugs: list[str] = []
+    def fit(self, posts: list[Post]) -> None:
+        """Fit TF-IDF on post content."""
+        self.slugs = [p.slug for p in posts]
+        texts = [p.content_text for p in posts]
+        self.tfidf_matrix = self.vectorizer.fit_transform(texts)
+    def get_similarity_matrix(self) -> np.ndarray:
+        """Return full cosine similarity matrix."""
+        if self.tfidf_matrix is None:
+            raise ValueError("Must call fit() first")
+        return cosine_similarity(self.tfidf_matrix)
+    def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
+        """Get top-k similar posts for a given slug."""
+        if slug not in self.slugs:
+            return []
+        idx = self.slugs.index(slug)
+        sim_matrix = self.get_similarity_matrix()
+        scores = sim_matrix[idx]
+        # Get top-k (excluding self)
+        top_indices = np.argsort(scores)[::-1][1 : top_k + 1]
+        return [(self.slugs[i], float(scores[i])) for i in top_indices]

tests/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ """Tests for tiktokify."""