nishparadox commited on
Commit
51da887
·
1 Parent(s): d038c44

Add initial vibe-coded shits

Browse files
.gitignore CHANGED
@@ -6,6 +6,13 @@ __pycache__/
6
  # C extensions
7
  *.so
8
 
 
 
 
 
 
 
 
9
  # Distribution / packaging
10
  .Python
11
  build/
 
6
  # C extensions
7
  *.so
8
 
9
+ output/
10
+ html/
11
+ tmp/
12
+ out/
13
+ temp/
14
+ data/
15
+
16
  # Distribution / packaging
17
  .Python
18
  build/
pyproject.toml ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "tiktokify"
3
+ version = "0.1.0"
4
+ description = "TikTok-style swipeable blog viewer with recommendations"
5
+ readme = "README.md"
6
+ requires-python = ">=3.11"
7
+ dependencies = [
8
+ "crawl4ai>=0.4.0",
9
+ "scikit-learn>=1.3.0",
10
+ "numpy>=1.24.0",
11
+ "jinja2>=3.1.0",
12
+ "httpx>=0.25.0",
13
+ "litellm>=1.0.0",
14
+ "click>=8.1.0",
15
+ "pydantic>=2.0.0",
16
+ "rich>=13.0.0",
17
+ "loguru>=0.7.0",
18
+ ]
19
+
20
+ [project.scripts]
21
+ tiktokify = "tiktokify.cli:main"
22
+
23
+ [build-system]
24
+ requires = ["hatchling"]
25
+ build-backend = "hatchling.build"
26
+
27
+ [tool.hatch.build.targets.wheel]
28
+ packages = ["src/tiktokify"]
29
+
30
+ [dependency-groups]
31
+ dev = [
32
+ "pytest>=7.0.0",
33
+ "pytest-asyncio>=0.21.0",
34
+ ]
src/tiktokify/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ """TikTokify - TikTok-style swipeable blog viewer with recommendations."""
2
+
3
+ __version__ = "0.1.0"
src/tiktokify/__main__.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ """Entry point for `python -m tiktokify` or `uv run tiktokify`."""
2
+
3
+ from tiktokify.cli import main
4
+
5
+ if __name__ == "__main__":
6
+ main()
src/tiktokify/cli.py ADDED
@@ -0,0 +1,303 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """CLI interface for tiktokify."""
2
+
3
+ import asyncio
4
+ from pathlib import Path
5
+
6
+ import click
7
+ from rich.console import Console
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn
9
+
10
+ console = Console()
11
+
12
+
13
+ @click.command()
14
+ @click.option(
15
+ "--base-url",
16
+ "-u",
17
+ required=True,
18
+ help="Base URL of the Jekyll blog (e.g., https://nish1001.github.io)",
19
+ )
20
+ @click.option(
21
+ "--output-html",
22
+ "-o",
23
+ required=True,
24
+ type=click.Path(),
25
+ help="Output path for generated HTML file",
26
+ )
27
+ @click.option(
28
+ "--model",
29
+ "-m",
30
+ default=None,
31
+ help="LLM model for enrichment (e.g., gpt-4o-mini, claude-3-haiku-20240307). Skip if not provided.",
32
+ )
33
+ @click.option(
34
+ "--n-key-points",
35
+ type=int,
36
+ default=5,
37
+ help="Number of key points to generate per post",
38
+ )
39
+ @click.option(
40
+ "--n-wiki",
41
+ type=int,
42
+ default=3,
43
+ help="Number of Wikipedia articles to suggest per post",
44
+ )
45
+ @click.option(
46
+ "--sources",
47
+ type=str,
48
+ default="",
49
+ help="Comma-separated external sources to fetch. Available: hackernews (hn), hn-frontpage (frontpage), links (linked)",
50
+ )
51
+ @click.option(
52
+ "--n-external",
53
+ type=int,
54
+ default=3,
55
+ help="Number of items to fetch per external source",
56
+ )
57
+ @click.option(
58
+ "--content-weight",
59
+ type=float,
60
+ default=0.6,
61
+ help="Weight for content-based similarity (0-1)",
62
+ )
63
+ @click.option(
64
+ "--metadata-weight",
65
+ type=float,
66
+ default=0.4,
67
+ help="Weight for tag/category similarity (0-1)",
68
+ )
69
+ @click.option(
70
+ "--top-k",
71
+ type=int,
72
+ default=5,
73
+ help="Number of recommendations per post",
74
+ )
75
+ @click.option(
76
+ "--max-concurrent",
77
+ type=int,
78
+ default=5,
79
+ help="Maximum concurrent requests",
80
+ )
81
+ @click.option(
82
+ "--max-depth",
83
+ type=int,
84
+ default=1,
85
+ help="Spider crawl depth (1=seed only, 2=seed+linked pages, etc.)",
86
+ )
87
+ @click.option(
88
+ "--verbose",
89
+ "-v",
90
+ is_flag=True,
91
+ help="Enable verbose output",
92
+ )
93
+ def main(
94
+ base_url: str,
95
+ output_html: str,
96
+ model: str | None,
97
+ n_key_points: int,
98
+ n_wiki: int,
99
+ sources: str,
100
+ n_external: int,
101
+ content_weight: float,
102
+ metadata_weight: float,
103
+ top_k: int,
104
+ max_concurrent: int,
105
+ max_depth: int,
106
+ verbose: bool,
107
+ ) -> None:
108
+ """
109
+ TikTokify - Generate a TikTok-style swipe interface for your Jekyll blog.
110
+
111
+ Example:
112
+
113
+ uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html
114
+
115
+ With LLM enrichment (key points + Wikipedia):
116
+
117
+ uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html -m gpt-4o-mini
118
+
119
+ With deeper spider crawling:
120
+
121
+ uv run tiktokify -u https://example.com -o output.html --max-depth 2
122
+ """
123
+ asyncio.run(
124
+ _main_async(
125
+ base_url=base_url,
126
+ output_html=Path(output_html),
127
+ model=model,
128
+ n_key_points=n_key_points,
129
+ n_wiki=n_wiki,
130
+ sources=[s.strip() for s in sources.split(",") if s.strip()],
131
+ n_external=n_external,
132
+ content_weight=content_weight,
133
+ metadata_weight=metadata_weight,
134
+ top_k=top_k,
135
+ max_concurrent=max_concurrent,
136
+ max_depth=max_depth,
137
+ verbose=verbose,
138
+ )
139
+ )
140
+
141
+
142
+ async def _main_async(
143
+ base_url: str,
144
+ output_html: Path,
145
+ model: str | None,
146
+ n_key_points: int,
147
+ n_wiki: int,
148
+ sources: list[str],
149
+ n_external: int,
150
+ content_weight: float,
151
+ metadata_weight: float,
152
+ top_k: int,
153
+ max_concurrent: int,
154
+ max_depth: int,
155
+ verbose: bool,
156
+ ) -> None:
157
+ """Async main function."""
158
+ from tiktokify.crawler import SpiderCrawler
159
+ from tiktokify.enrichment import (
160
+ HackerNewsProvider,
161
+ HNFrontPageProvider,
162
+ LinkedContentProvider,
163
+ PostEnricher,
164
+ )
165
+ from tiktokify.generator import HTMLGenerator
166
+ from tiktokify.models import ExternalContentItem
167
+ from tiktokify.recommender import RecommendationEngine
168
+
169
+ # Map source names to provider classes
170
+ PROVIDERS = {
171
+ "hackernews": HackerNewsProvider,
172
+ "hn": HackerNewsProvider, # alias
173
+ "hn-frontpage": HNFrontPageProvider,
174
+ "frontpage": HNFrontPageProvider, # alias
175
+ "links": LinkedContentProvider,
176
+ "linked": LinkedContentProvider, # alias
177
+ }
178
+
179
+ console.print(f"\n[bold blue]TikTokify[/bold blue] - Generating swipe UI for {base_url}\n")
180
+
181
+ with Progress(
182
+ SpinnerColumn(),
183
+ TextColumn("[progress.description]{task.description}"),
184
+ console=console,
185
+ transient=True,
186
+ ) as progress:
187
+ # Step 1: Spider crawl
188
+ depth_info = f" (depth={max_depth})" if max_depth > 1 else ""
189
+ task = progress.add_task(f"Spider crawling{depth_info}...", total=None)
190
+ crawler = SpiderCrawler(
191
+ base_url=base_url,
192
+ max_concurrent=max_concurrent,
193
+ max_depth=max_depth,
194
+ verbose=verbose,
195
+ )
196
+ posts = await crawler.crawl()
197
+ progress.remove_task(task)
198
+
199
+ if not posts:
200
+ console.print("[red]Error: No posts found![/red]")
201
+ return
202
+
203
+ console.print(f" [green]✓[/green] Found {len(posts)} posts")
204
+
205
+ # Step 2: Build recommendations
206
+ task = progress.add_task("Building recommendation graph...", total=None)
207
+ engine = RecommendationEngine(
208
+ content_weight=content_weight,
209
+ metadata_weight=metadata_weight,
210
+ top_k=top_k,
211
+ )
212
+ graph = engine.build_graph(posts)
213
+ progress.remove_task(task)
214
+ console.print(f" [green]✓[/green] Built recommendation graph")
215
+
216
+ # Step 3: LLM enrichment (optional)
217
+ if model:
218
+ task = progress.add_task(f"Enriching posts with LLM ({model})...", total=None)
219
+ enricher = PostEnricher(
220
+ model=model,
221
+ max_key_points=n_key_points,
222
+ max_wikipedia=n_wiki,
223
+ max_concurrent=max_concurrent,
224
+ verbose=verbose,
225
+ )
226
+ await enricher.enrich_posts(list(graph.posts.values()))
227
+ progress.remove_task(task)
228
+
229
+ enriched_count = sum(
230
+ 1 for p in graph.posts.values() if p.key_points
231
+ )
232
+ console.print(f" [green]✓[/green] Enriched {enriched_count} posts with key points + Wikipedia")
233
+ else:
234
+ console.print(" [dim]⊘ Skipping LLM enrichment (no --model specified)[/dim]")
235
+
236
+ # Step 4: External sources (optional)
237
+ if sources:
238
+ valid_sources = [s for s in sources if s in PROVIDERS]
239
+ if valid_sources:
240
+ task = progress.add_task(f"Fetching from {', '.join(valid_sources)}...", total=None)
241
+
242
+ # Build list of (provider, post) pairs for parallel fetching
243
+ fetch_tasks = []
244
+ task_info = [] # Track (source_name, post) for each task
245
+
246
+ for source_name in valid_sources:
247
+ provider_class = PROVIDERS[source_name]
248
+ provider = provider_class(max_items=n_external, verbose=verbose)
249
+
250
+ for post in graph.posts.values():
251
+ fetch_tasks.append(provider.fetch_for_post(post))
252
+ task_info.append((source_name, post))
253
+
254
+ # Fetch all in parallel with concurrency limit
255
+ semaphore = asyncio.Semaphore(max_concurrent)
256
+
257
+ async def fetch_with_limit(coro, info):
258
+ async with semaphore:
259
+ try:
260
+ return await coro, info, None
261
+ except Exception as e:
262
+ return [], info, e
263
+
264
+ results = await asyncio.gather(
265
+ *[fetch_with_limit(t, info) for t, info in zip(fetch_tasks, task_info)]
266
+ )
267
+
268
+ # Process results
269
+ for external_items, (source_name, post), error in results:
270
+ if error:
271
+ if verbose:
272
+ console.print(f"[yellow]Warning: {source_name} failed for {post.slug}: {error}[/yellow]")
273
+ continue
274
+
275
+ for item in external_items:
276
+ post.external_content.append(
277
+ ExternalContentItem(
278
+ source=item.source,
279
+ title=item.title,
280
+ url=item.url,
281
+ description=item.description,
282
+ relevance=item.relevance,
283
+ metadata=item.metadata,
284
+ )
285
+ )
286
+
287
+ progress.remove_task(task)
288
+ console.print(f" [green]✓[/green] Fetched external content from {', '.join(valid_sources)}")
289
+ else:
290
+ console.print(f" [yellow]⚠ Unknown sources: {sources}. Available: {list(PROVIDERS.keys())}[/yellow]")
291
+
292
+ # Step 5: Generate HTML
293
+ task = progress.add_task("Generating HTML...", total=None)
294
+ generator = HTMLGenerator()
295
+ generator.generate(graph, base_url, output_html)
296
+ progress.remove_task(task)
297
+ console.print(f" [green]✓[/green] Generated {output_html}")
298
+
299
+ console.print(f"\n[bold green]Done![/bold green] Open {output_html} in a browser to view.\n")
300
+
301
+
302
+ if __name__ == "__main__":
303
+ main()
src/tiktokify/crawler/__init__.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ """Spider crawler module for fetching website content."""
2
+
3
+ from .blog_crawler import SpiderCrawler
4
+
5
+ # Backward compatibility alias
6
+ JekyllBlogCrawler = SpiderCrawler
7
+
8
+ __all__ = ["SpiderCrawler", "JekyllBlogCrawler"]
src/tiktokify/crawler/blog_crawler.py ADDED
@@ -0,0 +1,382 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Async crawler for Jekyll blogs using crawl4ai."""
2
+
3
+ import asyncio
4
+ import re
5
+ from datetime import datetime
6
+ from urllib.parse import urljoin, urlparse
7
+
8
+ from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
9
+ from rich.console import Console
10
+
11
+ from tiktokify.models import Post, PostMetadata
12
+
13
+ console = Console()
14
+
15
+
16
+ class SpiderCrawler:
17
+ """Async spider crawler for any website with recursive link discovery."""
18
+
19
+ def __init__(
20
+ self,
21
+ base_url: str,
22
+ max_concurrent: int = 5,
23
+ max_depth: int = 1,
24
+ verbose: bool = False,
25
+ ):
26
+ self.base_url = base_url.rstrip("/")
27
+ self.max_concurrent = max_concurrent
28
+ self.max_depth = max_depth
29
+ self.verbose = verbose
30
+ self.semaphore = asyncio.Semaphore(max_concurrent)
31
+ self.base_domain = urlparse(self.base_url).netloc
32
+
33
+ async def crawl(self) -> list[Post]:
34
+ """Main entry point - crawls entire blog and returns posts."""
35
+ browser_config = BrowserConfig(headless=True, verbose=self.verbose)
36
+
37
+ async with AsyncWebCrawler(config=browser_config) as crawler:
38
+ # Step 1: Discover post URLs
39
+ if self.verbose:
40
+ console.print("[dim]Discovering post URLs...[/dim]")
41
+
42
+ post_urls = await self._discover_post_urls(crawler)
43
+
44
+ if self.verbose:
45
+ console.print(f"[green]Found {len(post_urls)} posts[/green]")
46
+
47
+ # Step 2: Crawl individual posts concurrently
48
+ posts = await self._crawl_posts(crawler, post_urls)
49
+
50
+ return posts
51
+
52
+ async def _discover_post_urls(self, crawler: AsyncWebCrawler) -> list[str]:
53
+ """Discover all content URLs using spider-style recursive crawling.
54
+
55
+ Starts from base URL and follows internal links up to max_depth levels.
56
+ - Depth 1: Only links from seed URL (default)
57
+ - Depth 2: Links from seed + links from those pages
58
+ - etc.
59
+ """
60
+ discovered: set[str] = set()
61
+ visited: set[str] = set()
62
+
63
+ async def crawl_page(url: str, depth: int) -> set[str]:
64
+ """Crawl a single page and return new URLs found."""
65
+ if depth > self.max_depth or url in visited:
66
+ return set()
67
+
68
+ visited.add(url)
69
+ new_urls: set[str] = set()
70
+
71
+ try:
72
+ async with self.semaphore:
73
+ result = await crawler.arun(
74
+ url=url,
75
+ config=CrawlerRunConfig(wait_until="domcontentloaded"),
76
+ )
77
+
78
+ if not result.success:
79
+ return set()
80
+
81
+ # Extract links from crawl4ai
82
+ if result.links:
83
+ for link in result.links.get("internal", []):
84
+ href = link.get("href", "") if isinstance(link, dict) else str(link)
85
+ if self._is_content_url(href, self.base_domain):
86
+ full_url = href if href.startswith("http") else urljoin(url, href)
87
+ if full_url not in discovered:
88
+ new_urls.add(full_url)
89
+
90
+ # Also parse HTML directly as fallback
91
+ if result.html:
92
+ hrefs = re.findall(r'href=["\']([^"\']+)["\']', result.html)
93
+ for href in hrefs:
94
+ if self._is_content_url(href, self.base_domain):
95
+ full_url = href if href.startswith("http") else urljoin(url, href)
96
+ if full_url not in discovered:
97
+ new_urls.add(full_url)
98
+
99
+ discovered.update(new_urls)
100
+
101
+ if self.verbose and new_urls:
102
+ console.print(f"[dim]Depth {depth}: Found {len(new_urls)} URLs from {url}[/dim]")
103
+
104
+ except Exception as e:
105
+ if self.verbose:
106
+ console.print(f"[yellow]Warning: Failed to crawl {url}: {e}[/yellow]")
107
+
108
+ return new_urls
109
+
110
+ # Start with seed URL
111
+ if self.verbose:
112
+ console.print(f"[dim]Spider crawling with max_depth={self.max_depth}[/dim]")
113
+
114
+ # Depth 1: crawl seed URL
115
+ current_urls = await crawl_page(self.base_url, 1)
116
+
117
+ # Deeper levels: recursively crawl discovered URLs
118
+ for depth in range(2, self.max_depth + 1):
119
+ if not current_urls:
120
+ break
121
+
122
+ if self.verbose:
123
+ console.print(f"[dim]Crawling depth {depth}: {len(current_urls)} URLs to explore[/dim]")
124
+
125
+ # Crawl all current URLs in parallel
126
+ tasks = [crawl_page(url, depth) for url in current_urls]
127
+ results = await asyncio.gather(*tasks, return_exceptions=True)
128
+
129
+ # Collect new URLs for next depth
130
+ next_urls: set[str] = set()
131
+ for result in results:
132
+ if isinstance(result, set):
133
+ next_urls.update(result)
134
+
135
+ current_urls = next_urls
136
+
137
+ if self.verbose:
138
+ console.print(f"[dim]Total discovered: {len(discovered)} content URLs[/dim]")
139
+
140
+ return list(discovered)
141
+
142
+ def _is_content_url(self, href: str, base_domain: str) -> bool:
143
+ """Check if URL is internal content (not static asset or utility page).
144
+
145
+ This is a simple filter - accepts anything that's:
146
+ 1. On the same domain
147
+ 2. Not a static asset (css, js, images, fonts)
148
+ 3. Not a utility link (mailto, javascript, anchor)
149
+ """
150
+ if not href:
151
+ return False
152
+
153
+ # Skip anchors, mailto, javascript
154
+ if href.startswith(("#", "mailto:", "javascript:", "tel:")):
155
+ return False
156
+
157
+ # Skip static assets
158
+ static_extensions = (
159
+ ".css", ".js", ".json", ".xml", ".rss", ".atom",
160
+ ".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
161
+ ".woff", ".woff2", ".ttf", ".eot", ".otf",
162
+ ".pdf", ".zip", ".tar", ".gz",
163
+ ".mp3", ".mp4", ".webm", ".ogg",
164
+ )
165
+ if any(href.lower().endswith(ext) for ext in static_extensions):
166
+ return False
167
+
168
+ # Check if it's an external link
169
+ if href.startswith(("http://", "https://")):
170
+ parsed = urlparse(href)
171
+ if parsed.netloc != base_domain:
172
+ return False
173
+
174
+ # Skip the base URL itself (index page)
175
+ path = urlparse(href).path if href.startswith("http") else href
176
+ if path in ("", "/", "/index.html", "/index.htm"):
177
+ return False
178
+
179
+ return True
180
+
181
+ async def _crawl_posts(
182
+ self, crawler: AsyncWebCrawler, urls: list[str]
183
+ ) -> list[Post]:
184
+ """Crawl all post URLs concurrently with semaphore."""
185
+
186
+ async def crawl_one(url: str) -> Post | None:
187
+ async with self.semaphore:
188
+ return await self._crawl_single_post(crawler, url)
189
+
190
+ tasks = [crawl_one(url) for url in urls]
191
+ results = await asyncio.gather(*tasks, return_exceptions=True)
192
+
193
+ posts = []
194
+ for i, result in enumerate(results):
195
+ if isinstance(result, Post):
196
+ posts.append(result)
197
+ elif isinstance(result, Exception) and self.verbose:
198
+ console.print(f"[yellow]Failed to crawl {urls[i]}: {result}[/yellow]")
199
+
200
+ return posts
201
+
202
+ async def _crawl_single_post(
203
+ self, crawler: AsyncWebCrawler, url: str
204
+ ) -> Post | None:
205
+ """Crawl and parse a single post."""
206
+ try:
207
+ result = await crawler.arun(
208
+ url=url,
209
+ config=CrawlerRunConfig(wait_until="domcontentloaded"),
210
+ )
211
+
212
+ if not result.success:
213
+ return None
214
+
215
+ # Extract metadata from HTML
216
+ metadata = self._extract_metadata(result.html, url)
217
+
218
+ # Use markdown for clean text (TF-IDF)
219
+ content_text = result.markdown or ""
220
+
221
+ # Calculate reading time (~200 words/min)
222
+ word_count = len(content_text.split())
223
+ reading_time = max(1, word_count // 200)
224
+
225
+ # Extract slug from URL
226
+ slug = self._extract_slug(url)
227
+
228
+ return Post(
229
+ url=url,
230
+ slug=slug,
231
+ metadata=metadata,
232
+ content_text=content_text,
233
+ content_html=result.html or "",
234
+ reading_time_minutes=reading_time,
235
+ )
236
+ except Exception as e:
237
+ if self.verbose:
238
+ console.print(f"[yellow]Error parsing {url}: {e}[/yellow]")
239
+ return None
240
+
241
+ def _extract_metadata(self, html: str, url: str) -> PostMetadata:
242
+ """Extract metadata from rendered HTML using regex (works with various blog themes)."""
243
+ # Try multiple patterns for title
244
+ title = "Untitled"
245
+ title_patterns = [
246
+ # Jekyll Clean Blog theme
247
+ r'<h1[^>]*class="[^"]*post-title[^"]*"[^>]*>([^<]+)</h1>',
248
+ # WordPress/common patterns
249
+ r'<h1[^>]*class="[^"]*entry-title[^"]*"[^>]*>([^<]+)</h1>',
250
+ r'<h1[^>]*class="[^"]*article-title[^"]*"[^>]*>([^<]+)</h1>',
251
+ r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)</h1>',
252
+ # Meta og:title
253
+ r'<meta[^>]*property="og:title"[^>]*content="([^"]+)"',
254
+ r'<meta[^>]*name="title"[^>]*content="([^"]+)"',
255
+ # Generic h1
256
+ r"<h1[^>]*>([^<]+)</h1>",
257
+ # Title tag fallback
258
+ r"<title>([^<|]+)",
259
+ ]
260
+ for pattern in title_patterns:
261
+ match = re.search(pattern, html, re.IGNORECASE)
262
+ if match:
263
+ title = re.sub(r"<[^>]+>", "", match.group(1)).strip()
264
+ if title:
265
+ break
266
+
267
+ # Try multiple patterns for date
268
+ date = datetime.now()
269
+ date_patterns = [
270
+ # Various date formats
271
+ (r"Posted on (\w+ \d+, \d{4})", "%B %d, %Y"),
272
+ (r'datetime="(\d{4}-\d{2}-\d{2})', "%Y-%m-%d"),
273
+ (r"(\d{4}-\d{2}-\d{2})", "%Y-%m-%d"),
274
+ (r"(\w+ \d{1,2}, \d{4})", "%B %d, %Y"),
275
+ (r"(\d{1,2} \w+ \d{4})", "%d %B %Y"),
276
+ (r'<time[^>]*>([^<]+)</time>', None), # Will try multiple formats
277
+ ]
278
+ for pattern, fmt in date_patterns:
279
+ match = re.search(pattern, html, re.IGNORECASE)
280
+ if match:
281
+ date_str = match.group(1).strip()
282
+ if fmt:
283
+ try:
284
+ date = datetime.strptime(date_str, fmt)
285
+ break
286
+ except ValueError:
287
+ continue
288
+ else:
289
+ # Try common formats
290
+ for try_fmt in ["%B %d, %Y", "%Y-%m-%d", "%d %B %Y", "%b %d, %Y"]:
291
+ try:
292
+ date = datetime.strptime(date_str, try_fmt)
293
+ break
294
+ except ValueError:
295
+ continue
296
+
297
+ # Extract date from URL if not found in HTML
298
+ if date == datetime.now():
299
+ url_date = re.search(r"(20\d{2})[/-](\d{1,2})[/-](\d{1,2})", url)
300
+ if url_date:
301
+ try:
302
+ date = datetime(int(url_date.group(1)), int(url_date.group(2)), int(url_date.group(3)))
303
+ except ValueError:
304
+ pass
305
+
306
+ # Tags from various patterns
307
+ tags = []
308
+ tag_patterns = [
309
+ r'<span[^>]*class="[^"]*badge[^"]*"[^>]*>([^<]+)</span>',
310
+ r'<a[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</a>',
311
+ r'rel="tag"[^>]*>([^<]+)</a>',
312
+ r'<span[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</span>',
313
+ ]
314
+ for pattern in tag_patterns:
315
+ found = re.findall(pattern, html, re.IGNORECASE)
316
+ tags.extend([t.strip() for t in found if t.strip()])
317
+ tags = list(set(tags))[:10] # Dedupe and limit
318
+
319
+ # Category from URL
320
+ path = urlparse(url).path
321
+ parts = [p for p in path.strip("/").split("/") if p and not re.match(r"^\d+$", p)]
322
+ # Skip date-like parts and get first meaningful segment
323
+ categories = []
324
+ for part in parts[:-1]: # Exclude last part (the slug)
325
+ if not re.match(r"^20\d{2}$", part) and part not in ["blog", "posts", "articles"]:
326
+ categories.append(part)
327
+ break
328
+
329
+ # Header image from various patterns
330
+ header_img = None
331
+ img_patterns = [
332
+ r'class="[^"]*intro-header[^"]*"[^>]*style="[^"]*url\([\'"]?([^\'")\s]+)',
333
+ r'class="[^"]*featured[^"]*"[^>]*src="([^"]+)"',
334
+ r'<meta[^>]*property="og:image"[^>]*content="([^"]+)"',
335
+ r'class="[^"]*post-image[^"]*"[^>]*src="([^"]+)"',
336
+ r'class="[^"]*hero[^"]*"[^>]*src="([^"]+)"',
337
+ ]
338
+ for pattern in img_patterns:
339
+ match = re.search(pattern, html, re.IGNORECASE)
340
+ if match:
341
+ header_img = match.group(1)
342
+ break
343
+
344
+ # Subtitle/description from various patterns
345
+ subtitle = None
346
+ subtitle_patterns = [
347
+ r'<span[^>]*class="[^"]*subheading[^"]*"[^>]*>([^<]+)</span>',
348
+ r'<p[^>]*class="[^"]*subtitle[^"]*"[^>]*>([^<]+)</p>',
349
+ r'<meta[^>]*name="description"[^>]*content="([^"]+)"',
350
+ r'<meta[^>]*property="og:description"[^>]*content="([^"]+)"',
351
+ ]
352
+ for pattern in subtitle_patterns:
353
+ match = re.search(pattern, html, re.IGNORECASE)
354
+ if match:
355
+ subtitle = match.group(1).strip()
356
+ if len(subtitle) > 200:
357
+ subtitle = subtitle[:197] + "..."
358
+ break
359
+
360
+ return PostMetadata(
361
+ title=title,
362
+ date=date,
363
+ categories=categories,
364
+ tags=tags,
365
+ subtitle=subtitle,
366
+ header_img=header_img,
367
+ )
368
+
369
+ def _extract_slug(self, url: str) -> str:
370
+ """Extract slug from post URL."""
371
+ path = urlparse(url).path
372
+ # Remove trailing slash and get last meaningful part
373
+ path = path.rstrip("/")
374
+ if not path:
375
+ return "index"
376
+ slug = path.rsplit("/", 1)[-1]
377
+ # Remove common extensions
378
+ for ext in (".html", ".htm", ".php", ".aspx"):
379
+ if slug.endswith(ext):
380
+ slug = slug[:-len(ext)]
381
+ break
382
+ return slug or "page"
src/tiktokify/enrichment/__init__.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Enrichment module for LLM-based post enrichment."""
2
+
3
+ from .base import ContentProvider, ExternalContent
4
+ from .llm_enricher import PostEnricher
5
+ from .providers import (
6
+ HackerNewsProvider,
7
+ HNFrontPageProvider,
8
+ LinkedContentProvider,
9
+ WikipediaProvider,
10
+ )
11
+
12
+ __all__ = [
13
+ "PostEnricher",
14
+ "ContentProvider",
15
+ "ExternalContent",
16
+ "WikipediaProvider",
17
+ "HackerNewsProvider",
18
+ "HNFrontPageProvider",
19
+ "LinkedContentProvider",
20
+ ]
src/tiktokify/enrichment/base.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Base classes for content providers."""
2
+
3
+ from abc import ABC, abstractmethod
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel, Field, HttpUrl
7
+
8
+ from tiktokify.models import Post
9
+
10
+
11
+ class ExternalContent(BaseModel):
12
+ """A piece of external content from any source."""
13
+
14
+ source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
15
+ title: str
16
+ url: HttpUrl
17
+ description: str = Field(default="", description="Brief description or excerpt")
18
+ relevance: str = Field(default="", description="Why this is relevant to the post")
19
+ metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
20
+
21
+
22
+ class ContentProvider(ABC):
23
+ """Abstract base class for external content providers.
24
+
25
+ To add a new source:
26
+ 1. Create a new file (e.g., hackernews.py)
27
+ 2. Subclass ContentProvider
28
+ 3. Implement source_type and fetch_for_post
29
+ 4. Register in enricher.py
30
+ """
31
+
32
+ def __init__(self, max_items: int = 3, verbose: bool = False):
33
+ self.max_items = max_items
34
+ self.verbose = verbose
35
+
36
+ @property
37
+ @abstractmethod
38
+ def source_type(self) -> str:
39
+ """Unique identifier for this source (e.g., 'wikipedia', 'hackernews')."""
40
+ pass
41
+
42
+ @abstractmethod
43
+ async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
44
+ """Fetch relevant external content for a blog post.
45
+
46
+ Args:
47
+ post: The blog post to find related content for
48
+
49
+ Returns:
50
+ List of ExternalContent items (up to max_items)
51
+ """
52
+ pass
53
+
54
+ async def fetch_for_posts(self, posts: list[Post]) -> dict[str, list[ExternalContent]]:
55
+ """Fetch content for multiple posts.
56
+
57
+ Default implementation calls fetch_for_post sequentially.
58
+ Override for batch optimization.
59
+
60
+ Returns:
61
+ Dict mapping post slug to list of ExternalContent
62
+ """
63
+ results = {}
64
+ for post in posts:
65
+ results[post.slug] = await self.fetch_for_post(post)
66
+ return results
src/tiktokify/enrichment/llm_enricher.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """LLM-based post enrichment using litellm.
2
+
3
+ This module uses LLM to:
4
+ 1. Generate key points/takeaways for each post
5
+ 2. Suggest relevant Wikipedia articles
6
+
7
+ The actual Wikipedia extract fetching is done by providers/wikipedia.py
8
+ """
9
+
10
+ import asyncio
11
+ import json
12
+
13
+ import litellm
14
+ from pydantic import ValidationError
15
+ from rich.console import Console
16
+
17
+ from tiktokify.models import Post, WikipediaSuggestion
18
+
19
+ console = Console()
20
+
21
+ # Disable litellm's verbose logging
22
+ litellm.suppress_debug_info = True
23
+
24
+
25
+ class PostEnricher:
26
+ """Enrich posts with key points and Wikipedia suggestions using LLM."""
27
+
28
+ def __init__(
29
+ self,
30
+ model: str = "gpt-4o-mini",
31
+ max_key_points: int = 5,
32
+ max_wikipedia: int = 3,
33
+ max_concurrent: int = 3,
34
+ verbose: bool = False,
35
+ ):
36
+ self.model = model
37
+ self.max_key_points = max_key_points
38
+ self.max_wikipedia = max_wikipedia
39
+ self.max_concurrent = max_concurrent
40
+ self.verbose = verbose
41
+ self.semaphore = asyncio.Semaphore(max_concurrent)
42
+
43
+ async def enrich_post(self, post: Post) -> None:
44
+ """Enrich a single post with key points and Wikipedia suggestions."""
45
+ prompt = self._build_prompt(post)
46
+
47
+ try:
48
+ # Calculate tokens needed: ~50 tokens per key point, ~80 per wiki suggestion
49
+ estimated_tokens = (self.max_key_points * 50) + (self.max_wikipedia * 100) + 200
50
+ max_tokens = max(1000, min(estimated_tokens, 4000))
51
+
52
+ response = await litellm.acompletion(
53
+ model=self.model,
54
+ messages=[{"role": "user", "content": prompt}],
55
+ temperature=0.3,
56
+ max_tokens=max_tokens,
57
+ )
58
+
59
+ content = response.choices[0].message.content
60
+ key_points, wikipedia = self._parse_response(content)
61
+
62
+ # Fetch Wikipedia extracts for each suggestion
63
+ wikipedia_with_extracts = await self._fetch_wiki_extracts(wikipedia)
64
+
65
+ post.key_points = key_points
66
+ post.wikipedia_suggestions = wikipedia_with_extracts
67
+
68
+ except Exception as e:
69
+ if self.verbose:
70
+ console.print(
71
+ f"[yellow]Warning: LLM call failed for {post.slug}: {e}[/yellow]"
72
+ )
73
+
74
+ async def _fetch_wiki_extracts(
75
+ self, suggestions: list[WikipediaSuggestion]
76
+ ) -> list[WikipediaSuggestion]:
77
+ """Fetch Wikipedia extracts for all suggestions concurrently."""
78
+ from tiktokify.enrichment.providers.wikipedia import WikipediaProvider
79
+
80
+ provider = WikipediaProvider(max_items=len(suggestions), verbose=self.verbose)
81
+
82
+ async def fetch_one(suggestion: WikipediaSuggestion) -> WikipediaSuggestion:
83
+ extract = await provider._fetch_extract(
84
+ provider._extract_title_from_url(str(suggestion.url)) or suggestion.title
85
+ )
86
+ return WikipediaSuggestion(
87
+ title=suggestion.title,
88
+ url=suggestion.url,
89
+ relevance=suggestion.relevance,
90
+ extract=extract,
91
+ )
92
+
93
+ tasks = [fetch_one(s) for s in suggestions]
94
+ results = await asyncio.gather(*tasks, return_exceptions=True)
95
+
96
+ return [r for r in results if isinstance(r, WikipediaSuggestion)]
97
+
98
+ def _build_prompt(self, post: Post) -> str:
99
+ """Build LLM prompt for key points and Wikipedia suggestions."""
100
+ content_excerpt = post.content_text[:2000] if post.content_text else ""
101
+
102
+ return f"""Analyze this blog post and provide:
103
+ 1. {self.max_key_points} key points/takeaways (concise bullet points)
104
+ 2. {self.max_wikipedia} relevant Wikipedia articles for further reading
105
+
106
+ Title: {post.metadata.title}
107
+ Subtitle: {post.metadata.subtitle or "N/A"}
108
+ Categories: {', '.join(post.metadata.categories)}
109
+ Tags: {', '.join(post.metadata.tags)}
110
+
111
+ Content:
112
+ {content_excerpt}
113
+
114
+ Return ONLY valid JSON with this exact structure:
115
+ {{
116
+ "keyPoints": ["point 1", "point 2", ...],
117
+ "wikipedia": [
118
+ {{"title": "Article Title", "url": "https://en.wikipedia.org/wiki/...", "relevance": "Why it's relevant"}}
119
+ ]
120
+ }}
121
+
122
+ Guidelines:
123
+ - Key points should be insightful takeaways, not just summaries
124
+ - Each key point should be 1-2 sentences max
125
+ - Wikipedia URLs must be valid (use underscores for spaces)
126
+ - Return ONLY the JSON, no markdown formatting"""
127
+
128
+ def _parse_response(self, content: str) -> tuple[list[str], list[WikipediaSuggestion]]:
129
+ """Parse LLM response into key points and Wikipedia suggestions."""
130
+ # Clean up response - remove markdown code blocks if present
131
+ content = content.strip()
132
+ if content.startswith("```"):
133
+ lines = content.split("\n")
134
+ content = "\n".join(
135
+ line for line in lines if not line.startswith("```")
136
+ )
137
+
138
+ key_points: list[str] = []
139
+ wikipedia: list[WikipediaSuggestion] = []
140
+
141
+ try:
142
+ data = json.loads(content)
143
+
144
+ # Parse key points
145
+ if "keyPoints" in data and isinstance(data["keyPoints"], list):
146
+ key_points = [str(p) for p in data["keyPoints"] if p]
147
+
148
+ # Parse Wikipedia suggestions
149
+ if "wikipedia" in data and isinstance(data["wikipedia"], list):
150
+ for item in data["wikipedia"]:
151
+ try:
152
+ suggestion = WikipediaSuggestion(
153
+ title=item.get("title", ""),
154
+ url=item.get("url", ""),
155
+ relevance=item.get("relevance", ""),
156
+ extract="", # Will be filled later
157
+ )
158
+ wikipedia.append(suggestion)
159
+ except ValidationError:
160
+ continue
161
+
162
+ except json.JSONDecodeError as e:
163
+ if self.verbose:
164
+ console.print(f"[yellow]JSON parse error: {e}[/yellow]")
165
+
166
+ return key_points, wikipedia
167
+
168
+ async def enrich_posts(self, posts: list[Post]) -> None:
169
+ """Enrich all posts concurrently."""
170
+
171
+ async def enrich_one(post: Post) -> None:
172
+ async with self.semaphore:
173
+ await self.enrich_post(post)
174
+
175
+ tasks = [enrich_one(post) for post in posts]
176
+ await asyncio.gather(*tasks, return_exceptions=True)
177
+
178
+
179
+ # Backwards compatibility alias
180
+ WikipediaSuggester = PostEnricher
src/tiktokify/enrichment/providers/__init__.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Content providers for external sources."""
2
+
3
+ from .hackernews import HackerNewsProvider, HNFrontPageProvider
4
+ from .links import LinkedContentProvider
5
+ from .wikipedia import WikipediaProvider
6
+
7
+ __all__ = [
8
+ "WikipediaProvider",
9
+ "HackerNewsProvider",
10
+ "HNFrontPageProvider",
11
+ "LinkedContentProvider",
12
+ ]
src/tiktokify/enrichment/providers/hackernews.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Hacker News content providers.
2
+
3
+ Provides two providers:
4
+ - HackerNewsProvider: Keyword-based search for stories related to post topics
5
+ - HNFrontPageProvider: Current front page stories for general interest
6
+ """
7
+
8
+ import asyncio
9
+ import re
10
+
11
+ import httpx
12
+
13
+ from tiktokify.enrichment.base import ContentProvider, ExternalContent
14
+ from tiktokify.models import Post
15
+
16
+
17
+ async def fetch_article_excerpt(url: str, max_chars: int = 800) -> str:
18
+ """Fetch and extract text excerpt from an article URL."""
19
+ if not url:
20
+ return ""
21
+
22
+ try:
23
+ async with httpx.AsyncClient() as client:
24
+ response = await client.get(
25
+ url,
26
+ headers={
27
+ "User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
28
+ "Accept": "text/html,application/xhtml+xml",
29
+ },
30
+ timeout=10.0,
31
+ follow_redirects=True,
32
+ )
33
+
34
+ if response.status_code != 200:
35
+ return ""
36
+
37
+ html = response.text
38
+
39
+ # Remove script, style, nav, header, footer tags
40
+ html = re.sub(r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
41
+
42
+ # Extract text from paragraph tags (most content is in <p>)
43
+ paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", html, flags=re.DOTALL | re.IGNORECASE)
44
+
45
+ # Clean HTML tags from extracted text
46
+ text_parts = []
47
+ for p in paragraphs:
48
+ clean = re.sub(r"<[^>]+>", " ", p)
49
+ clean = re.sub(r"\s+", " ", clean).strip()
50
+ if len(clean) > 50: # Skip very short paragraphs
51
+ text_parts.append(clean)
52
+
53
+ if not text_parts:
54
+ # Fallback: extract all text
55
+ text = re.sub(r"<[^>]+>", " ", html)
56
+ text = re.sub(r"\s+", " ", text).strip()
57
+ return text[:max_chars] + "..." if len(text) > max_chars else text
58
+
59
+ excerpt = " ".join(text_parts)
60
+ if len(excerpt) > max_chars:
61
+ excerpt = excerpt[:max_chars].rsplit(" ", 1)[0] + "..."
62
+ return excerpt
63
+
64
+ except Exception:
65
+ return ""
66
+
67
+
68
+ class HackerNewsProvider(ContentProvider):
69
+ """Fetch relevant Hacker News discussions for blog posts.
70
+
71
+ Uses the Algolia HN Search API to find related stories by keyword.
72
+ """
73
+
74
+ HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search"
75
+
76
+ @property
77
+ def source_type(self) -> str:
78
+ return "hackernews"
79
+
80
+ async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
81
+ """Search HN for stories related to the post's topics."""
82
+ # Build search query from post metadata
83
+ query_parts = []
84
+
85
+ # Use tags (most specific)
86
+ if post.metadata.tags:
87
+ query_parts.extend(post.metadata.tags[:3])
88
+
89
+ # Add key terms from title
90
+ title_words = [
91
+ w for w in post.metadata.title.split()
92
+ if len(w) > 4 and w.lower() not in {"about", "using", "with", "from", "that", "this", "what", "when", "where", "which"}
93
+ ]
94
+ query_parts.extend(title_words[:2])
95
+
96
+ if not query_parts:
97
+ return []
98
+
99
+ query = " ".join(query_parts)
100
+
101
+ try:
102
+ async with httpx.AsyncClient() as client:
103
+ response = await client.get(
104
+ self.HN_SEARCH_URL,
105
+ params={
106
+ "query": query,
107
+ "tags": "story",
108
+ "hitsPerPage": self.max_items * 2, # Fetch extra for filtering
109
+ },
110
+ timeout=10.0,
111
+ )
112
+
113
+ if response.status_code != 200:
114
+ return []
115
+
116
+ data = response.json()
117
+ hits = data.get("hits", [])
118
+
119
+ # Prepare hits for parallel fetching
120
+ selected_hits = hits[: self.max_items]
121
+ story_urls = [hit.get("url", "") for hit in selected_hits]
122
+
123
+ # Fetch all article excerpts in parallel
124
+ excerpts = await asyncio.gather(
125
+ *[fetch_article_excerpt(url) for url in story_urls],
126
+ return_exceptions=True,
127
+ )
128
+
129
+ results = []
130
+ for hit, excerpt in zip(selected_hits, excerpts):
131
+ story_id = hit.get("objectID", "")
132
+ hn_url = f"https://news.ycombinator.com/item?id={story_id}"
133
+
134
+ title = hit.get("title", "")
135
+ points = hit.get("points", 0)
136
+ num_comments = hit.get("num_comments", 0)
137
+ author = hit.get("author", "")
138
+ story_url = hit.get("url", "")
139
+
140
+ # Handle exceptions from parallel fetch
141
+ if isinstance(excerpt, Exception) or not excerpt:
142
+ excerpt = f"{points} points · {num_comments} comments"
143
+
144
+ results.append(
145
+ ExternalContent(
146
+ source=self.source_type,
147
+ title=title,
148
+ url=hn_url,
149
+ description=excerpt,
150
+ relevance=f"Found via search: {query}",
151
+ metadata={
152
+ "points": points,
153
+ "num_comments": num_comments,
154
+ "author": author,
155
+ "story_url": story_url,
156
+ },
157
+ )
158
+ )
159
+
160
+ return results
161
+
162
+ except Exception:
163
+ return []
164
+
165
+
166
+ class HNFrontPageProvider(ContentProvider):
167
+ """Fetch current Hacker News front page stories.
168
+
169
+ Uses the Algolia HN API to get stories currently on the front page.
170
+ Good for adding general tech interest content to any blog.
171
+ """
172
+
173
+ HN_FRONT_PAGE_URL = "https://hn.algolia.com/api/v1/search"
174
+
175
+ @property
176
+ def source_type(self) -> str:
177
+ return "hn-frontpage"
178
+
179
+ async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
180
+ """Fetch current front page stories (post-independent)."""
181
+ try:
182
+ async with httpx.AsyncClient() as client:
183
+ response = await client.get(
184
+ self.HN_FRONT_PAGE_URL,
185
+ params={
186
+ "tags": "front_page",
187
+ "hitsPerPage": self.max_items,
188
+ },
189
+ timeout=10.0,
190
+ )
191
+
192
+ if response.status_code != 200:
193
+ return []
194
+
195
+ data = response.json()
196
+ hits = data.get("hits", [])
197
+
198
+ # Prepare hits for parallel fetching
199
+ selected_hits = hits[: self.max_items]
200
+ story_urls = [hit.get("url", "") for hit in selected_hits]
201
+
202
+ # Fetch all article excerpts in parallel
203
+ excerpts = await asyncio.gather(
204
+ *[fetch_article_excerpt(url) for url in story_urls],
205
+ return_exceptions=True,
206
+ )
207
+
208
+ results = []
209
+ for hit, excerpt in zip(selected_hits, excerpts):
210
+ story_id = hit.get("objectID", "")
211
+ hn_url = f"https://news.ycombinator.com/item?id={story_id}"
212
+
213
+ title = hit.get("title", "")
214
+ points = hit.get("points", 0)
215
+ num_comments = hit.get("num_comments", 0)
216
+ author = hit.get("author", "")
217
+ story_url = hit.get("url", "")
218
+
219
+ # Handle exceptions from parallel fetch
220
+ if isinstance(excerpt, Exception) or not excerpt:
221
+ excerpt = f"{points} points · {num_comments} comments"
222
+
223
+ results.append(
224
+ ExternalContent(
225
+ source=self.source_type,
226
+ title=title,
227
+ url=hn_url,
228
+ description=excerpt,
229
+ relevance="Currently on HN front page",
230
+ metadata={
231
+ "points": points,
232
+ "num_comments": num_comments,
233
+ "author": author,
234
+ "story_url": story_url,
235
+ },
236
+ )
237
+ )
238
+
239
+ return results
240
+
241
+ except Exception:
242
+ return []
src/tiktokify/enrichment/providers/links.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Link extractor provider for crawling external links from blog posts."""
2
+
3
+ import asyncio
4
+ import re
5
+ from urllib.parse import urljoin, urlparse
6
+
7
+ import httpx
8
+
9
+ from tiktokify.enrichment.base import ContentProvider, ExternalContent
10
+ from tiktokify.models import Post
11
+
12
+
13
+ async def fetch_link_metadata(url: str, max_excerpt_chars: int = 600) -> tuple[str, str]:
14
+ """Fetch title and excerpt from a URL.
15
+
16
+ Returns (title, excerpt) tuple.
17
+ """
18
+ try:
19
+ async with httpx.AsyncClient() as client:
20
+ response = await client.get(
21
+ url,
22
+ headers={
23
+ "User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
24
+ "Accept": "text/html,application/xhtml+xml",
25
+ },
26
+ timeout=10.0,
27
+ follow_redirects=True,
28
+ )
29
+
30
+ if response.status_code != 200:
31
+ return "", ""
32
+
33
+ html = response.text
34
+
35
+ # Extract title
36
+ title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
37
+ title = ""
38
+ if title_match:
39
+ title = re.sub(r"<[^>]+>", "", title_match.group(1))
40
+ title = re.sub(r"\s+", " ", title).strip()
41
+
42
+ # Try meta description first
43
+ meta_desc = re.search(
44
+ r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
45
+ html,
46
+ re.IGNORECASE,
47
+ )
48
+ if meta_desc:
49
+ excerpt = meta_desc.group(1).strip()
50
+ return title, excerpt
51
+
52
+ # Remove script, style, nav, header, footer tags
53
+ clean_html = re.sub(
54
+ r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>",
55
+ "",
56
+ html,
57
+ flags=re.DOTALL | re.IGNORECASE,
58
+ )
59
+
60
+ # Extract text from paragraph tags
61
+ paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", clean_html, flags=re.DOTALL | re.IGNORECASE)
62
+
63
+ text_parts = []
64
+ for p in paragraphs:
65
+ clean = re.sub(r"<[^>]+>", " ", p)
66
+ clean = re.sub(r"\s+", " ", clean).strip()
67
+ if len(clean) > 50:
68
+ text_parts.append(clean)
69
+
70
+ excerpt = " ".join(text_parts)
71
+ if len(excerpt) > max_excerpt_chars:
72
+ excerpt = excerpt[:max_excerpt_chars].rsplit(" ", 1)[0] + "..."
73
+
74
+ return title, excerpt
75
+
76
+ except Exception:
77
+ return "", ""
78
+
79
+
80
+ class LinkedContentProvider(ContentProvider):
81
+ """Extract and crawl external links from blog post content.
82
+
83
+ Finds links within the blog post HTML and fetches their content,
84
+ creating a "spider" of related content from the post's references.
85
+ """
86
+
87
+ # Domains to skip (social media, generic sites, etc.)
88
+ SKIP_DOMAINS = {
89
+ "twitter.com",
90
+ "x.com",
91
+ "facebook.com",
92
+ "instagram.com",
93
+ "linkedin.com",
94
+ "youtube.com",
95
+ "youtu.be",
96
+ "github.com",
97
+ "gist.github.com",
98
+ "reddit.com",
99
+ "news.ycombinator.com",
100
+ "google.com",
101
+ "amazon.com",
102
+ "wikipedia.org", # Already have Wikipedia provider
103
+ "fonts.googleapis.com",
104
+ "cdn.jsdelivr.net",
105
+ "unpkg.com",
106
+ "cloudflare.com",
107
+ }
108
+
109
+ @property
110
+ def source_type(self) -> str:
111
+ return "linked"
112
+
113
+ def _extract_links(self, html: str, base_url: str) -> list[str]:
114
+ """Extract external links from HTML content."""
115
+ # Find all href links
116
+ links = re.findall(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE)
117
+
118
+ parsed_base = urlparse(base_url)
119
+ base_domain = parsed_base.netloc.lower()
120
+
121
+ external_links = []
122
+ seen = set()
123
+
124
+ for link in links:
125
+ # Skip anchor links, mailto, javascript, etc.
126
+ if link.startswith(("#", "mailto:", "javascript:", "tel:")):
127
+ continue
128
+
129
+ # Resolve relative URLs
130
+ if link.startswith("/"):
131
+ link = urljoin(base_url, link)
132
+ elif not link.startswith(("http://", "https://")):
133
+ continue
134
+
135
+ # Parse and validate
136
+ parsed = urlparse(link)
137
+ domain = parsed.netloc.lower()
138
+
139
+ # Skip internal links
140
+ if domain == base_domain or domain.endswith(f".{base_domain}"):
141
+ continue
142
+
143
+ # Skip blocked domains
144
+ if any(skip in domain for skip in self.SKIP_DOMAINS):
145
+ continue
146
+
147
+ # Skip non-http(s) links
148
+ if parsed.scheme not in ("http", "https"):
149
+ continue
150
+
151
+ # Skip duplicates
152
+ normalized = f"{parsed.scheme}://{domain}{parsed.path}"
153
+ if normalized in seen:
154
+ continue
155
+ seen.add(normalized)
156
+
157
+ external_links.append(link)
158
+
159
+ return external_links
160
+
161
+ async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
162
+ """Extract links from post content and fetch their metadata."""
163
+ if not post.content_html:
164
+ return []
165
+
166
+ # Extract external links
167
+ links = self._extract_links(post.content_html, post.url)
168
+
169
+ if not links:
170
+ return []
171
+
172
+ # Limit to max_items and fetch all in parallel
173
+ selected_links = links[: self.max_items]
174
+
175
+ metadata_results = await asyncio.gather(
176
+ *[fetch_link_metadata(link) for link in selected_links],
177
+ return_exceptions=True,
178
+ )
179
+
180
+ results = []
181
+ for link, meta in zip(selected_links, metadata_results):
182
+ # Handle exceptions
183
+ if isinstance(meta, Exception):
184
+ continue
185
+
186
+ title, excerpt = meta
187
+
188
+ if not title and not excerpt:
189
+ continue
190
+
191
+ # Use URL domain as fallback title
192
+ if not title:
193
+ parsed = urlparse(link)
194
+ title = parsed.netloc
195
+
196
+ results.append(
197
+ ExternalContent(
198
+ source=self.source_type,
199
+ title=title,
200
+ url=link,
201
+ description=excerpt,
202
+ relevance=f"Referenced in: {post.metadata.title}",
203
+ metadata={
204
+ "source_post_slug": post.slug,
205
+ "link_type": "reference",
206
+ },
207
+ )
208
+ )
209
+
210
+ return results
src/tiktokify/enrichment/providers/wikipedia.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Wikipedia content provider."""
2
+
3
+ from urllib.parse import unquote, urlparse
4
+
5
+ import httpx
6
+
7
+ from tiktokify.enrichment.base import ContentProvider, ExternalContent
8
+ from tiktokify.models import Post
9
+
10
+
11
+ class WikipediaProvider(ContentProvider):
12
+ """Fetch relevant Wikipedia articles for blog posts.
13
+
14
+ Uses Wikipedia REST API to fetch article summaries.
15
+ Requires LLM to first suggest relevant articles (see PostEnricher).
16
+ """
17
+
18
+ @property
19
+ def source_type(self) -> str:
20
+ return "wikipedia"
21
+
22
+ async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
23
+ """Fetch Wikipedia extracts for pre-suggested articles."""
24
+ results = []
25
+
26
+ for suggestion in post.wikipedia_suggestions[: self.max_items]:
27
+ title = self._extract_title_from_url(str(suggestion.url)) or suggestion.title
28
+ extract = await self._fetch_extract(title)
29
+
30
+ results.append(
31
+ ExternalContent(
32
+ source=self.source_type,
33
+ title=suggestion.title,
34
+ url=suggestion.url,
35
+ description=extract,
36
+ relevance=suggestion.relevance,
37
+ metadata={"extract": extract},
38
+ )
39
+ )
40
+
41
+ return results
42
+
43
+ async def _fetch_extract(self, title: str, max_chars: int = 1500) -> str:
44
+ """Fetch article extract from Wikipedia API."""
45
+ title = title.strip()
46
+ url = "https://en.wikipedia.org/api/rest_v1/page/summary/" + title.replace(" ", "_")
47
+
48
+ try:
49
+ async with httpx.AsyncClient() as client:
50
+ response = await client.get(
51
+ url,
52
+ headers={"User-Agent": "TikTokify/1.0"},
53
+ timeout=10.0,
54
+ follow_redirects=True,
55
+ )
56
+
57
+ if response.status_code == 200:
58
+ data = response.json()
59
+ extract = data.get("extract", "")
60
+ if len(extract) > max_chars:
61
+ extract = extract[:max_chars].rsplit(" ", 1)[0] + "..."
62
+ return extract
63
+ except Exception:
64
+ pass
65
+
66
+ return ""
67
+
68
+ def _extract_title_from_url(self, url: str) -> str:
69
+ """Extract Wikipedia article title from URL."""
70
+ parsed = urlparse(url)
71
+ if "wikipedia.org" in parsed.netloc:
72
+ path = parsed.path
73
+ if path.startswith("/wiki/"):
74
+ title = path[6:]
75
+ title = unquote(title)
76
+ title = title.replace("_", " ")
77
+ return title
78
+ return ""
src/tiktokify/generator/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """HTML generator module."""
2
+
3
+ from .html_generator import HTMLGenerator
4
+
5
+ __all__ = ["HTMLGenerator"]
src/tiktokify/generator/html_generator.py ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """HTML generator for TikTok-style swipe UI."""
2
+
3
+ import json
4
+ from pathlib import Path
5
+
6
+ from jinja2 import Environment, FileSystemLoader
7
+
8
+ from tiktokify.models import RecommendationGraph
9
+
10
+
11
+ class HTMLGenerator:
12
+ """Generate standalone HTML with embedded data and swipe UI."""
13
+
14
+ def __init__(self, template_dir: Path | None = None):
15
+ if template_dir is None:
16
+ template_dir = Path(__file__).parent / "templates"
17
+
18
+ self.env = Environment(
19
+ loader=FileSystemLoader(template_dir),
20
+ autoescape=True,
21
+ )
22
+
23
+ def generate(
24
+ self,
25
+ graph: RecommendationGraph,
26
+ base_url: str,
27
+ output_path: Path,
28
+ ) -> None:
29
+ """Generate HTML file with embedded recommendation data."""
30
+ template = self.env.get_template("swipe.html.jinja2")
31
+
32
+ # Prepare data for embedding
33
+ graph_data = graph.to_json_for_embed()
34
+ graph_json = json.dumps(graph_data, indent=2)
35
+
36
+ # Sort posts by date for initial list
37
+ sorted_posts = sorted(
38
+ graph.posts.values(),
39
+ key=lambda p: p.metadata.date,
40
+ reverse=True,
41
+ )
42
+ post_slugs = [p.slug for p in sorted_posts]
43
+
44
+ html = template.render(
45
+ base_url=base_url.rstrip("/"),
46
+ graph_json=graph_json,
47
+ post_slugs_json=json.dumps(post_slugs),
48
+ post_count=len(sorted_posts),
49
+ )
50
+
51
+ output_path.parent.mkdir(parents=True, exist_ok=True)
52
+ output_path.write_text(html)
src/tiktokify/generator/templates/swipe.html.jinja2 ADDED
@@ -0,0 +1,1028 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
6
+ <title>TikTokify - Swipe to Discover</title>
7
+ <style>
8
+ * {
9
+ margin: 0;
10
+ padding: 0;
11
+ box-sizing: border-box;
12
+ }
13
+
14
+ html, body {
15
+ height: 100%;
16
+ overflow: hidden;
17
+ font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
18
+ background: #000;
19
+ color: #fff;
20
+ }
21
+
22
+ .swipe-container {
23
+ height: 100vh;
24
+ overflow-y: scroll;
25
+ scroll-snap-type: y mandatory;
26
+ -webkit-overflow-scrolling: touch;
27
+ }
28
+
29
+ .card {
30
+ height: 100vh;
31
+ scroll-snap-align: start;
32
+ display: flex;
33
+ flex-direction: column;
34
+ position: relative;
35
+ background-size: cover;
36
+ background-position: center;
37
+ }
38
+
39
+ .card-overlay {
40
+ position: absolute;
41
+ inset: 0;
42
+ background: linear-gradient(
43
+ to bottom,
44
+ rgba(0,0,0,0.3) 0%,
45
+ rgba(0,0,0,0.1) 30%,
46
+ rgba(0,0,0,0.6) 60%,
47
+ rgba(0,0,0,0.95) 100%
48
+ );
49
+ }
50
+
51
+ .card-content {
52
+ position: relative;
53
+ z-index: 1;
54
+ height: 100%;
55
+ display: flex;
56
+ flex-direction: column;
57
+ justify-content: flex-end;
58
+ padding: 20px;
59
+ padding-bottom: 100px;
60
+ overflow-y: auto;
61
+ }
62
+
63
+ .card-type {
64
+ display: inline-flex;
65
+ align-items: center;
66
+ gap: 6px;
67
+ background: rgba(255,255,255,0.15);
68
+ padding: 4px 10px;
69
+ border-radius: 20px;
70
+ font-size: 10px;
71
+ text-transform: uppercase;
72
+ letter-spacing: 0.5px;
73
+ margin-bottom: 12px;
74
+ width: fit-content;
75
+ }
76
+
77
+ .card-type.wiki {
78
+ background: rgba(77, 163, 255, 0.3);
79
+ }
80
+
81
+ .card-meta {
82
+ display: flex;
83
+ flex-wrap: wrap;
84
+ gap: 8px;
85
+ margin-bottom: 12px;
86
+ }
87
+
88
+ .card-category {
89
+ background: rgba(255,255,255,0.2);
90
+ padding: 4px 12px;
91
+ border-radius: 20px;
92
+ font-size: 11px;
93
+ text-transform: uppercase;
94
+ letter-spacing: 0.5px;
95
+ }
96
+
97
+ .card-title {
98
+ font-size: 24px;
99
+ font-weight: 700;
100
+ margin-bottom: 10px;
101
+ line-height: 1.25;
102
+ text-shadow: 0 2px 4px rgba(0,0,0,0.3);
103
+ }
104
+
105
+ .card-subtitle {
106
+ font-size: 14px;
107
+ opacity: 0.85;
108
+ margin-bottom: 12px;
109
+ line-height: 1.5;
110
+ }
111
+
112
+ /* Key points */
113
+ .key-points {
114
+ background: rgba(255,255,255,0.08);
115
+ border-radius: 12px;
116
+ padding: 14px;
117
+ margin-bottom: 14px;
118
+ }
119
+
120
+ .key-points-title {
121
+ font-size: 11px;
122
+ text-transform: uppercase;
123
+ letter-spacing: 0.5px;
124
+ opacity: 0.6;
125
+ margin-bottom: 10px;
126
+ }
127
+
128
+ .key-point {
129
+ display: flex;
130
+ gap: 10px;
131
+ margin-bottom: 8px;
132
+ font-size: 13px;
133
+ line-height: 1.5;
134
+ }
135
+
136
+ .key-point:last-child {
137
+ margin-bottom: 0;
138
+ }
139
+
140
+ .key-point-bullet {
141
+ color: #4da3ff;
142
+ flex-shrink: 0;
143
+ }
144
+
145
+ .card-tags {
146
+ display: flex;
147
+ flex-wrap: wrap;
148
+ gap: 6px;
149
+ margin-bottom: 12px;
150
+ }
151
+
152
+ .card-tag {
153
+ background: rgba(255,255,255,0.12);
154
+ padding: 4px 10px;
155
+ border-radius: 4px;
156
+ font-size: 11px;
157
+ opacity: 0.9;
158
+ }
159
+
160
+ .card-date {
161
+ font-size: 12px;
162
+ opacity: 0.6;
163
+ }
164
+
165
+ /* Wikipedia card specific */
166
+ .wiki-card {
167
+ background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
168
+ }
169
+
170
+ .wiki-card .card-overlay {
171
+ background: linear-gradient(
172
+ to bottom,
173
+ rgba(15, 52, 96, 0.3) 0%,
174
+ rgba(15, 52, 96, 0.6) 50%,
175
+ rgba(10, 10, 30, 0.95) 100%
176
+ );
177
+ }
178
+
179
+ .wiki-excerpt {
180
+ font-size: 15px;
181
+ line-height: 1.7;
182
+ opacity: 0.95;
183
+ margin-bottom: 16px;
184
+ max-height: 40vh;
185
+ overflow-y: auto;
186
+ }
187
+
188
+ /* Wikipedia card - center content vertically */
189
+ .wiki-card .card-content {
190
+ justify-content: center;
191
+ padding-top: 80px;
192
+ }
193
+
194
+ /* HackerNews card */
195
+ .hn-card {
196
+ background: linear-gradient(135deg, #1a0a00 0%, #ff6600 100%);
197
+ }
198
+
199
+ .hn-card .card-overlay {
200
+ background: linear-gradient(
201
+ to bottom,
202
+ rgba(26, 10, 0, 0.4) 0%,
203
+ rgba(26, 10, 0, 0.6) 50%,
204
+ rgba(10, 5, 0, 0.95) 100%
205
+ );
206
+ }
207
+
208
+ .hn-card .card-content {
209
+ justify-content: center;
210
+ padding-top: 80px;
211
+ }
212
+
213
+ .card-type.hn {
214
+ background: rgba(255, 102, 0, 0.4);
215
+ }
216
+
217
+ .hn-meta {
218
+ display: flex;
219
+ gap: 16px;
220
+ margin-bottom: 16px;
221
+ font-size: 14px;
222
+ opacity: 0.9;
223
+ }
224
+
225
+ .hn-meta-item {
226
+ display: flex;
227
+ align-items: center;
228
+ gap: 6px;
229
+ }
230
+
231
+ .hn-description {
232
+ font-size: 15px;
233
+ line-height: 1.7;
234
+ opacity: 0.9;
235
+ margin-bottom: 16px;
236
+ max-height: 35vh;
237
+ overflow-y: auto;
238
+ }
239
+
240
+ .hn-link {
241
+ display: inline-flex;
242
+ align-items: center;
243
+ gap: 8px;
244
+ background: rgba(255, 102, 0, 0.3);
245
+ padding: 10px 16px;
246
+ border-radius: 8px;
247
+ font-size: 13px;
248
+ margin-top: 12px;
249
+ opacity: 0.9;
250
+ }
251
+
252
+ .wiki-relevance {
253
+ font-size: 13px;
254
+ line-height: 1.5;
255
+ opacity: 0.7;
256
+ margin-bottom: 12px;
257
+ padding-left: 12px;
258
+ border-left: 2px solid rgba(77, 163, 255, 0.5);
259
+ }
260
+
261
+ .wiki-source {
262
+ display: inline-flex;
263
+ align-items: center;
264
+ gap: 6px;
265
+ font-size: 12px;
266
+ opacity: 0.6;
267
+ }
268
+
269
+ /* Linked content card (tertiary crawled links) */
270
+ .linked-card {
271
+ background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
272
+ }
273
+
274
+ .linked-card .card-overlay {
275
+ background: linear-gradient(
276
+ to bottom,
277
+ rgba(26, 26, 26, 0.3) 0%,
278
+ rgba(26, 26, 26, 0.6) 50%,
279
+ rgba(10, 20, 15, 0.95) 100%
280
+ );
281
+ }
282
+
283
+ .linked-card .card-content {
284
+ justify-content: center;
285
+ padding-top: 80px;
286
+ }
287
+
288
+ .card-type.linked {
289
+ background: rgba(46, 204, 113, 0.3);
290
+ }
291
+
292
+ .linked-description {
293
+ font-size: 15px;
294
+ line-height: 1.7;
295
+ opacity: 0.9;
296
+ margin-bottom: 16px;
297
+ max-height: 35vh;
298
+ overflow-y: auto;
299
+ }
300
+
301
+ .rec-item.linked-rec {
302
+ border-left: 3px solid rgba(46, 204, 113, 0.6);
303
+ padding-left: 12px;
304
+ }
305
+
306
+ .rec-thumb.linked-thumb {
307
+ background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
308
+ display: flex;
309
+ align-items: center;
310
+ justify-content: center;
311
+ }
312
+
313
+ /* Action buttons */
314
+ .card-actions {
315
+ position: fixed;
316
+ right: 12px;
317
+ bottom: 120px;
318
+ display: flex;
319
+ flex-direction: column;
320
+ gap: 14px;
321
+ z-index: 10;
322
+ }
323
+
324
+ .action-btn {
325
+ width: 46px;
326
+ height: 46px;
327
+ border-radius: 50%;
328
+ background: rgba(255,255,255,0.15);
329
+ backdrop-filter: blur(10px);
330
+ border: none;
331
+ color: #fff;
332
+ font-size: 18px;
333
+ cursor: pointer;
334
+ display: flex;
335
+ align-items: center;
336
+ justify-content: center;
337
+ transition: transform 0.2s, background 0.2s;
338
+ }
339
+
340
+ .action-btn:hover {
341
+ transform: scale(1.1);
342
+ background: rgba(255,255,255,0.25);
343
+ }
344
+
345
+ .action-btn:active {
346
+ transform: scale(0.95);
347
+ }
348
+
349
+ .read-btn {
350
+ background: rgba(255,71,87,0.8);
351
+ }
352
+
353
+ .read-btn:hover {
354
+ background: rgba(255,71,87,1);
355
+ }
356
+
357
+ /* Panels */
358
+ .panel {
359
+ position: fixed;
360
+ bottom: 0;
361
+ left: 0;
362
+ right: 0;
363
+ background: rgba(20,20,20,0.98);
364
+ backdrop-filter: blur(20px);
365
+ border-radius: 20px 20px 0 0;
366
+ padding: 20px;
367
+ transform: translateY(100%);
368
+ transition: transform 0.3s ease;
369
+ z-index: 20;
370
+ max-height: 60vh;
371
+ overflow-y: auto;
372
+ }
373
+
374
+ .panel.open {
375
+ transform: translateY(0);
376
+ }
377
+
378
+ .panel-header {
379
+ display: flex;
380
+ justify-content: space-between;
381
+ align-items: center;
382
+ margin-bottom: 16px;
383
+ }
384
+
385
+ .panel h3 {
386
+ font-size: 16px;
387
+ font-weight: 600;
388
+ }
389
+
390
+ .panel-close {
391
+ background: none;
392
+ border: none;
393
+ color: #fff;
394
+ font-size: 24px;
395
+ cursor: pointer;
396
+ opacity: 0.6;
397
+ }
398
+
399
+ .panel-close:hover {
400
+ opacity: 1;
401
+ }
402
+
403
+ /* Recommendation items */
404
+ .rec-item {
405
+ display: flex;
406
+ gap: 14px;
407
+ padding: 12px 0;
408
+ border-bottom: 1px solid rgba(255,255,255,0.08);
409
+ cursor: pointer;
410
+ transition: background 0.2s;
411
+ }
412
+
413
+ .rec-item:hover {
414
+ background: rgba(255,255,255,0.05);
415
+ }
416
+
417
+ .rec-thumb {
418
+ width: 64px;
419
+ height: 64px;
420
+ border-radius: 8px;
421
+ background-size: cover;
422
+ background-position: center;
423
+ flex-shrink: 0;
424
+ background-color: rgba(255,255,255,0.1);
425
+ }
426
+
427
+ .rec-info {
428
+ flex: 1;
429
+ min-width: 0;
430
+ }
431
+
432
+ .rec-info h4 {
433
+ font-size: 14px;
434
+ font-weight: 600;
435
+ margin-bottom: 4px;
436
+ }
437
+
438
+ .rec-info span {
439
+ font-size: 12px;
440
+ opacity: 0.5;
441
+ }
442
+
443
+ .rec-section-title {
444
+ font-size: 12px;
445
+ text-transform: uppercase;
446
+ letter-spacing: 0.5px;
447
+ opacity: 0.5;
448
+ margin: 16px 0 8px 0;
449
+ }
450
+
451
+ .rec-section-title:first-child {
452
+ margin-top: 0;
453
+ }
454
+
455
+ .rec-item.wiki-rec {
456
+ border-left: 3px solid rgba(77, 163, 255, 0.6);
457
+ padding-left: 12px;
458
+ }
459
+
460
+ .rec-thumb.wiki-thumb {
461
+ background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
462
+ display: flex;
463
+ align-items: center;
464
+ justify-content: center;
465
+ }
466
+
467
+ .rec-thumb.wiki-thumb svg {
468
+ opacity: 0.6;
469
+ }
470
+
471
+ .rec-item.hn-rec {
472
+ border-left: 3px solid rgba(255, 102, 0, 0.6);
473
+ padding-left: 12px;
474
+ }
475
+
476
+ .rec-thumb.hn-thumb {
477
+ background: linear-gradient(135deg, #1a0a00 0%, #3d1a00 100%);
478
+ display: flex;
479
+ align-items: center;
480
+ justify-content: center;
481
+ }
482
+
483
+ /* Swipe hint */
484
+ .swipe-hint {
485
+ position: fixed;
486
+ bottom: 20px;
487
+ left: 50%;
488
+ transform: translateX(-50%);
489
+ font-size: 12px;
490
+ opacity: 0.4;
491
+ display: flex;
492
+ flex-direction: column;
493
+ align-items: center;
494
+ gap: 4px;
495
+ animation: fadeOut 3s forwards;
496
+ animation-delay: 2s;
497
+ }
498
+
499
+ .swipe-hint-arrow {
500
+ animation: bounce 1.5s infinite;
501
+ }
502
+
503
+ @keyframes bounce {
504
+ 0%, 100% { transform: translateY(0); }
505
+ 50% { transform: translateY(-6px); }
506
+ }
507
+
508
+ @keyframes fadeOut {
509
+ to { opacity: 0; pointer-events: none; }
510
+ }
511
+
512
+ .empty-state {
513
+ text-align: center;
514
+ padding: 40px 20px;
515
+ opacity: 0.6;
516
+ }
517
+
518
+ .panel-overlay {
519
+ position: fixed;
520
+ inset: 0;
521
+ background: rgba(0,0,0,0.5);
522
+ z-index: 15;
523
+ opacity: 0;
524
+ pointer-events: none;
525
+ transition: opacity 0.3s;
526
+ }
527
+
528
+ .panel-overlay.visible {
529
+ opacity: 1;
530
+ pointer-events: auto;
531
+ }
532
+ </style>
533
+ </head>
534
+ <body>
535
+ <div class="swipe-container" id="container"></div>
536
+
537
+ <div class="card-actions" id="actions">
538
+ <button class="action-btn" id="recs-btn" title="Similar Posts">
539
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
540
+ <circle cx="12" cy="12" r="3"/>
541
+ <path d="M12 2v4m0 12v4M2 12h4m12 0h4"/>
542
+ </svg>
543
+ </button>
544
+ <button class="action-btn read-btn" id="read-btn" title="Read Full">
545
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
546
+ <path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
547
+ <polyline points="15,3 21,3 21,9"/>
548
+ <line x1="10" y1="14" x2="21" y2="3"/>
549
+ </svg>
550
+ </button>
551
+ </div>
552
+
553
+ <div class="panel-overlay" id="panel-overlay"></div>
554
+
555
+ <div class="panel" id="recs-panel">
556
+ <div class="panel-header">
557
+ <h3>Related Content</h3>
558
+ <button class="panel-close" id="recs-close">&times;</button>
559
+ </div>
560
+ <div id="recs-content"></div>
561
+ </div>
562
+
563
+ <div class="swipe-hint" id="swipe-hint">
564
+ <span class="swipe-hint-arrow">&#8593;</span>
565
+ <span>Swipe up for more</span>
566
+ </div>
567
+
568
+ <script>
569
+ // Embedded data
570
+ const GRAPH = {{ graph_json|safe }};
571
+ const POST_SLUGS = {{ post_slugs_json|safe }};
572
+ const BASE_URL = "{{ base_url }}";
573
+
574
+ // State
575
+ let currentIndex = 0;
576
+ let feedItems = []; // Mixed array of {type: 'post'|'wiki', data: ...}
577
+
578
+ // Shuffle array (Fisher-Yates)
579
+ function shuffle(arr) {
580
+ const result = [...arr];
581
+ for (let i = result.length - 1; i > 0; i--) {
582
+ const j = Math.floor(Math.random() * (i + 1));
583
+ [result[i], result[j]] = [result[j], result[i]];
584
+ }
585
+ return result;
586
+ }
587
+
588
+ // Format date
589
+ function formatDate(isoDate) {
590
+ const date = new Date(isoDate);
591
+ return date.toLocaleDateString('en-US', {
592
+ year: 'numeric',
593
+ month: 'long',
594
+ day: 'numeric'
595
+ });
596
+ }
597
+
598
+ // Build feed with interleaved external content (Wikipedia, HN, etc.)
599
+ function buildFeed() {
600
+ const shuffledSlugs = shuffle(POST_SLUGS);
601
+ feedItems = [];
602
+
603
+ // Collect all Wikipedia suggestions
604
+ const allWiki = [];
605
+ // Collect all external content (HN, etc.)
606
+ const allExternal = [];
607
+
608
+ for (const slug of shuffledSlugs) {
609
+ const post = GRAPH.posts[slug];
610
+ if (post.wikipedia) {
611
+ for (const w of post.wikipedia) {
612
+ allWiki.push({ ...w, sourcePost: post.title });
613
+ }
614
+ }
615
+ if (post.externalContent) {
616
+ for (const e of post.externalContent) {
617
+ allExternal.push({ ...e, sourcePost: post.title });
618
+ }
619
+ }
620
+ }
621
+
622
+ const shuffledWiki = shuffle(allWiki);
623
+ const shuffledExternal = shuffle(allExternal);
624
+
625
+ // Interleave content: wiki every 3 posts, external every 4 posts
626
+ let wikiIndex = 0;
627
+ let extIndex = 0;
628
+
629
+ for (let i = 0; i < shuffledSlugs.length; i++) {
630
+ feedItems.push({ type: 'post', slug: shuffledSlugs[i] });
631
+
632
+ // Insert Wikipedia after every 3 posts
633
+ if ((i + 1) % 3 === 0 && wikiIndex < shuffledWiki.length) {
634
+ feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
635
+ wikiIndex++;
636
+ }
637
+
638
+ // Insert external content (HN) after every 4 posts
639
+ if ((i + 1) % 4 === 0 && extIndex < shuffledExternal.length) {
640
+ feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
641
+ extIndex++;
642
+ }
643
+ }
644
+
645
+ // Add remaining content at the end
646
+ while (wikiIndex < shuffledWiki.length) {
647
+ feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
648
+ wikiIndex++;
649
+ }
650
+ while (extIndex < shuffledExternal.length) {
651
+ feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
652
+ extIndex++;
653
+ }
654
+ }
655
+
656
+ // Render a blog post card
657
+ function renderPostCard(slug, idx) {
658
+ const post = GRAPH.posts[slug];
659
+ const bgImg = post.headerImg ? `${BASE_URL}/${post.headerImg}` : '';
660
+ const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%)';
661
+
662
+ const keyPointsHtml = post.keyPoints && post.keyPoints.length > 0 ? `
663
+ <div class="key-points">
664
+ <div class="key-points-title">Key Takeaways</div>
665
+ ${post.keyPoints.slice(0, 4).map(point => `
666
+ <div class="key-point">
667
+ <span class="key-point-bullet">•</span>
668
+ <span>${point}</span>
669
+ </div>
670
+ `).join('')}
671
+ </div>
672
+ ` : '';
673
+
674
+ return `
675
+ <div class="card" data-type="post" data-slug="${slug}" data-index="${idx}" style="${bgStyle}">
676
+ <div class="card-overlay"></div>
677
+ <div class="card-content">
678
+ <div class="card-type">📝 Blog Post</div>
679
+ <div class="card-meta">
680
+ ${post.categories.map(c =>
681
+ `<span class="card-category">${c}</span>`
682
+ ).join('')}
683
+ <span class="card-category">${post.readingTime} min</span>
684
+ </div>
685
+ <h1 class="card-title">${post.title}</h1>
686
+ ${post.subtitle ? `<p class="card-subtitle">${post.subtitle}</p>` : ''}
687
+ ${keyPointsHtml}
688
+ <div class="card-tags">
689
+ ${post.tags.slice(0, 4).map(t =>
690
+ `<span class="card-tag">#${t}</span>`
691
+ ).join('')}
692
+ </div>
693
+ <div class="card-date">${formatDate(post.date)}</div>
694
+ </div>
695
+ </div>
696
+ `;
697
+ }
698
+
699
+ // Render a Wikipedia card
700
+ function renderWikiCard(wiki, idx) {
701
+ // Prefer extract from Wikipedia API, fall back to LLM-generated relevance
702
+ const excerpt = wiki.extract || wiki.relevance || '';
703
+ return `
704
+ <div class="card wiki-card" data-type="wiki" data-url="${wiki.url}" data-index="${idx}">
705
+ <div class="card-overlay"></div>
706
+ <div class="card-content">
707
+ <div class="card-type wiki">
708
+ <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
709
+ <circle cx="12" cy="12" r="10"/>
710
+ <path d="M12 16v-4M12 8h.01"/>
711
+ </svg>
712
+ Wikipedia
713
+ </div>
714
+ <h1 class="card-title">${wiki.title}</h1>
715
+ <p class="wiki-excerpt">${excerpt}</p>
716
+ ${wiki.relevance && wiki.extract ? `<p class="wiki-relevance"><em>${wiki.relevance}</em></p>` : ''}
717
+ <div class="wiki-source">
718
+ Related to: ${wiki.sourcePost}
719
+ </div>
720
+ </div>
721
+ </div>
722
+ `;
723
+ }
724
+
725
+ // Render an external content card (HackerNews, linked, etc.)
726
+ function renderExternalCard(ext, idx) {
727
+ const isHN = ext.source === 'hackernews' || ext.source === 'hn-frontpage';
728
+ const isLinked = ext.source === 'linked';
729
+
730
+ let cardClass, typeClass, typeName, descClass;
731
+
732
+ if (isHN) {
733
+ cardClass = 'hn-card';
734
+ typeClass = 'hn';
735
+ typeName = ext.source === 'hn-frontpage' ? 'HN Front Page' : 'Hacker News';
736
+ descClass = 'hn-description';
737
+ } else if (isLinked) {
738
+ cardClass = 'linked-card';
739
+ typeClass = 'linked';
740
+ typeName = 'Referenced Link';
741
+ descClass = 'linked-description';
742
+ } else {
743
+ cardClass = 'external-card';
744
+ typeClass = '';
745
+ typeName = ext.source;
746
+ descClass = 'hn-description';
747
+ }
748
+
749
+ const points = ext.metadata?.points || 0;
750
+ const comments = ext.metadata?.num_comments || 0;
751
+ const storyUrl = ext.metadata?.story_url || '';
752
+
753
+ // Icon based on type
754
+ const icon = isHN
755
+ ? `<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>`
756
+ : isLinked
757
+ ? `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/><path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/></svg>`
758
+ : '';
759
+
760
+ return `
761
+ <div class="card ${cardClass}" data-type="external" data-url="${ext.url}" data-index="${idx}">
762
+ <div class="card-overlay"></div>
763
+ <div class="card-content">
764
+ <div class="card-type ${typeClass}">
765
+ ${icon}
766
+ ${typeName}
767
+ </div>
768
+ <h1 class="card-title">${ext.title}</h1>
769
+ ${isHN ? `
770
+ <div class="hn-meta">
771
+ <div class="hn-meta-item">
772
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
773
+ <path d="M12 2l3.09 6.26L22 9.27l-5 4.87 1.18 6.88L12 17.77l-6.18 3.25L7 14.14 2 9.27l6.91-1.01L12 2z"/>
774
+ </svg>
775
+ ${points} points
776
+ </div>
777
+ <div class="hn-meta-item">
778
+ <svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
779
+ <path d="M21 15a2 2 0 01-2 2H7l-4 4V5a2 2 0 012-2h14a2 2 0 012 2z"/>
780
+ </svg>
781
+ ${comments} comments
782
+ </div>
783
+ </div>
784
+ ` : ''}
785
+ ${ext.description ? `<p class="${descClass}">${ext.description}</p>` : ''}
786
+ ${storyUrl ? `
787
+ <div class="hn-link">
788
+ <svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
789
+ <path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
790
+ <polyline points="15,3 21,3 21,9"/>
791
+ <line x1="10" y1="14" x2="21" y2="3"/>
792
+ </svg>
793
+ View original article
794
+ </div>
795
+ ` : ''}
796
+ <div class="wiki-source">
797
+ ${ext.relevance || `Related to: ${ext.sourcePost}`}
798
+ </div>
799
+ </div>
800
+ </div>
801
+ `;
802
+ }
803
+
804
+ // Render all slides
805
+ function renderSlides() {
806
+ buildFeed();
807
+ const container = document.getElementById('container');
808
+
809
+ container.innerHTML = feedItems.map((item, idx) => {
810
+ if (item.type === 'post') {
811
+ return renderPostCard(item.slug, idx);
812
+ } else if (item.type === 'wiki') {
813
+ return renderWikiCard(item.data, idx);
814
+ } else if (item.type === 'external') {
815
+ return renderExternalCard(item.data, idx);
816
+ }
817
+ }).join('');
818
+ }
819
+
820
+ // Track current slide on scroll
821
+ function onScroll() {
822
+ const container = document.getElementById('container');
823
+ const slideHeight = window.innerHeight;
824
+ currentIndex = Math.round(container.scrollTop / slideHeight);
825
+
826
+ // Update action buttons visibility based on card type
827
+ const currentItem = feedItems[currentIndex];
828
+ const recsBtn = document.getElementById('recs-btn');
829
+ if (currentItem && (currentItem.type === 'wiki' || currentItem.type === 'external')) {
830
+ recsBtn.style.display = 'none';
831
+ } else {
832
+ recsBtn.style.display = 'flex';
833
+ }
834
+ }
835
+
836
+ // Open full content
837
+ function openFullContent() {
838
+ const item = feedItems[currentIndex];
839
+ if (item.type === 'post') {
840
+ const post = GRAPH.posts[item.slug];
841
+ window.open(post.url, '_blank');
842
+ } else if (item.type === 'wiki') {
843
+ window.open(item.data.url, '_blank');
844
+ } else if (item.type === 'external') {
845
+ window.open(item.data.url, '_blank');
846
+ }
847
+ }
848
+
849
+ // Close all panels
850
+ function closePanels() {
851
+ document.getElementById('recs-panel').classList.remove('open');
852
+ document.getElementById('panel-overlay').classList.remove('visible');
853
+ }
854
+
855
+ // Show recommendations panel
856
+ function showRecsPanel() {
857
+ const item = feedItems[currentIndex];
858
+ if (item.type !== 'post') return;
859
+
860
+ const panel = document.getElementById('recs-panel');
861
+ const content = document.getElementById('recs-content');
862
+ const post = GRAPH.posts[item.slug];
863
+ const recs = GRAPH.recommendations[item.slug] || [];
864
+ const wikiSuggestions = post.wikipedia || [];
865
+ const externalContent = post.externalContent || [];
866
+
867
+ let html = '';
868
+
869
+ // Similar posts section
870
+ if (recs.length > 0) {
871
+ html += '<div class="rec-section-title">Similar Posts</div>';
872
+ html += recs.map(([recSlug, score]) => {
873
+ const recPost = GRAPH.posts[recSlug];
874
+ const bgImg = recPost.headerImg ? `${BASE_URL}/${recPost.headerImg}` : '';
875
+ const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #2d3436 0%, #636e72 100%)';
876
+
877
+ return `
878
+ <div class="rec-item" data-slug="${recSlug}">
879
+ <div class="rec-thumb" style="${bgStyle}"></div>
880
+ <div class="rec-info">
881
+ <h4>${recPost.title}</h4>
882
+ <span>${Math.round(score * 100)}% match</span>
883
+ </div>
884
+ </div>
885
+ `;
886
+ }).join('');
887
+ }
888
+
889
+ // Wikipedia section
890
+ if (wikiSuggestions.length > 0) {
891
+ html += '<div class="rec-section-title">Wikipedia Articles</div>';
892
+ html += wikiSuggestions.map(wiki => {
893
+ const excerpt = wiki.extract || wiki.relevance || '';
894
+ const truncatedExcerpt = excerpt.length > 80 ? excerpt.slice(0, 80) + '...' : excerpt;
895
+ return `
896
+ <div class="rec-item wiki-rec" data-url="${wiki.url}">
897
+ <div class="rec-thumb wiki-thumb">
898
+ <svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
899
+ <circle cx="12" cy="12" r="10"/>
900
+ <path d="M12 16v-4M12 8h.01"/>
901
+ </svg>
902
+ </div>
903
+ <div class="rec-info">
904
+ <h4>${wiki.title}</h4>
905
+ <span>${truncatedExcerpt}</span>
906
+ </div>
907
+ </div>
908
+ `;
909
+ }).join('');
910
+ }
911
+
912
+ // External content section (HN, etc.)
913
+ const hnContent = externalContent.filter(e => e.source === 'hackernews' || e.source === 'hn-frontpage');
914
+ if (hnContent.length > 0) {
915
+ html += '<div class="rec-section-title">Hacker News</div>';
916
+ html += hnContent.map(hn => {
917
+ const points = hn.metadata?.points || 0;
918
+ const comments = hn.metadata?.num_comments || 0;
919
+ return `
920
+ <div class="rec-item hn-rec" data-url="${hn.url}">
921
+ <div class="rec-thumb hn-thumb">
922
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="#ff6600"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>
923
+ </div>
924
+ <div class="rec-info">
925
+ <h4>${hn.title}</h4>
926
+ <span>${points} pts · ${comments} comments</span>
927
+ </div>
928
+ </div>
929
+ `;
930
+ }).join('');
931
+ }
932
+
933
+ // Linked content section (tertiary crawled links)
934
+ const linkedContent = externalContent.filter(e => e.source === 'linked');
935
+ if (linkedContent.length > 0) {
936
+ html += '<div class="rec-section-title">Referenced Links</div>';
937
+ html += linkedContent.map(link => {
938
+ const truncatedDesc = link.description && link.description.length > 80
939
+ ? link.description.slice(0, 80) + '...'
940
+ : (link.description || '');
941
+ return `
942
+ <div class="rec-item linked-rec" data-url="${link.url}">
943
+ <div class="rec-thumb linked-thumb">
944
+ <svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="#2ecc71" stroke-width="2">
945
+ <path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/>
946
+ <path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/>
947
+ </svg>
948
+ </div>
949
+ <div class="rec-info">
950
+ <h4>${link.title}</h4>
951
+ <span>${truncatedDesc}</span>
952
+ </div>
953
+ </div>
954
+ `;
955
+ }).join('');
956
+ }
957
+
958
+ if (html) {
959
+ content.innerHTML = html;
960
+ } else {
961
+ content.innerHTML = '<div class="empty-state">No related content found.</div>';
962
+ }
963
+
964
+ document.getElementById('panel-overlay').classList.add('visible');
965
+ panel.classList.add('open');
966
+ }
967
+
968
+ // Jump to a specific post
969
+ function jumpToPost(slug) {
970
+ const idx = feedItems.findIndex(item => item.type === 'post' && item.slug === slug);
971
+ if (idx >= 0) {
972
+ const container = document.getElementById('container');
973
+ container.scrollTo({
974
+ top: idx * window.innerHeight,
975
+ behavior: 'smooth'
976
+ });
977
+ closePanels();
978
+ }
979
+ }
980
+
981
+ // Event listeners
982
+ document.getElementById('container').addEventListener('scroll', onScroll);
983
+ document.getElementById('read-btn').addEventListener('click', openFullContent);
984
+ document.getElementById('recs-btn').addEventListener('click', showRecsPanel);
985
+ document.getElementById('recs-close').addEventListener('click', closePanels);
986
+ document.getElementById('panel-overlay').addEventListener('click', closePanels);
987
+
988
+ document.getElementById('recs-content').addEventListener('click', (e) => {
989
+ const item = e.target.closest('.rec-item');
990
+ if (item) {
991
+ if (item.dataset.url) {
992
+ // Wikipedia item - open in new tab
993
+ window.open(item.dataset.url, '_blank');
994
+ closePanels();
995
+ } else if (item.dataset.slug) {
996
+ // Blog post - jump to it
997
+ jumpToPost(item.dataset.slug);
998
+ }
999
+ }
1000
+ });
1001
+
1002
+ // Keyboard navigation
1003
+ document.addEventListener('keydown', (e) => {
1004
+ const container = document.getElementById('container');
1005
+ const slideHeight = window.innerHeight;
1006
+
1007
+ if (e.key === 'ArrowDown' || e.key === 'j') {
1008
+ container.scrollBy({ top: slideHeight, behavior: 'smooth' });
1009
+ } else if (e.key === 'ArrowUp' || e.key === 'k') {
1010
+ container.scrollBy({ top: -slideHeight, behavior: 'smooth' });
1011
+ } else if (e.key === 'Enter' || e.key === 'o') {
1012
+ openFullContent();
1013
+ } else if (e.key === 'Escape') {
1014
+ closePanels();
1015
+ }
1016
+ });
1017
+
1018
+ // Hide swipe hint after first scroll
1019
+ document.getElementById('container').addEventListener('scroll', () => {
1020
+ document.getElementById('swipe-hint').style.display = 'none';
1021
+ }, { once: true });
1022
+
1023
+ // Initialize
1024
+ renderSlides();
1025
+ onScroll(); // Update button visibility for initial state
1026
+ </script>
1027
+ </body>
1028
+ </html>
src/tiktokify/models/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Data models for tiktokify."""
2
+
3
+ from .post import (
4
+ ExternalContentItem,
5
+ Post,
6
+ PostMetadata,
7
+ RecommendationGraph,
8
+ WikipediaSuggestion,
9
+ )
10
+
11
+ __all__ = [
12
+ "ExternalContentItem",
13
+ "Post",
14
+ "PostMetadata",
15
+ "RecommendationGraph",
16
+ "WikipediaSuggestion",
17
+ ]
src/tiktokify/models/post.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Pydantic models for blog posts and recommendation graph."""
2
+
3
+ from datetime import datetime
4
+ from typing import Optional
5
+
6
+ from pydantic import BaseModel, Field, HttpUrl
7
+
8
+
9
+ class ExternalContentItem(BaseModel):
10
+ """Generic external content from any source."""
11
+
12
+ source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
13
+ title: str
14
+ url: HttpUrl
15
+ description: str = Field(default="", description="Brief description or excerpt")
16
+ relevance: str = Field(default="", description="Why this is relevant to the post")
17
+ metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
18
+
19
+
20
+ class WikipediaSuggestion(BaseModel):
21
+ """A Wikipedia article suggestion for a blog post."""
22
+
23
+ title: str
24
+ url: HttpUrl
25
+ relevance: str = Field(description="Brief explanation of why this is relevant")
26
+ extract: str = Field(default="", description="Article summary from Wikipedia API")
27
+
28
+
29
+ class PostMetadata(BaseModel):
30
+ """Metadata extracted from Jekyll post front matter."""
31
+
32
+ title: str
33
+ date: datetime
34
+ categories: list[str] = Field(default_factory=list)
35
+ tags: list[str] = Field(default_factory=list)
36
+ subtitle: Optional[str] = None
37
+ header_img: Optional[str] = None
38
+ last_edited_on: Optional[datetime] = None
39
+
40
+
41
+ class Post(BaseModel):
42
+ """Complete representation of a blog post."""
43
+
44
+ url: str
45
+ slug: str
46
+ metadata: PostMetadata
47
+ content_text: str = Field(description="Plain text content for TF-IDF")
48
+ content_html: str = Field(default="", description="Full HTML content")
49
+ reading_time_minutes: int = Field(default=1)
50
+
51
+ # Populated during enrichment phase
52
+ key_points: list[str] = Field(
53
+ default_factory=list, description="LLM-generated key points/summary"
54
+ )
55
+ similar_posts: list[str] = Field(
56
+ default_factory=list, description="List of similar post slugs"
57
+ )
58
+ similarity_scores: dict[str, float] = Field(
59
+ default_factory=dict, description="slug -> similarity score"
60
+ )
61
+ wikipedia_suggestions: list[WikipediaSuggestion] = Field(default_factory=list)
62
+ external_content: list[ExternalContentItem] = Field(
63
+ default_factory=list, description="Content from external sources (HN, Reddit, etc.)"
64
+ )
65
+
66
+
67
+ class RecommendationGraph(BaseModel):
68
+ """Graph of posts with recommendation adjacency list."""
69
+
70
+ posts: dict[str, Post] = Field(description="slug -> Post mapping")
71
+ adjacency: dict[str, list[tuple[str, float]]] = Field(
72
+ default_factory=dict, description="slug -> [(similar_slug, score), ...]"
73
+ )
74
+
75
+ def to_json_for_embed(self) -> dict:
76
+ """Serialize for embedding in HTML (minimal, frontend-friendly format)."""
77
+ return {
78
+ "posts": {
79
+ slug: {
80
+ "title": p.metadata.title,
81
+ "subtitle": p.metadata.subtitle,
82
+ "date": p.metadata.date.isoformat(),
83
+ "categories": p.metadata.categories,
84
+ "tags": p.metadata.tags,
85
+ "url": p.url,
86
+ "headerImg": p.metadata.header_img,
87
+ "readingTime": p.reading_time_minutes,
88
+ "keyPoints": p.key_points,
89
+ "wikipedia": [
90
+ {
91
+ "title": w.title,
92
+ "url": str(w.url),
93
+ "relevance": w.relevance,
94
+ "extract": w.extract,
95
+ }
96
+ for w in p.wikipedia_suggestions
97
+ ],
98
+ "externalContent": [
99
+ {
100
+ "source": e.source,
101
+ "title": e.title,
102
+ "url": str(e.url),
103
+ "description": e.description,
104
+ "relevance": e.relevance,
105
+ "metadata": e.metadata,
106
+ }
107
+ for e in p.external_content
108
+ ],
109
+ }
110
+ for slug, p in self.posts.items()
111
+ },
112
+ "recommendations": {
113
+ slug: [(s, round(score, 3)) for s, score in recs]
114
+ for slug, recs in self.adjacency.items()
115
+ },
116
+ }
src/tiktokify/recommender/__init__.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ """Recommendation engine module."""
2
+
3
+ from .engine import RecommendationEngine
4
+
5
+ __all__ = ["RecommendationEngine"]
src/tiktokify/recommender/engine.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Combined recommendation engine."""
2
+
3
+ from tiktokify.models import Post, RecommendationGraph
4
+
5
+ from .metadata import MetadataSimilarity
6
+ from .tfidf import TFIDFSimilarity
7
+
8
+
9
+ class RecommendationEngine:
10
+ """Hybrid recommendation combining content and metadata similarity."""
11
+
12
+ def __init__(
13
+ self,
14
+ content_weight: float = 0.6,
15
+ metadata_weight: float = 0.4,
16
+ top_k: int = 5,
17
+ ):
18
+ self.content_weight = content_weight
19
+ self.metadata_weight = metadata_weight
20
+ self.top_k = top_k
21
+
22
+ self.tfidf = TFIDFSimilarity()
23
+ self.metadata = MetadataSimilarity()
24
+
25
+ def build_graph(self, posts: list[Post]) -> RecommendationGraph:
26
+ """Build complete recommendation graph."""
27
+ # Fit both models
28
+ self.tfidf.fit(posts)
29
+ self.metadata.fit(posts)
30
+
31
+ posts_dict = {p.slug: p for p in posts}
32
+ adjacency: dict[str, list[tuple[str, float]]] = {}
33
+
34
+ for post in posts:
35
+ # Get similarities from both sources
36
+ content_sims = dict(self.tfidf.get_similar(post.slug, self.top_k * 2))
37
+ metadata_sims = dict(self.metadata.get_similar(post.slug, self.top_k * 2))
38
+
39
+ # Combine scores
40
+ all_slugs = set(content_sims.keys()) | set(metadata_sims.keys())
41
+ combined: list[tuple[str, float]] = []
42
+
43
+ for slug in all_slugs:
44
+ c_score = content_sims.get(slug, 0)
45
+ m_score = metadata_sims.get(slug, 0)
46
+ combined_score = (
47
+ self.content_weight * c_score + self.metadata_weight * m_score
48
+ )
49
+ combined.append((slug, combined_score))
50
+
51
+ # Sort and take top_k
52
+ combined.sort(key=lambda x: x[1], reverse=True)
53
+ adjacency[post.slug] = combined[: self.top_k]
54
+
55
+ # Update post object with recommendations
56
+ post.similar_posts = [s for s, _ in combined[: self.top_k]]
57
+ post.similarity_scores = dict(combined[: self.top_k])
58
+
59
+ return RecommendationGraph(posts=posts_dict, adjacency=adjacency)
src/tiktokify/recommender/metadata.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tag and category based similarity."""
2
+
3
+ from tiktokify.models import Post
4
+
5
+
6
+ class MetadataSimilarity:
7
+ """Tag and category based Jaccard similarity."""
8
+
9
+ def __init__(
10
+ self,
11
+ tag_weight: float = 0.7,
12
+ category_weight: float = 0.3,
13
+ ):
14
+ self.tag_weight = tag_weight
15
+ self.category_weight = category_weight
16
+ self.posts: dict[str, Post] = {}
17
+
18
+ def fit(self, posts: list[Post]) -> None:
19
+ """Store posts for similarity computation."""
20
+ self.posts = {p.slug: p for p in posts}
21
+
22
+ def compute_similarity(self, slug1: str, slug2: str) -> float:
23
+ """Compute Jaccard-like similarity between two posts."""
24
+ p1, p2 = self.posts.get(slug1), self.posts.get(slug2)
25
+ if not p1 or not p2:
26
+ return 0.0
27
+
28
+ # Tag similarity (Jaccard index)
29
+ tags1, tags2 = set(p1.metadata.tags), set(p2.metadata.tags)
30
+ tag_union = tags1 | tags2
31
+ tag_sim = len(tags1 & tags2) / len(tag_union) if tag_union else 0
32
+
33
+ # Category similarity (exact match)
34
+ cats1, cats2 = set(p1.metadata.categories), set(p2.metadata.categories)
35
+ cat_union = cats1 | cats2
36
+ cat_sim = len(cats1 & cats2) / len(cat_union) if cat_union else 0
37
+
38
+ return self.tag_weight * tag_sim + self.category_weight * cat_sim
39
+
40
+ def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
41
+ """Get top-k similar posts based on metadata."""
42
+ if slug not in self.posts:
43
+ return []
44
+
45
+ scores = [
46
+ (other_slug, self.compute_similarity(slug, other_slug))
47
+ for other_slug in self.posts
48
+ if other_slug != slug
49
+ ]
50
+ scores.sort(key=lambda x: x[1], reverse=True)
51
+ return scores[:top_k]
src/tiktokify/recommender/tfidf.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """TF-IDF based content similarity."""
2
+
3
+ import numpy as np
4
+ from sklearn.feature_extraction.text import TfidfVectorizer
5
+ from sklearn.metrics.pairwise import cosine_similarity
6
+
7
+ from tiktokify.models import Post
8
+
9
+
10
+ class TFIDFSimilarity:
11
+ """Content-based similarity using TF-IDF."""
12
+
13
+ def __init__(
14
+ self,
15
+ max_features: int = 5000,
16
+ ngram_range: tuple[int, int] = (1, 2),
17
+ ):
18
+ self.vectorizer = TfidfVectorizer(
19
+ max_features=max_features,
20
+ ngram_range=ngram_range,
21
+ stop_words="english",
22
+ min_df=1,
23
+ max_df=0.9,
24
+ )
25
+ self.tfidf_matrix: np.ndarray | None = None
26
+ self.slugs: list[str] = []
27
+
28
+ def fit(self, posts: list[Post]) -> None:
29
+ """Fit TF-IDF on post content."""
30
+ self.slugs = [p.slug for p in posts]
31
+ texts = [p.content_text for p in posts]
32
+ self.tfidf_matrix = self.vectorizer.fit_transform(texts)
33
+
34
+ def get_similarity_matrix(self) -> np.ndarray:
35
+ """Return full cosine similarity matrix."""
36
+ if self.tfidf_matrix is None:
37
+ raise ValueError("Must call fit() first")
38
+ return cosine_similarity(self.tfidf_matrix)
39
+
40
+ def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
41
+ """Get top-k similar posts for a given slug."""
42
+ if slug not in self.slugs:
43
+ return []
44
+
45
+ idx = self.slugs.index(slug)
46
+ sim_matrix = self.get_similarity_matrix()
47
+ scores = sim_matrix[idx]
48
+
49
+ # Get top-k (excluding self)
50
+ top_indices = np.argsort(scores)[::-1][1 : top_k + 1]
51
+ return [(self.slugs[i], float(scores[i])) for i in top_indices]
tests/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """Tests for tiktokify."""