Spaces:
Running
Running
Commit ·
51da887
1
Parent(s): d038c44
Add initial vibe-coded shits
Browse files- .gitignore +7 -0
- pyproject.toml +34 -0
- src/tiktokify/__init__.py +3 -0
- src/tiktokify/__main__.py +6 -0
- src/tiktokify/cli.py +303 -0
- src/tiktokify/crawler/__init__.py +8 -0
- src/tiktokify/crawler/blog_crawler.py +382 -0
- src/tiktokify/enrichment/__init__.py +20 -0
- src/tiktokify/enrichment/base.py +66 -0
- src/tiktokify/enrichment/llm_enricher.py +180 -0
- src/tiktokify/enrichment/providers/__init__.py +12 -0
- src/tiktokify/enrichment/providers/hackernews.py +242 -0
- src/tiktokify/enrichment/providers/links.py +210 -0
- src/tiktokify/enrichment/providers/wikipedia.py +78 -0
- src/tiktokify/generator/__init__.py +5 -0
- src/tiktokify/generator/html_generator.py +52 -0
- src/tiktokify/generator/templates/swipe.html.jinja2 +1028 -0
- src/tiktokify/models/__init__.py +17 -0
- src/tiktokify/models/post.py +116 -0
- src/tiktokify/recommender/__init__.py +5 -0
- src/tiktokify/recommender/engine.py +59 -0
- src/tiktokify/recommender/metadata.py +51 -0
- src/tiktokify/recommender/tfidf.py +51 -0
- tests/__init__.py +1 -0
.gitignore
CHANGED
|
@@ -6,6 +6,13 @@ __pycache__/
|
|
| 6 |
# C extensions
|
| 7 |
*.so
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
# Distribution / packaging
|
| 10 |
.Python
|
| 11 |
build/
|
|
|
|
| 6 |
# C extensions
|
| 7 |
*.so
|
| 8 |
|
| 9 |
+
output/
|
| 10 |
+
html/
|
| 11 |
+
tmp/
|
| 12 |
+
out/
|
| 13 |
+
temp/
|
| 14 |
+
data/
|
| 15 |
+
|
| 16 |
# Distribution / packaging
|
| 17 |
.Python
|
| 18 |
build/
|
pyproject.toml
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[project]
|
| 2 |
+
name = "tiktokify"
|
| 3 |
+
version = "0.1.0"
|
| 4 |
+
description = "TikTok-style swipeable blog viewer with recommendations"
|
| 5 |
+
readme = "README.md"
|
| 6 |
+
requires-python = ">=3.11"
|
| 7 |
+
dependencies = [
|
| 8 |
+
"crawl4ai>=0.4.0",
|
| 9 |
+
"scikit-learn>=1.3.0",
|
| 10 |
+
"numpy>=1.24.0",
|
| 11 |
+
"jinja2>=3.1.0",
|
| 12 |
+
"httpx>=0.25.0",
|
| 13 |
+
"litellm>=1.0.0",
|
| 14 |
+
"click>=8.1.0",
|
| 15 |
+
"pydantic>=2.0.0",
|
| 16 |
+
"rich>=13.0.0",
|
| 17 |
+
"loguru>=0.7.0",
|
| 18 |
+
]
|
| 19 |
+
|
| 20 |
+
[project.scripts]
|
| 21 |
+
tiktokify = "tiktokify.cli:main"
|
| 22 |
+
|
| 23 |
+
[build-system]
|
| 24 |
+
requires = ["hatchling"]
|
| 25 |
+
build-backend = "hatchling.build"
|
| 26 |
+
|
| 27 |
+
[tool.hatch.build.targets.wheel]
|
| 28 |
+
packages = ["src/tiktokify"]
|
| 29 |
+
|
| 30 |
+
[dependency-groups]
|
| 31 |
+
dev = [
|
| 32 |
+
"pytest>=7.0.0",
|
| 33 |
+
"pytest-asyncio>=0.21.0",
|
| 34 |
+
]
|
src/tiktokify/__init__.py
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TikTokify - TikTok-style swipeable blog viewer with recommendations."""
|
| 2 |
+
|
| 3 |
+
__version__ = "0.1.0"
|
src/tiktokify/__main__.py
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Entry point for `python -m tiktokify` or `uv run tiktokify`."""
|
| 2 |
+
|
| 3 |
+
from tiktokify.cli import main
|
| 4 |
+
|
| 5 |
+
if __name__ == "__main__":
|
| 6 |
+
main()
|
src/tiktokify/cli.py
ADDED
|
@@ -0,0 +1,303 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""CLI interface for tiktokify."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
import click
|
| 7 |
+
from rich.console import Console
|
| 8 |
+
from rich.progress import Progress, SpinnerColumn, TextColumn
|
| 9 |
+
|
| 10 |
+
console = Console()
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@click.command()
|
| 14 |
+
@click.option(
|
| 15 |
+
"--base-url",
|
| 16 |
+
"-u",
|
| 17 |
+
required=True,
|
| 18 |
+
help="Base URL of the Jekyll blog (e.g., https://nish1001.github.io)",
|
| 19 |
+
)
|
| 20 |
+
@click.option(
|
| 21 |
+
"--output-html",
|
| 22 |
+
"-o",
|
| 23 |
+
required=True,
|
| 24 |
+
type=click.Path(),
|
| 25 |
+
help="Output path for generated HTML file",
|
| 26 |
+
)
|
| 27 |
+
@click.option(
|
| 28 |
+
"--model",
|
| 29 |
+
"-m",
|
| 30 |
+
default=None,
|
| 31 |
+
help="LLM model for enrichment (e.g., gpt-4o-mini, claude-3-haiku-20240307). Skip if not provided.",
|
| 32 |
+
)
|
| 33 |
+
@click.option(
|
| 34 |
+
"--n-key-points",
|
| 35 |
+
type=int,
|
| 36 |
+
default=5,
|
| 37 |
+
help="Number of key points to generate per post",
|
| 38 |
+
)
|
| 39 |
+
@click.option(
|
| 40 |
+
"--n-wiki",
|
| 41 |
+
type=int,
|
| 42 |
+
default=3,
|
| 43 |
+
help="Number of Wikipedia articles to suggest per post",
|
| 44 |
+
)
|
| 45 |
+
@click.option(
|
| 46 |
+
"--sources",
|
| 47 |
+
type=str,
|
| 48 |
+
default="",
|
| 49 |
+
help="Comma-separated external sources to fetch. Available: hackernews (hn), hn-frontpage (frontpage), links (linked)",
|
| 50 |
+
)
|
| 51 |
+
@click.option(
|
| 52 |
+
"--n-external",
|
| 53 |
+
type=int,
|
| 54 |
+
default=3,
|
| 55 |
+
help="Number of items to fetch per external source",
|
| 56 |
+
)
|
| 57 |
+
@click.option(
|
| 58 |
+
"--content-weight",
|
| 59 |
+
type=float,
|
| 60 |
+
default=0.6,
|
| 61 |
+
help="Weight for content-based similarity (0-1)",
|
| 62 |
+
)
|
| 63 |
+
@click.option(
|
| 64 |
+
"--metadata-weight",
|
| 65 |
+
type=float,
|
| 66 |
+
default=0.4,
|
| 67 |
+
help="Weight for tag/category similarity (0-1)",
|
| 68 |
+
)
|
| 69 |
+
@click.option(
|
| 70 |
+
"--top-k",
|
| 71 |
+
type=int,
|
| 72 |
+
default=5,
|
| 73 |
+
help="Number of recommendations per post",
|
| 74 |
+
)
|
| 75 |
+
@click.option(
|
| 76 |
+
"--max-concurrent",
|
| 77 |
+
type=int,
|
| 78 |
+
default=5,
|
| 79 |
+
help="Maximum concurrent requests",
|
| 80 |
+
)
|
| 81 |
+
@click.option(
|
| 82 |
+
"--max-depth",
|
| 83 |
+
type=int,
|
| 84 |
+
default=1,
|
| 85 |
+
help="Spider crawl depth (1=seed only, 2=seed+linked pages, etc.)",
|
| 86 |
+
)
|
| 87 |
+
@click.option(
|
| 88 |
+
"--verbose",
|
| 89 |
+
"-v",
|
| 90 |
+
is_flag=True,
|
| 91 |
+
help="Enable verbose output",
|
| 92 |
+
)
|
| 93 |
+
def main(
|
| 94 |
+
base_url: str,
|
| 95 |
+
output_html: str,
|
| 96 |
+
model: str | None,
|
| 97 |
+
n_key_points: int,
|
| 98 |
+
n_wiki: int,
|
| 99 |
+
sources: str,
|
| 100 |
+
n_external: int,
|
| 101 |
+
content_weight: float,
|
| 102 |
+
metadata_weight: float,
|
| 103 |
+
top_k: int,
|
| 104 |
+
max_concurrent: int,
|
| 105 |
+
max_depth: int,
|
| 106 |
+
verbose: bool,
|
| 107 |
+
) -> None:
|
| 108 |
+
"""
|
| 109 |
+
TikTokify - Generate a TikTok-style swipe interface for your Jekyll blog.
|
| 110 |
+
|
| 111 |
+
Example:
|
| 112 |
+
|
| 113 |
+
uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html
|
| 114 |
+
|
| 115 |
+
With LLM enrichment (key points + Wikipedia):
|
| 116 |
+
|
| 117 |
+
uv run tiktokify -u https://nish1001.github.io -o ./tiktokify/index.html -m gpt-4o-mini
|
| 118 |
+
|
| 119 |
+
With deeper spider crawling:
|
| 120 |
+
|
| 121 |
+
uv run tiktokify -u https://example.com -o output.html --max-depth 2
|
| 122 |
+
"""
|
| 123 |
+
asyncio.run(
|
| 124 |
+
_main_async(
|
| 125 |
+
base_url=base_url,
|
| 126 |
+
output_html=Path(output_html),
|
| 127 |
+
model=model,
|
| 128 |
+
n_key_points=n_key_points,
|
| 129 |
+
n_wiki=n_wiki,
|
| 130 |
+
sources=[s.strip() for s in sources.split(",") if s.strip()],
|
| 131 |
+
n_external=n_external,
|
| 132 |
+
content_weight=content_weight,
|
| 133 |
+
metadata_weight=metadata_weight,
|
| 134 |
+
top_k=top_k,
|
| 135 |
+
max_concurrent=max_concurrent,
|
| 136 |
+
max_depth=max_depth,
|
| 137 |
+
verbose=verbose,
|
| 138 |
+
)
|
| 139 |
+
)
|
| 140 |
+
|
| 141 |
+
|
| 142 |
+
async def _main_async(
|
| 143 |
+
base_url: str,
|
| 144 |
+
output_html: Path,
|
| 145 |
+
model: str | None,
|
| 146 |
+
n_key_points: int,
|
| 147 |
+
n_wiki: int,
|
| 148 |
+
sources: list[str],
|
| 149 |
+
n_external: int,
|
| 150 |
+
content_weight: float,
|
| 151 |
+
metadata_weight: float,
|
| 152 |
+
top_k: int,
|
| 153 |
+
max_concurrent: int,
|
| 154 |
+
max_depth: int,
|
| 155 |
+
verbose: bool,
|
| 156 |
+
) -> None:
|
| 157 |
+
"""Async main function."""
|
| 158 |
+
from tiktokify.crawler import SpiderCrawler
|
| 159 |
+
from tiktokify.enrichment import (
|
| 160 |
+
HackerNewsProvider,
|
| 161 |
+
HNFrontPageProvider,
|
| 162 |
+
LinkedContentProvider,
|
| 163 |
+
PostEnricher,
|
| 164 |
+
)
|
| 165 |
+
from tiktokify.generator import HTMLGenerator
|
| 166 |
+
from tiktokify.models import ExternalContentItem
|
| 167 |
+
from tiktokify.recommender import RecommendationEngine
|
| 168 |
+
|
| 169 |
+
# Map source names to provider classes
|
| 170 |
+
PROVIDERS = {
|
| 171 |
+
"hackernews": HackerNewsProvider,
|
| 172 |
+
"hn": HackerNewsProvider, # alias
|
| 173 |
+
"hn-frontpage": HNFrontPageProvider,
|
| 174 |
+
"frontpage": HNFrontPageProvider, # alias
|
| 175 |
+
"links": LinkedContentProvider,
|
| 176 |
+
"linked": LinkedContentProvider, # alias
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
console.print(f"\n[bold blue]TikTokify[/bold blue] - Generating swipe UI for {base_url}\n")
|
| 180 |
+
|
| 181 |
+
with Progress(
|
| 182 |
+
SpinnerColumn(),
|
| 183 |
+
TextColumn("[progress.description]{task.description}"),
|
| 184 |
+
console=console,
|
| 185 |
+
transient=True,
|
| 186 |
+
) as progress:
|
| 187 |
+
# Step 1: Spider crawl
|
| 188 |
+
depth_info = f" (depth={max_depth})" if max_depth > 1 else ""
|
| 189 |
+
task = progress.add_task(f"Spider crawling{depth_info}...", total=None)
|
| 190 |
+
crawler = SpiderCrawler(
|
| 191 |
+
base_url=base_url,
|
| 192 |
+
max_concurrent=max_concurrent,
|
| 193 |
+
max_depth=max_depth,
|
| 194 |
+
verbose=verbose,
|
| 195 |
+
)
|
| 196 |
+
posts = await crawler.crawl()
|
| 197 |
+
progress.remove_task(task)
|
| 198 |
+
|
| 199 |
+
if not posts:
|
| 200 |
+
console.print("[red]Error: No posts found![/red]")
|
| 201 |
+
return
|
| 202 |
+
|
| 203 |
+
console.print(f" [green]✓[/green] Found {len(posts)} posts")
|
| 204 |
+
|
| 205 |
+
# Step 2: Build recommendations
|
| 206 |
+
task = progress.add_task("Building recommendation graph...", total=None)
|
| 207 |
+
engine = RecommendationEngine(
|
| 208 |
+
content_weight=content_weight,
|
| 209 |
+
metadata_weight=metadata_weight,
|
| 210 |
+
top_k=top_k,
|
| 211 |
+
)
|
| 212 |
+
graph = engine.build_graph(posts)
|
| 213 |
+
progress.remove_task(task)
|
| 214 |
+
console.print(f" [green]✓[/green] Built recommendation graph")
|
| 215 |
+
|
| 216 |
+
# Step 3: LLM enrichment (optional)
|
| 217 |
+
if model:
|
| 218 |
+
task = progress.add_task(f"Enriching posts with LLM ({model})...", total=None)
|
| 219 |
+
enricher = PostEnricher(
|
| 220 |
+
model=model,
|
| 221 |
+
max_key_points=n_key_points,
|
| 222 |
+
max_wikipedia=n_wiki,
|
| 223 |
+
max_concurrent=max_concurrent,
|
| 224 |
+
verbose=verbose,
|
| 225 |
+
)
|
| 226 |
+
await enricher.enrich_posts(list(graph.posts.values()))
|
| 227 |
+
progress.remove_task(task)
|
| 228 |
+
|
| 229 |
+
enriched_count = sum(
|
| 230 |
+
1 for p in graph.posts.values() if p.key_points
|
| 231 |
+
)
|
| 232 |
+
console.print(f" [green]✓[/green] Enriched {enriched_count} posts with key points + Wikipedia")
|
| 233 |
+
else:
|
| 234 |
+
console.print(" [dim]⊘ Skipping LLM enrichment (no --model specified)[/dim]")
|
| 235 |
+
|
| 236 |
+
# Step 4: External sources (optional)
|
| 237 |
+
if sources:
|
| 238 |
+
valid_sources = [s for s in sources if s in PROVIDERS]
|
| 239 |
+
if valid_sources:
|
| 240 |
+
task = progress.add_task(f"Fetching from {', '.join(valid_sources)}...", total=None)
|
| 241 |
+
|
| 242 |
+
# Build list of (provider, post) pairs for parallel fetching
|
| 243 |
+
fetch_tasks = []
|
| 244 |
+
task_info = [] # Track (source_name, post) for each task
|
| 245 |
+
|
| 246 |
+
for source_name in valid_sources:
|
| 247 |
+
provider_class = PROVIDERS[source_name]
|
| 248 |
+
provider = provider_class(max_items=n_external, verbose=verbose)
|
| 249 |
+
|
| 250 |
+
for post in graph.posts.values():
|
| 251 |
+
fetch_tasks.append(provider.fetch_for_post(post))
|
| 252 |
+
task_info.append((source_name, post))
|
| 253 |
+
|
| 254 |
+
# Fetch all in parallel with concurrency limit
|
| 255 |
+
semaphore = asyncio.Semaphore(max_concurrent)
|
| 256 |
+
|
| 257 |
+
async def fetch_with_limit(coro, info):
|
| 258 |
+
async with semaphore:
|
| 259 |
+
try:
|
| 260 |
+
return await coro, info, None
|
| 261 |
+
except Exception as e:
|
| 262 |
+
return [], info, e
|
| 263 |
+
|
| 264 |
+
results = await asyncio.gather(
|
| 265 |
+
*[fetch_with_limit(t, info) for t, info in zip(fetch_tasks, task_info)]
|
| 266 |
+
)
|
| 267 |
+
|
| 268 |
+
# Process results
|
| 269 |
+
for external_items, (source_name, post), error in results:
|
| 270 |
+
if error:
|
| 271 |
+
if verbose:
|
| 272 |
+
console.print(f"[yellow]Warning: {source_name} failed for {post.slug}: {error}[/yellow]")
|
| 273 |
+
continue
|
| 274 |
+
|
| 275 |
+
for item in external_items:
|
| 276 |
+
post.external_content.append(
|
| 277 |
+
ExternalContentItem(
|
| 278 |
+
source=item.source,
|
| 279 |
+
title=item.title,
|
| 280 |
+
url=item.url,
|
| 281 |
+
description=item.description,
|
| 282 |
+
relevance=item.relevance,
|
| 283 |
+
metadata=item.metadata,
|
| 284 |
+
)
|
| 285 |
+
)
|
| 286 |
+
|
| 287 |
+
progress.remove_task(task)
|
| 288 |
+
console.print(f" [green]✓[/green] Fetched external content from {', '.join(valid_sources)}")
|
| 289 |
+
else:
|
| 290 |
+
console.print(f" [yellow]⚠ Unknown sources: {sources}. Available: {list(PROVIDERS.keys())}[/yellow]")
|
| 291 |
+
|
| 292 |
+
# Step 5: Generate HTML
|
| 293 |
+
task = progress.add_task("Generating HTML...", total=None)
|
| 294 |
+
generator = HTMLGenerator()
|
| 295 |
+
generator.generate(graph, base_url, output_html)
|
| 296 |
+
progress.remove_task(task)
|
| 297 |
+
console.print(f" [green]✓[/green] Generated {output_html}")
|
| 298 |
+
|
| 299 |
+
console.print(f"\n[bold green]Done![/bold green] Open {output_html} in a browser to view.\n")
|
| 300 |
+
|
| 301 |
+
|
| 302 |
+
if __name__ == "__main__":
|
| 303 |
+
main()
|
src/tiktokify/crawler/__init__.py
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Spider crawler module for fetching website content."""
|
| 2 |
+
|
| 3 |
+
from .blog_crawler import SpiderCrawler
|
| 4 |
+
|
| 5 |
+
# Backward compatibility alias
|
| 6 |
+
JekyllBlogCrawler = SpiderCrawler
|
| 7 |
+
|
| 8 |
+
__all__ = ["SpiderCrawler", "JekyllBlogCrawler"]
|
src/tiktokify/crawler/blog_crawler.py
ADDED
|
@@ -0,0 +1,382 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Async crawler for Jekyll blogs using crawl4ai."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import re
|
| 5 |
+
from datetime import datetime
|
| 6 |
+
from urllib.parse import urljoin, urlparse
|
| 7 |
+
|
| 8 |
+
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
| 9 |
+
from rich.console import Console
|
| 10 |
+
|
| 11 |
+
from tiktokify.models import Post, PostMetadata
|
| 12 |
+
|
| 13 |
+
console = Console()
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
class SpiderCrawler:
|
| 17 |
+
"""Async spider crawler for any website with recursive link discovery."""
|
| 18 |
+
|
| 19 |
+
def __init__(
|
| 20 |
+
self,
|
| 21 |
+
base_url: str,
|
| 22 |
+
max_concurrent: int = 5,
|
| 23 |
+
max_depth: int = 1,
|
| 24 |
+
verbose: bool = False,
|
| 25 |
+
):
|
| 26 |
+
self.base_url = base_url.rstrip("/")
|
| 27 |
+
self.max_concurrent = max_concurrent
|
| 28 |
+
self.max_depth = max_depth
|
| 29 |
+
self.verbose = verbose
|
| 30 |
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
| 31 |
+
self.base_domain = urlparse(self.base_url).netloc
|
| 32 |
+
|
| 33 |
+
async def crawl(self) -> list[Post]:
|
| 34 |
+
"""Main entry point - crawls entire blog and returns posts."""
|
| 35 |
+
browser_config = BrowserConfig(headless=True, verbose=self.verbose)
|
| 36 |
+
|
| 37 |
+
async with AsyncWebCrawler(config=browser_config) as crawler:
|
| 38 |
+
# Step 1: Discover post URLs
|
| 39 |
+
if self.verbose:
|
| 40 |
+
console.print("[dim]Discovering post URLs...[/dim]")
|
| 41 |
+
|
| 42 |
+
post_urls = await self._discover_post_urls(crawler)
|
| 43 |
+
|
| 44 |
+
if self.verbose:
|
| 45 |
+
console.print(f"[green]Found {len(post_urls)} posts[/green]")
|
| 46 |
+
|
| 47 |
+
# Step 2: Crawl individual posts concurrently
|
| 48 |
+
posts = await self._crawl_posts(crawler, post_urls)
|
| 49 |
+
|
| 50 |
+
return posts
|
| 51 |
+
|
| 52 |
+
async def _discover_post_urls(self, crawler: AsyncWebCrawler) -> list[str]:
|
| 53 |
+
"""Discover all content URLs using spider-style recursive crawling.
|
| 54 |
+
|
| 55 |
+
Starts from base URL and follows internal links up to max_depth levels.
|
| 56 |
+
- Depth 1: Only links from seed URL (default)
|
| 57 |
+
- Depth 2: Links from seed + links from those pages
|
| 58 |
+
- etc.
|
| 59 |
+
"""
|
| 60 |
+
discovered: set[str] = set()
|
| 61 |
+
visited: set[str] = set()
|
| 62 |
+
|
| 63 |
+
async def crawl_page(url: str, depth: int) -> set[str]:
|
| 64 |
+
"""Crawl a single page and return new URLs found."""
|
| 65 |
+
if depth > self.max_depth or url in visited:
|
| 66 |
+
return set()
|
| 67 |
+
|
| 68 |
+
visited.add(url)
|
| 69 |
+
new_urls: set[str] = set()
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
async with self.semaphore:
|
| 73 |
+
result = await crawler.arun(
|
| 74 |
+
url=url,
|
| 75 |
+
config=CrawlerRunConfig(wait_until="domcontentloaded"),
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
if not result.success:
|
| 79 |
+
return set()
|
| 80 |
+
|
| 81 |
+
# Extract links from crawl4ai
|
| 82 |
+
if result.links:
|
| 83 |
+
for link in result.links.get("internal", []):
|
| 84 |
+
href = link.get("href", "") if isinstance(link, dict) else str(link)
|
| 85 |
+
if self._is_content_url(href, self.base_domain):
|
| 86 |
+
full_url = href if href.startswith("http") else urljoin(url, href)
|
| 87 |
+
if full_url not in discovered:
|
| 88 |
+
new_urls.add(full_url)
|
| 89 |
+
|
| 90 |
+
# Also parse HTML directly as fallback
|
| 91 |
+
if result.html:
|
| 92 |
+
hrefs = re.findall(r'href=["\']([^"\']+)["\']', result.html)
|
| 93 |
+
for href in hrefs:
|
| 94 |
+
if self._is_content_url(href, self.base_domain):
|
| 95 |
+
full_url = href if href.startswith("http") else urljoin(url, href)
|
| 96 |
+
if full_url not in discovered:
|
| 97 |
+
new_urls.add(full_url)
|
| 98 |
+
|
| 99 |
+
discovered.update(new_urls)
|
| 100 |
+
|
| 101 |
+
if self.verbose and new_urls:
|
| 102 |
+
console.print(f"[dim]Depth {depth}: Found {len(new_urls)} URLs from {url}[/dim]")
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
if self.verbose:
|
| 106 |
+
console.print(f"[yellow]Warning: Failed to crawl {url}: {e}[/yellow]")
|
| 107 |
+
|
| 108 |
+
return new_urls
|
| 109 |
+
|
| 110 |
+
# Start with seed URL
|
| 111 |
+
if self.verbose:
|
| 112 |
+
console.print(f"[dim]Spider crawling with max_depth={self.max_depth}[/dim]")
|
| 113 |
+
|
| 114 |
+
# Depth 1: crawl seed URL
|
| 115 |
+
current_urls = await crawl_page(self.base_url, 1)
|
| 116 |
+
|
| 117 |
+
# Deeper levels: recursively crawl discovered URLs
|
| 118 |
+
for depth in range(2, self.max_depth + 1):
|
| 119 |
+
if not current_urls:
|
| 120 |
+
break
|
| 121 |
+
|
| 122 |
+
if self.verbose:
|
| 123 |
+
console.print(f"[dim]Crawling depth {depth}: {len(current_urls)} URLs to explore[/dim]")
|
| 124 |
+
|
| 125 |
+
# Crawl all current URLs in parallel
|
| 126 |
+
tasks = [crawl_page(url, depth) for url in current_urls]
|
| 127 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 128 |
+
|
| 129 |
+
# Collect new URLs for next depth
|
| 130 |
+
next_urls: set[str] = set()
|
| 131 |
+
for result in results:
|
| 132 |
+
if isinstance(result, set):
|
| 133 |
+
next_urls.update(result)
|
| 134 |
+
|
| 135 |
+
current_urls = next_urls
|
| 136 |
+
|
| 137 |
+
if self.verbose:
|
| 138 |
+
console.print(f"[dim]Total discovered: {len(discovered)} content URLs[/dim]")
|
| 139 |
+
|
| 140 |
+
return list(discovered)
|
| 141 |
+
|
| 142 |
+
def _is_content_url(self, href: str, base_domain: str) -> bool:
|
| 143 |
+
"""Check if URL is internal content (not static asset or utility page).
|
| 144 |
+
|
| 145 |
+
This is a simple filter - accepts anything that's:
|
| 146 |
+
1. On the same domain
|
| 147 |
+
2. Not a static asset (css, js, images, fonts)
|
| 148 |
+
3. Not a utility link (mailto, javascript, anchor)
|
| 149 |
+
"""
|
| 150 |
+
if not href:
|
| 151 |
+
return False
|
| 152 |
+
|
| 153 |
+
# Skip anchors, mailto, javascript
|
| 154 |
+
if href.startswith(("#", "mailto:", "javascript:", "tel:")):
|
| 155 |
+
return False
|
| 156 |
+
|
| 157 |
+
# Skip static assets
|
| 158 |
+
static_extensions = (
|
| 159 |
+
".css", ".js", ".json", ".xml", ".rss", ".atom",
|
| 160 |
+
".png", ".jpg", ".jpeg", ".gif", ".svg", ".ico", ".webp",
|
| 161 |
+
".woff", ".woff2", ".ttf", ".eot", ".otf",
|
| 162 |
+
".pdf", ".zip", ".tar", ".gz",
|
| 163 |
+
".mp3", ".mp4", ".webm", ".ogg",
|
| 164 |
+
)
|
| 165 |
+
if any(href.lower().endswith(ext) for ext in static_extensions):
|
| 166 |
+
return False
|
| 167 |
+
|
| 168 |
+
# Check if it's an external link
|
| 169 |
+
if href.startswith(("http://", "https://")):
|
| 170 |
+
parsed = urlparse(href)
|
| 171 |
+
if parsed.netloc != base_domain:
|
| 172 |
+
return False
|
| 173 |
+
|
| 174 |
+
# Skip the base URL itself (index page)
|
| 175 |
+
path = urlparse(href).path if href.startswith("http") else href
|
| 176 |
+
if path in ("", "/", "/index.html", "/index.htm"):
|
| 177 |
+
return False
|
| 178 |
+
|
| 179 |
+
return True
|
| 180 |
+
|
| 181 |
+
async def _crawl_posts(
|
| 182 |
+
self, crawler: AsyncWebCrawler, urls: list[str]
|
| 183 |
+
) -> list[Post]:
|
| 184 |
+
"""Crawl all post URLs concurrently with semaphore."""
|
| 185 |
+
|
| 186 |
+
async def crawl_one(url: str) -> Post | None:
|
| 187 |
+
async with self.semaphore:
|
| 188 |
+
return await self._crawl_single_post(crawler, url)
|
| 189 |
+
|
| 190 |
+
tasks = [crawl_one(url) for url in urls]
|
| 191 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 192 |
+
|
| 193 |
+
posts = []
|
| 194 |
+
for i, result in enumerate(results):
|
| 195 |
+
if isinstance(result, Post):
|
| 196 |
+
posts.append(result)
|
| 197 |
+
elif isinstance(result, Exception) and self.verbose:
|
| 198 |
+
console.print(f"[yellow]Failed to crawl {urls[i]}: {result}[/yellow]")
|
| 199 |
+
|
| 200 |
+
return posts
|
| 201 |
+
|
| 202 |
+
async def _crawl_single_post(
|
| 203 |
+
self, crawler: AsyncWebCrawler, url: str
|
| 204 |
+
) -> Post | None:
|
| 205 |
+
"""Crawl and parse a single post."""
|
| 206 |
+
try:
|
| 207 |
+
result = await crawler.arun(
|
| 208 |
+
url=url,
|
| 209 |
+
config=CrawlerRunConfig(wait_until="domcontentloaded"),
|
| 210 |
+
)
|
| 211 |
+
|
| 212 |
+
if not result.success:
|
| 213 |
+
return None
|
| 214 |
+
|
| 215 |
+
# Extract metadata from HTML
|
| 216 |
+
metadata = self._extract_metadata(result.html, url)
|
| 217 |
+
|
| 218 |
+
# Use markdown for clean text (TF-IDF)
|
| 219 |
+
content_text = result.markdown or ""
|
| 220 |
+
|
| 221 |
+
# Calculate reading time (~200 words/min)
|
| 222 |
+
word_count = len(content_text.split())
|
| 223 |
+
reading_time = max(1, word_count // 200)
|
| 224 |
+
|
| 225 |
+
# Extract slug from URL
|
| 226 |
+
slug = self._extract_slug(url)
|
| 227 |
+
|
| 228 |
+
return Post(
|
| 229 |
+
url=url,
|
| 230 |
+
slug=slug,
|
| 231 |
+
metadata=metadata,
|
| 232 |
+
content_text=content_text,
|
| 233 |
+
content_html=result.html or "",
|
| 234 |
+
reading_time_minutes=reading_time,
|
| 235 |
+
)
|
| 236 |
+
except Exception as e:
|
| 237 |
+
if self.verbose:
|
| 238 |
+
console.print(f"[yellow]Error parsing {url}: {e}[/yellow]")
|
| 239 |
+
return None
|
| 240 |
+
|
| 241 |
+
def _extract_metadata(self, html: str, url: str) -> PostMetadata:
|
| 242 |
+
"""Extract metadata from rendered HTML using regex (works with various blog themes)."""
|
| 243 |
+
# Try multiple patterns for title
|
| 244 |
+
title = "Untitled"
|
| 245 |
+
title_patterns = [
|
| 246 |
+
# Jekyll Clean Blog theme
|
| 247 |
+
r'<h1[^>]*class="[^"]*post-title[^"]*"[^>]*>([^<]+)</h1>',
|
| 248 |
+
# WordPress/common patterns
|
| 249 |
+
r'<h1[^>]*class="[^"]*entry-title[^"]*"[^>]*>([^<]+)</h1>',
|
| 250 |
+
r'<h1[^>]*class="[^"]*article-title[^"]*"[^>]*>([^<]+)</h1>',
|
| 251 |
+
r'<h1[^>]*class="[^"]*title[^"]*"[^>]*>([^<]+)</h1>',
|
| 252 |
+
# Meta og:title
|
| 253 |
+
r'<meta[^>]*property="og:title"[^>]*content="([^"]+)"',
|
| 254 |
+
r'<meta[^>]*name="title"[^>]*content="([^"]+)"',
|
| 255 |
+
# Generic h1
|
| 256 |
+
r"<h1[^>]*>([^<]+)</h1>",
|
| 257 |
+
# Title tag fallback
|
| 258 |
+
r"<title>([^<|]+)",
|
| 259 |
+
]
|
| 260 |
+
for pattern in title_patterns:
|
| 261 |
+
match = re.search(pattern, html, re.IGNORECASE)
|
| 262 |
+
if match:
|
| 263 |
+
title = re.sub(r"<[^>]+>", "", match.group(1)).strip()
|
| 264 |
+
if title:
|
| 265 |
+
break
|
| 266 |
+
|
| 267 |
+
# Try multiple patterns for date
|
| 268 |
+
date = datetime.now()
|
| 269 |
+
date_patterns = [
|
| 270 |
+
# Various date formats
|
| 271 |
+
(r"Posted on (\w+ \d+, \d{4})", "%B %d, %Y"),
|
| 272 |
+
(r'datetime="(\d{4}-\d{2}-\d{2})', "%Y-%m-%d"),
|
| 273 |
+
(r"(\d{4}-\d{2}-\d{2})", "%Y-%m-%d"),
|
| 274 |
+
(r"(\w+ \d{1,2}, \d{4})", "%B %d, %Y"),
|
| 275 |
+
(r"(\d{1,2} \w+ \d{4})", "%d %B %Y"),
|
| 276 |
+
(r'<time[^>]*>([^<]+)</time>', None), # Will try multiple formats
|
| 277 |
+
]
|
| 278 |
+
for pattern, fmt in date_patterns:
|
| 279 |
+
match = re.search(pattern, html, re.IGNORECASE)
|
| 280 |
+
if match:
|
| 281 |
+
date_str = match.group(1).strip()
|
| 282 |
+
if fmt:
|
| 283 |
+
try:
|
| 284 |
+
date = datetime.strptime(date_str, fmt)
|
| 285 |
+
break
|
| 286 |
+
except ValueError:
|
| 287 |
+
continue
|
| 288 |
+
else:
|
| 289 |
+
# Try common formats
|
| 290 |
+
for try_fmt in ["%B %d, %Y", "%Y-%m-%d", "%d %B %Y", "%b %d, %Y"]:
|
| 291 |
+
try:
|
| 292 |
+
date = datetime.strptime(date_str, try_fmt)
|
| 293 |
+
break
|
| 294 |
+
except ValueError:
|
| 295 |
+
continue
|
| 296 |
+
|
| 297 |
+
# Extract date from URL if not found in HTML
|
| 298 |
+
if date == datetime.now():
|
| 299 |
+
url_date = re.search(r"(20\d{2})[/-](\d{1,2})[/-](\d{1,2})", url)
|
| 300 |
+
if url_date:
|
| 301 |
+
try:
|
| 302 |
+
date = datetime(int(url_date.group(1)), int(url_date.group(2)), int(url_date.group(3)))
|
| 303 |
+
except ValueError:
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
# Tags from various patterns
|
| 307 |
+
tags = []
|
| 308 |
+
tag_patterns = [
|
| 309 |
+
r'<span[^>]*class="[^"]*badge[^"]*"[^>]*>([^<]+)</span>',
|
| 310 |
+
r'<a[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</a>',
|
| 311 |
+
r'rel="tag"[^>]*>([^<]+)</a>',
|
| 312 |
+
r'<span[^>]*class="[^"]*tag[^"]*"[^>]*>([^<]+)</span>',
|
| 313 |
+
]
|
| 314 |
+
for pattern in tag_patterns:
|
| 315 |
+
found = re.findall(pattern, html, re.IGNORECASE)
|
| 316 |
+
tags.extend([t.strip() for t in found if t.strip()])
|
| 317 |
+
tags = list(set(tags))[:10] # Dedupe and limit
|
| 318 |
+
|
| 319 |
+
# Category from URL
|
| 320 |
+
path = urlparse(url).path
|
| 321 |
+
parts = [p for p in path.strip("/").split("/") if p and not re.match(r"^\d+$", p)]
|
| 322 |
+
# Skip date-like parts and get first meaningful segment
|
| 323 |
+
categories = []
|
| 324 |
+
for part in parts[:-1]: # Exclude last part (the slug)
|
| 325 |
+
if not re.match(r"^20\d{2}$", part) and part not in ["blog", "posts", "articles"]:
|
| 326 |
+
categories.append(part)
|
| 327 |
+
break
|
| 328 |
+
|
| 329 |
+
# Header image from various patterns
|
| 330 |
+
header_img = None
|
| 331 |
+
img_patterns = [
|
| 332 |
+
r'class="[^"]*intro-header[^"]*"[^>]*style="[^"]*url\([\'"]?([^\'")\s]+)',
|
| 333 |
+
r'class="[^"]*featured[^"]*"[^>]*src="([^"]+)"',
|
| 334 |
+
r'<meta[^>]*property="og:image"[^>]*content="([^"]+)"',
|
| 335 |
+
r'class="[^"]*post-image[^"]*"[^>]*src="([^"]+)"',
|
| 336 |
+
r'class="[^"]*hero[^"]*"[^>]*src="([^"]+)"',
|
| 337 |
+
]
|
| 338 |
+
for pattern in img_patterns:
|
| 339 |
+
match = re.search(pattern, html, re.IGNORECASE)
|
| 340 |
+
if match:
|
| 341 |
+
header_img = match.group(1)
|
| 342 |
+
break
|
| 343 |
+
|
| 344 |
+
# Subtitle/description from various patterns
|
| 345 |
+
subtitle = None
|
| 346 |
+
subtitle_patterns = [
|
| 347 |
+
r'<span[^>]*class="[^"]*subheading[^"]*"[^>]*>([^<]+)</span>',
|
| 348 |
+
r'<p[^>]*class="[^"]*subtitle[^"]*"[^>]*>([^<]+)</p>',
|
| 349 |
+
r'<meta[^>]*name="description"[^>]*content="([^"]+)"',
|
| 350 |
+
r'<meta[^>]*property="og:description"[^>]*content="([^"]+)"',
|
| 351 |
+
]
|
| 352 |
+
for pattern in subtitle_patterns:
|
| 353 |
+
match = re.search(pattern, html, re.IGNORECASE)
|
| 354 |
+
if match:
|
| 355 |
+
subtitle = match.group(1).strip()
|
| 356 |
+
if len(subtitle) > 200:
|
| 357 |
+
subtitle = subtitle[:197] + "..."
|
| 358 |
+
break
|
| 359 |
+
|
| 360 |
+
return PostMetadata(
|
| 361 |
+
title=title,
|
| 362 |
+
date=date,
|
| 363 |
+
categories=categories,
|
| 364 |
+
tags=tags,
|
| 365 |
+
subtitle=subtitle,
|
| 366 |
+
header_img=header_img,
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
def _extract_slug(self, url: str) -> str:
|
| 370 |
+
"""Extract slug from post URL."""
|
| 371 |
+
path = urlparse(url).path
|
| 372 |
+
# Remove trailing slash and get last meaningful part
|
| 373 |
+
path = path.rstrip("/")
|
| 374 |
+
if not path:
|
| 375 |
+
return "index"
|
| 376 |
+
slug = path.rsplit("/", 1)[-1]
|
| 377 |
+
# Remove common extensions
|
| 378 |
+
for ext in (".html", ".htm", ".php", ".aspx"):
|
| 379 |
+
if slug.endswith(ext):
|
| 380 |
+
slug = slug[:-len(ext)]
|
| 381 |
+
break
|
| 382 |
+
return slug or "page"
|
src/tiktokify/enrichment/__init__.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Enrichment module for LLM-based post enrichment."""
|
| 2 |
+
|
| 3 |
+
from .base import ContentProvider, ExternalContent
|
| 4 |
+
from .llm_enricher import PostEnricher
|
| 5 |
+
from .providers import (
|
| 6 |
+
HackerNewsProvider,
|
| 7 |
+
HNFrontPageProvider,
|
| 8 |
+
LinkedContentProvider,
|
| 9 |
+
WikipediaProvider,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
__all__ = [
|
| 13 |
+
"PostEnricher",
|
| 14 |
+
"ContentProvider",
|
| 15 |
+
"ExternalContent",
|
| 16 |
+
"WikipediaProvider",
|
| 17 |
+
"HackerNewsProvider",
|
| 18 |
+
"HNFrontPageProvider",
|
| 19 |
+
"LinkedContentProvider",
|
| 20 |
+
]
|
src/tiktokify/enrichment/base.py
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Base classes for content providers."""
|
| 2 |
+
|
| 3 |
+
from abc import ABC, abstractmethod
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 7 |
+
|
| 8 |
+
from tiktokify.models import Post
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class ExternalContent(BaseModel):
|
| 12 |
+
"""A piece of external content from any source."""
|
| 13 |
+
|
| 14 |
+
source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
|
| 15 |
+
title: str
|
| 16 |
+
url: HttpUrl
|
| 17 |
+
description: str = Field(default="", description="Brief description or excerpt")
|
| 18 |
+
relevance: str = Field(default="", description="Why this is relevant to the post")
|
| 19 |
+
metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
class ContentProvider(ABC):
|
| 23 |
+
"""Abstract base class for external content providers.
|
| 24 |
+
|
| 25 |
+
To add a new source:
|
| 26 |
+
1. Create a new file (e.g., hackernews.py)
|
| 27 |
+
2. Subclass ContentProvider
|
| 28 |
+
3. Implement source_type and fetch_for_post
|
| 29 |
+
4. Register in enricher.py
|
| 30 |
+
"""
|
| 31 |
+
|
| 32 |
+
def __init__(self, max_items: int = 3, verbose: bool = False):
|
| 33 |
+
self.max_items = max_items
|
| 34 |
+
self.verbose = verbose
|
| 35 |
+
|
| 36 |
+
@property
|
| 37 |
+
@abstractmethod
|
| 38 |
+
def source_type(self) -> str:
|
| 39 |
+
"""Unique identifier for this source (e.g., 'wikipedia', 'hackernews')."""
|
| 40 |
+
pass
|
| 41 |
+
|
| 42 |
+
@abstractmethod
|
| 43 |
+
async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
|
| 44 |
+
"""Fetch relevant external content for a blog post.
|
| 45 |
+
|
| 46 |
+
Args:
|
| 47 |
+
post: The blog post to find related content for
|
| 48 |
+
|
| 49 |
+
Returns:
|
| 50 |
+
List of ExternalContent items (up to max_items)
|
| 51 |
+
"""
|
| 52 |
+
pass
|
| 53 |
+
|
| 54 |
+
async def fetch_for_posts(self, posts: list[Post]) -> dict[str, list[ExternalContent]]:
|
| 55 |
+
"""Fetch content for multiple posts.
|
| 56 |
+
|
| 57 |
+
Default implementation calls fetch_for_post sequentially.
|
| 58 |
+
Override for batch optimization.
|
| 59 |
+
|
| 60 |
+
Returns:
|
| 61 |
+
Dict mapping post slug to list of ExternalContent
|
| 62 |
+
"""
|
| 63 |
+
results = {}
|
| 64 |
+
for post in posts:
|
| 65 |
+
results[post.slug] = await self.fetch_for_post(post)
|
| 66 |
+
return results
|
src/tiktokify/enrichment/llm_enricher.py
ADDED
|
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""LLM-based post enrichment using litellm.
|
| 2 |
+
|
| 3 |
+
This module uses LLM to:
|
| 4 |
+
1. Generate key points/takeaways for each post
|
| 5 |
+
2. Suggest relevant Wikipedia articles
|
| 6 |
+
|
| 7 |
+
The actual Wikipedia extract fetching is done by providers/wikipedia.py
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import asyncio
|
| 11 |
+
import json
|
| 12 |
+
|
| 13 |
+
import litellm
|
| 14 |
+
from pydantic import ValidationError
|
| 15 |
+
from rich.console import Console
|
| 16 |
+
|
| 17 |
+
from tiktokify.models import Post, WikipediaSuggestion
|
| 18 |
+
|
| 19 |
+
console = Console()
|
| 20 |
+
|
| 21 |
+
# Disable litellm's verbose logging
|
| 22 |
+
litellm.suppress_debug_info = True
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
class PostEnricher:
|
| 26 |
+
"""Enrich posts with key points and Wikipedia suggestions using LLM."""
|
| 27 |
+
|
| 28 |
+
def __init__(
|
| 29 |
+
self,
|
| 30 |
+
model: str = "gpt-4o-mini",
|
| 31 |
+
max_key_points: int = 5,
|
| 32 |
+
max_wikipedia: int = 3,
|
| 33 |
+
max_concurrent: int = 3,
|
| 34 |
+
verbose: bool = False,
|
| 35 |
+
):
|
| 36 |
+
self.model = model
|
| 37 |
+
self.max_key_points = max_key_points
|
| 38 |
+
self.max_wikipedia = max_wikipedia
|
| 39 |
+
self.max_concurrent = max_concurrent
|
| 40 |
+
self.verbose = verbose
|
| 41 |
+
self.semaphore = asyncio.Semaphore(max_concurrent)
|
| 42 |
+
|
| 43 |
+
async def enrich_post(self, post: Post) -> None:
|
| 44 |
+
"""Enrich a single post with key points and Wikipedia suggestions."""
|
| 45 |
+
prompt = self._build_prompt(post)
|
| 46 |
+
|
| 47 |
+
try:
|
| 48 |
+
# Calculate tokens needed: ~50 tokens per key point, ~80 per wiki suggestion
|
| 49 |
+
estimated_tokens = (self.max_key_points * 50) + (self.max_wikipedia * 100) + 200
|
| 50 |
+
max_tokens = max(1000, min(estimated_tokens, 4000))
|
| 51 |
+
|
| 52 |
+
response = await litellm.acompletion(
|
| 53 |
+
model=self.model,
|
| 54 |
+
messages=[{"role": "user", "content": prompt}],
|
| 55 |
+
temperature=0.3,
|
| 56 |
+
max_tokens=max_tokens,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
content = response.choices[0].message.content
|
| 60 |
+
key_points, wikipedia = self._parse_response(content)
|
| 61 |
+
|
| 62 |
+
# Fetch Wikipedia extracts for each suggestion
|
| 63 |
+
wikipedia_with_extracts = await self._fetch_wiki_extracts(wikipedia)
|
| 64 |
+
|
| 65 |
+
post.key_points = key_points
|
| 66 |
+
post.wikipedia_suggestions = wikipedia_with_extracts
|
| 67 |
+
|
| 68 |
+
except Exception as e:
|
| 69 |
+
if self.verbose:
|
| 70 |
+
console.print(
|
| 71 |
+
f"[yellow]Warning: LLM call failed for {post.slug}: {e}[/yellow]"
|
| 72 |
+
)
|
| 73 |
+
|
| 74 |
+
async def _fetch_wiki_extracts(
|
| 75 |
+
self, suggestions: list[WikipediaSuggestion]
|
| 76 |
+
) -> list[WikipediaSuggestion]:
|
| 77 |
+
"""Fetch Wikipedia extracts for all suggestions concurrently."""
|
| 78 |
+
from tiktokify.enrichment.providers.wikipedia import WikipediaProvider
|
| 79 |
+
|
| 80 |
+
provider = WikipediaProvider(max_items=len(suggestions), verbose=self.verbose)
|
| 81 |
+
|
| 82 |
+
async def fetch_one(suggestion: WikipediaSuggestion) -> WikipediaSuggestion:
|
| 83 |
+
extract = await provider._fetch_extract(
|
| 84 |
+
provider._extract_title_from_url(str(suggestion.url)) or suggestion.title
|
| 85 |
+
)
|
| 86 |
+
return WikipediaSuggestion(
|
| 87 |
+
title=suggestion.title,
|
| 88 |
+
url=suggestion.url,
|
| 89 |
+
relevance=suggestion.relevance,
|
| 90 |
+
extract=extract,
|
| 91 |
+
)
|
| 92 |
+
|
| 93 |
+
tasks = [fetch_one(s) for s in suggestions]
|
| 94 |
+
results = await asyncio.gather(*tasks, return_exceptions=True)
|
| 95 |
+
|
| 96 |
+
return [r for r in results if isinstance(r, WikipediaSuggestion)]
|
| 97 |
+
|
| 98 |
+
def _build_prompt(self, post: Post) -> str:
|
| 99 |
+
"""Build LLM prompt for key points and Wikipedia suggestions."""
|
| 100 |
+
content_excerpt = post.content_text[:2000] if post.content_text else ""
|
| 101 |
+
|
| 102 |
+
return f"""Analyze this blog post and provide:
|
| 103 |
+
1. {self.max_key_points} key points/takeaways (concise bullet points)
|
| 104 |
+
2. {self.max_wikipedia} relevant Wikipedia articles for further reading
|
| 105 |
+
|
| 106 |
+
Title: {post.metadata.title}
|
| 107 |
+
Subtitle: {post.metadata.subtitle or "N/A"}
|
| 108 |
+
Categories: {', '.join(post.metadata.categories)}
|
| 109 |
+
Tags: {', '.join(post.metadata.tags)}
|
| 110 |
+
|
| 111 |
+
Content:
|
| 112 |
+
{content_excerpt}
|
| 113 |
+
|
| 114 |
+
Return ONLY valid JSON with this exact structure:
|
| 115 |
+
{{
|
| 116 |
+
"keyPoints": ["point 1", "point 2", ...],
|
| 117 |
+
"wikipedia": [
|
| 118 |
+
{{"title": "Article Title", "url": "https://en.wikipedia.org/wiki/...", "relevance": "Why it's relevant"}}
|
| 119 |
+
]
|
| 120 |
+
}}
|
| 121 |
+
|
| 122 |
+
Guidelines:
|
| 123 |
+
- Key points should be insightful takeaways, not just summaries
|
| 124 |
+
- Each key point should be 1-2 sentences max
|
| 125 |
+
- Wikipedia URLs must be valid (use underscores for spaces)
|
| 126 |
+
- Return ONLY the JSON, no markdown formatting"""
|
| 127 |
+
|
| 128 |
+
def _parse_response(self, content: str) -> tuple[list[str], list[WikipediaSuggestion]]:
|
| 129 |
+
"""Parse LLM response into key points and Wikipedia suggestions."""
|
| 130 |
+
# Clean up response - remove markdown code blocks if present
|
| 131 |
+
content = content.strip()
|
| 132 |
+
if content.startswith("```"):
|
| 133 |
+
lines = content.split("\n")
|
| 134 |
+
content = "\n".join(
|
| 135 |
+
line for line in lines if not line.startswith("```")
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
key_points: list[str] = []
|
| 139 |
+
wikipedia: list[WikipediaSuggestion] = []
|
| 140 |
+
|
| 141 |
+
try:
|
| 142 |
+
data = json.loads(content)
|
| 143 |
+
|
| 144 |
+
# Parse key points
|
| 145 |
+
if "keyPoints" in data and isinstance(data["keyPoints"], list):
|
| 146 |
+
key_points = [str(p) for p in data["keyPoints"] if p]
|
| 147 |
+
|
| 148 |
+
# Parse Wikipedia suggestions
|
| 149 |
+
if "wikipedia" in data and isinstance(data["wikipedia"], list):
|
| 150 |
+
for item in data["wikipedia"]:
|
| 151 |
+
try:
|
| 152 |
+
suggestion = WikipediaSuggestion(
|
| 153 |
+
title=item.get("title", ""),
|
| 154 |
+
url=item.get("url", ""),
|
| 155 |
+
relevance=item.get("relevance", ""),
|
| 156 |
+
extract="", # Will be filled later
|
| 157 |
+
)
|
| 158 |
+
wikipedia.append(suggestion)
|
| 159 |
+
except ValidationError:
|
| 160 |
+
continue
|
| 161 |
+
|
| 162 |
+
except json.JSONDecodeError as e:
|
| 163 |
+
if self.verbose:
|
| 164 |
+
console.print(f"[yellow]JSON parse error: {e}[/yellow]")
|
| 165 |
+
|
| 166 |
+
return key_points, wikipedia
|
| 167 |
+
|
| 168 |
+
async def enrich_posts(self, posts: list[Post]) -> None:
|
| 169 |
+
"""Enrich all posts concurrently."""
|
| 170 |
+
|
| 171 |
+
async def enrich_one(post: Post) -> None:
|
| 172 |
+
async with self.semaphore:
|
| 173 |
+
await self.enrich_post(post)
|
| 174 |
+
|
| 175 |
+
tasks = [enrich_one(post) for post in posts]
|
| 176 |
+
await asyncio.gather(*tasks, return_exceptions=True)
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
# Backwards compatibility alias
|
| 180 |
+
WikipediaSuggester = PostEnricher
|
src/tiktokify/enrichment/providers/__init__.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Content providers for external sources."""
|
| 2 |
+
|
| 3 |
+
from .hackernews import HackerNewsProvider, HNFrontPageProvider
|
| 4 |
+
from .links import LinkedContentProvider
|
| 5 |
+
from .wikipedia import WikipediaProvider
|
| 6 |
+
|
| 7 |
+
__all__ = [
|
| 8 |
+
"WikipediaProvider",
|
| 9 |
+
"HackerNewsProvider",
|
| 10 |
+
"HNFrontPageProvider",
|
| 11 |
+
"LinkedContentProvider",
|
| 12 |
+
]
|
src/tiktokify/enrichment/providers/hackernews.py
ADDED
|
@@ -0,0 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Hacker News content providers.
|
| 2 |
+
|
| 3 |
+
Provides two providers:
|
| 4 |
+
- HackerNewsProvider: Keyword-based search for stories related to post topics
|
| 5 |
+
- HNFrontPageProvider: Current front page stories for general interest
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import asyncio
|
| 9 |
+
import re
|
| 10 |
+
|
| 11 |
+
import httpx
|
| 12 |
+
|
| 13 |
+
from tiktokify.enrichment.base import ContentProvider, ExternalContent
|
| 14 |
+
from tiktokify.models import Post
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
async def fetch_article_excerpt(url: str, max_chars: int = 800) -> str:
|
| 18 |
+
"""Fetch and extract text excerpt from an article URL."""
|
| 19 |
+
if not url:
|
| 20 |
+
return ""
|
| 21 |
+
|
| 22 |
+
try:
|
| 23 |
+
async with httpx.AsyncClient() as client:
|
| 24 |
+
response = await client.get(
|
| 25 |
+
url,
|
| 26 |
+
headers={
|
| 27 |
+
"User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
|
| 28 |
+
"Accept": "text/html,application/xhtml+xml",
|
| 29 |
+
},
|
| 30 |
+
timeout=10.0,
|
| 31 |
+
follow_redirects=True,
|
| 32 |
+
)
|
| 33 |
+
|
| 34 |
+
if response.status_code != 200:
|
| 35 |
+
return ""
|
| 36 |
+
|
| 37 |
+
html = response.text
|
| 38 |
+
|
| 39 |
+
# Remove script, style, nav, header, footer tags
|
| 40 |
+
html = re.sub(r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>", "", html, flags=re.DOTALL | re.IGNORECASE)
|
| 41 |
+
|
| 42 |
+
# Extract text from paragraph tags (most content is in <p>)
|
| 43 |
+
paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", html, flags=re.DOTALL | re.IGNORECASE)
|
| 44 |
+
|
| 45 |
+
# Clean HTML tags from extracted text
|
| 46 |
+
text_parts = []
|
| 47 |
+
for p in paragraphs:
|
| 48 |
+
clean = re.sub(r"<[^>]+>", " ", p)
|
| 49 |
+
clean = re.sub(r"\s+", " ", clean).strip()
|
| 50 |
+
if len(clean) > 50: # Skip very short paragraphs
|
| 51 |
+
text_parts.append(clean)
|
| 52 |
+
|
| 53 |
+
if not text_parts:
|
| 54 |
+
# Fallback: extract all text
|
| 55 |
+
text = re.sub(r"<[^>]+>", " ", html)
|
| 56 |
+
text = re.sub(r"\s+", " ", text).strip()
|
| 57 |
+
return text[:max_chars] + "..." if len(text) > max_chars else text
|
| 58 |
+
|
| 59 |
+
excerpt = " ".join(text_parts)
|
| 60 |
+
if len(excerpt) > max_chars:
|
| 61 |
+
excerpt = excerpt[:max_chars].rsplit(" ", 1)[0] + "..."
|
| 62 |
+
return excerpt
|
| 63 |
+
|
| 64 |
+
except Exception:
|
| 65 |
+
return ""
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
class HackerNewsProvider(ContentProvider):
|
| 69 |
+
"""Fetch relevant Hacker News discussions for blog posts.
|
| 70 |
+
|
| 71 |
+
Uses the Algolia HN Search API to find related stories by keyword.
|
| 72 |
+
"""
|
| 73 |
+
|
| 74 |
+
HN_SEARCH_URL = "https://hn.algolia.com/api/v1/search"
|
| 75 |
+
|
| 76 |
+
@property
|
| 77 |
+
def source_type(self) -> str:
|
| 78 |
+
return "hackernews"
|
| 79 |
+
|
| 80 |
+
async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
|
| 81 |
+
"""Search HN for stories related to the post's topics."""
|
| 82 |
+
# Build search query from post metadata
|
| 83 |
+
query_parts = []
|
| 84 |
+
|
| 85 |
+
# Use tags (most specific)
|
| 86 |
+
if post.metadata.tags:
|
| 87 |
+
query_parts.extend(post.metadata.tags[:3])
|
| 88 |
+
|
| 89 |
+
# Add key terms from title
|
| 90 |
+
title_words = [
|
| 91 |
+
w for w in post.metadata.title.split()
|
| 92 |
+
if len(w) > 4 and w.lower() not in {"about", "using", "with", "from", "that", "this", "what", "when", "where", "which"}
|
| 93 |
+
]
|
| 94 |
+
query_parts.extend(title_words[:2])
|
| 95 |
+
|
| 96 |
+
if not query_parts:
|
| 97 |
+
return []
|
| 98 |
+
|
| 99 |
+
query = " ".join(query_parts)
|
| 100 |
+
|
| 101 |
+
try:
|
| 102 |
+
async with httpx.AsyncClient() as client:
|
| 103 |
+
response = await client.get(
|
| 104 |
+
self.HN_SEARCH_URL,
|
| 105 |
+
params={
|
| 106 |
+
"query": query,
|
| 107 |
+
"tags": "story",
|
| 108 |
+
"hitsPerPage": self.max_items * 2, # Fetch extra for filtering
|
| 109 |
+
},
|
| 110 |
+
timeout=10.0,
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
if response.status_code != 200:
|
| 114 |
+
return []
|
| 115 |
+
|
| 116 |
+
data = response.json()
|
| 117 |
+
hits = data.get("hits", [])
|
| 118 |
+
|
| 119 |
+
# Prepare hits for parallel fetching
|
| 120 |
+
selected_hits = hits[: self.max_items]
|
| 121 |
+
story_urls = [hit.get("url", "") for hit in selected_hits]
|
| 122 |
+
|
| 123 |
+
# Fetch all article excerpts in parallel
|
| 124 |
+
excerpts = await asyncio.gather(
|
| 125 |
+
*[fetch_article_excerpt(url) for url in story_urls],
|
| 126 |
+
return_exceptions=True,
|
| 127 |
+
)
|
| 128 |
+
|
| 129 |
+
results = []
|
| 130 |
+
for hit, excerpt in zip(selected_hits, excerpts):
|
| 131 |
+
story_id = hit.get("objectID", "")
|
| 132 |
+
hn_url = f"https://news.ycombinator.com/item?id={story_id}"
|
| 133 |
+
|
| 134 |
+
title = hit.get("title", "")
|
| 135 |
+
points = hit.get("points", 0)
|
| 136 |
+
num_comments = hit.get("num_comments", 0)
|
| 137 |
+
author = hit.get("author", "")
|
| 138 |
+
story_url = hit.get("url", "")
|
| 139 |
+
|
| 140 |
+
# Handle exceptions from parallel fetch
|
| 141 |
+
if isinstance(excerpt, Exception) or not excerpt:
|
| 142 |
+
excerpt = f"{points} points · {num_comments} comments"
|
| 143 |
+
|
| 144 |
+
results.append(
|
| 145 |
+
ExternalContent(
|
| 146 |
+
source=self.source_type,
|
| 147 |
+
title=title,
|
| 148 |
+
url=hn_url,
|
| 149 |
+
description=excerpt,
|
| 150 |
+
relevance=f"Found via search: {query}",
|
| 151 |
+
metadata={
|
| 152 |
+
"points": points,
|
| 153 |
+
"num_comments": num_comments,
|
| 154 |
+
"author": author,
|
| 155 |
+
"story_url": story_url,
|
| 156 |
+
},
|
| 157 |
+
)
|
| 158 |
+
)
|
| 159 |
+
|
| 160 |
+
return results
|
| 161 |
+
|
| 162 |
+
except Exception:
|
| 163 |
+
return []
|
| 164 |
+
|
| 165 |
+
|
| 166 |
+
class HNFrontPageProvider(ContentProvider):
|
| 167 |
+
"""Fetch current Hacker News front page stories.
|
| 168 |
+
|
| 169 |
+
Uses the Algolia HN API to get stories currently on the front page.
|
| 170 |
+
Good for adding general tech interest content to any blog.
|
| 171 |
+
"""
|
| 172 |
+
|
| 173 |
+
HN_FRONT_PAGE_URL = "https://hn.algolia.com/api/v1/search"
|
| 174 |
+
|
| 175 |
+
@property
|
| 176 |
+
def source_type(self) -> str:
|
| 177 |
+
return "hn-frontpage"
|
| 178 |
+
|
| 179 |
+
async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
|
| 180 |
+
"""Fetch current front page stories (post-independent)."""
|
| 181 |
+
try:
|
| 182 |
+
async with httpx.AsyncClient() as client:
|
| 183 |
+
response = await client.get(
|
| 184 |
+
self.HN_FRONT_PAGE_URL,
|
| 185 |
+
params={
|
| 186 |
+
"tags": "front_page",
|
| 187 |
+
"hitsPerPage": self.max_items,
|
| 188 |
+
},
|
| 189 |
+
timeout=10.0,
|
| 190 |
+
)
|
| 191 |
+
|
| 192 |
+
if response.status_code != 200:
|
| 193 |
+
return []
|
| 194 |
+
|
| 195 |
+
data = response.json()
|
| 196 |
+
hits = data.get("hits", [])
|
| 197 |
+
|
| 198 |
+
# Prepare hits for parallel fetching
|
| 199 |
+
selected_hits = hits[: self.max_items]
|
| 200 |
+
story_urls = [hit.get("url", "") for hit in selected_hits]
|
| 201 |
+
|
| 202 |
+
# Fetch all article excerpts in parallel
|
| 203 |
+
excerpts = await asyncio.gather(
|
| 204 |
+
*[fetch_article_excerpt(url) for url in story_urls],
|
| 205 |
+
return_exceptions=True,
|
| 206 |
+
)
|
| 207 |
+
|
| 208 |
+
results = []
|
| 209 |
+
for hit, excerpt in zip(selected_hits, excerpts):
|
| 210 |
+
story_id = hit.get("objectID", "")
|
| 211 |
+
hn_url = f"https://news.ycombinator.com/item?id={story_id}"
|
| 212 |
+
|
| 213 |
+
title = hit.get("title", "")
|
| 214 |
+
points = hit.get("points", 0)
|
| 215 |
+
num_comments = hit.get("num_comments", 0)
|
| 216 |
+
author = hit.get("author", "")
|
| 217 |
+
story_url = hit.get("url", "")
|
| 218 |
+
|
| 219 |
+
# Handle exceptions from parallel fetch
|
| 220 |
+
if isinstance(excerpt, Exception) or not excerpt:
|
| 221 |
+
excerpt = f"{points} points · {num_comments} comments"
|
| 222 |
+
|
| 223 |
+
results.append(
|
| 224 |
+
ExternalContent(
|
| 225 |
+
source=self.source_type,
|
| 226 |
+
title=title,
|
| 227 |
+
url=hn_url,
|
| 228 |
+
description=excerpt,
|
| 229 |
+
relevance="Currently on HN front page",
|
| 230 |
+
metadata={
|
| 231 |
+
"points": points,
|
| 232 |
+
"num_comments": num_comments,
|
| 233 |
+
"author": author,
|
| 234 |
+
"story_url": story_url,
|
| 235 |
+
},
|
| 236 |
+
)
|
| 237 |
+
)
|
| 238 |
+
|
| 239 |
+
return results
|
| 240 |
+
|
| 241 |
+
except Exception:
|
| 242 |
+
return []
|
src/tiktokify/enrichment/providers/links.py
ADDED
|
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Link extractor provider for crawling external links from blog posts."""
|
| 2 |
+
|
| 3 |
+
import asyncio
|
| 4 |
+
import re
|
| 5 |
+
from urllib.parse import urljoin, urlparse
|
| 6 |
+
|
| 7 |
+
import httpx
|
| 8 |
+
|
| 9 |
+
from tiktokify.enrichment.base import ContentProvider, ExternalContent
|
| 10 |
+
from tiktokify.models import Post
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
async def fetch_link_metadata(url: str, max_excerpt_chars: int = 600) -> tuple[str, str]:
|
| 14 |
+
"""Fetch title and excerpt from a URL.
|
| 15 |
+
|
| 16 |
+
Returns (title, excerpt) tuple.
|
| 17 |
+
"""
|
| 18 |
+
try:
|
| 19 |
+
async with httpx.AsyncClient() as client:
|
| 20 |
+
response = await client.get(
|
| 21 |
+
url,
|
| 22 |
+
headers={
|
| 23 |
+
"User-Agent": "TikTokify/1.0 (Mozilla/5.0 compatible)",
|
| 24 |
+
"Accept": "text/html,application/xhtml+xml",
|
| 25 |
+
},
|
| 26 |
+
timeout=10.0,
|
| 27 |
+
follow_redirects=True,
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
if response.status_code != 200:
|
| 31 |
+
return "", ""
|
| 32 |
+
|
| 33 |
+
html = response.text
|
| 34 |
+
|
| 35 |
+
# Extract title
|
| 36 |
+
title_match = re.search(r"<title[^>]*>(.*?)</title>", html, re.IGNORECASE | re.DOTALL)
|
| 37 |
+
title = ""
|
| 38 |
+
if title_match:
|
| 39 |
+
title = re.sub(r"<[^>]+>", "", title_match.group(1))
|
| 40 |
+
title = re.sub(r"\s+", " ", title).strip()
|
| 41 |
+
|
| 42 |
+
# Try meta description first
|
| 43 |
+
meta_desc = re.search(
|
| 44 |
+
r'<meta[^>]*name=["\']description["\'][^>]*content=["\'](.*?)["\']',
|
| 45 |
+
html,
|
| 46 |
+
re.IGNORECASE,
|
| 47 |
+
)
|
| 48 |
+
if meta_desc:
|
| 49 |
+
excerpt = meta_desc.group(1).strip()
|
| 50 |
+
return title, excerpt
|
| 51 |
+
|
| 52 |
+
# Remove script, style, nav, header, footer tags
|
| 53 |
+
clean_html = re.sub(
|
| 54 |
+
r"<(script|style|nav|header|footer|aside)[^>]*>.*?</\1>",
|
| 55 |
+
"",
|
| 56 |
+
html,
|
| 57 |
+
flags=re.DOTALL | re.IGNORECASE,
|
| 58 |
+
)
|
| 59 |
+
|
| 60 |
+
# Extract text from paragraph tags
|
| 61 |
+
paragraphs = re.findall(r"<p[^>]*>(.*?)</p>", clean_html, flags=re.DOTALL | re.IGNORECASE)
|
| 62 |
+
|
| 63 |
+
text_parts = []
|
| 64 |
+
for p in paragraphs:
|
| 65 |
+
clean = re.sub(r"<[^>]+>", " ", p)
|
| 66 |
+
clean = re.sub(r"\s+", " ", clean).strip()
|
| 67 |
+
if len(clean) > 50:
|
| 68 |
+
text_parts.append(clean)
|
| 69 |
+
|
| 70 |
+
excerpt = " ".join(text_parts)
|
| 71 |
+
if len(excerpt) > max_excerpt_chars:
|
| 72 |
+
excerpt = excerpt[:max_excerpt_chars].rsplit(" ", 1)[0] + "..."
|
| 73 |
+
|
| 74 |
+
return title, excerpt
|
| 75 |
+
|
| 76 |
+
except Exception:
|
| 77 |
+
return "", ""
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
class LinkedContentProvider(ContentProvider):
|
| 81 |
+
"""Extract and crawl external links from blog post content.
|
| 82 |
+
|
| 83 |
+
Finds links within the blog post HTML and fetches their content,
|
| 84 |
+
creating a "spider" of related content from the post's references.
|
| 85 |
+
"""
|
| 86 |
+
|
| 87 |
+
# Domains to skip (social media, generic sites, etc.)
|
| 88 |
+
SKIP_DOMAINS = {
|
| 89 |
+
"twitter.com",
|
| 90 |
+
"x.com",
|
| 91 |
+
"facebook.com",
|
| 92 |
+
"instagram.com",
|
| 93 |
+
"linkedin.com",
|
| 94 |
+
"youtube.com",
|
| 95 |
+
"youtu.be",
|
| 96 |
+
"github.com",
|
| 97 |
+
"gist.github.com",
|
| 98 |
+
"reddit.com",
|
| 99 |
+
"news.ycombinator.com",
|
| 100 |
+
"google.com",
|
| 101 |
+
"amazon.com",
|
| 102 |
+
"wikipedia.org", # Already have Wikipedia provider
|
| 103 |
+
"fonts.googleapis.com",
|
| 104 |
+
"cdn.jsdelivr.net",
|
| 105 |
+
"unpkg.com",
|
| 106 |
+
"cloudflare.com",
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
@property
|
| 110 |
+
def source_type(self) -> str:
|
| 111 |
+
return "linked"
|
| 112 |
+
|
| 113 |
+
def _extract_links(self, html: str, base_url: str) -> list[str]:
|
| 114 |
+
"""Extract external links from HTML content."""
|
| 115 |
+
# Find all href links
|
| 116 |
+
links = re.findall(r'href=["\']([^"\']+)["\']', html, re.IGNORECASE)
|
| 117 |
+
|
| 118 |
+
parsed_base = urlparse(base_url)
|
| 119 |
+
base_domain = parsed_base.netloc.lower()
|
| 120 |
+
|
| 121 |
+
external_links = []
|
| 122 |
+
seen = set()
|
| 123 |
+
|
| 124 |
+
for link in links:
|
| 125 |
+
# Skip anchor links, mailto, javascript, etc.
|
| 126 |
+
if link.startswith(("#", "mailto:", "javascript:", "tel:")):
|
| 127 |
+
continue
|
| 128 |
+
|
| 129 |
+
# Resolve relative URLs
|
| 130 |
+
if link.startswith("/"):
|
| 131 |
+
link = urljoin(base_url, link)
|
| 132 |
+
elif not link.startswith(("http://", "https://")):
|
| 133 |
+
continue
|
| 134 |
+
|
| 135 |
+
# Parse and validate
|
| 136 |
+
parsed = urlparse(link)
|
| 137 |
+
domain = parsed.netloc.lower()
|
| 138 |
+
|
| 139 |
+
# Skip internal links
|
| 140 |
+
if domain == base_domain or domain.endswith(f".{base_domain}"):
|
| 141 |
+
continue
|
| 142 |
+
|
| 143 |
+
# Skip blocked domains
|
| 144 |
+
if any(skip in domain for skip in self.SKIP_DOMAINS):
|
| 145 |
+
continue
|
| 146 |
+
|
| 147 |
+
# Skip non-http(s) links
|
| 148 |
+
if parsed.scheme not in ("http", "https"):
|
| 149 |
+
continue
|
| 150 |
+
|
| 151 |
+
# Skip duplicates
|
| 152 |
+
normalized = f"{parsed.scheme}://{domain}{parsed.path}"
|
| 153 |
+
if normalized in seen:
|
| 154 |
+
continue
|
| 155 |
+
seen.add(normalized)
|
| 156 |
+
|
| 157 |
+
external_links.append(link)
|
| 158 |
+
|
| 159 |
+
return external_links
|
| 160 |
+
|
| 161 |
+
async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
|
| 162 |
+
"""Extract links from post content and fetch their metadata."""
|
| 163 |
+
if not post.content_html:
|
| 164 |
+
return []
|
| 165 |
+
|
| 166 |
+
# Extract external links
|
| 167 |
+
links = self._extract_links(post.content_html, post.url)
|
| 168 |
+
|
| 169 |
+
if not links:
|
| 170 |
+
return []
|
| 171 |
+
|
| 172 |
+
# Limit to max_items and fetch all in parallel
|
| 173 |
+
selected_links = links[: self.max_items]
|
| 174 |
+
|
| 175 |
+
metadata_results = await asyncio.gather(
|
| 176 |
+
*[fetch_link_metadata(link) for link in selected_links],
|
| 177 |
+
return_exceptions=True,
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
results = []
|
| 181 |
+
for link, meta in zip(selected_links, metadata_results):
|
| 182 |
+
# Handle exceptions
|
| 183 |
+
if isinstance(meta, Exception):
|
| 184 |
+
continue
|
| 185 |
+
|
| 186 |
+
title, excerpt = meta
|
| 187 |
+
|
| 188 |
+
if not title and not excerpt:
|
| 189 |
+
continue
|
| 190 |
+
|
| 191 |
+
# Use URL domain as fallback title
|
| 192 |
+
if not title:
|
| 193 |
+
parsed = urlparse(link)
|
| 194 |
+
title = parsed.netloc
|
| 195 |
+
|
| 196 |
+
results.append(
|
| 197 |
+
ExternalContent(
|
| 198 |
+
source=self.source_type,
|
| 199 |
+
title=title,
|
| 200 |
+
url=link,
|
| 201 |
+
description=excerpt,
|
| 202 |
+
relevance=f"Referenced in: {post.metadata.title}",
|
| 203 |
+
metadata={
|
| 204 |
+
"source_post_slug": post.slug,
|
| 205 |
+
"link_type": "reference",
|
| 206 |
+
},
|
| 207 |
+
)
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
return results
|
src/tiktokify/enrichment/providers/wikipedia.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Wikipedia content provider."""
|
| 2 |
+
|
| 3 |
+
from urllib.parse import unquote, urlparse
|
| 4 |
+
|
| 5 |
+
import httpx
|
| 6 |
+
|
| 7 |
+
from tiktokify.enrichment.base import ContentProvider, ExternalContent
|
| 8 |
+
from tiktokify.models import Post
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class WikipediaProvider(ContentProvider):
|
| 12 |
+
"""Fetch relevant Wikipedia articles for blog posts.
|
| 13 |
+
|
| 14 |
+
Uses Wikipedia REST API to fetch article summaries.
|
| 15 |
+
Requires LLM to first suggest relevant articles (see PostEnricher).
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
@property
|
| 19 |
+
def source_type(self) -> str:
|
| 20 |
+
return "wikipedia"
|
| 21 |
+
|
| 22 |
+
async def fetch_for_post(self, post: Post) -> list[ExternalContent]:
|
| 23 |
+
"""Fetch Wikipedia extracts for pre-suggested articles."""
|
| 24 |
+
results = []
|
| 25 |
+
|
| 26 |
+
for suggestion in post.wikipedia_suggestions[: self.max_items]:
|
| 27 |
+
title = self._extract_title_from_url(str(suggestion.url)) or suggestion.title
|
| 28 |
+
extract = await self._fetch_extract(title)
|
| 29 |
+
|
| 30 |
+
results.append(
|
| 31 |
+
ExternalContent(
|
| 32 |
+
source=self.source_type,
|
| 33 |
+
title=suggestion.title,
|
| 34 |
+
url=suggestion.url,
|
| 35 |
+
description=extract,
|
| 36 |
+
relevance=suggestion.relevance,
|
| 37 |
+
metadata={"extract": extract},
|
| 38 |
+
)
|
| 39 |
+
)
|
| 40 |
+
|
| 41 |
+
return results
|
| 42 |
+
|
| 43 |
+
async def _fetch_extract(self, title: str, max_chars: int = 1500) -> str:
|
| 44 |
+
"""Fetch article extract from Wikipedia API."""
|
| 45 |
+
title = title.strip()
|
| 46 |
+
url = "https://en.wikipedia.org/api/rest_v1/page/summary/" + title.replace(" ", "_")
|
| 47 |
+
|
| 48 |
+
try:
|
| 49 |
+
async with httpx.AsyncClient() as client:
|
| 50 |
+
response = await client.get(
|
| 51 |
+
url,
|
| 52 |
+
headers={"User-Agent": "TikTokify/1.0"},
|
| 53 |
+
timeout=10.0,
|
| 54 |
+
follow_redirects=True,
|
| 55 |
+
)
|
| 56 |
+
|
| 57 |
+
if response.status_code == 200:
|
| 58 |
+
data = response.json()
|
| 59 |
+
extract = data.get("extract", "")
|
| 60 |
+
if len(extract) > max_chars:
|
| 61 |
+
extract = extract[:max_chars].rsplit(" ", 1)[0] + "..."
|
| 62 |
+
return extract
|
| 63 |
+
except Exception:
|
| 64 |
+
pass
|
| 65 |
+
|
| 66 |
+
return ""
|
| 67 |
+
|
| 68 |
+
def _extract_title_from_url(self, url: str) -> str:
|
| 69 |
+
"""Extract Wikipedia article title from URL."""
|
| 70 |
+
parsed = urlparse(url)
|
| 71 |
+
if "wikipedia.org" in parsed.netloc:
|
| 72 |
+
path = parsed.path
|
| 73 |
+
if path.startswith("/wiki/"):
|
| 74 |
+
title = path[6:]
|
| 75 |
+
title = unquote(title)
|
| 76 |
+
title = title.replace("_", " ")
|
| 77 |
+
return title
|
| 78 |
+
return ""
|
src/tiktokify/generator/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTML generator module."""
|
| 2 |
+
|
| 3 |
+
from .html_generator import HTMLGenerator
|
| 4 |
+
|
| 5 |
+
__all__ = ["HTMLGenerator"]
|
src/tiktokify/generator/html_generator.py
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""HTML generator for TikTok-style swipe UI."""
|
| 2 |
+
|
| 3 |
+
import json
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
from jinja2 import Environment, FileSystemLoader
|
| 7 |
+
|
| 8 |
+
from tiktokify.models import RecommendationGraph
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
class HTMLGenerator:
|
| 12 |
+
"""Generate standalone HTML with embedded data and swipe UI."""
|
| 13 |
+
|
| 14 |
+
def __init__(self, template_dir: Path | None = None):
|
| 15 |
+
if template_dir is None:
|
| 16 |
+
template_dir = Path(__file__).parent / "templates"
|
| 17 |
+
|
| 18 |
+
self.env = Environment(
|
| 19 |
+
loader=FileSystemLoader(template_dir),
|
| 20 |
+
autoescape=True,
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
def generate(
|
| 24 |
+
self,
|
| 25 |
+
graph: RecommendationGraph,
|
| 26 |
+
base_url: str,
|
| 27 |
+
output_path: Path,
|
| 28 |
+
) -> None:
|
| 29 |
+
"""Generate HTML file with embedded recommendation data."""
|
| 30 |
+
template = self.env.get_template("swipe.html.jinja2")
|
| 31 |
+
|
| 32 |
+
# Prepare data for embedding
|
| 33 |
+
graph_data = graph.to_json_for_embed()
|
| 34 |
+
graph_json = json.dumps(graph_data, indent=2)
|
| 35 |
+
|
| 36 |
+
# Sort posts by date for initial list
|
| 37 |
+
sorted_posts = sorted(
|
| 38 |
+
graph.posts.values(),
|
| 39 |
+
key=lambda p: p.metadata.date,
|
| 40 |
+
reverse=True,
|
| 41 |
+
)
|
| 42 |
+
post_slugs = [p.slug for p in sorted_posts]
|
| 43 |
+
|
| 44 |
+
html = template.render(
|
| 45 |
+
base_url=base_url.rstrip("/"),
|
| 46 |
+
graph_json=graph_json,
|
| 47 |
+
post_slugs_json=json.dumps(post_slugs),
|
| 48 |
+
post_count=len(sorted_posts),
|
| 49 |
+
)
|
| 50 |
+
|
| 51 |
+
output_path.parent.mkdir(parents=True, exist_ok=True)
|
| 52 |
+
output_path.write_text(html)
|
src/tiktokify/generator/templates/swipe.html.jinja2
ADDED
|
@@ -0,0 +1,1028 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0, maximum-scale=1.0, user-scalable=no">
|
| 6 |
+
<title>TikTokify - Swipe to Discover</title>
|
| 7 |
+
<style>
|
| 8 |
+
* {
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
box-sizing: border-box;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
html, body {
|
| 15 |
+
height: 100%;
|
| 16 |
+
overflow: hidden;
|
| 17 |
+
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
| 18 |
+
background: #000;
|
| 19 |
+
color: #fff;
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
.swipe-container {
|
| 23 |
+
height: 100vh;
|
| 24 |
+
overflow-y: scroll;
|
| 25 |
+
scroll-snap-type: y mandatory;
|
| 26 |
+
-webkit-overflow-scrolling: touch;
|
| 27 |
+
}
|
| 28 |
+
|
| 29 |
+
.card {
|
| 30 |
+
height: 100vh;
|
| 31 |
+
scroll-snap-align: start;
|
| 32 |
+
display: flex;
|
| 33 |
+
flex-direction: column;
|
| 34 |
+
position: relative;
|
| 35 |
+
background-size: cover;
|
| 36 |
+
background-position: center;
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
.card-overlay {
|
| 40 |
+
position: absolute;
|
| 41 |
+
inset: 0;
|
| 42 |
+
background: linear-gradient(
|
| 43 |
+
to bottom,
|
| 44 |
+
rgba(0,0,0,0.3) 0%,
|
| 45 |
+
rgba(0,0,0,0.1) 30%,
|
| 46 |
+
rgba(0,0,0,0.6) 60%,
|
| 47 |
+
rgba(0,0,0,0.95) 100%
|
| 48 |
+
);
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
.card-content {
|
| 52 |
+
position: relative;
|
| 53 |
+
z-index: 1;
|
| 54 |
+
height: 100%;
|
| 55 |
+
display: flex;
|
| 56 |
+
flex-direction: column;
|
| 57 |
+
justify-content: flex-end;
|
| 58 |
+
padding: 20px;
|
| 59 |
+
padding-bottom: 100px;
|
| 60 |
+
overflow-y: auto;
|
| 61 |
+
}
|
| 62 |
+
|
| 63 |
+
.card-type {
|
| 64 |
+
display: inline-flex;
|
| 65 |
+
align-items: center;
|
| 66 |
+
gap: 6px;
|
| 67 |
+
background: rgba(255,255,255,0.15);
|
| 68 |
+
padding: 4px 10px;
|
| 69 |
+
border-radius: 20px;
|
| 70 |
+
font-size: 10px;
|
| 71 |
+
text-transform: uppercase;
|
| 72 |
+
letter-spacing: 0.5px;
|
| 73 |
+
margin-bottom: 12px;
|
| 74 |
+
width: fit-content;
|
| 75 |
+
}
|
| 76 |
+
|
| 77 |
+
.card-type.wiki {
|
| 78 |
+
background: rgba(77, 163, 255, 0.3);
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
.card-meta {
|
| 82 |
+
display: flex;
|
| 83 |
+
flex-wrap: wrap;
|
| 84 |
+
gap: 8px;
|
| 85 |
+
margin-bottom: 12px;
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
.card-category {
|
| 89 |
+
background: rgba(255,255,255,0.2);
|
| 90 |
+
padding: 4px 12px;
|
| 91 |
+
border-radius: 20px;
|
| 92 |
+
font-size: 11px;
|
| 93 |
+
text-transform: uppercase;
|
| 94 |
+
letter-spacing: 0.5px;
|
| 95 |
+
}
|
| 96 |
+
|
| 97 |
+
.card-title {
|
| 98 |
+
font-size: 24px;
|
| 99 |
+
font-weight: 700;
|
| 100 |
+
margin-bottom: 10px;
|
| 101 |
+
line-height: 1.25;
|
| 102 |
+
text-shadow: 0 2px 4px rgba(0,0,0,0.3);
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
.card-subtitle {
|
| 106 |
+
font-size: 14px;
|
| 107 |
+
opacity: 0.85;
|
| 108 |
+
margin-bottom: 12px;
|
| 109 |
+
line-height: 1.5;
|
| 110 |
+
}
|
| 111 |
+
|
| 112 |
+
/* Key points */
|
| 113 |
+
.key-points {
|
| 114 |
+
background: rgba(255,255,255,0.08);
|
| 115 |
+
border-radius: 12px;
|
| 116 |
+
padding: 14px;
|
| 117 |
+
margin-bottom: 14px;
|
| 118 |
+
}
|
| 119 |
+
|
| 120 |
+
.key-points-title {
|
| 121 |
+
font-size: 11px;
|
| 122 |
+
text-transform: uppercase;
|
| 123 |
+
letter-spacing: 0.5px;
|
| 124 |
+
opacity: 0.6;
|
| 125 |
+
margin-bottom: 10px;
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
.key-point {
|
| 129 |
+
display: flex;
|
| 130 |
+
gap: 10px;
|
| 131 |
+
margin-bottom: 8px;
|
| 132 |
+
font-size: 13px;
|
| 133 |
+
line-height: 1.5;
|
| 134 |
+
}
|
| 135 |
+
|
| 136 |
+
.key-point:last-child {
|
| 137 |
+
margin-bottom: 0;
|
| 138 |
+
}
|
| 139 |
+
|
| 140 |
+
.key-point-bullet {
|
| 141 |
+
color: #4da3ff;
|
| 142 |
+
flex-shrink: 0;
|
| 143 |
+
}
|
| 144 |
+
|
| 145 |
+
.card-tags {
|
| 146 |
+
display: flex;
|
| 147 |
+
flex-wrap: wrap;
|
| 148 |
+
gap: 6px;
|
| 149 |
+
margin-bottom: 12px;
|
| 150 |
+
}
|
| 151 |
+
|
| 152 |
+
.card-tag {
|
| 153 |
+
background: rgba(255,255,255,0.12);
|
| 154 |
+
padding: 4px 10px;
|
| 155 |
+
border-radius: 4px;
|
| 156 |
+
font-size: 11px;
|
| 157 |
+
opacity: 0.9;
|
| 158 |
+
}
|
| 159 |
+
|
| 160 |
+
.card-date {
|
| 161 |
+
font-size: 12px;
|
| 162 |
+
opacity: 0.6;
|
| 163 |
+
}
|
| 164 |
+
|
| 165 |
+
/* Wikipedia card specific */
|
| 166 |
+
.wiki-card {
|
| 167 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
|
| 168 |
+
}
|
| 169 |
+
|
| 170 |
+
.wiki-card .card-overlay {
|
| 171 |
+
background: linear-gradient(
|
| 172 |
+
to bottom,
|
| 173 |
+
rgba(15, 52, 96, 0.3) 0%,
|
| 174 |
+
rgba(15, 52, 96, 0.6) 50%,
|
| 175 |
+
rgba(10, 10, 30, 0.95) 100%
|
| 176 |
+
);
|
| 177 |
+
}
|
| 178 |
+
|
| 179 |
+
.wiki-excerpt {
|
| 180 |
+
font-size: 15px;
|
| 181 |
+
line-height: 1.7;
|
| 182 |
+
opacity: 0.95;
|
| 183 |
+
margin-bottom: 16px;
|
| 184 |
+
max-height: 40vh;
|
| 185 |
+
overflow-y: auto;
|
| 186 |
+
}
|
| 187 |
+
|
| 188 |
+
/* Wikipedia card - center content vertically */
|
| 189 |
+
.wiki-card .card-content {
|
| 190 |
+
justify-content: center;
|
| 191 |
+
padding-top: 80px;
|
| 192 |
+
}
|
| 193 |
+
|
| 194 |
+
/* HackerNews card */
|
| 195 |
+
.hn-card {
|
| 196 |
+
background: linear-gradient(135deg, #1a0a00 0%, #ff6600 100%);
|
| 197 |
+
}
|
| 198 |
+
|
| 199 |
+
.hn-card .card-overlay {
|
| 200 |
+
background: linear-gradient(
|
| 201 |
+
to bottom,
|
| 202 |
+
rgba(26, 10, 0, 0.4) 0%,
|
| 203 |
+
rgba(26, 10, 0, 0.6) 50%,
|
| 204 |
+
rgba(10, 5, 0, 0.95) 100%
|
| 205 |
+
);
|
| 206 |
+
}
|
| 207 |
+
|
| 208 |
+
.hn-card .card-content {
|
| 209 |
+
justify-content: center;
|
| 210 |
+
padding-top: 80px;
|
| 211 |
+
}
|
| 212 |
+
|
| 213 |
+
.card-type.hn {
|
| 214 |
+
background: rgba(255, 102, 0, 0.4);
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
.hn-meta {
|
| 218 |
+
display: flex;
|
| 219 |
+
gap: 16px;
|
| 220 |
+
margin-bottom: 16px;
|
| 221 |
+
font-size: 14px;
|
| 222 |
+
opacity: 0.9;
|
| 223 |
+
}
|
| 224 |
+
|
| 225 |
+
.hn-meta-item {
|
| 226 |
+
display: flex;
|
| 227 |
+
align-items: center;
|
| 228 |
+
gap: 6px;
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
.hn-description {
|
| 232 |
+
font-size: 15px;
|
| 233 |
+
line-height: 1.7;
|
| 234 |
+
opacity: 0.9;
|
| 235 |
+
margin-bottom: 16px;
|
| 236 |
+
max-height: 35vh;
|
| 237 |
+
overflow-y: auto;
|
| 238 |
+
}
|
| 239 |
+
|
| 240 |
+
.hn-link {
|
| 241 |
+
display: inline-flex;
|
| 242 |
+
align-items: center;
|
| 243 |
+
gap: 8px;
|
| 244 |
+
background: rgba(255, 102, 0, 0.3);
|
| 245 |
+
padding: 10px 16px;
|
| 246 |
+
border-radius: 8px;
|
| 247 |
+
font-size: 13px;
|
| 248 |
+
margin-top: 12px;
|
| 249 |
+
opacity: 0.9;
|
| 250 |
+
}
|
| 251 |
+
|
| 252 |
+
.wiki-relevance {
|
| 253 |
+
font-size: 13px;
|
| 254 |
+
line-height: 1.5;
|
| 255 |
+
opacity: 0.7;
|
| 256 |
+
margin-bottom: 12px;
|
| 257 |
+
padding-left: 12px;
|
| 258 |
+
border-left: 2px solid rgba(77, 163, 255, 0.5);
|
| 259 |
+
}
|
| 260 |
+
|
| 261 |
+
.wiki-source {
|
| 262 |
+
display: inline-flex;
|
| 263 |
+
align-items: center;
|
| 264 |
+
gap: 6px;
|
| 265 |
+
font-size: 12px;
|
| 266 |
+
opacity: 0.6;
|
| 267 |
+
}
|
| 268 |
+
|
| 269 |
+
/* Linked content card (tertiary crawled links) */
|
| 270 |
+
.linked-card {
|
| 271 |
+
background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
.linked-card .card-overlay {
|
| 275 |
+
background: linear-gradient(
|
| 276 |
+
to bottom,
|
| 277 |
+
rgba(26, 26, 26, 0.3) 0%,
|
| 278 |
+
rgba(26, 26, 26, 0.6) 50%,
|
| 279 |
+
rgba(10, 20, 15, 0.95) 100%
|
| 280 |
+
);
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
.linked-card .card-content {
|
| 284 |
+
justify-content: center;
|
| 285 |
+
padding-top: 80px;
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
.card-type.linked {
|
| 289 |
+
background: rgba(46, 204, 113, 0.3);
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
.linked-description {
|
| 293 |
+
font-size: 15px;
|
| 294 |
+
line-height: 1.7;
|
| 295 |
+
opacity: 0.9;
|
| 296 |
+
margin-bottom: 16px;
|
| 297 |
+
max-height: 35vh;
|
| 298 |
+
overflow-y: auto;
|
| 299 |
+
}
|
| 300 |
+
|
| 301 |
+
.rec-item.linked-rec {
|
| 302 |
+
border-left: 3px solid rgba(46, 204, 113, 0.6);
|
| 303 |
+
padding-left: 12px;
|
| 304 |
+
}
|
| 305 |
+
|
| 306 |
+
.rec-thumb.linked-thumb {
|
| 307 |
+
background: linear-gradient(135deg, #1a1a1a 0%, #2d4a3e 100%);
|
| 308 |
+
display: flex;
|
| 309 |
+
align-items: center;
|
| 310 |
+
justify-content: center;
|
| 311 |
+
}
|
| 312 |
+
|
| 313 |
+
/* Action buttons */
|
| 314 |
+
.card-actions {
|
| 315 |
+
position: fixed;
|
| 316 |
+
right: 12px;
|
| 317 |
+
bottom: 120px;
|
| 318 |
+
display: flex;
|
| 319 |
+
flex-direction: column;
|
| 320 |
+
gap: 14px;
|
| 321 |
+
z-index: 10;
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
.action-btn {
|
| 325 |
+
width: 46px;
|
| 326 |
+
height: 46px;
|
| 327 |
+
border-radius: 50%;
|
| 328 |
+
background: rgba(255,255,255,0.15);
|
| 329 |
+
backdrop-filter: blur(10px);
|
| 330 |
+
border: none;
|
| 331 |
+
color: #fff;
|
| 332 |
+
font-size: 18px;
|
| 333 |
+
cursor: pointer;
|
| 334 |
+
display: flex;
|
| 335 |
+
align-items: center;
|
| 336 |
+
justify-content: center;
|
| 337 |
+
transition: transform 0.2s, background 0.2s;
|
| 338 |
+
}
|
| 339 |
+
|
| 340 |
+
.action-btn:hover {
|
| 341 |
+
transform: scale(1.1);
|
| 342 |
+
background: rgba(255,255,255,0.25);
|
| 343 |
+
}
|
| 344 |
+
|
| 345 |
+
.action-btn:active {
|
| 346 |
+
transform: scale(0.95);
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
.read-btn {
|
| 350 |
+
background: rgba(255,71,87,0.8);
|
| 351 |
+
}
|
| 352 |
+
|
| 353 |
+
.read-btn:hover {
|
| 354 |
+
background: rgba(255,71,87,1);
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
/* Panels */
|
| 358 |
+
.panel {
|
| 359 |
+
position: fixed;
|
| 360 |
+
bottom: 0;
|
| 361 |
+
left: 0;
|
| 362 |
+
right: 0;
|
| 363 |
+
background: rgba(20,20,20,0.98);
|
| 364 |
+
backdrop-filter: blur(20px);
|
| 365 |
+
border-radius: 20px 20px 0 0;
|
| 366 |
+
padding: 20px;
|
| 367 |
+
transform: translateY(100%);
|
| 368 |
+
transition: transform 0.3s ease;
|
| 369 |
+
z-index: 20;
|
| 370 |
+
max-height: 60vh;
|
| 371 |
+
overflow-y: auto;
|
| 372 |
+
}
|
| 373 |
+
|
| 374 |
+
.panel.open {
|
| 375 |
+
transform: translateY(0);
|
| 376 |
+
}
|
| 377 |
+
|
| 378 |
+
.panel-header {
|
| 379 |
+
display: flex;
|
| 380 |
+
justify-content: space-between;
|
| 381 |
+
align-items: center;
|
| 382 |
+
margin-bottom: 16px;
|
| 383 |
+
}
|
| 384 |
+
|
| 385 |
+
.panel h3 {
|
| 386 |
+
font-size: 16px;
|
| 387 |
+
font-weight: 600;
|
| 388 |
+
}
|
| 389 |
+
|
| 390 |
+
.panel-close {
|
| 391 |
+
background: none;
|
| 392 |
+
border: none;
|
| 393 |
+
color: #fff;
|
| 394 |
+
font-size: 24px;
|
| 395 |
+
cursor: pointer;
|
| 396 |
+
opacity: 0.6;
|
| 397 |
+
}
|
| 398 |
+
|
| 399 |
+
.panel-close:hover {
|
| 400 |
+
opacity: 1;
|
| 401 |
+
}
|
| 402 |
+
|
| 403 |
+
/* Recommendation items */
|
| 404 |
+
.rec-item {
|
| 405 |
+
display: flex;
|
| 406 |
+
gap: 14px;
|
| 407 |
+
padding: 12px 0;
|
| 408 |
+
border-bottom: 1px solid rgba(255,255,255,0.08);
|
| 409 |
+
cursor: pointer;
|
| 410 |
+
transition: background 0.2s;
|
| 411 |
+
}
|
| 412 |
+
|
| 413 |
+
.rec-item:hover {
|
| 414 |
+
background: rgba(255,255,255,0.05);
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
.rec-thumb {
|
| 418 |
+
width: 64px;
|
| 419 |
+
height: 64px;
|
| 420 |
+
border-radius: 8px;
|
| 421 |
+
background-size: cover;
|
| 422 |
+
background-position: center;
|
| 423 |
+
flex-shrink: 0;
|
| 424 |
+
background-color: rgba(255,255,255,0.1);
|
| 425 |
+
}
|
| 426 |
+
|
| 427 |
+
.rec-info {
|
| 428 |
+
flex: 1;
|
| 429 |
+
min-width: 0;
|
| 430 |
+
}
|
| 431 |
+
|
| 432 |
+
.rec-info h4 {
|
| 433 |
+
font-size: 14px;
|
| 434 |
+
font-weight: 600;
|
| 435 |
+
margin-bottom: 4px;
|
| 436 |
+
}
|
| 437 |
+
|
| 438 |
+
.rec-info span {
|
| 439 |
+
font-size: 12px;
|
| 440 |
+
opacity: 0.5;
|
| 441 |
+
}
|
| 442 |
+
|
| 443 |
+
.rec-section-title {
|
| 444 |
+
font-size: 12px;
|
| 445 |
+
text-transform: uppercase;
|
| 446 |
+
letter-spacing: 0.5px;
|
| 447 |
+
opacity: 0.5;
|
| 448 |
+
margin: 16px 0 8px 0;
|
| 449 |
+
}
|
| 450 |
+
|
| 451 |
+
.rec-section-title:first-child {
|
| 452 |
+
margin-top: 0;
|
| 453 |
+
}
|
| 454 |
+
|
| 455 |
+
.rec-item.wiki-rec {
|
| 456 |
+
border-left: 3px solid rgba(77, 163, 255, 0.6);
|
| 457 |
+
padding-left: 12px;
|
| 458 |
+
}
|
| 459 |
+
|
| 460 |
+
.rec-thumb.wiki-thumb {
|
| 461 |
+
background: linear-gradient(135deg, #1a1a2e 0%, #0f3460 100%);
|
| 462 |
+
display: flex;
|
| 463 |
+
align-items: center;
|
| 464 |
+
justify-content: center;
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
.rec-thumb.wiki-thumb svg {
|
| 468 |
+
opacity: 0.6;
|
| 469 |
+
}
|
| 470 |
+
|
| 471 |
+
.rec-item.hn-rec {
|
| 472 |
+
border-left: 3px solid rgba(255, 102, 0, 0.6);
|
| 473 |
+
padding-left: 12px;
|
| 474 |
+
}
|
| 475 |
+
|
| 476 |
+
.rec-thumb.hn-thumb {
|
| 477 |
+
background: linear-gradient(135deg, #1a0a00 0%, #3d1a00 100%);
|
| 478 |
+
display: flex;
|
| 479 |
+
align-items: center;
|
| 480 |
+
justify-content: center;
|
| 481 |
+
}
|
| 482 |
+
|
| 483 |
+
/* Swipe hint */
|
| 484 |
+
.swipe-hint {
|
| 485 |
+
position: fixed;
|
| 486 |
+
bottom: 20px;
|
| 487 |
+
left: 50%;
|
| 488 |
+
transform: translateX(-50%);
|
| 489 |
+
font-size: 12px;
|
| 490 |
+
opacity: 0.4;
|
| 491 |
+
display: flex;
|
| 492 |
+
flex-direction: column;
|
| 493 |
+
align-items: center;
|
| 494 |
+
gap: 4px;
|
| 495 |
+
animation: fadeOut 3s forwards;
|
| 496 |
+
animation-delay: 2s;
|
| 497 |
+
}
|
| 498 |
+
|
| 499 |
+
.swipe-hint-arrow {
|
| 500 |
+
animation: bounce 1.5s infinite;
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
@keyframes bounce {
|
| 504 |
+
0%, 100% { transform: translateY(0); }
|
| 505 |
+
50% { transform: translateY(-6px); }
|
| 506 |
+
}
|
| 507 |
+
|
| 508 |
+
@keyframes fadeOut {
|
| 509 |
+
to { opacity: 0; pointer-events: none; }
|
| 510 |
+
}
|
| 511 |
+
|
| 512 |
+
.empty-state {
|
| 513 |
+
text-align: center;
|
| 514 |
+
padding: 40px 20px;
|
| 515 |
+
opacity: 0.6;
|
| 516 |
+
}
|
| 517 |
+
|
| 518 |
+
.panel-overlay {
|
| 519 |
+
position: fixed;
|
| 520 |
+
inset: 0;
|
| 521 |
+
background: rgba(0,0,0,0.5);
|
| 522 |
+
z-index: 15;
|
| 523 |
+
opacity: 0;
|
| 524 |
+
pointer-events: none;
|
| 525 |
+
transition: opacity 0.3s;
|
| 526 |
+
}
|
| 527 |
+
|
| 528 |
+
.panel-overlay.visible {
|
| 529 |
+
opacity: 1;
|
| 530 |
+
pointer-events: auto;
|
| 531 |
+
}
|
| 532 |
+
</style>
|
| 533 |
+
</head>
|
| 534 |
+
<body>
|
| 535 |
+
<div class="swipe-container" id="container"></div>
|
| 536 |
+
|
| 537 |
+
<div class="card-actions" id="actions">
|
| 538 |
+
<button class="action-btn" id="recs-btn" title="Similar Posts">
|
| 539 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 540 |
+
<circle cx="12" cy="12" r="3"/>
|
| 541 |
+
<path d="M12 2v4m0 12v4M2 12h4m12 0h4"/>
|
| 542 |
+
</svg>
|
| 543 |
+
</button>
|
| 544 |
+
<button class="action-btn read-btn" id="read-btn" title="Read Full">
|
| 545 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 546 |
+
<path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
|
| 547 |
+
<polyline points="15,3 21,3 21,9"/>
|
| 548 |
+
<line x1="10" y1="14" x2="21" y2="3"/>
|
| 549 |
+
</svg>
|
| 550 |
+
</button>
|
| 551 |
+
</div>
|
| 552 |
+
|
| 553 |
+
<div class="panel-overlay" id="panel-overlay"></div>
|
| 554 |
+
|
| 555 |
+
<div class="panel" id="recs-panel">
|
| 556 |
+
<div class="panel-header">
|
| 557 |
+
<h3>Related Content</h3>
|
| 558 |
+
<button class="panel-close" id="recs-close">×</button>
|
| 559 |
+
</div>
|
| 560 |
+
<div id="recs-content"></div>
|
| 561 |
+
</div>
|
| 562 |
+
|
| 563 |
+
<div class="swipe-hint" id="swipe-hint">
|
| 564 |
+
<span class="swipe-hint-arrow">↑</span>
|
| 565 |
+
<span>Swipe up for more</span>
|
| 566 |
+
</div>
|
| 567 |
+
|
| 568 |
+
<script>
|
| 569 |
+
// Embedded data
|
| 570 |
+
const GRAPH = {{ graph_json|safe }};
|
| 571 |
+
const POST_SLUGS = {{ post_slugs_json|safe }};
|
| 572 |
+
const BASE_URL = "{{ base_url }}";
|
| 573 |
+
|
| 574 |
+
// State
|
| 575 |
+
let currentIndex = 0;
|
| 576 |
+
let feedItems = []; // Mixed array of {type: 'post'|'wiki', data: ...}
|
| 577 |
+
|
| 578 |
+
// Shuffle array (Fisher-Yates)
|
| 579 |
+
function shuffle(arr) {
|
| 580 |
+
const result = [...arr];
|
| 581 |
+
for (let i = result.length - 1; i > 0; i--) {
|
| 582 |
+
const j = Math.floor(Math.random() * (i + 1));
|
| 583 |
+
[result[i], result[j]] = [result[j], result[i]];
|
| 584 |
+
}
|
| 585 |
+
return result;
|
| 586 |
+
}
|
| 587 |
+
|
| 588 |
+
// Format date
|
| 589 |
+
function formatDate(isoDate) {
|
| 590 |
+
const date = new Date(isoDate);
|
| 591 |
+
return date.toLocaleDateString('en-US', {
|
| 592 |
+
year: 'numeric',
|
| 593 |
+
month: 'long',
|
| 594 |
+
day: 'numeric'
|
| 595 |
+
});
|
| 596 |
+
}
|
| 597 |
+
|
| 598 |
+
// Build feed with interleaved external content (Wikipedia, HN, etc.)
|
| 599 |
+
function buildFeed() {
|
| 600 |
+
const shuffledSlugs = shuffle(POST_SLUGS);
|
| 601 |
+
feedItems = [];
|
| 602 |
+
|
| 603 |
+
// Collect all Wikipedia suggestions
|
| 604 |
+
const allWiki = [];
|
| 605 |
+
// Collect all external content (HN, etc.)
|
| 606 |
+
const allExternal = [];
|
| 607 |
+
|
| 608 |
+
for (const slug of shuffledSlugs) {
|
| 609 |
+
const post = GRAPH.posts[slug];
|
| 610 |
+
if (post.wikipedia) {
|
| 611 |
+
for (const w of post.wikipedia) {
|
| 612 |
+
allWiki.push({ ...w, sourcePost: post.title });
|
| 613 |
+
}
|
| 614 |
+
}
|
| 615 |
+
if (post.externalContent) {
|
| 616 |
+
for (const e of post.externalContent) {
|
| 617 |
+
allExternal.push({ ...e, sourcePost: post.title });
|
| 618 |
+
}
|
| 619 |
+
}
|
| 620 |
+
}
|
| 621 |
+
|
| 622 |
+
const shuffledWiki = shuffle(allWiki);
|
| 623 |
+
const shuffledExternal = shuffle(allExternal);
|
| 624 |
+
|
| 625 |
+
// Interleave content: wiki every 3 posts, external every 4 posts
|
| 626 |
+
let wikiIndex = 0;
|
| 627 |
+
let extIndex = 0;
|
| 628 |
+
|
| 629 |
+
for (let i = 0; i < shuffledSlugs.length; i++) {
|
| 630 |
+
feedItems.push({ type: 'post', slug: shuffledSlugs[i] });
|
| 631 |
+
|
| 632 |
+
// Insert Wikipedia after every 3 posts
|
| 633 |
+
if ((i + 1) % 3 === 0 && wikiIndex < shuffledWiki.length) {
|
| 634 |
+
feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
|
| 635 |
+
wikiIndex++;
|
| 636 |
+
}
|
| 637 |
+
|
| 638 |
+
// Insert external content (HN) after every 4 posts
|
| 639 |
+
if ((i + 1) % 4 === 0 && extIndex < shuffledExternal.length) {
|
| 640 |
+
feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
|
| 641 |
+
extIndex++;
|
| 642 |
+
}
|
| 643 |
+
}
|
| 644 |
+
|
| 645 |
+
// Add remaining content at the end
|
| 646 |
+
while (wikiIndex < shuffledWiki.length) {
|
| 647 |
+
feedItems.push({ type: 'wiki', data: shuffledWiki[wikiIndex] });
|
| 648 |
+
wikiIndex++;
|
| 649 |
+
}
|
| 650 |
+
while (extIndex < shuffledExternal.length) {
|
| 651 |
+
feedItems.push({ type: 'external', data: shuffledExternal[extIndex] });
|
| 652 |
+
extIndex++;
|
| 653 |
+
}
|
| 654 |
+
}
|
| 655 |
+
|
| 656 |
+
// Render a blog post card
|
| 657 |
+
function renderPostCard(slug, idx) {
|
| 658 |
+
const post = GRAPH.posts[slug];
|
| 659 |
+
const bgImg = post.headerImg ? `${BASE_URL}/${post.headerImg}` : '';
|
| 660 |
+
const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #1a1a2e 0%, #16213e 100%)';
|
| 661 |
+
|
| 662 |
+
const keyPointsHtml = post.keyPoints && post.keyPoints.length > 0 ? `
|
| 663 |
+
<div class="key-points">
|
| 664 |
+
<div class="key-points-title">Key Takeaways</div>
|
| 665 |
+
${post.keyPoints.slice(0, 4).map(point => `
|
| 666 |
+
<div class="key-point">
|
| 667 |
+
<span class="key-point-bullet">•</span>
|
| 668 |
+
<span>${point}</span>
|
| 669 |
+
</div>
|
| 670 |
+
`).join('')}
|
| 671 |
+
</div>
|
| 672 |
+
` : '';
|
| 673 |
+
|
| 674 |
+
return `
|
| 675 |
+
<div class="card" data-type="post" data-slug="${slug}" data-index="${idx}" style="${bgStyle}">
|
| 676 |
+
<div class="card-overlay"></div>
|
| 677 |
+
<div class="card-content">
|
| 678 |
+
<div class="card-type">📝 Blog Post</div>
|
| 679 |
+
<div class="card-meta">
|
| 680 |
+
${post.categories.map(c =>
|
| 681 |
+
`<span class="card-category">${c}</span>`
|
| 682 |
+
).join('')}
|
| 683 |
+
<span class="card-category">${post.readingTime} min</span>
|
| 684 |
+
</div>
|
| 685 |
+
<h1 class="card-title">${post.title}</h1>
|
| 686 |
+
${post.subtitle ? `<p class="card-subtitle">${post.subtitle}</p>` : ''}
|
| 687 |
+
${keyPointsHtml}
|
| 688 |
+
<div class="card-tags">
|
| 689 |
+
${post.tags.slice(0, 4).map(t =>
|
| 690 |
+
`<span class="card-tag">#${t}</span>`
|
| 691 |
+
).join('')}
|
| 692 |
+
</div>
|
| 693 |
+
<div class="card-date">${formatDate(post.date)}</div>
|
| 694 |
+
</div>
|
| 695 |
+
</div>
|
| 696 |
+
`;
|
| 697 |
+
}
|
| 698 |
+
|
| 699 |
+
// Render a Wikipedia card
|
| 700 |
+
function renderWikiCard(wiki, idx) {
|
| 701 |
+
// Prefer extract from Wikipedia API, fall back to LLM-generated relevance
|
| 702 |
+
const excerpt = wiki.extract || wiki.relevance || '';
|
| 703 |
+
return `
|
| 704 |
+
<div class="card wiki-card" data-type="wiki" data-url="${wiki.url}" data-index="${idx}">
|
| 705 |
+
<div class="card-overlay"></div>
|
| 706 |
+
<div class="card-content">
|
| 707 |
+
<div class="card-type wiki">
|
| 708 |
+
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 709 |
+
<circle cx="12" cy="12" r="10"/>
|
| 710 |
+
<path d="M12 16v-4M12 8h.01"/>
|
| 711 |
+
</svg>
|
| 712 |
+
Wikipedia
|
| 713 |
+
</div>
|
| 714 |
+
<h1 class="card-title">${wiki.title}</h1>
|
| 715 |
+
<p class="wiki-excerpt">${excerpt}</p>
|
| 716 |
+
${wiki.relevance && wiki.extract ? `<p class="wiki-relevance"><em>${wiki.relevance}</em></p>` : ''}
|
| 717 |
+
<div class="wiki-source">
|
| 718 |
+
Related to: ${wiki.sourcePost}
|
| 719 |
+
</div>
|
| 720 |
+
</div>
|
| 721 |
+
</div>
|
| 722 |
+
`;
|
| 723 |
+
}
|
| 724 |
+
|
| 725 |
+
// Render an external content card (HackerNews, linked, etc.)
|
| 726 |
+
function renderExternalCard(ext, idx) {
|
| 727 |
+
const isHN = ext.source === 'hackernews' || ext.source === 'hn-frontpage';
|
| 728 |
+
const isLinked = ext.source === 'linked';
|
| 729 |
+
|
| 730 |
+
let cardClass, typeClass, typeName, descClass;
|
| 731 |
+
|
| 732 |
+
if (isHN) {
|
| 733 |
+
cardClass = 'hn-card';
|
| 734 |
+
typeClass = 'hn';
|
| 735 |
+
typeName = ext.source === 'hn-frontpage' ? 'HN Front Page' : 'Hacker News';
|
| 736 |
+
descClass = 'hn-description';
|
| 737 |
+
} else if (isLinked) {
|
| 738 |
+
cardClass = 'linked-card';
|
| 739 |
+
typeClass = 'linked';
|
| 740 |
+
typeName = 'Referenced Link';
|
| 741 |
+
descClass = 'linked-description';
|
| 742 |
+
} else {
|
| 743 |
+
cardClass = 'external-card';
|
| 744 |
+
typeClass = '';
|
| 745 |
+
typeName = ext.source;
|
| 746 |
+
descClass = 'hn-description';
|
| 747 |
+
}
|
| 748 |
+
|
| 749 |
+
const points = ext.metadata?.points || 0;
|
| 750 |
+
const comments = ext.metadata?.num_comments || 0;
|
| 751 |
+
const storyUrl = ext.metadata?.story_url || '';
|
| 752 |
+
|
| 753 |
+
// Icon based on type
|
| 754 |
+
const icon = isHN
|
| 755 |
+
? `<svg width="14" height="14" viewBox="0 0 24 24" fill="currentColor"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>`
|
| 756 |
+
: isLinked
|
| 757 |
+
? `<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/><path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/></svg>`
|
| 758 |
+
: '';
|
| 759 |
+
|
| 760 |
+
return `
|
| 761 |
+
<div class="card ${cardClass}" data-type="external" data-url="${ext.url}" data-index="${idx}">
|
| 762 |
+
<div class="card-overlay"></div>
|
| 763 |
+
<div class="card-content">
|
| 764 |
+
<div class="card-type ${typeClass}">
|
| 765 |
+
${icon}
|
| 766 |
+
${typeName}
|
| 767 |
+
</div>
|
| 768 |
+
<h1 class="card-title">${ext.title}</h1>
|
| 769 |
+
${isHN ? `
|
| 770 |
+
<div class="hn-meta">
|
| 771 |
+
<div class="hn-meta-item">
|
| 772 |
+
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 773 |
+
<path d="M12 2l3.09 6.26L22 9.27l-5 4.87 1.18 6.88L12 17.77l-6.18 3.25L7 14.14 2 9.27l6.91-1.01L12 2z"/>
|
| 774 |
+
</svg>
|
| 775 |
+
${points} points
|
| 776 |
+
</div>
|
| 777 |
+
<div class="hn-meta-item">
|
| 778 |
+
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 779 |
+
<path d="M21 15a2 2 0 01-2 2H7l-4 4V5a2 2 0 012-2h14a2 2 0 012 2z"/>
|
| 780 |
+
</svg>
|
| 781 |
+
${comments} comments
|
| 782 |
+
</div>
|
| 783 |
+
</div>
|
| 784 |
+
` : ''}
|
| 785 |
+
${ext.description ? `<p class="${descClass}">${ext.description}</p>` : ''}
|
| 786 |
+
${storyUrl ? `
|
| 787 |
+
<div class="hn-link">
|
| 788 |
+
<svg width="14" height="14" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2">
|
| 789 |
+
<path d="M18 13v6a2 2 0 01-2 2H5a2 2 0 01-2-2V8a2 2 0 012-2h6"/>
|
| 790 |
+
<polyline points="15,3 21,3 21,9"/>
|
| 791 |
+
<line x1="10" y1="14" x2="21" y2="3"/>
|
| 792 |
+
</svg>
|
| 793 |
+
View original article
|
| 794 |
+
</div>
|
| 795 |
+
` : ''}
|
| 796 |
+
<div class="wiki-source">
|
| 797 |
+
${ext.relevance || `Related to: ${ext.sourcePost}`}
|
| 798 |
+
</div>
|
| 799 |
+
</div>
|
| 800 |
+
</div>
|
| 801 |
+
`;
|
| 802 |
+
}
|
| 803 |
+
|
| 804 |
+
// Render all slides
|
| 805 |
+
function renderSlides() {
|
| 806 |
+
buildFeed();
|
| 807 |
+
const container = document.getElementById('container');
|
| 808 |
+
|
| 809 |
+
container.innerHTML = feedItems.map((item, idx) => {
|
| 810 |
+
if (item.type === 'post') {
|
| 811 |
+
return renderPostCard(item.slug, idx);
|
| 812 |
+
} else if (item.type === 'wiki') {
|
| 813 |
+
return renderWikiCard(item.data, idx);
|
| 814 |
+
} else if (item.type === 'external') {
|
| 815 |
+
return renderExternalCard(item.data, idx);
|
| 816 |
+
}
|
| 817 |
+
}).join('');
|
| 818 |
+
}
|
| 819 |
+
|
| 820 |
+
// Track current slide on scroll
|
| 821 |
+
function onScroll() {
|
| 822 |
+
const container = document.getElementById('container');
|
| 823 |
+
const slideHeight = window.innerHeight;
|
| 824 |
+
currentIndex = Math.round(container.scrollTop / slideHeight);
|
| 825 |
+
|
| 826 |
+
// Update action buttons visibility based on card type
|
| 827 |
+
const currentItem = feedItems[currentIndex];
|
| 828 |
+
const recsBtn = document.getElementById('recs-btn');
|
| 829 |
+
if (currentItem && (currentItem.type === 'wiki' || currentItem.type === 'external')) {
|
| 830 |
+
recsBtn.style.display = 'none';
|
| 831 |
+
} else {
|
| 832 |
+
recsBtn.style.display = 'flex';
|
| 833 |
+
}
|
| 834 |
+
}
|
| 835 |
+
|
| 836 |
+
// Open full content
|
| 837 |
+
function openFullContent() {
|
| 838 |
+
const item = feedItems[currentIndex];
|
| 839 |
+
if (item.type === 'post') {
|
| 840 |
+
const post = GRAPH.posts[item.slug];
|
| 841 |
+
window.open(post.url, '_blank');
|
| 842 |
+
} else if (item.type === 'wiki') {
|
| 843 |
+
window.open(item.data.url, '_blank');
|
| 844 |
+
} else if (item.type === 'external') {
|
| 845 |
+
window.open(item.data.url, '_blank');
|
| 846 |
+
}
|
| 847 |
+
}
|
| 848 |
+
|
| 849 |
+
// Close all panels
|
| 850 |
+
function closePanels() {
|
| 851 |
+
document.getElementById('recs-panel').classList.remove('open');
|
| 852 |
+
document.getElementById('panel-overlay').classList.remove('visible');
|
| 853 |
+
}
|
| 854 |
+
|
| 855 |
+
// Show recommendations panel
|
| 856 |
+
function showRecsPanel() {
|
| 857 |
+
const item = feedItems[currentIndex];
|
| 858 |
+
if (item.type !== 'post') return;
|
| 859 |
+
|
| 860 |
+
const panel = document.getElementById('recs-panel');
|
| 861 |
+
const content = document.getElementById('recs-content');
|
| 862 |
+
const post = GRAPH.posts[item.slug];
|
| 863 |
+
const recs = GRAPH.recommendations[item.slug] || [];
|
| 864 |
+
const wikiSuggestions = post.wikipedia || [];
|
| 865 |
+
const externalContent = post.externalContent || [];
|
| 866 |
+
|
| 867 |
+
let html = '';
|
| 868 |
+
|
| 869 |
+
// Similar posts section
|
| 870 |
+
if (recs.length > 0) {
|
| 871 |
+
html += '<div class="rec-section-title">Similar Posts</div>';
|
| 872 |
+
html += recs.map(([recSlug, score]) => {
|
| 873 |
+
const recPost = GRAPH.posts[recSlug];
|
| 874 |
+
const bgImg = recPost.headerImg ? `${BASE_URL}/${recPost.headerImg}` : '';
|
| 875 |
+
const bgStyle = bgImg ? `background-image: url('${bgImg}')` : 'background: linear-gradient(135deg, #2d3436 0%, #636e72 100%)';
|
| 876 |
+
|
| 877 |
+
return `
|
| 878 |
+
<div class="rec-item" data-slug="${recSlug}">
|
| 879 |
+
<div class="rec-thumb" style="${bgStyle}"></div>
|
| 880 |
+
<div class="rec-info">
|
| 881 |
+
<h4>${recPost.title}</h4>
|
| 882 |
+
<span>${Math.round(score * 100)}% match</span>
|
| 883 |
+
</div>
|
| 884 |
+
</div>
|
| 885 |
+
`;
|
| 886 |
+
}).join('');
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
// Wikipedia section
|
| 890 |
+
if (wikiSuggestions.length > 0) {
|
| 891 |
+
html += '<div class="rec-section-title">Wikipedia Articles</div>';
|
| 892 |
+
html += wikiSuggestions.map(wiki => {
|
| 893 |
+
const excerpt = wiki.extract || wiki.relevance || '';
|
| 894 |
+
const truncatedExcerpt = excerpt.length > 80 ? excerpt.slice(0, 80) + '...' : excerpt;
|
| 895 |
+
return `
|
| 896 |
+
<div class="rec-item wiki-rec" data-url="${wiki.url}">
|
| 897 |
+
<div class="rec-thumb wiki-thumb">
|
| 898 |
+
<svg width="24" height="24" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="1.5">
|
| 899 |
+
<circle cx="12" cy="12" r="10"/>
|
| 900 |
+
<path d="M12 16v-4M12 8h.01"/>
|
| 901 |
+
</svg>
|
| 902 |
+
</div>
|
| 903 |
+
<div class="rec-info">
|
| 904 |
+
<h4>${wiki.title}</h4>
|
| 905 |
+
<span>${truncatedExcerpt}</span>
|
| 906 |
+
</div>
|
| 907 |
+
</div>
|
| 908 |
+
`;
|
| 909 |
+
}).join('');
|
| 910 |
+
}
|
| 911 |
+
|
| 912 |
+
// External content section (HN, etc.)
|
| 913 |
+
const hnContent = externalContent.filter(e => e.source === 'hackernews' || e.source === 'hn-frontpage');
|
| 914 |
+
if (hnContent.length > 0) {
|
| 915 |
+
html += '<div class="rec-section-title">Hacker News</div>';
|
| 916 |
+
html += hnContent.map(hn => {
|
| 917 |
+
const points = hn.metadata?.points || 0;
|
| 918 |
+
const comments = hn.metadata?.num_comments || 0;
|
| 919 |
+
return `
|
| 920 |
+
<div class="rec-item hn-rec" data-url="${hn.url}">
|
| 921 |
+
<div class="rec-thumb hn-thumb">
|
| 922 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="#ff6600"><path d="M0 24V0h24v24H0zM6.951 5.896l4.112 7.708v5.064h1.583v-4.972l4.148-7.799h-1.749l-2.457 4.875c-.372.745-.688 1.434-.688 1.434s-.297-.708-.651-1.434L8.831 5.896h-1.88z"/></svg>
|
| 923 |
+
</div>
|
| 924 |
+
<div class="rec-info">
|
| 925 |
+
<h4>${hn.title}</h4>
|
| 926 |
+
<span>${points} pts · ${comments} comments</span>
|
| 927 |
+
</div>
|
| 928 |
+
</div>
|
| 929 |
+
`;
|
| 930 |
+
}).join('');
|
| 931 |
+
}
|
| 932 |
+
|
| 933 |
+
// Linked content section (tertiary crawled links)
|
| 934 |
+
const linkedContent = externalContent.filter(e => e.source === 'linked');
|
| 935 |
+
if (linkedContent.length > 0) {
|
| 936 |
+
html += '<div class="rec-section-title">Referenced Links</div>';
|
| 937 |
+
html += linkedContent.map(link => {
|
| 938 |
+
const truncatedDesc = link.description && link.description.length > 80
|
| 939 |
+
? link.description.slice(0, 80) + '...'
|
| 940 |
+
: (link.description || '');
|
| 941 |
+
return `
|
| 942 |
+
<div class="rec-item linked-rec" data-url="${link.url}">
|
| 943 |
+
<div class="rec-thumb linked-thumb">
|
| 944 |
+
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="#2ecc71" stroke-width="2">
|
| 945 |
+
<path d="M10 13a5 5 0 007.54.54l3-3a5 5 0 00-7.07-7.07l-1.72 1.71"/>
|
| 946 |
+
<path d="M14 11a5 5 0 00-7.54-.54l-3 3a5 5 0 007.07 7.07l1.71-1.71"/>
|
| 947 |
+
</svg>
|
| 948 |
+
</div>
|
| 949 |
+
<div class="rec-info">
|
| 950 |
+
<h4>${link.title}</h4>
|
| 951 |
+
<span>${truncatedDesc}</span>
|
| 952 |
+
</div>
|
| 953 |
+
</div>
|
| 954 |
+
`;
|
| 955 |
+
}).join('');
|
| 956 |
+
}
|
| 957 |
+
|
| 958 |
+
if (html) {
|
| 959 |
+
content.innerHTML = html;
|
| 960 |
+
} else {
|
| 961 |
+
content.innerHTML = '<div class="empty-state">No related content found.</div>';
|
| 962 |
+
}
|
| 963 |
+
|
| 964 |
+
document.getElementById('panel-overlay').classList.add('visible');
|
| 965 |
+
panel.classList.add('open');
|
| 966 |
+
}
|
| 967 |
+
|
| 968 |
+
// Jump to a specific post
|
| 969 |
+
function jumpToPost(slug) {
|
| 970 |
+
const idx = feedItems.findIndex(item => item.type === 'post' && item.slug === slug);
|
| 971 |
+
if (idx >= 0) {
|
| 972 |
+
const container = document.getElementById('container');
|
| 973 |
+
container.scrollTo({
|
| 974 |
+
top: idx * window.innerHeight,
|
| 975 |
+
behavior: 'smooth'
|
| 976 |
+
});
|
| 977 |
+
closePanels();
|
| 978 |
+
}
|
| 979 |
+
}
|
| 980 |
+
|
| 981 |
+
// Event listeners
|
| 982 |
+
document.getElementById('container').addEventListener('scroll', onScroll);
|
| 983 |
+
document.getElementById('read-btn').addEventListener('click', openFullContent);
|
| 984 |
+
document.getElementById('recs-btn').addEventListener('click', showRecsPanel);
|
| 985 |
+
document.getElementById('recs-close').addEventListener('click', closePanels);
|
| 986 |
+
document.getElementById('panel-overlay').addEventListener('click', closePanels);
|
| 987 |
+
|
| 988 |
+
document.getElementById('recs-content').addEventListener('click', (e) => {
|
| 989 |
+
const item = e.target.closest('.rec-item');
|
| 990 |
+
if (item) {
|
| 991 |
+
if (item.dataset.url) {
|
| 992 |
+
// Wikipedia item - open in new tab
|
| 993 |
+
window.open(item.dataset.url, '_blank');
|
| 994 |
+
closePanels();
|
| 995 |
+
} else if (item.dataset.slug) {
|
| 996 |
+
// Blog post - jump to it
|
| 997 |
+
jumpToPost(item.dataset.slug);
|
| 998 |
+
}
|
| 999 |
+
}
|
| 1000 |
+
});
|
| 1001 |
+
|
| 1002 |
+
// Keyboard navigation
|
| 1003 |
+
document.addEventListener('keydown', (e) => {
|
| 1004 |
+
const container = document.getElementById('container');
|
| 1005 |
+
const slideHeight = window.innerHeight;
|
| 1006 |
+
|
| 1007 |
+
if (e.key === 'ArrowDown' || e.key === 'j') {
|
| 1008 |
+
container.scrollBy({ top: slideHeight, behavior: 'smooth' });
|
| 1009 |
+
} else if (e.key === 'ArrowUp' || e.key === 'k') {
|
| 1010 |
+
container.scrollBy({ top: -slideHeight, behavior: 'smooth' });
|
| 1011 |
+
} else if (e.key === 'Enter' || e.key === 'o') {
|
| 1012 |
+
openFullContent();
|
| 1013 |
+
} else if (e.key === 'Escape') {
|
| 1014 |
+
closePanels();
|
| 1015 |
+
}
|
| 1016 |
+
});
|
| 1017 |
+
|
| 1018 |
+
// Hide swipe hint after first scroll
|
| 1019 |
+
document.getElementById('container').addEventListener('scroll', () => {
|
| 1020 |
+
document.getElementById('swipe-hint').style.display = 'none';
|
| 1021 |
+
}, { once: true });
|
| 1022 |
+
|
| 1023 |
+
// Initialize
|
| 1024 |
+
renderSlides();
|
| 1025 |
+
onScroll(); // Update button visibility for initial state
|
| 1026 |
+
</script>
|
| 1027 |
+
</body>
|
| 1028 |
+
</html>
|
src/tiktokify/models/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Data models for tiktokify."""
|
| 2 |
+
|
| 3 |
+
from .post import (
|
| 4 |
+
ExternalContentItem,
|
| 5 |
+
Post,
|
| 6 |
+
PostMetadata,
|
| 7 |
+
RecommendationGraph,
|
| 8 |
+
WikipediaSuggestion,
|
| 9 |
+
)
|
| 10 |
+
|
| 11 |
+
__all__ = [
|
| 12 |
+
"ExternalContentItem",
|
| 13 |
+
"Post",
|
| 14 |
+
"PostMetadata",
|
| 15 |
+
"RecommendationGraph",
|
| 16 |
+
"WikipediaSuggestion",
|
| 17 |
+
]
|
src/tiktokify/models/post.py
ADDED
|
@@ -0,0 +1,116 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Pydantic models for blog posts and recommendation graph."""
|
| 2 |
+
|
| 3 |
+
from datetime import datetime
|
| 4 |
+
from typing import Optional
|
| 5 |
+
|
| 6 |
+
from pydantic import BaseModel, Field, HttpUrl
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class ExternalContentItem(BaseModel):
|
| 10 |
+
"""Generic external content from any source."""
|
| 11 |
+
|
| 12 |
+
source: str = Field(description="Source type: 'wikipedia', 'hackernews', 'reddit', etc.")
|
| 13 |
+
title: str
|
| 14 |
+
url: HttpUrl
|
| 15 |
+
description: str = Field(default="", description="Brief description or excerpt")
|
| 16 |
+
relevance: str = Field(default="", description="Why this is relevant to the post")
|
| 17 |
+
metadata: dict = Field(default_factory=dict, description="Source-specific metadata")
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
class WikipediaSuggestion(BaseModel):
|
| 21 |
+
"""A Wikipedia article suggestion for a blog post."""
|
| 22 |
+
|
| 23 |
+
title: str
|
| 24 |
+
url: HttpUrl
|
| 25 |
+
relevance: str = Field(description="Brief explanation of why this is relevant")
|
| 26 |
+
extract: str = Field(default="", description="Article summary from Wikipedia API")
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class PostMetadata(BaseModel):
|
| 30 |
+
"""Metadata extracted from Jekyll post front matter."""
|
| 31 |
+
|
| 32 |
+
title: str
|
| 33 |
+
date: datetime
|
| 34 |
+
categories: list[str] = Field(default_factory=list)
|
| 35 |
+
tags: list[str] = Field(default_factory=list)
|
| 36 |
+
subtitle: Optional[str] = None
|
| 37 |
+
header_img: Optional[str] = None
|
| 38 |
+
last_edited_on: Optional[datetime] = None
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
class Post(BaseModel):
|
| 42 |
+
"""Complete representation of a blog post."""
|
| 43 |
+
|
| 44 |
+
url: str
|
| 45 |
+
slug: str
|
| 46 |
+
metadata: PostMetadata
|
| 47 |
+
content_text: str = Field(description="Plain text content for TF-IDF")
|
| 48 |
+
content_html: str = Field(default="", description="Full HTML content")
|
| 49 |
+
reading_time_minutes: int = Field(default=1)
|
| 50 |
+
|
| 51 |
+
# Populated during enrichment phase
|
| 52 |
+
key_points: list[str] = Field(
|
| 53 |
+
default_factory=list, description="LLM-generated key points/summary"
|
| 54 |
+
)
|
| 55 |
+
similar_posts: list[str] = Field(
|
| 56 |
+
default_factory=list, description="List of similar post slugs"
|
| 57 |
+
)
|
| 58 |
+
similarity_scores: dict[str, float] = Field(
|
| 59 |
+
default_factory=dict, description="slug -> similarity score"
|
| 60 |
+
)
|
| 61 |
+
wikipedia_suggestions: list[WikipediaSuggestion] = Field(default_factory=list)
|
| 62 |
+
external_content: list[ExternalContentItem] = Field(
|
| 63 |
+
default_factory=list, description="Content from external sources (HN, Reddit, etc.)"
|
| 64 |
+
)
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
class RecommendationGraph(BaseModel):
|
| 68 |
+
"""Graph of posts with recommendation adjacency list."""
|
| 69 |
+
|
| 70 |
+
posts: dict[str, Post] = Field(description="slug -> Post mapping")
|
| 71 |
+
adjacency: dict[str, list[tuple[str, float]]] = Field(
|
| 72 |
+
default_factory=dict, description="slug -> [(similar_slug, score), ...]"
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
def to_json_for_embed(self) -> dict:
|
| 76 |
+
"""Serialize for embedding in HTML (minimal, frontend-friendly format)."""
|
| 77 |
+
return {
|
| 78 |
+
"posts": {
|
| 79 |
+
slug: {
|
| 80 |
+
"title": p.metadata.title,
|
| 81 |
+
"subtitle": p.metadata.subtitle,
|
| 82 |
+
"date": p.metadata.date.isoformat(),
|
| 83 |
+
"categories": p.metadata.categories,
|
| 84 |
+
"tags": p.metadata.tags,
|
| 85 |
+
"url": p.url,
|
| 86 |
+
"headerImg": p.metadata.header_img,
|
| 87 |
+
"readingTime": p.reading_time_minutes,
|
| 88 |
+
"keyPoints": p.key_points,
|
| 89 |
+
"wikipedia": [
|
| 90 |
+
{
|
| 91 |
+
"title": w.title,
|
| 92 |
+
"url": str(w.url),
|
| 93 |
+
"relevance": w.relevance,
|
| 94 |
+
"extract": w.extract,
|
| 95 |
+
}
|
| 96 |
+
for w in p.wikipedia_suggestions
|
| 97 |
+
],
|
| 98 |
+
"externalContent": [
|
| 99 |
+
{
|
| 100 |
+
"source": e.source,
|
| 101 |
+
"title": e.title,
|
| 102 |
+
"url": str(e.url),
|
| 103 |
+
"description": e.description,
|
| 104 |
+
"relevance": e.relevance,
|
| 105 |
+
"metadata": e.metadata,
|
| 106 |
+
}
|
| 107 |
+
for e in p.external_content
|
| 108 |
+
],
|
| 109 |
+
}
|
| 110 |
+
for slug, p in self.posts.items()
|
| 111 |
+
},
|
| 112 |
+
"recommendations": {
|
| 113 |
+
slug: [(s, round(score, 3)) for s, score in recs]
|
| 114 |
+
for slug, recs in self.adjacency.items()
|
| 115 |
+
},
|
| 116 |
+
}
|
src/tiktokify/recommender/__init__.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Recommendation engine module."""
|
| 2 |
+
|
| 3 |
+
from .engine import RecommendationEngine
|
| 4 |
+
|
| 5 |
+
__all__ = ["RecommendationEngine"]
|
src/tiktokify/recommender/engine.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Combined recommendation engine."""
|
| 2 |
+
|
| 3 |
+
from tiktokify.models import Post, RecommendationGraph
|
| 4 |
+
|
| 5 |
+
from .metadata import MetadataSimilarity
|
| 6 |
+
from .tfidf import TFIDFSimilarity
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
class RecommendationEngine:
|
| 10 |
+
"""Hybrid recommendation combining content and metadata similarity."""
|
| 11 |
+
|
| 12 |
+
def __init__(
|
| 13 |
+
self,
|
| 14 |
+
content_weight: float = 0.6,
|
| 15 |
+
metadata_weight: float = 0.4,
|
| 16 |
+
top_k: int = 5,
|
| 17 |
+
):
|
| 18 |
+
self.content_weight = content_weight
|
| 19 |
+
self.metadata_weight = metadata_weight
|
| 20 |
+
self.top_k = top_k
|
| 21 |
+
|
| 22 |
+
self.tfidf = TFIDFSimilarity()
|
| 23 |
+
self.metadata = MetadataSimilarity()
|
| 24 |
+
|
| 25 |
+
def build_graph(self, posts: list[Post]) -> RecommendationGraph:
|
| 26 |
+
"""Build complete recommendation graph."""
|
| 27 |
+
# Fit both models
|
| 28 |
+
self.tfidf.fit(posts)
|
| 29 |
+
self.metadata.fit(posts)
|
| 30 |
+
|
| 31 |
+
posts_dict = {p.slug: p for p in posts}
|
| 32 |
+
adjacency: dict[str, list[tuple[str, float]]] = {}
|
| 33 |
+
|
| 34 |
+
for post in posts:
|
| 35 |
+
# Get similarities from both sources
|
| 36 |
+
content_sims = dict(self.tfidf.get_similar(post.slug, self.top_k * 2))
|
| 37 |
+
metadata_sims = dict(self.metadata.get_similar(post.slug, self.top_k * 2))
|
| 38 |
+
|
| 39 |
+
# Combine scores
|
| 40 |
+
all_slugs = set(content_sims.keys()) | set(metadata_sims.keys())
|
| 41 |
+
combined: list[tuple[str, float]] = []
|
| 42 |
+
|
| 43 |
+
for slug in all_slugs:
|
| 44 |
+
c_score = content_sims.get(slug, 0)
|
| 45 |
+
m_score = metadata_sims.get(slug, 0)
|
| 46 |
+
combined_score = (
|
| 47 |
+
self.content_weight * c_score + self.metadata_weight * m_score
|
| 48 |
+
)
|
| 49 |
+
combined.append((slug, combined_score))
|
| 50 |
+
|
| 51 |
+
# Sort and take top_k
|
| 52 |
+
combined.sort(key=lambda x: x[1], reverse=True)
|
| 53 |
+
adjacency[post.slug] = combined[: self.top_k]
|
| 54 |
+
|
| 55 |
+
# Update post object with recommendations
|
| 56 |
+
post.similar_posts = [s for s, _ in combined[: self.top_k]]
|
| 57 |
+
post.similarity_scores = dict(combined[: self.top_k])
|
| 58 |
+
|
| 59 |
+
return RecommendationGraph(posts=posts_dict, adjacency=adjacency)
|
src/tiktokify/recommender/metadata.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tag and category based similarity."""
|
| 2 |
+
|
| 3 |
+
from tiktokify.models import Post
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class MetadataSimilarity:
|
| 7 |
+
"""Tag and category based Jaccard similarity."""
|
| 8 |
+
|
| 9 |
+
def __init__(
|
| 10 |
+
self,
|
| 11 |
+
tag_weight: float = 0.7,
|
| 12 |
+
category_weight: float = 0.3,
|
| 13 |
+
):
|
| 14 |
+
self.tag_weight = tag_weight
|
| 15 |
+
self.category_weight = category_weight
|
| 16 |
+
self.posts: dict[str, Post] = {}
|
| 17 |
+
|
| 18 |
+
def fit(self, posts: list[Post]) -> None:
|
| 19 |
+
"""Store posts for similarity computation."""
|
| 20 |
+
self.posts = {p.slug: p for p in posts}
|
| 21 |
+
|
| 22 |
+
def compute_similarity(self, slug1: str, slug2: str) -> float:
|
| 23 |
+
"""Compute Jaccard-like similarity between two posts."""
|
| 24 |
+
p1, p2 = self.posts.get(slug1), self.posts.get(slug2)
|
| 25 |
+
if not p1 or not p2:
|
| 26 |
+
return 0.0
|
| 27 |
+
|
| 28 |
+
# Tag similarity (Jaccard index)
|
| 29 |
+
tags1, tags2 = set(p1.metadata.tags), set(p2.metadata.tags)
|
| 30 |
+
tag_union = tags1 | tags2
|
| 31 |
+
tag_sim = len(tags1 & tags2) / len(tag_union) if tag_union else 0
|
| 32 |
+
|
| 33 |
+
# Category similarity (exact match)
|
| 34 |
+
cats1, cats2 = set(p1.metadata.categories), set(p2.metadata.categories)
|
| 35 |
+
cat_union = cats1 | cats2
|
| 36 |
+
cat_sim = len(cats1 & cats2) / len(cat_union) if cat_union else 0
|
| 37 |
+
|
| 38 |
+
return self.tag_weight * tag_sim + self.category_weight * cat_sim
|
| 39 |
+
|
| 40 |
+
def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
|
| 41 |
+
"""Get top-k similar posts based on metadata."""
|
| 42 |
+
if slug not in self.posts:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
scores = [
|
| 46 |
+
(other_slug, self.compute_similarity(slug, other_slug))
|
| 47 |
+
for other_slug in self.posts
|
| 48 |
+
if other_slug != slug
|
| 49 |
+
]
|
| 50 |
+
scores.sort(key=lambda x: x[1], reverse=True)
|
| 51 |
+
return scores[:top_k]
|
src/tiktokify/recommender/tfidf.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""TF-IDF based content similarity."""
|
| 2 |
+
|
| 3 |
+
import numpy as np
|
| 4 |
+
from sklearn.feature_extraction.text import TfidfVectorizer
|
| 5 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 6 |
+
|
| 7 |
+
from tiktokify.models import Post
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
class TFIDFSimilarity:
|
| 11 |
+
"""Content-based similarity using TF-IDF."""
|
| 12 |
+
|
| 13 |
+
def __init__(
|
| 14 |
+
self,
|
| 15 |
+
max_features: int = 5000,
|
| 16 |
+
ngram_range: tuple[int, int] = (1, 2),
|
| 17 |
+
):
|
| 18 |
+
self.vectorizer = TfidfVectorizer(
|
| 19 |
+
max_features=max_features,
|
| 20 |
+
ngram_range=ngram_range,
|
| 21 |
+
stop_words="english",
|
| 22 |
+
min_df=1,
|
| 23 |
+
max_df=0.9,
|
| 24 |
+
)
|
| 25 |
+
self.tfidf_matrix: np.ndarray | None = None
|
| 26 |
+
self.slugs: list[str] = []
|
| 27 |
+
|
| 28 |
+
def fit(self, posts: list[Post]) -> None:
|
| 29 |
+
"""Fit TF-IDF on post content."""
|
| 30 |
+
self.slugs = [p.slug for p in posts]
|
| 31 |
+
texts = [p.content_text for p in posts]
|
| 32 |
+
self.tfidf_matrix = self.vectorizer.fit_transform(texts)
|
| 33 |
+
|
| 34 |
+
def get_similarity_matrix(self) -> np.ndarray:
|
| 35 |
+
"""Return full cosine similarity matrix."""
|
| 36 |
+
if self.tfidf_matrix is None:
|
| 37 |
+
raise ValueError("Must call fit() first")
|
| 38 |
+
return cosine_similarity(self.tfidf_matrix)
|
| 39 |
+
|
| 40 |
+
def get_similar(self, slug: str, top_k: int = 5) -> list[tuple[str, float]]:
|
| 41 |
+
"""Get top-k similar posts for a given slug."""
|
| 42 |
+
if slug not in self.slugs:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
idx = self.slugs.index(slug)
|
| 46 |
+
sim_matrix = self.get_similarity_matrix()
|
| 47 |
+
scores = sim_matrix[idx]
|
| 48 |
+
|
| 49 |
+
# Get top-k (excluding self)
|
| 50 |
+
top_indices = np.argsort(scores)[::-1][1 : top_k + 1]
|
| 51 |
+
return [(self.slugs[i], float(scores[i])) for i in top_indices]
|
tests/__init__.py
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
"""Tests for tiktokify."""
|