daily-snorter / scraper /extract_jokes.py
allyboyboy's picture
Initial deployment of The Daily Snorter
555aaab verified
"""Extract standalone one-liner jokes from long-form articles.
Two-stage pipeline:
Stage 1: Regex/heuristic filtering (6,500 sentences → ~300 candidates)
Stage 2: Scoring candidates for standalone-ness (requires manual or AI review)
For now, Stage 1 runs automatically. Stage 2 candidates are stored with
content_type='extracted_candidate' for review in the web dashboard.
"""
import re
import logging
from scraper.db import get_db, insert_entry
logger = logging.getLogger("joke-corpus")
# Minimum article length to bother extracting from
MIN_ARTICLE_LENGTH = 500
# Sentence length bounds for one-liners
MIN_JOKE_LENGTH = 40
MAX_JOKE_LENGTH = 300
# Patterns that strongly indicate a standalone joke (Frankie Boyle style)
STRONG_PATTERNS = [
re.compile(r"looks?\s+like\s+", re.IGNORECASE), # "looks like a..."
re.compile(r"look\s+as\s+if\s+", re.IGNORECASE), # "look as if..."
re.compile(r"the\s+face\s+of\s+", re.IGNORECASE), # "the face of someone who..."
re.compile(r"the\s+kind\s+of\s+\w+\s+who", re.IGNORECASE), # "the kind of man who..."
re.compile(r"the\s+sort\s+of\s+\w+\s+who", re.IGNORECASE),
re.compile(r"so\s+\w+\s+that\s+", re.IGNORECASE), # "so X that..."
re.compile(r"like\s+a\s+.+\s+in\s+a\s+", re.IGNORECASE), # "like a X in a Y"
re.compile(r"like\s+watching\s+", re.IGNORECASE), # "like watching..."
re.compile(r"imagine\s+", re.IGNORECASE), # "Imagine..."
]
# Weak signals — help score but don't guarantee standalone-ness
WEAK_PATTERNS = [
re.compile(r"\(.*\)", re.IGNORECASE), # parenthetical aside
re.compile(r"—.+—", re.IGNORECASE), # em-dash aside
re.compile(r"perhaps\s+", re.IGNORECASE), # dry understatement
re.compile(r"presumably\s+", re.IGNORECASE),
re.compile(r"naturally\s+", re.IGNORECASE),
re.compile(r"of\s+course\s+", re.IGNORECASE),
]
# Sentences starting with these are likely context-dependent
SKIP_STARTS = re.compile(
r"^(But |And |Yet |So |Or |However,|Meanwhile,|"
r"He |She |It |They |This |That |These |Those |"
r"His |Her |Its |Their |"
r"The (article|piece|column|book|report|story|film|show) )",
re.IGNORECASE,
)
# Named entities / proper nouns that suggest the subject is identified
HAS_NAMED_SUBJECT = re.compile(
r"^[A-Z][a-z]+ [A-Z]|" # First Last at start
r"^(Boris|Trump|Corbyn|Farage|May|Johnson|Cameron|Starmer|Sunak|"
r"Blair|Thatcher|Obama|Biden|Putin|Musk|Patel|Gove|Rees-Mogg|"
r"Cummings|Hancock|Truss|Sturgeon|Salmond|"
r"Britain|England|Scotland|America|Labour|Tory|Conservative|"
r"The (Queen|King|PM|BBC|NHS|Guardian|Sun|Mail|Times))\b",
)
def _split_sentences(text):
"""Split article text into sentences, handling common abbreviations."""
# Remove markdown headers
text = re.sub(r"^#+\s+.*$", "", text, flags=re.MULTILINE)
# Remove URLs
text = re.sub(r"https?://\S+", "", text)
# Protect common abbreviations
text = text.replace("Mr.", "Mr").replace("Mrs.", "Mrs").replace("Ms.", "Ms")
text = text.replace("Dr.", "Dr").replace("St.", "St").replace("Prof.", "Prof")
text = text.replace("etc.", "etc").replace("eg.", "eg").replace("ie.", "ie")
text = text.replace("...", "…")
# Split on sentence endings
sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z"\'])', text)
return [s.strip() for s in sentences if s.strip()]
def _score_candidate(sentence):
"""Score a sentence for standalone joke potential. Higher = more likely."""
score = 0
# Length sweet spot (80-200 chars is ideal for a one-liner)
length = len(sentence)
if 80 <= length <= 200:
score += 2
elif 60 <= length <= 250:
score += 1
# Strong pattern matches
for pattern in STRONG_PATTERNS:
if pattern.search(sentence):
score += 3
break # Only count once
# Weak pattern matches
for pattern in WEAK_PATTERNS:
if pattern.search(sentence):
score += 1
# Has a named subject (not pronoun-dependent)
if HAS_NAMED_SUBJECT.match(sentence):
score += 2
# Starts with skip words (context-dependent)
if SKIP_STARTS.match(sentence):
score -= 2
# Contains a quote (often setup-punchline)
if '"' in sentence or '\u201c' in sentence:
score += 1
# Ends with a strong punchline indicator
if sentence.rstrip().endswith((".", "…", "?")):
score += 1
return score
def extract_from_article(entry_id, text, source_id, author=None,
topic=None, min_score=3):
"""Extract joke candidates from a long-form article.
Returns list of (sentence, score) tuples that were inserted.
"""
if len(text) < MIN_ARTICLE_LENGTH:
return []
# Get article title for attribution
title = ""
lines = text.split("\n")
for line in lines:
if line.strip().startswith("#"):
title = line.strip().lstrip("#").strip()
break
sentences = _split_sentences(text)
extracted = []
for sentence in sentences:
# Basic length filter
if len(sentence) < MIN_JOKE_LENGTH or len(sentence) > MAX_JOKE_LENGTH:
continue
score = _score_candidate(sentence)
if score >= min_score:
# Build attribution prefix if needed
attributed = sentence
# Store as extracted candidate
platform_id = f"extract-{entry_id}-{hash(sentence) & 0xFFFFFFFF:08x}"
eid = insert_entry(
source_id=source_id,
platform="extracted",
text=attributed,
platform_entry_id=platform_id,
author=author,
content_type="extracted_candidate",
topic=topic,
)
if eid:
extracted.append((sentence, score))
return extracted
def run_extraction(min_score=3):
"""Extract jokes from all long-form articles in the corpus."""
with get_db() as conn:
# Find articles that haven't been extracted yet
articles = conn.execute("""
SELECT e.id, e.text, e.source_id, e.author, e.topic
FROM entries e
WHERE e.content_type = 'article'
AND LENGTH(e.text) > ?
AND e.id NOT IN (
SELECT DISTINCT CAST(
SUBSTR(platform_entry_id, 9,
INSTR(SUBSTR(platform_entry_id, 9), '-') - 1)
AS INTEGER)
FROM entries
WHERE platform = 'extracted'
)
""", (MIN_ARTICLE_LENGTH,)).fetchall()
total_extracted = 0
for article in articles:
extracted = extract_from_article(
entry_id=article["id"],
text=article["text"],
source_id=article["source_id"],
author=article["author"],
topic=article["topic"],
min_score=min_score,
)
if extracted:
logger.info(
f" Article {article['id']} ({article['author']}): "
f"extracted {len(extracted)} candidates"
)
total_extracted += len(extracted)
return total_extracted