Spaces:

allyboyboy
/

daily-snorter

Sleeping

App Files Files Community

daily-snorter / scraper /extract_jokes.py

allyboyboy

Initial deployment of The Daily Snorter

555aaab verified 2 months ago

raw

history blame contribute delete

7.29 kB

	"""Extract standalone one-liner jokes from long-form articles.

	Two-stage pipeline:
	Stage 1: Regex/heuristic filtering (6,500 sentences → ~300 candidates)
	Stage 2: Scoring candidates for standalone-ness (requires manual or AI review)

	For now, Stage 1 runs automatically. Stage 2 candidates are stored with
	content_type='extracted_candidate' for review in the web dashboard.
	"""
	import re
	import logging
	from scraper.db import get_db, insert_entry

	logger = logging.getLogger("joke-corpus")

	# Minimum article length to bother extracting from
	MIN_ARTICLE_LENGTH = 500

	# Sentence length bounds for one-liners
	MIN_JOKE_LENGTH = 40
	MAX_JOKE_LENGTH = 300

	# Patterns that strongly indicate a standalone joke (Frankie Boyle style)
	STRONG_PATTERNS = [
	re.compile(r"looks?\s+like\s+", re.IGNORECASE), # "looks like a..."
	re.compile(r"look\s+as\s+if\s+", re.IGNORECASE), # "look as if..."
	re.compile(r"the\s+face\s+of\s+", re.IGNORECASE), # "the face of someone who..."
	re.compile(r"the\s+kind\s+of\s+\w+\s+who", re.IGNORECASE), # "the kind of man who..."
	re.compile(r"the\s+sort\s+of\s+\w+\s+who", re.IGNORECASE),
	re.compile(r"so\s+\w+\s+that\s+", re.IGNORECASE), # "so X that..."
	re.compile(r"like\s+a\s+.+\s+in\s+a\s+", re.IGNORECASE), # "like a X in a Y"
	re.compile(r"like\s+watching\s+", re.IGNORECASE), # "like watching..."
	re.compile(r"imagine\s+", re.IGNORECASE), # "Imagine..."
	]

	# Weak signals — help score but don't guarantee standalone-ness
	WEAK_PATTERNS = [
	re.compile(r"$.*$", re.IGNORECASE), # parenthetical aside
	re.compile(r"—.+—", re.IGNORECASE), # em-dash aside
	re.compile(r"perhaps\s+", re.IGNORECASE), # dry understatement
	re.compile(r"presumably\s+", re.IGNORECASE),
	re.compile(r"naturally\s+", re.IGNORECASE),
	re.compile(r"of\s+course\s+", re.IGNORECASE),
	]

	# Sentences starting with these are likely context-dependent
	SKIP_STARTS = re.compile(
	r"^(But \|And \|Yet \|So \|Or \|However,\|Meanwhile,\|"
	r"He \|She \|It \|They \|This \|That \|These \|Those \|"
	r"His \|Her \|Its \|Their \|"
	r"The (article\|piece\|column\|book\|report\|story\|film\|show) )",
	re.IGNORECASE,
	)

	# Named entities / proper nouns that suggest the subject is identified
	HAS_NAMED_SUBJECT = re.compile(
	r"^[A-Z][a-z]+ [A-Z]\|" # First Last at start
	r"^(Boris\|Trump\|Corbyn\|Farage\|May\|Johnson\|Cameron\|Starmer\|Sunak\|"
	r"Blair\|Thatcher\|Obama\|Biden\|Putin\|Musk\|Patel\|Gove\|Rees-Mogg\|"
	r"Cummings\|Hancock\|Truss\|Sturgeon\|Salmond\|"
	r"Britain\|England\|Scotland\|America\|Labour\|Tory\|Conservative\|"
	r"The (Queen\|King\|PM\|BBC\|NHS\|Guardian\|Sun\|Mail\|Times))\b",
	)


	def _split_sentences(text):
	"""Split article text into sentences, handling common abbreviations."""
	# Remove markdown headers
	text = re.sub(r"^#+\s+.*$", "", text, flags=re.MULTILINE)
	# Remove URLs
	text = re.sub(r"https?://\S+", "", text)
	# Protect common abbreviations
	text = text.replace("Mr.", "Mr").replace("Mrs.", "Mrs").replace("Ms.", "Ms")
	text = text.replace("Dr.", "Dr").replace("St.", "St").replace("Prof.", "Prof")
	text = text.replace("etc.", "etc").replace("eg.", "eg").replace("ie.", "ie")
	text = text.replace("...", "…")

	# Split on sentence endings
	sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z"\'])', text)
	return [s.strip() for s in sentences if s.strip()]


	def _score_candidate(sentence):
	"""Score a sentence for standalone joke potential. Higher = more likely."""
	score = 0

	# Length sweet spot (80-200 chars is ideal for a one-liner)
	length = len(sentence)
	if 80 <= length <= 200:
	score += 2
	elif 60 <= length <= 250:
	score += 1

	# Strong pattern matches
	for pattern in STRONG_PATTERNS:
	if pattern.search(sentence):
	score += 3
	break # Only count once

	# Weak pattern matches
	for pattern in WEAK_PATTERNS:
	if pattern.search(sentence):
	score += 1

	# Has a named subject (not pronoun-dependent)
	if HAS_NAMED_SUBJECT.match(sentence):
	score += 2

	# Starts with skip words (context-dependent)
	if SKIP_STARTS.match(sentence):
	score -= 2

	# Contains a quote (often setup-punchline)
	if '"' in sentence or '\u201c' in sentence:
	score += 1

	# Ends with a strong punchline indicator
	if sentence.rstrip().endswith((".", "…", "?")):
	score += 1

	return score


	def extract_from_article(entry_id, text, source_id, author=None,
	topic=None, min_score=3):
	"""Extract joke candidates from a long-form article.

	Returns list of (sentence, score) tuples that were inserted.
	"""
	if len(text) < MIN_ARTICLE_LENGTH:
	return []

	# Get article title for attribution
	title = ""
	lines = text.split("\n")
	for line in lines:
	if line.strip().startswith("#"):
	title = line.strip().lstrip("#").strip()
	break

	sentences = _split_sentences(text)
	extracted = []

	for sentence in sentences:
	# Basic length filter
	if len(sentence) < MIN_JOKE_LENGTH or len(sentence) > MAX_JOKE_LENGTH:
	continue

	score = _score_candidate(sentence)
	if score >= min_score:
	# Build attribution prefix if needed
	attributed = sentence

	# Store as extracted candidate
	platform_id = f"extract-{entry_id}-{hash(sentence) & 0xFFFFFFFF:08x}"
	eid = insert_entry(
	source_id=source_id,
	platform="extracted",
	text=attributed,
	platform_entry_id=platform_id,
	author=author,
	content_type="extracted_candidate",
	topic=topic,
	)
	if eid:
	extracted.append((sentence, score))

	return extracted


	def run_extraction(min_score=3):
	"""Extract jokes from all long-form articles in the corpus."""
	with get_db() as conn:
	# Find articles that haven't been extracted yet
	articles = conn.execute("""
	SELECT e.id, e.text, e.source_id, e.author, e.topic
	FROM entries e
	WHERE e.content_type = 'article'
	AND LENGTH(e.text) > ?
	AND e.id NOT IN (
	SELECT DISTINCT CAST(
	SUBSTR(platform_entry_id, 9,
	INSTR(SUBSTR(platform_entry_id, 9), '-') - 1)
	AS INTEGER)
	FROM entries
	WHERE platform = 'extracted'
	)
	""", (MIN_ARTICLE_LENGTH,)).fetchall()

	total_extracted = 0
	for article in articles:
	extracted = extract_from_article(
	entry_id=article["id"],
	text=article["text"],
	source_id=article["source_id"],
	author=article["author"],
	topic=article["topic"],
	min_score=min_score,
	)
	if extracted:
	logger.info(
	f" Article {article['id']} ({article['author']}): "
	f"extracted {len(extracted)} candidates"
	)
	total_extracted += len(extracted)

	return total_extracted