Spaces:
Sleeping
Sleeping
| """Extract standalone one-liner jokes from long-form articles. | |
| Two-stage pipeline: | |
| Stage 1: Regex/heuristic filtering (6,500 sentences → ~300 candidates) | |
| Stage 2: Scoring candidates for standalone-ness (requires manual or AI review) | |
| For now, Stage 1 runs automatically. Stage 2 candidates are stored with | |
| content_type='extracted_candidate' for review in the web dashboard. | |
| """ | |
| import re | |
| import logging | |
| from scraper.db import get_db, insert_entry | |
| logger = logging.getLogger("joke-corpus") | |
| # Minimum article length to bother extracting from | |
| MIN_ARTICLE_LENGTH = 500 | |
| # Sentence length bounds for one-liners | |
| MIN_JOKE_LENGTH = 40 | |
| MAX_JOKE_LENGTH = 300 | |
| # Patterns that strongly indicate a standalone joke (Frankie Boyle style) | |
| STRONG_PATTERNS = [ | |
| re.compile(r"looks?\s+like\s+", re.IGNORECASE), # "looks like a..." | |
| re.compile(r"look\s+as\s+if\s+", re.IGNORECASE), # "look as if..." | |
| re.compile(r"the\s+face\s+of\s+", re.IGNORECASE), # "the face of someone who..." | |
| re.compile(r"the\s+kind\s+of\s+\w+\s+who", re.IGNORECASE), # "the kind of man who..." | |
| re.compile(r"the\s+sort\s+of\s+\w+\s+who", re.IGNORECASE), | |
| re.compile(r"so\s+\w+\s+that\s+", re.IGNORECASE), # "so X that..." | |
| re.compile(r"like\s+a\s+.+\s+in\s+a\s+", re.IGNORECASE), # "like a X in a Y" | |
| re.compile(r"like\s+watching\s+", re.IGNORECASE), # "like watching..." | |
| re.compile(r"imagine\s+", re.IGNORECASE), # "Imagine..." | |
| ] | |
| # Weak signals — help score but don't guarantee standalone-ness | |
| WEAK_PATTERNS = [ | |
| re.compile(r"\(.*\)", re.IGNORECASE), # parenthetical aside | |
| re.compile(r"—.+—", re.IGNORECASE), # em-dash aside | |
| re.compile(r"perhaps\s+", re.IGNORECASE), # dry understatement | |
| re.compile(r"presumably\s+", re.IGNORECASE), | |
| re.compile(r"naturally\s+", re.IGNORECASE), | |
| re.compile(r"of\s+course\s+", re.IGNORECASE), | |
| ] | |
| # Sentences starting with these are likely context-dependent | |
| SKIP_STARTS = re.compile( | |
| r"^(But |And |Yet |So |Or |However,|Meanwhile,|" | |
| r"He |She |It |They |This |That |These |Those |" | |
| r"His |Her |Its |Their |" | |
| r"The (article|piece|column|book|report|story|film|show) )", | |
| re.IGNORECASE, | |
| ) | |
| # Named entities / proper nouns that suggest the subject is identified | |
| HAS_NAMED_SUBJECT = re.compile( | |
| r"^[A-Z][a-z]+ [A-Z]|" # First Last at start | |
| r"^(Boris|Trump|Corbyn|Farage|May|Johnson|Cameron|Starmer|Sunak|" | |
| r"Blair|Thatcher|Obama|Biden|Putin|Musk|Patel|Gove|Rees-Mogg|" | |
| r"Cummings|Hancock|Truss|Sturgeon|Salmond|" | |
| r"Britain|England|Scotland|America|Labour|Tory|Conservative|" | |
| r"The (Queen|King|PM|BBC|NHS|Guardian|Sun|Mail|Times))\b", | |
| ) | |
| def _split_sentences(text): | |
| """Split article text into sentences, handling common abbreviations.""" | |
| # Remove markdown headers | |
| text = re.sub(r"^#+\s+.*$", "", text, flags=re.MULTILINE) | |
| # Remove URLs | |
| text = re.sub(r"https?://\S+", "", text) | |
| # Protect common abbreviations | |
| text = text.replace("Mr.", "Mr").replace("Mrs.", "Mrs").replace("Ms.", "Ms") | |
| text = text.replace("Dr.", "Dr").replace("St.", "St").replace("Prof.", "Prof") | |
| text = text.replace("etc.", "etc").replace("eg.", "eg").replace("ie.", "ie") | |
| text = text.replace("...", "…") | |
| # Split on sentence endings | |
| sentences = re.split(r'(?<=[.!?])\s+(?=[A-Z"\'])', text) | |
| return [s.strip() for s in sentences if s.strip()] | |
| def _score_candidate(sentence): | |
| """Score a sentence for standalone joke potential. Higher = more likely.""" | |
| score = 0 | |
| # Length sweet spot (80-200 chars is ideal for a one-liner) | |
| length = len(sentence) | |
| if 80 <= length <= 200: | |
| score += 2 | |
| elif 60 <= length <= 250: | |
| score += 1 | |
| # Strong pattern matches | |
| for pattern in STRONG_PATTERNS: | |
| if pattern.search(sentence): | |
| score += 3 | |
| break # Only count once | |
| # Weak pattern matches | |
| for pattern in WEAK_PATTERNS: | |
| if pattern.search(sentence): | |
| score += 1 | |
| # Has a named subject (not pronoun-dependent) | |
| if HAS_NAMED_SUBJECT.match(sentence): | |
| score += 2 | |
| # Starts with skip words (context-dependent) | |
| if SKIP_STARTS.match(sentence): | |
| score -= 2 | |
| # Contains a quote (often setup-punchline) | |
| if '"' in sentence or '\u201c' in sentence: | |
| score += 1 | |
| # Ends with a strong punchline indicator | |
| if sentence.rstrip().endswith((".", "…", "?")): | |
| score += 1 | |
| return score | |
| def extract_from_article(entry_id, text, source_id, author=None, | |
| topic=None, min_score=3): | |
| """Extract joke candidates from a long-form article. | |
| Returns list of (sentence, score) tuples that were inserted. | |
| """ | |
| if len(text) < MIN_ARTICLE_LENGTH: | |
| return [] | |
| # Get article title for attribution | |
| title = "" | |
| lines = text.split("\n") | |
| for line in lines: | |
| if line.strip().startswith("#"): | |
| title = line.strip().lstrip("#").strip() | |
| break | |
| sentences = _split_sentences(text) | |
| extracted = [] | |
| for sentence in sentences: | |
| # Basic length filter | |
| if len(sentence) < MIN_JOKE_LENGTH or len(sentence) > MAX_JOKE_LENGTH: | |
| continue | |
| score = _score_candidate(sentence) | |
| if score >= min_score: | |
| # Build attribution prefix if needed | |
| attributed = sentence | |
| # Store as extracted candidate | |
| platform_id = f"extract-{entry_id}-{hash(sentence) & 0xFFFFFFFF:08x}" | |
| eid = insert_entry( | |
| source_id=source_id, | |
| platform="extracted", | |
| text=attributed, | |
| platform_entry_id=platform_id, | |
| author=author, | |
| content_type="extracted_candidate", | |
| topic=topic, | |
| ) | |
| if eid: | |
| extracted.append((sentence, score)) | |
| return extracted | |
| def run_extraction(min_score=3): | |
| """Extract jokes from all long-form articles in the corpus.""" | |
| with get_db() as conn: | |
| # Find articles that haven't been extracted yet | |
| articles = conn.execute(""" | |
| SELECT e.id, e.text, e.source_id, e.author, e.topic | |
| FROM entries e | |
| WHERE e.content_type = 'article' | |
| AND LENGTH(e.text) > ? | |
| AND e.id NOT IN ( | |
| SELECT DISTINCT CAST( | |
| SUBSTR(platform_entry_id, 9, | |
| INSTR(SUBSTR(platform_entry_id, 9), '-') - 1) | |
| AS INTEGER) | |
| FROM entries | |
| WHERE platform = 'extracted' | |
| ) | |
| """, (MIN_ARTICLE_LENGTH,)).fetchall() | |
| total_extracted = 0 | |
| for article in articles: | |
| extracted = extract_from_article( | |
| entry_id=article["id"], | |
| text=article["text"], | |
| source_id=article["source_id"], | |
| author=article["author"], | |
| topic=article["topic"], | |
| min_score=min_score, | |
| ) | |
| if extracted: | |
| logger.info( | |
| f" Article {article['id']} ({article['author']}): " | |
| f"extracted {len(extracted)} candidates" | |
| ) | |
| total_extracted += len(extracted) | |
| return total_extracted | |