Spaces:

gohilnath2
/

newspaper-api

Sleeping

App Files Files Community

newspaper-api / summarizer.py

gohilnath2

Priority queue, single SambaNova provider, sequential page processing

15f3011 about 2 months ago

raw

history blame contribute delete

11.1 kB

	# =============================================================================
	# 📰 Newspaper Summarizer
	# Scans all pages for headlines using font size, then ranks via LLM.
	# No YOLO, no vision model, no OCR. Fast and standalone.
	#
	# Usage:
	# from summarizer import NewspaperSummarizer
	# summarizer = NewspaperSummarizer(api_key="...")
	# result = summarizer.summarize("newspaper.pdf")
	# =============================================================================

	import json
	import time
	import re
	import fitz
	from openai import OpenAI
	import logging

	from config import (
	LLM_BASE_URL, TEXT_MODEL,
	HEADLINE_MIN_FONT_SIZE, TOP_ARTICLES_COUNT,
	SUMMARY_PROMPT, SECTION_NAMES, SKIP_SECTIONS, SECTION_TIERS,
	)

	logger = logging.getLogger("newspaper_summarizer")


	class NewspaperSummarizer:
	"""Scans a newspaper PDF for headlines and generates importance-ranked summaries."""

	def __init__(self, api_key):
	self.llm_client = OpenAI(base_url=LLM_BASE_URL, api_key=api_key)
	logger.info("✅ Summarizer initialized")

	def summarize(self, pdf_path):
	"""
	Full pipeline: detect sections → scan headlines → rank via LLM.
	Returns: {"important_articles": [...], "total_headlines_found": int, "sections": {...}}
	"""
	sections = self._detect_page_sections(pdf_path)
	logger.info(f"Detected sections: {sections}")

	headlines = self._scan_headlines(pdf_path, sections)
	logger.info(f"Scanned {len(headlines)} headlines from PDF")

	if not headlines:
	return {"important_articles": [], "total_headlines_found": 0, "sections": sections}

	ranked = self._rank_headlines(headlines)
	ranked["total_headlines_found"] = len(headlines)
	ranked["sections"] = sections
	return ranked

	def _detect_page_sections(self, pdf_path):
	"""
	Detect section names from page headers.
	Returns dict: {page_num (1-indexed): section_name}
	"""
	doc = fitz.open(pdf_path)
	sections = {}

	section_names_lower = {s.lower(): s for s in SECTION_NAMES}

	for page_num in range(doc.page_count):
	page = doc[page_num]
	page_height = page.rect.height

	# Scan top 12% of the page for section headers
	header_rect = fitz.Rect(0, 0, page.rect.width, page_height * 0.12)
	blocks = page.get_text("dict", clip=header_rect)["blocks"]

	best_match = None
	best_size = 0

	for block in blocks:
	if block.get("type") != 0:
	continue
	for line in block.get("lines", []):
	for span in line.get("spans", []):
	text = span.get("text", "").strip().upper()
	size = span.get("size", 0)

	# Check against known section names
	for known_lower, known_original in section_names_lower.items():
	if known_lower in text.lower() and size > best_size:
	best_match = known_original
	best_size = size

	if best_match:
	sections[page_num + 1] = best_match
	elif page_num == 0:
	sections[1] = "FRONT PAGE"

	doc.close()
	return sections

	def _get_tier(self, section_name):
	"""Get tier (1-3) for a section. Returns 3 as default."""
	if not section_name:
	return 3
	upper = section_name.upper()
	for tier, names in SECTION_TIERS.items():
	if upper in [n.upper() for n in names]:
	return tier
	return 3

	def _scan_headlines(self, pdf_path, sections=None):
	"""
	Extract headlines from all pages using font size detection.
	Includes section context from detected sections.
	Skips pages with low-value sections.
	"""
	if sections is None:
	sections = {}

	doc = fitz.open(pdf_path)
	headlines = []

	for page_num in range(doc.page_count):
	section = sections.get(page_num + 1, "UNKNOWN")

	# Skip low-value sections entirely
	if section.lower() in SKIP_SECTIONS:
	continue

	page = doc[page_num]
	blocks = page.get_text("dict")["blocks"]

	page_texts = []

	# Collect all text spans with their font size and position
	for block in blocks:
	if block.get("type") != 0: # type 0 = text block
	continue
	for line in block.get("lines", []):
	for span in line.get("spans", []):
	text = span.get("text", "").strip()
	if not text:
	continue
	page_texts.append({
	"text": text,
	"size": round(span.get("size", 0), 1),
	"y": round(span.get("origin", [0, 0])[1], 1),
	"flags": span.get("flags", 0), # bold, italic etc.
	})

	if not page_texts:
	continue

	# Find headline spans (font size above threshold)
	# Group consecutive spans at the same font size into one headline
	i = 0
	while i < len(page_texts):
	span = page_texts[i]

	if span["size"] >= HEADLINE_MIN_FONT_SIZE:
	# Collect consecutive headline-sized spans into one headline
	headline_parts = [span["text"]]
	headline_y = span["y"]
	headline_size = span["size"]
	j = i + 1

	while j < len(page_texts):
	next_span = page_texts[j]
	# Same headline if similar font size and close vertical position
	if (abs(next_span["size"] - headline_size) < 2
	and abs(next_span["y"] - page_texts[j-1]["y"]) < 20):
	headline_parts.append(next_span["text"])
	j += 1
	else:
	break

	headline_text = " ".join(headline_parts).strip()

	# Skip very short headlines (likely page numbers, labels)
	if len(headline_text) < 10:
	i = j
	continue

	# Skip common non-article text
	skip_patterns = [
	r"^the\s+hindu$",
	r"^page\s+\d+",
	r"^\d+$",
	r"^www\.",
	r"^https?://",
	r"continued\s+on",
	r"continued\s+from",
	]
	if any(re.search(p, headline_text, re.IGNORECASE) for p in skip_patterns):
	i = j
	continue

	# Grab snippet: first ~200 chars of body text below this headline
	snippet = ""
	for k in range(j, min(j + 15, len(page_texts))):
	body_span = page_texts[k]
	if body_span["size"] < HEADLINE_MIN_FONT_SIZE:
	snippet += body_span["text"] + " "
	if len(snippet) >= 200:
	break

	snippet = snippet[:200].strip()

	headlines.append({
	"page": page_num + 1,
	"headline": headline_text,
	"snippet": snippet,
	"font_size": headline_size,
	"section": section,
	"tier": self._get_tier(section),
	})

	i = j
	else:
	i += 1

	doc.close()

	# Deduplicate headlines (same text on same page)
	seen = set()
	unique = []
	for h in headlines:
	key = (h["page"], h["headline"].lower())
	if key not in seen:
	seen.add(key)
	unique.append(h)

	# Sort by font size descending (bigger headline = more important)
	unique.sort(key=lambda h: h["font_size"], reverse=True)

	return unique

	def _rank_headlines(self, headlines, max_retries=3):
	"""Send headlines to text LLM for importance ranking and summary."""

	# Build headline list with section and tier context
	lines = []
	for h in headlines:
	tier = h.get("tier", 3)
	section = h.get("section", "UNKNOWN")
	line = f"[Tier {tier}] Page {h['page']} — {section}: \"{h['headline']}\""
	if h["snippet"]:
	line += f" — {h['snippet']}"
	lines.append(line)

	headlines_text = "\n".join(lines)

	prompt = SUMMARY_PROMPT.format(
	headlines_list=headlines_text,
	count=TOP_ARTICLES_COUNT,
	)

	# Call LLM with retry
	for attempt in range(max_retries):
	try:
	response = self.llm_client.chat.completions.create(
	model=TEXT_MODEL,
	messages=[
	{
	"role": "system",
	"content": "You are a newspaper editor. Respond with valid JSON only, no markdown fences.",
	},
	{"role": "user", "content": prompt},
	],
	temperature=0.1,
	max_tokens=4096,
	)
	raw = response.choices[0].message.content.strip()
	if raw.startswith("```"):
	raw = raw.split("\n", 1)[1].rsplit("```", 1)[0]
	result = json.loads(raw)
	logger.info(
	f"LLM ranked {len(result.get('important_articles', []))} important articles"
	)
	return result

	except Exception as e:
	if "429" in str(e) or "rate" in str(e).lower():
	wait = 60
	match = re.search(r"(\d+\.?\d)\ss", str(e))
	if match:
	wait = float(match.group(1)) + 2
	logger.warning(f"Rate limited, waiting {wait:.0f}s (attempt {attempt + 1})")
	time.sleep(wait)
	continue
	logger.error(f"LLM ranking failed: {e}")
	raise

	raise RuntimeError("Summary LLM failed after retries")