SummarizerApp / app /api /v4 /schemas.py
ming
Migrate to Ruff for linting/formatting and add comprehensive import tests
29ed661
"""
Request and response schemas for V4 structured summarization API.
"""
import re
from enum import Enum
from pydantic import BaseModel, Field, field_validator, model_validator
class SummarizationStyle(str, Enum):
"""Available summarization styles."""
SKIMMER = "skimmer" # Brief, fact-focused
EXECUTIVE = "executive" # Business-focused, strategic
ELI5 = "eli5" # Simple, easy-to-understand
class Sentiment(str, Enum):
"""Sentiment classification."""
POSITIVE = "positive"
NEGATIVE = "negative"
NEUTRAL = "neutral"
class StructuredSummaryRequest(BaseModel):
"""Request schema for V4 structured summarization."""
url: str | None = Field(
None,
description="URL of article to scrape and summarize",
example="https://example.com/article",
)
text: str | None = Field(
None,
description="Direct text to summarize (alternative to URL)",
example="Your article text here...",
)
style: SummarizationStyle = Field(
default=SummarizationStyle.EXECUTIVE,
description="Summarization style to apply",
)
max_tokens: int | None = Field(
default=1024, ge=128, le=2048, description="Maximum tokens to generate"
)
include_metadata: bool | None = Field(
default=True, description="Include scraping metadata in first SSE event"
)
use_cache: bool | None = Field(
default=True, description="Use cached content if available (URL mode only)"
)
@model_validator(mode="after")
def check_url_or_text(self):
"""Ensure exactly one of url or text is provided."""
if not self.url and not self.text:
raise ValueError('Either "url" or "text" must be provided')
if self.url and self.text:
raise ValueError('Provide either "url" OR "text", not both')
return self
@field_validator("url")
@classmethod
def validate_url(cls, v: str | None) -> str | None:
"""Validate URL format and security."""
if v is None:
return v
# Basic URL pattern validation
url_pattern = re.compile(
r"^https?://" # http:// or https://
r"(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|" # domain
r"localhost|" # localhost
r"\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" # or IP
r"(?::\d+)?" # optional port
r"(?:/?|[/?]\S+)$",
re.IGNORECASE,
)
if not url_pattern.match(v):
raise ValueError("Invalid URL format. Must start with http:// or https://")
# SSRF protection - block localhost and private IPs
v_lower = v.lower()
if "localhost" in v_lower or "127.0.0.1" in v_lower:
raise ValueError("Cannot scrape localhost URLs")
# Block common private IP ranges
from urllib.parse import urlparse
parsed = urlparse(v)
hostname = parsed.hostname
# Check for private IP ranges
if hostname and (
hostname.startswith("10.")
or hostname.startswith("192.168.")
or hostname.startswith("172.16.")
or hostname.startswith("172.17.")
or hostname.startswith("172.18.")
or hostname.startswith("172.19.")
or hostname.startswith("172.20.")
or hostname.startswith("172.21.")
or hostname.startswith("172.22.")
or hostname.startswith("172.23.")
or hostname.startswith("172.24.")
or hostname.startswith("172.25.")
or hostname.startswith("172.26.")
or hostname.startswith("172.27.")
or hostname.startswith("172.28.")
or hostname.startswith("172.29.")
or hostname.startswith("172.30.")
or hostname.startswith("172.31.")
):
raise ValueError("Cannot scrape private IP addresses")
# Block file:// and other dangerous schemes
if not v.startswith(("http://", "https://")):
raise ValueError("Only HTTP and HTTPS URLs are allowed")
# Limit URL length
if len(v) > 2000:
raise ValueError("URL too long (maximum 2000 characters)")
return v
@field_validator("text")
@classmethod
def validate_text(cls, v: str | None) -> str | None:
"""Validate text content if provided."""
if v is None:
return v
if len(v) < 50:
raise ValueError("Text too short (minimum 50 characters)")
if len(v) > 50000:
raise ValueError("Text too long (maximum 50,000 characters)")
# Check for mostly whitespace
non_whitespace = len(v.replace(" ", "").replace("\n", "").replace("\t", ""))
if non_whitespace < 30:
raise ValueError("Text contains mostly whitespace")
return v
class StructuredSummary(BaseModel):
"""Structured summary output schema (for documentation and validation)."""
title: str = Field(..., description="A click-worthy, engaging title")
main_summary: str = Field(..., description="The main summary content")
key_points: list[str] = Field(..., description="List of 3-5 distinct key facts")
category: str = Field(
..., description="Topic category (e.g., Tech, Politics, Health)"
)
sentiment: Sentiment = Field(..., description="Overall sentiment of the article")
read_time_min: int = Field(
..., description="Estimated minutes to read the original article", ge=1
)