""" Knowledge Universe API — Pydantic Models Full multi-format taxonomy: 50+ source types, no API keys required Rick's architecture: every format that exists on the internet """ from pydantic import BaseModel, Field, validator from typing import List, Optional, Dict, Any from enum import Enum from datetime import datetime # ============================================================ # MASTER FORMAT ENUM — Every format type on the internet # ============================================================ class SourceFormat(str, Enum): """ Complete taxonomy of discoverable content types. Maps to crawlers, scoring logic, and UI rendering. All lowercase strings so JSON round-trips safely. """ # ---------- TEXTUAL (8) ---------- pdf = "pdf" # Academic papers, textbooks (arXiv, LibGen) markdown = "markdown" # GitHub READMEs, wikis, GitBook html = "html" # Web articles, tutorials, MIT OCW text = "text" # Raw plain-text documents epub = "epub" # E-books (Open Library, Standard Ebooks) docx = "docx" # Word documents latex = "latex" # LaTeX source (arXiv raw) xml = "xml" # Structured documents, OAI-PMH feeds # ---------- VISUAL (6) ---------- image = "image" # PNG, JPG, WebP (Wikimedia Commons, Unsplash) image_set = "image_set" # Slide decks, image collections svg = "svg" # Vector diagrams (ObservableHQ, D3 gallery) infographic = "infographic" # Data visualizations, posters diagram = "diagram" # Flowcharts, UML, network diagrams screenshot = "screenshot" # Code screenshots, UI mockups # ---------- AUDIO (3) ---------- audio = "audio" # MP3, WAV, M4A — podcast lectures transcript = "transcript" # Timestamped transcriptions (YouTube captions) podcast = "podcast" # Podcast RSS feed metadata (Podcast Index) # ---------- VIDEO (4) ---------- video = "video" # MP4, WebM, YouTube, Vimeo video_playlist = "video_playlist" # YouTube playlist / course series live = "live" # Live streams, webinars animation = "animation" # GIF, animated SVG, Manim exports # ---------- INTERACTIVE (7) ---------- jupyter = "jupyter" # .ipynb (Kaggle, GitHub, Binder) colab = "colab" # Google Colab notebooks rmarkdown = "rmarkdown" # R Markdown / Quarto notebooks sandbox = "sandbox" # CodePen, Replit, JSFiddle simulation = "simulation" # Physics / ML simulations quiz = "quiz" # Interactive quizzes game = "game" # Educational games # ---------- AR / VR (2) ---------- ar = "ar" # Augmented reality experiences vr = "vr" # VR simulations # ---------- CODE & PLATFORM (8) ---------- github = "github" # GitHub repositories (GH Archive) github_discussion = "github_discussion" # Issues, Discussions, Q&A gist = "gist" # GitHub Gists (code snippets) kaggle = "kaggle" # Kaggle notebooks + datasets stackoverflow = "stackoverflow" # Stack Exchange Q&A # ── Block 2 new platforms ────────────────────────────────── documentation = "documentation" paperswithcode = "paperswithcode" semantic_scholar = "semantic_scholar" distill = "distill" observablehq = "observablehq" sketchfab = "sketchfab" freesound = "freesound" wolfram = "wolfram" api_docs = "api_docs" # API references (Read the Docs) dataset = "dataset" # CSV, Parquet, HDF5 (HuggingFace, OpenML) repo = "repo" # Generic code repository # ---------- 3D & SPATIAL (5) ---------- model_3d = "3d_model" # GLTF, OBJ, FBX (Sketchfab, Thingiverse) model_3d_interactive = "3d_interactive" # Three.js, Babylon.js scenes pointcloud = "pointcloud" # LiDAR, 3D scans (.las, .pcd) cad = "cad" # AutoCAD, SolidWorks (.dwg, .stp) volumetric = "volumetric" # Medical imaging (MRI, CT — TCIA) # ---------- GRAPH & VISUALIZATION (4) ---------- graph = "graph" # D3.js, Plotly, ObservableHQ knowledge_graph = "knowledge_graph" # RDF, Wikidata, semantic networks network = "network" # Force-directed graphs geo_map = "map" # Geographic visualizations (GeoJSON) # ---------- DATA & TIME-SERIES (3) ---------- timeseries = "timeseries" # Stock data, sensor logs dashboard = "dashboard" # Tableau, Looker, Metabase dashboards dataframe = "dataframe" # Pandas/Polars dataframe snapshots # ---------- ASSESSMENT (4) ---------- flashcards = "flashcards" # Spaced repetition (Anki decks) problem_set = "problem_set" # Homework problems + solutions lab = "lab" # Hands-on lab exercises exam = "exam" # Practice / certification exams # ============================================================ # FORMAT GROUPS — used for crawler routing # ============================================================ FORMAT_GROUPS = { "textual": [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html, SourceFormat.text, SourceFormat.epub, SourceFormat.latex, SourceFormat.xml, SourceFormat.docx], "visual": [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg, SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot], "audio": [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast], "video": [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.live, SourceFormat.animation], "interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown, SourceFormat.sandbox, SourceFormat.simulation, SourceFormat.quiz, SourceFormat.game], "code": [SourceFormat.github, SourceFormat.github_discussion, SourceFormat.gist, SourceFormat.kaggle, SourceFormat.stackoverflow, SourceFormat.api_docs, SourceFormat.dataset, SourceFormat.repo], "spatial": [SourceFormat.model_3d, SourceFormat.model_3d_interactive, SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric], "graph": [SourceFormat.graph, SourceFormat.knowledge_graph, SourceFormat.network, SourceFormat.geo_map], "data": [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe], "assessment": [SourceFormat.flashcards, SourceFormat.problem_set, SourceFormat.lab, SourceFormat.exam], } # ============================================================ # CRAWLER → FORMAT ROUTING MAP # (Which crawler handles which formats — no API keys) # ============================================================ CRAWLER_FORMAT_MAP = { # bulk / protocol access — zero API key "common_crawl": [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown], "arxiv": [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html], "gharchive": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter], "stackoverflow": [SourceFormat.stackoverflow, SourceFormat.html], "wikipedia": [SourceFormat.html, SourceFormat.markdown, SourceFormat.image, SourceFormat.knowledge_graph], "openlibrary": [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html], "libgen": [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx], "mit_ocw": [SourceFormat.html, SourceFormat.pdf, SourceFormat.video, SourceFormat.problem_set], "huggingface": [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown, SourceFormat.dataframe], "podcast": [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript], # API-key crawlers (existing) "github": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter, SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo], "kaggle": [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset], "youtube": [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript], # FIND CRAWLER_FORMAT_MAP dict and ADD: "paperswithcode": [SourceFormat.pdf, SourceFormat.github], "documentation": [SourceFormat.html], "semantic_scholar":[SourceFormat.pdf, SourceFormat.html], "distill": [SourceFormat.html, SourceFormat.simulation], "observablehq": [SourceFormat.sandbox, SourceFormat.html], "sketchfab": [SourceFormat.model_3d, SourceFormat.model_3d_interactive], "freesound": [SourceFormat.audio], "wolfram": [SourceFormat.simulation, SourceFormat.sandbox], } # ============================================================ # OTHER ENUMS # ============================================================ class LearningStyle(str, Enum): visual = "visual" textual = "textual" kinesthetic = "kinesthetic" auditory = "auditory" class OutputFormat(str, Enum): json = "json" html = "html" embeddings = "embeddings" pdf = "pdf" streaming = "streaming" # ============================================================ # REQUEST MODELS # ============================================================ class DiscoveryRequest(BaseModel): """Request model for multi-format source discovery""" topic: str = Field(..., min_length=2, max_length=200) difficulty: int = Field(..., ge=1, le=5) formats: List[SourceFormat] = Field( default_factory=lambda: [SourceFormat.pdf, SourceFormat.video, SourceFormat.github, SourceFormat.jupyter] ) prerequisites: List[str] = Field(default_factory=list) learning_style: Optional[LearningStyle] = None language: str = Field(default="en") max_results: int = Field(default=10, ge=1, le=50) # Domain lock (v1 scope) domain: Optional[str] = Field( default=None, description="ai_engineering | fintech_ai | None (unrestricted)" ) # Output format for AI systems output: OutputFormat = Field(default=OutputFormat.json) min_freshness: Optional[float] = Field( default=None, ge=0.0, le=1.0, description=( "Minimum freshness score (0.0–1.0). " "Sources with freshness below this value are excluded. " "Example: 0.5 = exclude anything more than half-decayed. " "Use 0.75 to only get fresh/recently-updated sources." ) ) @validator("topic") def normalize_topic(cls, v): v = v.strip() if not v: raise ValueError("Topic cannot be empty") return v @validator("output", pre=True) def validate_output(cls, v): # ── BUG 3 FIX ────────────────────────────────────────────────────── # Pydantic silently coerces invalid enum values to the default. # We intercept BEFORE coercion (pre=True) so bad values raise 422 # with a human-readable message instead of silently returning JSON. valid = [e.value for e in OutputFormat] if v not in valid: raise ValueError( f"Invalid output format '{v}'. " f"Valid values: {valid}. " f"Example: {{\"output\": \"embeddings\"}}" ) return v # ============================================================ # SOURCE MODELS # ============================================================ class SourceLink(BaseModel): type: str url: str format: SourceFormat size_bytes: Optional[int] = None access_method: str = "direct" # direct | git_clone | streaming | torrent | api class Source(BaseModel): """Discovered source with full metadata""" id: str title: str authors: List[str] = Field(default_factory=list) quality_score: float = Field(..., ge=0, le=10) pedagogical_fit: float = Field(..., ge=0, le=1) difficulty: int = Field(..., ge=1, le=5) links: List[SourceLink] formats: List[SourceFormat] retraction_status: Optional[Dict[str, Any]] = None related_sources: Optional[Dict[str, Any]] = None summary: str = "" prerequisites: List[str] = Field(default_factory=list) tags: List[str] = Field(default_factory=list) language: str = "en" # Quality & Freshness signals (NEW: Decay Report) citation_count: int = 0 peer_reviewed: bool = False open_access: bool = True decay_report: Optional[Dict[str, Any]] = None ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals") publication_date: Optional[datetime] = None last_updated: Optional[datetime] = None # Engagement metrics views: Optional[int] = None likes: Optional[int] = None rating: Optional[float] = Field(None, ge=0, le=5) stars: Optional[int] = None forks: Optional[int] = None downloads: Optional[int] = None # Platform source_platform: str thumbnail_url: Optional[str] = None url: str # Format-specific metadata duration_seconds: Optional[int] = None # video / audio file_size_bytes: Optional[int] = None # pdf / epub / dataset page_count: Optional[int] = None # pdf / epub kernel_type: Optional[str] = None # jupyter (Python / R / Julia) dataset_rows: Optional[int] = None # dataset dataset_cols: Optional[int] = None # dataset license: Optional[str] = None # dataset / repo / epub class Config: populate_by_name = True # ============================================================ # RESPONSE MODELS # ============================================================ class DiscoveryResponse(BaseModel): query: str domain: Optional[str] = None total_found: int sources: List[Source] formats_found: Dict[str, int] = Field(default_factory=dict) cache_hit: bool processing_time_ms: float page: int = 1 total_pages: int = 1 # --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) --- # First-class fields for enterprise clients — no adapter derivation needed. # Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score max_decay_detected: Optional[float] = Field( default=None, ge=0.0, le=1.0, description=( "The highest decay_score across all sources in this response. " "Use for boundary-stamped edge risk in TrustGraph pipelines. " "Range: 0.0 (all fresh) → 1.0 (worst source fully decayed). " "This is the field to use for propagation risk gates." ) ) avg_decay_score: Optional[float] = Field( default=None, ge=0.0, le=1.0, description="Mean decay score across all sources. Use for trend monitoring." ) worst_source_id: Optional[str] = Field( default=None, description=( "Source ID with the highest decay_score. " "Use as TrustGraph edge label or tooltip to identify the stale tail source." ) ) stale_count: int = Field( default=0, description="Number of sources labeled 'stale' or 'decayed' in this response." ) class KnowledgeObject(BaseModel): """ Enterprise-grade knowledge object for AI system consumption. This is the v1 contract output for serious clients. """ source_id: str title: str url: str platform: str format: SourceFormat # Scores quality_score: float pedagogical_fit: float freshness_score: float # Provenance authors: List[str] publication_date: Optional[datetime] license: Optional[str] open_access: bool # Embedding (populated when output=embeddings) embedding: Optional[List[float]] = None # Raw content for downstream use summary: str tags: List[str] class EnterpriseResponse(BaseModel): """ Full v1 contract response for enterprise clients. Matches the locked contract from the planning doc. """ knowledge_objects: List[KnowledgeObject] scores: Dict[str, float] # aggregate scoring breakdown source_links: List[str] # all URLs in order last_updated: datetime confidence_level: str # "high" | "medium" | "low" domain: Optional[str] formats_distribution: Dict[str, int] class CacheStats(BaseModel): total_keys: int memory_used_mb: float memory_used_percent: float hit_rate: float = Field(..., ge=0, le=1) hits: int misses: int evictions: int ttl_distribution: Dict[str, int] = Field(default_factory=dict) class ErrorResponse(BaseModel): error: str message: str details: Optional[Dict[str, Any]] = None timestamp: datetime = Field(default_factory=datetime.utcnow)