Knowledge-Universe / src /api /models.py
vlsiddarth's picture
feat: max_decay_detected as first-class field in /v1/discover (v2.2)
e5cdd9c
"""
Knowledge Universe API — Pydantic Models
Full multi-format taxonomy: 50+ source types, no API keys required
Rick's architecture: every format that exists on the internet
"""
from pydantic import BaseModel, Field, validator
from typing import List, Optional, Dict, Any
from enum import Enum
from datetime import datetime
# ============================================================
# MASTER FORMAT ENUM — Every format type on the internet
# ============================================================
class SourceFormat(str, Enum):
"""
Complete taxonomy of discoverable content types.
Maps to crawlers, scoring logic, and UI rendering.
All lowercase strings so JSON round-trips safely.
"""
# ---------- TEXTUAL (8) ----------
pdf = "pdf" # Academic papers, textbooks (arXiv, LibGen)
markdown = "markdown" # GitHub READMEs, wikis, GitBook
html = "html" # Web articles, tutorials, MIT OCW
text = "text" # Raw plain-text documents
epub = "epub" # E-books (Open Library, Standard Ebooks)
docx = "docx" # Word documents
latex = "latex" # LaTeX source (arXiv raw)
xml = "xml" # Structured documents, OAI-PMH feeds
# ---------- VISUAL (6) ----------
image = "image" # PNG, JPG, WebP (Wikimedia Commons, Unsplash)
image_set = "image_set" # Slide decks, image collections
svg = "svg" # Vector diagrams (ObservableHQ, D3 gallery)
infographic = "infographic" # Data visualizations, posters
diagram = "diagram" # Flowcharts, UML, network diagrams
screenshot = "screenshot" # Code screenshots, UI mockups
# ---------- AUDIO (3) ----------
audio = "audio" # MP3, WAV, M4A — podcast lectures
transcript = "transcript" # Timestamped transcriptions (YouTube captions)
podcast = "podcast" # Podcast RSS feed metadata (Podcast Index)
# ---------- VIDEO (4) ----------
video = "video" # MP4, WebM, YouTube, Vimeo
video_playlist = "video_playlist" # YouTube playlist / course series
live = "live" # Live streams, webinars
animation = "animation" # GIF, animated SVG, Manim exports
# ---------- INTERACTIVE (7) ----------
jupyter = "jupyter" # .ipynb (Kaggle, GitHub, Binder)
colab = "colab" # Google Colab notebooks
rmarkdown = "rmarkdown" # R Markdown / Quarto notebooks
sandbox = "sandbox" # CodePen, Replit, JSFiddle
simulation = "simulation" # Physics / ML simulations
quiz = "quiz" # Interactive quizzes
game = "game" # Educational games
# ---------- AR / VR (2) ----------
ar = "ar" # Augmented reality experiences
vr = "vr" # VR simulations
# ---------- CODE & PLATFORM (8) ----------
github = "github" # GitHub repositories (GH Archive)
github_discussion = "github_discussion" # Issues, Discussions, Q&A
gist = "gist" # GitHub Gists (code snippets)
kaggle = "kaggle" # Kaggle notebooks + datasets
stackoverflow = "stackoverflow" # Stack Exchange Q&A
# ── Block 2 new platforms ──────────────────────────────────
documentation = "documentation"
paperswithcode = "paperswithcode"
semantic_scholar = "semantic_scholar"
distill = "distill"
observablehq = "observablehq"
sketchfab = "sketchfab"
freesound = "freesound"
wolfram = "wolfram"
api_docs = "api_docs" # API references (Read the Docs)
dataset = "dataset" # CSV, Parquet, HDF5 (HuggingFace, OpenML)
repo = "repo" # Generic code repository
# ---------- 3D & SPATIAL (5) ----------
model_3d = "3d_model" # GLTF, OBJ, FBX (Sketchfab, Thingiverse)
model_3d_interactive = "3d_interactive" # Three.js, Babylon.js scenes
pointcloud = "pointcloud" # LiDAR, 3D scans (.las, .pcd)
cad = "cad" # AutoCAD, SolidWorks (.dwg, .stp)
volumetric = "volumetric" # Medical imaging (MRI, CT — TCIA)
# ---------- GRAPH & VISUALIZATION (4) ----------
graph = "graph" # D3.js, Plotly, ObservableHQ
knowledge_graph = "knowledge_graph" # RDF, Wikidata, semantic networks
network = "network" # Force-directed graphs
geo_map = "map" # Geographic visualizations (GeoJSON)
# ---------- DATA & TIME-SERIES (3) ----------
timeseries = "timeseries" # Stock data, sensor logs
dashboard = "dashboard" # Tableau, Looker, Metabase dashboards
dataframe = "dataframe" # Pandas/Polars dataframe snapshots
# ---------- ASSESSMENT (4) ----------
flashcards = "flashcards" # Spaced repetition (Anki decks)
problem_set = "problem_set" # Homework problems + solutions
lab = "lab" # Hands-on lab exercises
exam = "exam" # Practice / certification exams
# ============================================================
# FORMAT GROUPS — used for crawler routing
# ============================================================
FORMAT_GROUPS = {
"textual": [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html,
SourceFormat.text, SourceFormat.epub, SourceFormat.latex,
SourceFormat.xml, SourceFormat.docx],
"visual": [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg,
SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot],
"audio": [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast],
"video": [SourceFormat.video, SourceFormat.video_playlist,
SourceFormat.live, SourceFormat.animation],
"interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown,
SourceFormat.sandbox, SourceFormat.simulation,
SourceFormat.quiz, SourceFormat.game],
"code": [SourceFormat.github, SourceFormat.github_discussion,
SourceFormat.gist, SourceFormat.kaggle,
SourceFormat.stackoverflow, SourceFormat.api_docs,
SourceFormat.dataset, SourceFormat.repo],
"spatial": [SourceFormat.model_3d, SourceFormat.model_3d_interactive,
SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric],
"graph": [SourceFormat.graph, SourceFormat.knowledge_graph,
SourceFormat.network, SourceFormat.geo_map],
"data": [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe],
"assessment": [SourceFormat.flashcards, SourceFormat.problem_set,
SourceFormat.lab, SourceFormat.exam],
}
# ============================================================
# CRAWLER → FORMAT ROUTING MAP
# (Which crawler handles which formats — no API keys)
# ============================================================
CRAWLER_FORMAT_MAP = {
# bulk / protocol access — zero API key
"common_crawl": [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown],
"arxiv": [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html],
"gharchive": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter],
"stackoverflow": [SourceFormat.stackoverflow, SourceFormat.html],
"wikipedia": [SourceFormat.html, SourceFormat.markdown, SourceFormat.image,
SourceFormat.knowledge_graph],
"openlibrary": [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html],
"libgen": [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx],
"mit_ocw": [SourceFormat.html, SourceFormat.pdf, SourceFormat.video,
SourceFormat.problem_set],
"huggingface": [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown,
SourceFormat.dataframe],
"podcast": [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript],
# API-key crawlers (existing)
"github": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter,
SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo],
"kaggle": [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset],
"youtube": [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript],
# FIND CRAWLER_FORMAT_MAP dict and ADD:
"paperswithcode": [SourceFormat.pdf, SourceFormat.github],
"documentation": [SourceFormat.html],
"semantic_scholar":[SourceFormat.pdf, SourceFormat.html],
"distill": [SourceFormat.html, SourceFormat.simulation],
"observablehq": [SourceFormat.sandbox, SourceFormat.html],
"sketchfab": [SourceFormat.model_3d, SourceFormat.model_3d_interactive],
"freesound": [SourceFormat.audio],
"wolfram": [SourceFormat.simulation, SourceFormat.sandbox],
}
# ============================================================
# OTHER ENUMS
# ============================================================
class LearningStyle(str, Enum):
visual = "visual"
textual = "textual"
kinesthetic = "kinesthetic"
auditory = "auditory"
class OutputFormat(str, Enum):
json = "json"
html = "html"
embeddings = "embeddings"
pdf = "pdf"
streaming = "streaming"
# ============================================================
# REQUEST MODELS
# ============================================================
class DiscoveryRequest(BaseModel):
"""Request model for multi-format source discovery"""
topic: str = Field(..., min_length=2, max_length=200)
difficulty: int = Field(..., ge=1, le=5)
formats: List[SourceFormat] = Field(
default_factory=lambda: [SourceFormat.pdf, SourceFormat.video,
SourceFormat.github, SourceFormat.jupyter]
)
prerequisites: List[str] = Field(default_factory=list)
learning_style: Optional[LearningStyle] = None
language: str = Field(default="en")
max_results: int = Field(default=10, ge=1, le=50)
# Domain lock (v1 scope)
domain: Optional[str] = Field(
default=None,
description="ai_engineering | fintech_ai | None (unrestricted)"
)
# Output format for AI systems
output: OutputFormat = Field(default=OutputFormat.json)
min_freshness: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description=(
"Minimum freshness score (0.0–1.0). "
"Sources with freshness below this value are excluded. "
"Example: 0.5 = exclude anything more than half-decayed. "
"Use 0.75 to only get fresh/recently-updated sources."
)
)
@validator("topic")
def normalize_topic(cls, v):
v = v.strip()
if not v:
raise ValueError("Topic cannot be empty")
return v
@validator("output", pre=True)
def validate_output(cls, v):
# ── BUG 3 FIX ──────────────────────────────────────────────────────
# Pydantic silently coerces invalid enum values to the default.
# We intercept BEFORE coercion (pre=True) so bad values raise 422
# with a human-readable message instead of silently returning JSON.
valid = [e.value for e in OutputFormat]
if v not in valid:
raise ValueError(
f"Invalid output format '{v}'. "
f"Valid values: {valid}. "
f"Example: {{\"output\": \"embeddings\"}}"
)
return v
# ============================================================
# SOURCE MODELS
# ============================================================
class SourceLink(BaseModel):
type: str
url: str
format: SourceFormat
size_bytes: Optional[int] = None
access_method: str = "direct" # direct | git_clone | streaming | torrent | api
class Source(BaseModel):
"""Discovered source with full metadata"""
id: str
title: str
authors: List[str] = Field(default_factory=list)
quality_score: float = Field(..., ge=0, le=10)
pedagogical_fit: float = Field(..., ge=0, le=1)
difficulty: int = Field(..., ge=1, le=5)
links: List[SourceLink]
formats: List[SourceFormat]
retraction_status: Optional[Dict[str, Any]] = None
related_sources: Optional[Dict[str, Any]] = None
summary: str = ""
prerequisites: List[str] = Field(default_factory=list)
tags: List[str] = Field(default_factory=list)
language: str = "en"
# Quality & Freshness signals (NEW: Decay Report)
citation_count: int = 0
peer_reviewed: bool = False
open_access: bool = True
decay_report: Optional[Dict[str, Any]] = None
ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals")
publication_date: Optional[datetime] = None
last_updated: Optional[datetime] = None
# Engagement metrics
views: Optional[int] = None
likes: Optional[int] = None
rating: Optional[float] = Field(None, ge=0, le=5)
stars: Optional[int] = None
forks: Optional[int] = None
downloads: Optional[int] = None
# Platform
source_platform: str
thumbnail_url: Optional[str] = None
url: str
# Format-specific metadata
duration_seconds: Optional[int] = None # video / audio
file_size_bytes: Optional[int] = None # pdf / epub / dataset
page_count: Optional[int] = None # pdf / epub
kernel_type: Optional[str] = None # jupyter (Python / R / Julia)
dataset_rows: Optional[int] = None # dataset
dataset_cols: Optional[int] = None # dataset
license: Optional[str] = None # dataset / repo / epub
class Config:
populate_by_name = True
# ============================================================
# RESPONSE MODELS
# ============================================================
class DiscoveryResponse(BaseModel):
query: str
domain: Optional[str] = None
total_found: int
sources: List[Source]
formats_found: Dict[str, int] = Field(default_factory=dict)
cache_hit: bool
processing_time_ms: float
page: int = 1
total_pages: int = 1
# --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) ---
# First-class fields for enterprise clients — no adapter derivation needed.
# Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score
max_decay_detected: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description=(
"The highest decay_score across all sources in this response. "
"Use for boundary-stamped edge risk in TrustGraph pipelines. "
"Range: 0.0 (all fresh) → 1.0 (worst source fully decayed). "
"This is the field to use for propagation risk gates."
)
)
avg_decay_score: Optional[float] = Field(
default=None,
ge=0.0,
le=1.0,
description="Mean decay score across all sources. Use for trend monitoring."
)
worst_source_id: Optional[str] = Field(
default=None,
description=(
"Source ID with the highest decay_score. "
"Use as TrustGraph edge label or tooltip to identify the stale tail source."
)
)
stale_count: int = Field(
default=0,
description="Number of sources labeled 'stale' or 'decayed' in this response."
)
class KnowledgeObject(BaseModel):
"""
Enterprise-grade knowledge object for AI system consumption.
This is the v1 contract output for serious clients.
"""
source_id: str
title: str
url: str
platform: str
format: SourceFormat
# Scores
quality_score: float
pedagogical_fit: float
freshness_score: float
# Provenance
authors: List[str]
publication_date: Optional[datetime]
license: Optional[str]
open_access: bool
# Embedding (populated when output=embeddings)
embedding: Optional[List[float]] = None
# Raw content for downstream use
summary: str
tags: List[str]
class EnterpriseResponse(BaseModel):
"""
Full v1 contract response for enterprise clients.
Matches the locked contract from the planning doc.
"""
knowledge_objects: List[KnowledgeObject]
scores: Dict[str, float] # aggregate scoring breakdown
source_links: List[str] # all URLs in order
last_updated: datetime
confidence_level: str # "high" | "medium" | "low"
domain: Optional[str]
formats_distribution: Dict[str, int]
class CacheStats(BaseModel):
total_keys: int
memory_used_mb: float
memory_used_percent: float
hit_rate: float = Field(..., ge=0, le=1)
hits: int
misses: int
evictions: int
ttl_distribution: Dict[str, int] = Field(default_factory=dict)
class ErrorResponse(BaseModel):
error: str
message: str
details: Optional[Dict[str, Any]] = None
timestamp: datetime = Field(default_factory=datetime.utcnow)