Spaces:

vlsiddarth
/

Knowledge-Universe

Running

App Files Files Community

Knowledge-Universe / src /api /models.py

vlsiddarth

feat: max_decay_detected as first-class field in /v1/discover (v2.2)

e5cdd9c 11 days ago

raw

history blame contribute delete

17.7 kB

	"""
	Knowledge Universe API — Pydantic Models
	Full multi-format taxonomy: 50+ source types, no API keys required
	Rick's architecture: every format that exists on the internet
	"""

	from pydantic import BaseModel, Field, validator
	from typing import List, Optional, Dict, Any
	from enum import Enum
	from datetime import datetime


	# ============================================================
	# MASTER FORMAT ENUM — Every format type on the internet
	# ============================================================

	class SourceFormat(str, Enum):
	"""
	Complete taxonomy of discoverable content types.
	Maps to crawlers, scoring logic, and UI rendering.
	All lowercase strings so JSON round-trips safely.
	"""

	# ---------- TEXTUAL (8) ----------
	pdf = "pdf" # Academic papers, textbooks (arXiv, LibGen)
	markdown = "markdown" # GitHub READMEs, wikis, GitBook
	html = "html" # Web articles, tutorials, MIT OCW
	text = "text" # Raw plain-text documents
	epub = "epub" # E-books (Open Library, Standard Ebooks)
	docx = "docx" # Word documents
	latex = "latex" # LaTeX source (arXiv raw)
	xml = "xml" # Structured documents, OAI-PMH feeds

	# ---------- VISUAL (6) ----------
	image = "image" # PNG, JPG, WebP (Wikimedia Commons, Unsplash)
	image_set = "image_set" # Slide decks, image collections
	svg = "svg" # Vector diagrams (ObservableHQ, D3 gallery)
	infographic = "infographic" # Data visualizations, posters
	diagram = "diagram" # Flowcharts, UML, network diagrams
	screenshot = "screenshot" # Code screenshots, UI mockups

	# ---------- AUDIO (3) ----------
	audio = "audio" # MP3, WAV, M4A — podcast lectures
	transcript = "transcript" # Timestamped transcriptions (YouTube captions)
	podcast = "podcast" # Podcast RSS feed metadata (Podcast Index)

	# ---------- VIDEO (4) ----------
	video = "video" # MP4, WebM, YouTube, Vimeo
	video_playlist = "video_playlist" # YouTube playlist / course series
	live = "live" # Live streams, webinars
	animation = "animation" # GIF, animated SVG, Manim exports

	# ---------- INTERACTIVE (7) ----------
	jupyter = "jupyter" # .ipynb (Kaggle, GitHub, Binder)
	colab = "colab" # Google Colab notebooks
	rmarkdown = "rmarkdown" # R Markdown / Quarto notebooks
	sandbox = "sandbox" # CodePen, Replit, JSFiddle
	simulation = "simulation" # Physics / ML simulations
	quiz = "quiz" # Interactive quizzes
	game = "game" # Educational games

	# ---------- AR / VR (2) ----------
	ar = "ar" # Augmented reality experiences
	vr = "vr" # VR simulations

	# ---------- CODE & PLATFORM (8) ----------
	github = "github" # GitHub repositories (GH Archive)
	github_discussion = "github_discussion" # Issues, Discussions, Q&A
	gist = "gist" # GitHub Gists (code snippets)
	kaggle = "kaggle" # Kaggle notebooks + datasets
	stackoverflow = "stackoverflow" # Stack Exchange Q&A
	# ── Block 2 new platforms ──────────────────────────────────
	documentation = "documentation"
	paperswithcode = "paperswithcode"
	semantic_scholar = "semantic_scholar"
	distill = "distill"
	observablehq = "observablehq"
	sketchfab = "sketchfab"
	freesound = "freesound"
	wolfram = "wolfram"
	api_docs = "api_docs" # API references (Read the Docs)
	dataset = "dataset" # CSV, Parquet, HDF5 (HuggingFace, OpenML)
	repo = "repo" # Generic code repository

	# ---------- 3D & SPATIAL (5) ----------
	model_3d = "3d_model" # GLTF, OBJ, FBX (Sketchfab, Thingiverse)
	model_3d_interactive = "3d_interactive" # Three.js, Babylon.js scenes
	pointcloud = "pointcloud" # LiDAR, 3D scans (.las, .pcd)
	cad = "cad" # AutoCAD, SolidWorks (.dwg, .stp)
	volumetric = "volumetric" # Medical imaging (MRI, CT — TCIA)

	# ---------- GRAPH & VISUALIZATION (4) ----------
	graph = "graph" # D3.js, Plotly, ObservableHQ
	knowledge_graph = "knowledge_graph" # RDF, Wikidata, semantic networks
	network = "network" # Force-directed graphs
	geo_map = "map" # Geographic visualizations (GeoJSON)

	# ---------- DATA & TIME-SERIES (3) ----------
	timeseries = "timeseries" # Stock data, sensor logs
	dashboard = "dashboard" # Tableau, Looker, Metabase dashboards
	dataframe = "dataframe" # Pandas/Polars dataframe snapshots

	# ---------- ASSESSMENT (4) ----------
	flashcards = "flashcards" # Spaced repetition (Anki decks)
	problem_set = "problem_set" # Homework problems + solutions
	lab = "lab" # Hands-on lab exercises
	exam = "exam" # Practice / certification exams


	# ============================================================
	# FORMAT GROUPS — used for crawler routing
	# ============================================================

	FORMAT_GROUPS = {
	"textual": [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html,
	SourceFormat.text, SourceFormat.epub, SourceFormat.latex,
	SourceFormat.xml, SourceFormat.docx],

	"visual": [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg,
	SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot],

	"audio": [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast],

	"video": [SourceFormat.video, SourceFormat.video_playlist,
	SourceFormat.live, SourceFormat.animation],

	"interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown,
	SourceFormat.sandbox, SourceFormat.simulation,
	SourceFormat.quiz, SourceFormat.game],

	"code": [SourceFormat.github, SourceFormat.github_discussion,
	SourceFormat.gist, SourceFormat.kaggle,
	SourceFormat.stackoverflow, SourceFormat.api_docs,
	SourceFormat.dataset, SourceFormat.repo],

	"spatial": [SourceFormat.model_3d, SourceFormat.model_3d_interactive,
	SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric],

	"graph": [SourceFormat.graph, SourceFormat.knowledge_graph,
	SourceFormat.network, SourceFormat.geo_map],

	"data": [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe],

	"assessment": [SourceFormat.flashcards, SourceFormat.problem_set,
	SourceFormat.lab, SourceFormat.exam],
	}


	# ============================================================
	# CRAWLER → FORMAT ROUTING MAP
	# (Which crawler handles which formats — no API keys)
	# ============================================================

	CRAWLER_FORMAT_MAP = {
	# bulk / protocol access — zero API key
	"common_crawl": [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown],
	"arxiv": [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html],
	"gharchive": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter],
	"stackoverflow": [SourceFormat.stackoverflow, SourceFormat.html],
	"wikipedia": [SourceFormat.html, SourceFormat.markdown, SourceFormat.image,
	SourceFormat.knowledge_graph],
	"openlibrary": [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html],
	"libgen": [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx],
	"mit_ocw": [SourceFormat.html, SourceFormat.pdf, SourceFormat.video,
	SourceFormat.problem_set],
	"huggingface": [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown,
	SourceFormat.dataframe],
	"podcast": [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript],

	# API-key crawlers (existing)
	"github": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter,
	SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo],
	"kaggle": [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset],
	"youtube": [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript],

	# FIND CRAWLER_FORMAT_MAP dict and ADD:
	"paperswithcode": [SourceFormat.pdf, SourceFormat.github],
	"documentation": [SourceFormat.html],
	"semantic_scholar":[SourceFormat.pdf, SourceFormat.html],
	"distill": [SourceFormat.html, SourceFormat.simulation],
	"observablehq": [SourceFormat.sandbox, SourceFormat.html],
	"sketchfab": [SourceFormat.model_3d, SourceFormat.model_3d_interactive],
	"freesound": [SourceFormat.audio],
	"wolfram": [SourceFormat.simulation, SourceFormat.sandbox],
	}


	# ============================================================
	# OTHER ENUMS
	# ============================================================

	class LearningStyle(str, Enum):
	visual = "visual"
	textual = "textual"
	kinesthetic = "kinesthetic"
	auditory = "auditory"


	class OutputFormat(str, Enum):
	json = "json"
	html = "html"
	embeddings = "embeddings"
	pdf = "pdf"
	streaming = "streaming"


	# ============================================================
	# REQUEST MODELS
	# ============================================================

	class DiscoveryRequest(BaseModel):
	"""Request model for multi-format source discovery"""

	topic: str = Field(..., min_length=2, max_length=200)
	difficulty: int = Field(..., ge=1, le=5)

	formats: List[SourceFormat] = Field(
	default_factory=lambda: [SourceFormat.pdf, SourceFormat.video,
	SourceFormat.github, SourceFormat.jupyter]
	)

	prerequisites: List[str] = Field(default_factory=list)
	learning_style: Optional[LearningStyle] = None
	language: str = Field(default="en")
	max_results: int = Field(default=10, ge=1, le=50)

	# Domain lock (v1 scope)
	domain: Optional[str] = Field(
	default=None,
	description="ai_engineering \| fintech_ai \| None (unrestricted)"
	)

	# Output format for AI systems
	output: OutputFormat = Field(default=OutputFormat.json)

	min_freshness: Optional[float] = Field(
	default=None,
	ge=0.0,
	le=1.0,
	description=(
	"Minimum freshness score (0.0–1.0). "
	"Sources with freshness below this value are excluded. "
	"Example: 0.5 = exclude anything more than half-decayed. "
	"Use 0.75 to only get fresh/recently-updated sources."
	)
	)

	@validator("topic")
	def normalize_topic(cls, v):
	v = v.strip()
	if not v:
	raise ValueError("Topic cannot be empty")
	return v

	@validator("output", pre=True)
	def validate_output(cls, v):
	# ── BUG 3 FIX ──────────────────────────────────────────────────────
	# Pydantic silently coerces invalid enum values to the default.
	# We intercept BEFORE coercion (pre=True) so bad values raise 422
	# with a human-readable message instead of silently returning JSON.
	valid = [e.value for e in OutputFormat]
	if v not in valid:
	raise ValueError(
	f"Invalid output format '{v}'. "
	f"Valid values: {valid}. "
	f"Example: {{\"output\": \"embeddings\"}}"
	)
	return v


	# ============================================================
	# SOURCE MODELS
	# ============================================================

	class SourceLink(BaseModel):
	type: str
	url: str
	format: SourceFormat
	size_bytes: Optional[int] = None
	access_method: str = "direct" # direct \| git_clone \| streaming \| torrent \| api


	class Source(BaseModel):
	"""Discovered source with full metadata"""

	id: str
	title: str
	authors: List[str] = Field(default_factory=list)

	quality_score: float = Field(..., ge=0, le=10)
	pedagogical_fit: float = Field(..., ge=0, le=1)
	difficulty: int = Field(..., ge=1, le=5)

	links: List[SourceLink]
	formats: List[SourceFormat]
	retraction_status: Optional[Dict[str, Any]] = None
	related_sources: Optional[Dict[str, Any]] = None
	summary: str = ""
	prerequisites: List[str] = Field(default_factory=list)
	tags: List[str] = Field(default_factory=list)
	language: str = "en"

	# Quality & Freshness signals (NEW: Decay Report)
	citation_count: int = 0
	peer_reviewed: bool = False
	open_access: bool = True

	decay_report: Optional[Dict[str, Any]] = None
	ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals")

	publication_date: Optional[datetime] = None
	last_updated: Optional[datetime] = None

	# Engagement metrics
	views: Optional[int] = None
	likes: Optional[int] = None
	rating: Optional[float] = Field(None, ge=0, le=5)
	stars: Optional[int] = None
	forks: Optional[int] = None
	downloads: Optional[int] = None

	# Platform
	source_platform: str
	thumbnail_url: Optional[str] = None
	url: str

	# Format-specific metadata
	duration_seconds: Optional[int] = None # video / audio
	file_size_bytes: Optional[int] = None # pdf / epub / dataset
	page_count: Optional[int] = None # pdf / epub
	kernel_type: Optional[str] = None # jupyter (Python / R / Julia)
	dataset_rows: Optional[int] = None # dataset
	dataset_cols: Optional[int] = None # dataset
	license: Optional[str] = None # dataset / repo / epub

	class Config:
	populate_by_name = True


	# ============================================================
	# RESPONSE MODELS
	# ============================================================

	class DiscoveryResponse(BaseModel):
	query: str
	domain: Optional[str] = None
	total_found: int
	sources: List[Source]

	formats_found: Dict[str, int] = Field(default_factory=dict)

	cache_hit: bool
	processing_time_ms: float

	page: int = 1
	total_pages: int = 1

	# --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) ---
	# First-class fields for enterprise clients — no adapter derivation needed.
	# Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score
	max_decay_detected: Optional[float] = Field(
	default=None,
	ge=0.0,
	le=1.0,
	description=(
	"The highest decay_score across all sources in this response. "
	"Use for boundary-stamped edge risk in TrustGraph pipelines. "
	"Range: 0.0 (all fresh) → 1.0 (worst source fully decayed). "
	"This is the field to use for propagation risk gates."
	)
	)
	avg_decay_score: Optional[float] = Field(
	default=None,
	ge=0.0,
	le=1.0,
	description="Mean decay score across all sources. Use for trend monitoring."
	)
	worst_source_id: Optional[str] = Field(
	default=None,
	description=(
	"Source ID with the highest decay_score. "
	"Use as TrustGraph edge label or tooltip to identify the stale tail source."
	)
	)
	stale_count: int = Field(
	default=0,
	description="Number of sources labeled 'stale' or 'decayed' in this response."
	)


	class KnowledgeObject(BaseModel):
	"""
	Enterprise-grade knowledge object for AI system consumption.
	This is the v1 contract output for serious clients.
	"""
	source_id: str
	title: str
	url: str
	platform: str
	format: SourceFormat

	# Scores
	quality_score: float
	pedagogical_fit: float
	freshness_score: float

	# Provenance
	authors: List[str]
	publication_date: Optional[datetime]
	license: Optional[str]
	open_access: bool

	# Embedding (populated when output=embeddings)
	embedding: Optional[List[float]] = None

	# Raw content for downstream use
	summary: str
	tags: List[str]


	class EnterpriseResponse(BaseModel):
	"""
	Full v1 contract response for enterprise clients.
	Matches the locked contract from the planning doc.
	"""
	knowledge_objects: List[KnowledgeObject]
	scores: Dict[str, float] # aggregate scoring breakdown
	source_links: List[str] # all URLs in order
	last_updated: datetime
	confidence_level: str # "high" \| "medium" \| "low"
	domain: Optional[str]
	formats_distribution: Dict[str, int]


	class CacheStats(BaseModel):
	total_keys: int
	memory_used_mb: float
	memory_used_percent: float
	hit_rate: float = Field(..., ge=0, le=1)

	hits: int
	misses: int
	evictions: int
	ttl_distribution: Dict[str, int] = Field(default_factory=dict)


	class ErrorResponse(BaseModel):
	error: str
	message: str
	details: Optional[Dict[str, Any]] = None
	timestamp: datetime = Field(default_factory=datetime.utcnow)