Spaces:
Running
Running
| """ | |
| Knowledge Universe API — Pydantic Models | |
| Full multi-format taxonomy: 50+ source types, no API keys required | |
| Rick's architecture: every format that exists on the internet | |
| """ | |
| from pydantic import BaseModel, Field, validator | |
| from typing import List, Optional, Dict, Any | |
| from enum import Enum | |
| from datetime import datetime | |
| # ============================================================ | |
| # MASTER FORMAT ENUM — Every format type on the internet | |
| # ============================================================ | |
| class SourceFormat(str, Enum): | |
| """ | |
| Complete taxonomy of discoverable content types. | |
| Maps to crawlers, scoring logic, and UI rendering. | |
| All lowercase strings so JSON round-trips safely. | |
| """ | |
| # ---------- TEXTUAL (8) ---------- | |
| pdf = "pdf" # Academic papers, textbooks (arXiv, LibGen) | |
| markdown = "markdown" # GitHub READMEs, wikis, GitBook | |
| html = "html" # Web articles, tutorials, MIT OCW | |
| text = "text" # Raw plain-text documents | |
| epub = "epub" # E-books (Open Library, Standard Ebooks) | |
| docx = "docx" # Word documents | |
| latex = "latex" # LaTeX source (arXiv raw) | |
| xml = "xml" # Structured documents, OAI-PMH feeds | |
| # ---------- VISUAL (6) ---------- | |
| image = "image" # PNG, JPG, WebP (Wikimedia Commons, Unsplash) | |
| image_set = "image_set" # Slide decks, image collections | |
| svg = "svg" # Vector diagrams (ObservableHQ, D3 gallery) | |
| infographic = "infographic" # Data visualizations, posters | |
| diagram = "diagram" # Flowcharts, UML, network diagrams | |
| screenshot = "screenshot" # Code screenshots, UI mockups | |
| # ---------- AUDIO (3) ---------- | |
| audio = "audio" # MP3, WAV, M4A — podcast lectures | |
| transcript = "transcript" # Timestamped transcriptions (YouTube captions) | |
| podcast = "podcast" # Podcast RSS feed metadata (Podcast Index) | |
| # ---------- VIDEO (4) ---------- | |
| video = "video" # MP4, WebM, YouTube, Vimeo | |
| video_playlist = "video_playlist" # YouTube playlist / course series | |
| live = "live" # Live streams, webinars | |
| animation = "animation" # GIF, animated SVG, Manim exports | |
| # ---------- INTERACTIVE (7) ---------- | |
| jupyter = "jupyter" # .ipynb (Kaggle, GitHub, Binder) | |
| colab = "colab" # Google Colab notebooks | |
| rmarkdown = "rmarkdown" # R Markdown / Quarto notebooks | |
| sandbox = "sandbox" # CodePen, Replit, JSFiddle | |
| simulation = "simulation" # Physics / ML simulations | |
| quiz = "quiz" # Interactive quizzes | |
| game = "game" # Educational games | |
| # ---------- AR / VR (2) ---------- | |
| ar = "ar" # Augmented reality experiences | |
| vr = "vr" # VR simulations | |
| # ---------- CODE & PLATFORM (8) ---------- | |
| github = "github" # GitHub repositories (GH Archive) | |
| github_discussion = "github_discussion" # Issues, Discussions, Q&A | |
| gist = "gist" # GitHub Gists (code snippets) | |
| kaggle = "kaggle" # Kaggle notebooks + datasets | |
| stackoverflow = "stackoverflow" # Stack Exchange Q&A | |
| # ── Block 2 new platforms ────────────────────────────────── | |
| documentation = "documentation" | |
| paperswithcode = "paperswithcode" | |
| semantic_scholar = "semantic_scholar" | |
| distill = "distill" | |
| observablehq = "observablehq" | |
| sketchfab = "sketchfab" | |
| freesound = "freesound" | |
| wolfram = "wolfram" | |
| api_docs = "api_docs" # API references (Read the Docs) | |
| dataset = "dataset" # CSV, Parquet, HDF5 (HuggingFace, OpenML) | |
| repo = "repo" # Generic code repository | |
| # ---------- 3D & SPATIAL (5) ---------- | |
| model_3d = "3d_model" # GLTF, OBJ, FBX (Sketchfab, Thingiverse) | |
| model_3d_interactive = "3d_interactive" # Three.js, Babylon.js scenes | |
| pointcloud = "pointcloud" # LiDAR, 3D scans (.las, .pcd) | |
| cad = "cad" # AutoCAD, SolidWorks (.dwg, .stp) | |
| volumetric = "volumetric" # Medical imaging (MRI, CT — TCIA) | |
| # ---------- GRAPH & VISUALIZATION (4) ---------- | |
| graph = "graph" # D3.js, Plotly, ObservableHQ | |
| knowledge_graph = "knowledge_graph" # RDF, Wikidata, semantic networks | |
| network = "network" # Force-directed graphs | |
| geo_map = "map" # Geographic visualizations (GeoJSON) | |
| # ---------- DATA & TIME-SERIES (3) ---------- | |
| timeseries = "timeseries" # Stock data, sensor logs | |
| dashboard = "dashboard" # Tableau, Looker, Metabase dashboards | |
| dataframe = "dataframe" # Pandas/Polars dataframe snapshots | |
| # ---------- ASSESSMENT (4) ---------- | |
| flashcards = "flashcards" # Spaced repetition (Anki decks) | |
| problem_set = "problem_set" # Homework problems + solutions | |
| lab = "lab" # Hands-on lab exercises | |
| exam = "exam" # Practice / certification exams | |
| # ============================================================ | |
| # FORMAT GROUPS — used for crawler routing | |
| # ============================================================ | |
| FORMAT_GROUPS = { | |
| "textual": [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html, | |
| SourceFormat.text, SourceFormat.epub, SourceFormat.latex, | |
| SourceFormat.xml, SourceFormat.docx], | |
| "visual": [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg, | |
| SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot], | |
| "audio": [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast], | |
| "video": [SourceFormat.video, SourceFormat.video_playlist, | |
| SourceFormat.live, SourceFormat.animation], | |
| "interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown, | |
| SourceFormat.sandbox, SourceFormat.simulation, | |
| SourceFormat.quiz, SourceFormat.game], | |
| "code": [SourceFormat.github, SourceFormat.github_discussion, | |
| SourceFormat.gist, SourceFormat.kaggle, | |
| SourceFormat.stackoverflow, SourceFormat.api_docs, | |
| SourceFormat.dataset, SourceFormat.repo], | |
| "spatial": [SourceFormat.model_3d, SourceFormat.model_3d_interactive, | |
| SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric], | |
| "graph": [SourceFormat.graph, SourceFormat.knowledge_graph, | |
| SourceFormat.network, SourceFormat.geo_map], | |
| "data": [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe], | |
| "assessment": [SourceFormat.flashcards, SourceFormat.problem_set, | |
| SourceFormat.lab, SourceFormat.exam], | |
| } | |
| # ============================================================ | |
| # CRAWLER → FORMAT ROUTING MAP | |
| # (Which crawler handles which formats — no API keys) | |
| # ============================================================ | |
| CRAWLER_FORMAT_MAP = { | |
| # bulk / protocol access — zero API key | |
| "common_crawl": [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown], | |
| "arxiv": [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html], | |
| "gharchive": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter], | |
| "stackoverflow": [SourceFormat.stackoverflow, SourceFormat.html], | |
| "wikipedia": [SourceFormat.html, SourceFormat.markdown, SourceFormat.image, | |
| SourceFormat.knowledge_graph], | |
| "openlibrary": [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html], | |
| "libgen": [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx], | |
| "mit_ocw": [SourceFormat.html, SourceFormat.pdf, SourceFormat.video, | |
| SourceFormat.problem_set], | |
| "huggingface": [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown, | |
| SourceFormat.dataframe], | |
| "podcast": [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript], | |
| # API-key crawlers (existing) | |
| "github": [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter, | |
| SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo], | |
| "kaggle": [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset], | |
| "youtube": [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript], | |
| # FIND CRAWLER_FORMAT_MAP dict and ADD: | |
| "paperswithcode": [SourceFormat.pdf, SourceFormat.github], | |
| "documentation": [SourceFormat.html], | |
| "semantic_scholar":[SourceFormat.pdf, SourceFormat.html], | |
| "distill": [SourceFormat.html, SourceFormat.simulation], | |
| "observablehq": [SourceFormat.sandbox, SourceFormat.html], | |
| "sketchfab": [SourceFormat.model_3d, SourceFormat.model_3d_interactive], | |
| "freesound": [SourceFormat.audio], | |
| "wolfram": [SourceFormat.simulation, SourceFormat.sandbox], | |
| } | |
| # ============================================================ | |
| # OTHER ENUMS | |
| # ============================================================ | |
| class LearningStyle(str, Enum): | |
| visual = "visual" | |
| textual = "textual" | |
| kinesthetic = "kinesthetic" | |
| auditory = "auditory" | |
| class OutputFormat(str, Enum): | |
| json = "json" | |
| html = "html" | |
| embeddings = "embeddings" | |
| pdf = "pdf" | |
| streaming = "streaming" | |
| # ============================================================ | |
| # REQUEST MODELS | |
| # ============================================================ | |
| class DiscoveryRequest(BaseModel): | |
| """Request model for multi-format source discovery""" | |
| topic: str = Field(..., min_length=2, max_length=200) | |
| difficulty: int = Field(..., ge=1, le=5) | |
| formats: List[SourceFormat] = Field( | |
| default_factory=lambda: [SourceFormat.pdf, SourceFormat.video, | |
| SourceFormat.github, SourceFormat.jupyter] | |
| ) | |
| prerequisites: List[str] = Field(default_factory=list) | |
| learning_style: Optional[LearningStyle] = None | |
| language: str = Field(default="en") | |
| max_results: int = Field(default=10, ge=1, le=50) | |
| # Domain lock (v1 scope) | |
| domain: Optional[str] = Field( | |
| default=None, | |
| description="ai_engineering | fintech_ai | None (unrestricted)" | |
| ) | |
| # Output format for AI systems | |
| output: OutputFormat = Field(default=OutputFormat.json) | |
| min_freshness: Optional[float] = Field( | |
| default=None, | |
| ge=0.0, | |
| le=1.0, | |
| description=( | |
| "Minimum freshness score (0.0–1.0). " | |
| "Sources with freshness below this value are excluded. " | |
| "Example: 0.5 = exclude anything more than half-decayed. " | |
| "Use 0.75 to only get fresh/recently-updated sources." | |
| ) | |
| ) | |
| def normalize_topic(cls, v): | |
| v = v.strip() | |
| if not v: | |
| raise ValueError("Topic cannot be empty") | |
| return v | |
| def validate_output(cls, v): | |
| # ── BUG 3 FIX ────────────────────────────────────────────────────── | |
| # Pydantic silently coerces invalid enum values to the default. | |
| # We intercept BEFORE coercion (pre=True) so bad values raise 422 | |
| # with a human-readable message instead of silently returning JSON. | |
| valid = [e.value for e in OutputFormat] | |
| if v not in valid: | |
| raise ValueError( | |
| f"Invalid output format '{v}'. " | |
| f"Valid values: {valid}. " | |
| f"Example: {{\"output\": \"embeddings\"}}" | |
| ) | |
| return v | |
| # ============================================================ | |
| # SOURCE MODELS | |
| # ============================================================ | |
| class SourceLink(BaseModel): | |
| type: str | |
| url: str | |
| format: SourceFormat | |
| size_bytes: Optional[int] = None | |
| access_method: str = "direct" # direct | git_clone | streaming | torrent | api | |
| class Source(BaseModel): | |
| """Discovered source with full metadata""" | |
| id: str | |
| title: str | |
| authors: List[str] = Field(default_factory=list) | |
| quality_score: float = Field(..., ge=0, le=10) | |
| pedagogical_fit: float = Field(..., ge=0, le=1) | |
| difficulty: int = Field(..., ge=1, le=5) | |
| links: List[SourceLink] | |
| formats: List[SourceFormat] | |
| retraction_status: Optional[Dict[str, Any]] = None | |
| related_sources: Optional[Dict[str, Any]] = None | |
| summary: str = "" | |
| prerequisites: List[str] = Field(default_factory=list) | |
| tags: List[str] = Field(default_factory=list) | |
| language: str = "en" | |
| # Quality & Freshness signals (NEW: Decay Report) | |
| citation_count: int = 0 | |
| peer_reviewed: bool = False | |
| open_access: bool = True | |
| decay_report: Optional[Dict[str, Any]] = None | |
| ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals") | |
| publication_date: Optional[datetime] = None | |
| last_updated: Optional[datetime] = None | |
| # Engagement metrics | |
| views: Optional[int] = None | |
| likes: Optional[int] = None | |
| rating: Optional[float] = Field(None, ge=0, le=5) | |
| stars: Optional[int] = None | |
| forks: Optional[int] = None | |
| downloads: Optional[int] = None | |
| # Platform | |
| source_platform: str | |
| thumbnail_url: Optional[str] = None | |
| url: str | |
| # Format-specific metadata | |
| duration_seconds: Optional[int] = None # video / audio | |
| file_size_bytes: Optional[int] = None # pdf / epub / dataset | |
| page_count: Optional[int] = None # pdf / epub | |
| kernel_type: Optional[str] = None # jupyter (Python / R / Julia) | |
| dataset_rows: Optional[int] = None # dataset | |
| dataset_cols: Optional[int] = None # dataset | |
| license: Optional[str] = None # dataset / repo / epub | |
| class Config: | |
| populate_by_name = True | |
| # ============================================================ | |
| # RESPONSE MODELS | |
| # ============================================================ | |
| class DiscoveryResponse(BaseModel): | |
| query: str | |
| domain: Optional[str] = None | |
| total_found: int | |
| sources: List[Source] | |
| formats_found: Dict[str, int] = Field(default_factory=dict) | |
| cache_hit: bool | |
| processing_time_ms: float | |
| page: int = 1 | |
| total_pages: int = 1 | |
| # --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) --- | |
| # First-class fields for enterprise clients — no adapter derivation needed. | |
| # Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score | |
| max_decay_detected: Optional[float] = Field( | |
| default=None, | |
| ge=0.0, | |
| le=1.0, | |
| description=( | |
| "The highest decay_score across all sources in this response. " | |
| "Use for boundary-stamped edge risk in TrustGraph pipelines. " | |
| "Range: 0.0 (all fresh) → 1.0 (worst source fully decayed). " | |
| "This is the field to use for propagation risk gates." | |
| ) | |
| ) | |
| avg_decay_score: Optional[float] = Field( | |
| default=None, | |
| ge=0.0, | |
| le=1.0, | |
| description="Mean decay score across all sources. Use for trend monitoring." | |
| ) | |
| worst_source_id: Optional[str] = Field( | |
| default=None, | |
| description=( | |
| "Source ID with the highest decay_score. " | |
| "Use as TrustGraph edge label or tooltip to identify the stale tail source." | |
| ) | |
| ) | |
| stale_count: int = Field( | |
| default=0, | |
| description="Number of sources labeled 'stale' or 'decayed' in this response." | |
| ) | |
| class KnowledgeObject(BaseModel): | |
| """ | |
| Enterprise-grade knowledge object for AI system consumption. | |
| This is the v1 contract output for serious clients. | |
| """ | |
| source_id: str | |
| title: str | |
| url: str | |
| platform: str | |
| format: SourceFormat | |
| # Scores | |
| quality_score: float | |
| pedagogical_fit: float | |
| freshness_score: float | |
| # Provenance | |
| authors: List[str] | |
| publication_date: Optional[datetime] | |
| license: Optional[str] | |
| open_access: bool | |
| # Embedding (populated when output=embeddings) | |
| embedding: Optional[List[float]] = None | |
| # Raw content for downstream use | |
| summary: str | |
| tags: List[str] | |
| class EnterpriseResponse(BaseModel): | |
| """ | |
| Full v1 contract response for enterprise clients. | |
| Matches the locked contract from the planning doc. | |
| """ | |
| knowledge_objects: List[KnowledgeObject] | |
| scores: Dict[str, float] # aggregate scoring breakdown | |
| source_links: List[str] # all URLs in order | |
| last_updated: datetime | |
| confidence_level: str # "high" | "medium" | "low" | |
| domain: Optional[str] | |
| formats_distribution: Dict[str, int] | |
| class CacheStats(BaseModel): | |
| total_keys: int | |
| memory_used_mb: float | |
| memory_used_percent: float | |
| hit_rate: float = Field(..., ge=0, le=1) | |
| hits: int | |
| misses: int | |
| evictions: int | |
| ttl_distribution: Dict[str, int] = Field(default_factory=dict) | |
| class ErrorResponse(BaseModel): | |
| error: str | |
| message: str | |
| details: Optional[Dict[str, Any]] = None | |
| timestamp: datetime = Field(default_factory=datetime.utcnow) |