Spaces:

vlsiddarth
/

Knowledge-Universe

Running

File size: 17,718 Bytes

"""
Knowledge Universe API — Pydantic Models
Full multi-format taxonomy: 50+ source types, no API keys required
Rick's architecture: every format that exists on the internet
"""

from pydantic import BaseModel, Field, validator
from typing import List, Optional, Dict, Any
from enum import Enum
from datetime import datetime


# ============================================================
# MASTER FORMAT ENUM — Every format type on the internet
# ============================================================

class SourceFormat(str, Enum):
    """
    Complete taxonomy of discoverable content types.
    Maps to crawlers, scoring logic, and UI rendering.
    All lowercase strings so JSON round-trips safely.
    """

    # ---------- TEXTUAL (8) ----------
    pdf          = "pdf"           # Academic papers, textbooks (arXiv, LibGen)
    markdown     = "markdown"      # GitHub READMEs, wikis, GitBook
    html         = "html"          # Web articles, tutorials, MIT OCW
    text         = "text"          # Raw plain-text documents
    epub         = "epub"          # E-books (Open Library, Standard Ebooks)
    docx         = "docx"          # Word documents
    latex        = "latex"         # LaTeX source (arXiv raw)
    xml          = "xml"           # Structured documents, OAI-PMH feeds

    # ---------- VISUAL (6) ----------
    image        = "image"         # PNG, JPG, WebP (Wikimedia Commons, Unsplash)
    image_set    = "image_set"     # Slide decks, image collections
    svg          = "svg"           # Vector diagrams (ObservableHQ, D3 gallery)
    infographic  = "infographic"   # Data visualizations, posters
    diagram      = "diagram"       # Flowcharts, UML, network diagrams
    screenshot   = "screenshot"    # Code screenshots, UI mockups

    # ---------- AUDIO (3) ----------
    audio        = "audio"         # MP3, WAV, M4A — podcast lectures
    transcript   = "transcript"    # Timestamped transcriptions (YouTube captions)
    podcast      = "podcast"       # Podcast RSS feed metadata (Podcast Index)

    # ---------- VIDEO (4) ----------
    video        = "video"         # MP4, WebM, YouTube, Vimeo
    video_playlist = "video_playlist"  # YouTube playlist / course series
    live         = "live"          # Live streams, webinars
    animation    = "animation"     # GIF, animated SVG, Manim exports

    # ---------- INTERACTIVE (7) ----------
    jupyter      = "jupyter"       # .ipynb (Kaggle, GitHub, Binder)
    colab        = "colab"         # Google Colab notebooks
    rmarkdown    = "rmarkdown"     # R Markdown / Quarto notebooks
    sandbox      = "sandbox"       # CodePen, Replit, JSFiddle
    simulation   = "simulation"    # Physics / ML simulations
    quiz         = "quiz"          # Interactive quizzes
    game         = "game"          # Educational games

    # ---------- AR / VR (2) ----------
    ar           = "ar"            # Augmented reality experiences
    vr           = "vr"            # VR simulations

    # ---------- CODE & PLATFORM (8) ----------
    github       = "github"        # GitHub repositories (GH Archive)
    github_discussion = "github_discussion"  # Issues, Discussions, Q&A
    gist         = "gist"          # GitHub Gists (code snippets)
    kaggle       = "kaggle"        # Kaggle notebooks + datasets
    stackoverflow = "stackoverflow" # Stack Exchange Q&A
    # ── Block 2 new platforms ──────────────────────────────────
    documentation    = "documentation"
    paperswithcode   = "paperswithcode"
    semantic_scholar = "semantic_scholar"
    distill          = "distill"
    observablehq     = "observablehq"
    sketchfab        = "sketchfab"
    freesound        = "freesound"
    wolfram          = "wolfram"    
    api_docs     = "api_docs"      # API references (Read the Docs)
    dataset      = "dataset"       # CSV, Parquet, HDF5 (HuggingFace, OpenML)
    repo         = "repo"          # Generic code repository

    # ---------- 3D & SPATIAL (5) ----------
    model_3d     = "3d_model"      # GLTF, OBJ, FBX (Sketchfab, Thingiverse)
    model_3d_interactive = "3d_interactive"  # Three.js, Babylon.js scenes
    pointcloud   = "pointcloud"    # LiDAR, 3D scans (.las, .pcd)
    cad          = "cad"           # AutoCAD, SolidWorks (.dwg, .stp)
    volumetric   = "volumetric"    # Medical imaging (MRI, CT — TCIA)

    # ---------- GRAPH & VISUALIZATION (4) ----------
    graph        = "graph"         # D3.js, Plotly, ObservableHQ
    knowledge_graph = "knowledge_graph"  # RDF, Wikidata, semantic networks
    network      = "network"       # Force-directed graphs
    geo_map      = "map"           # Geographic visualizations (GeoJSON)

    # ---------- DATA & TIME-SERIES (3) ----------
    timeseries   = "timeseries"    # Stock data, sensor logs
    dashboard    = "dashboard"     # Tableau, Looker, Metabase dashboards
    dataframe    = "dataframe"     # Pandas/Polars dataframe snapshots

    # ---------- ASSESSMENT (4) ----------
    flashcards   = "flashcards"    # Spaced repetition (Anki decks)
    problem_set  = "problem_set"   # Homework problems + solutions
    lab          = "lab"           # Hands-on lab exercises
    exam         = "exam"          # Practice / certification exams


# ============================================================
# FORMAT GROUPS — used for crawler routing
# ============================================================

FORMAT_GROUPS = {
    "textual":     [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html,
                    SourceFormat.text, SourceFormat.epub, SourceFormat.latex,
                    SourceFormat.xml, SourceFormat.docx],

    "visual":      [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg,
                    SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot],

    "audio":       [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast],

    "video":       [SourceFormat.video, SourceFormat.video_playlist,
                    SourceFormat.live, SourceFormat.animation],

    "interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown,
                    SourceFormat.sandbox, SourceFormat.simulation,
                    SourceFormat.quiz, SourceFormat.game],

    "code":        [SourceFormat.github, SourceFormat.github_discussion,
                    SourceFormat.gist, SourceFormat.kaggle,
                    SourceFormat.stackoverflow, SourceFormat.api_docs,
                    SourceFormat.dataset, SourceFormat.repo],

    "spatial":     [SourceFormat.model_3d, SourceFormat.model_3d_interactive,
                    SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric],

    "graph":       [SourceFormat.graph, SourceFormat.knowledge_graph,
                    SourceFormat.network, SourceFormat.geo_map],

    "data":        [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe],

    "assessment":  [SourceFormat.flashcards, SourceFormat.problem_set,
                    SourceFormat.lab, SourceFormat.exam],
}


# ============================================================
# CRAWLER → FORMAT ROUTING MAP
# (Which crawler handles which formats — no API keys)
# ============================================================

CRAWLER_FORMAT_MAP = {
    # bulk / protocol access — zero API key
    "common_crawl":    [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown],
    "arxiv":           [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html],
    "gharchive":       [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter],
    "stackoverflow":   [SourceFormat.stackoverflow, SourceFormat.html],
    "wikipedia":       [SourceFormat.html, SourceFormat.markdown, SourceFormat.image,
                        SourceFormat.knowledge_graph],
    "openlibrary":     [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html],
    "libgen":          [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx],
    "mit_ocw":         [SourceFormat.html, SourceFormat.pdf, SourceFormat.video,
                        SourceFormat.problem_set],
    "huggingface":     [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown,
                        SourceFormat.dataframe],
    "podcast":         [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript],

    # API-key crawlers (existing)
    "github":          [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter,
                        SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo],
    "kaggle":          [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset],
    "youtube":         [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript],
    
    # FIND CRAWLER_FORMAT_MAP dict and ADD:
    "paperswithcode":  [SourceFormat.pdf, SourceFormat.github],
    "documentation":   [SourceFormat.html],
    "semantic_scholar":[SourceFormat.pdf, SourceFormat.html],
    "distill":         [SourceFormat.html, SourceFormat.simulation],
    "observablehq":    [SourceFormat.sandbox, SourceFormat.html],
    "sketchfab":       [SourceFormat.model_3d, SourceFormat.model_3d_interactive],
    "freesound":       [SourceFormat.audio],
    "wolfram":         [SourceFormat.simulation, SourceFormat.sandbox],
}


# ============================================================
# OTHER ENUMS
# ============================================================

class LearningStyle(str, Enum):
    visual      = "visual"
    textual     = "textual"
    kinesthetic = "kinesthetic"
    auditory    = "auditory"


class OutputFormat(str, Enum):
    json        = "json"
    html        = "html"
    embeddings  = "embeddings"
    pdf         = "pdf"
    streaming   = "streaming"


# ============================================================
# REQUEST MODELS
# ============================================================

class DiscoveryRequest(BaseModel):
    """Request model for multi-format source discovery"""

    topic: str = Field(..., min_length=2, max_length=200)
    difficulty: int = Field(..., ge=1, le=5)

    formats: List[SourceFormat] = Field(
        default_factory=lambda: [SourceFormat.pdf, SourceFormat.video,
                                 SourceFormat.github, SourceFormat.jupyter]
    )

    prerequisites: List[str] = Field(default_factory=list)
    learning_style: Optional[LearningStyle] = None
    language: str = Field(default="en")
    max_results: int = Field(default=10, ge=1, le=50)

    # Domain lock (v1 scope)
    domain: Optional[str] = Field(
        default=None,
        description="ai_engineering | fintech_ai | None (unrestricted)"
    )

    # Output format for AI systems
    output: OutputFormat = Field(default=OutputFormat.json)

    min_freshness: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description=(
            "Minimum freshness score (0.0–1.0). "
            "Sources with freshness below this value are excluded. "
            "Example: 0.5 = exclude anything more than half-decayed. "
            "Use 0.75 to only get fresh/recently-updated sources."
        )
    )
    
    @validator("topic")
    def normalize_topic(cls, v):
        v = v.strip()
        if not v:
            raise ValueError("Topic cannot be empty")
        return v

    @validator("output", pre=True)
    def validate_output(cls, v):
        # ── BUG 3 FIX ──────────────────────────────────────────────────────
        # Pydantic silently coerces invalid enum values to the default.
        # We intercept BEFORE coercion (pre=True) so bad values raise 422
        # with a human-readable message instead of silently returning JSON.
        valid = [e.value for e in OutputFormat]
        if v not in valid:
            raise ValueError(
                f"Invalid output format '{v}'. "
                f"Valid values: {valid}. "
                f"Example: {{\"output\": \"embeddings\"}}"
            )
        return v


# ============================================================
# SOURCE MODELS
# ============================================================

class SourceLink(BaseModel):
    type: str
    url: str
    format: SourceFormat
    size_bytes: Optional[int] = None
    access_method: str = "direct"   # direct | git_clone | streaming | torrent | api


class Source(BaseModel):
    """Discovered source with full metadata"""

    id: str
    title: str
    authors: List[str] = Field(default_factory=list)

    quality_score: float = Field(..., ge=0, le=10)
    pedagogical_fit: float = Field(..., ge=0, le=1)
    difficulty: int = Field(..., ge=1, le=5)

    links: List[SourceLink]
    formats: List[SourceFormat]
    retraction_status: Optional[Dict[str, Any]] = None
    related_sources: Optional[Dict[str, Any]] = None
    summary: str = ""
    prerequisites: List[str] = Field(default_factory=list)
    tags: List[str] = Field(default_factory=list)
    language: str = "en"

    # Quality & Freshness signals (NEW: Decay Report)
    citation_count: int = 0
    peer_reviewed: bool = False
    open_access: bool = True
    
    decay_report: Optional[Dict[str, Any]] = None
    ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals")

    publication_date: Optional[datetime] = None
    last_updated: Optional[datetime] = None

    # Engagement metrics
    views:    Optional[int]   = None
    likes:    Optional[int]   = None
    rating:   Optional[float] = Field(None, ge=0, le=5)
    stars:    Optional[int]   = None
    forks:    Optional[int]   = None
    downloads: Optional[int]  = None

    # Platform
    source_platform: str
    thumbnail_url: Optional[str] = None
    url: str

    # Format-specific metadata
    duration_seconds:  Optional[int]   = None   # video / audio
    file_size_bytes:   Optional[int]   = None   # pdf / epub / dataset
    page_count:        Optional[int]   = None   # pdf / epub
    kernel_type:       Optional[str]   = None   # jupyter (Python / R / Julia)
    dataset_rows:      Optional[int]   = None   # dataset
    dataset_cols:      Optional[int]   = None   # dataset
    license:           Optional[str]   = None   # dataset / repo / epub

    class Config:
        populate_by_name = True


# ============================================================
# RESPONSE MODELS
# ============================================================

class DiscoveryResponse(BaseModel):
    query: str
    domain: Optional[str] = None
    total_found: int
    sources: List[Source]

    formats_found: Dict[str, int] = Field(default_factory=dict)

    cache_hit: bool
    processing_time_ms: float

    page: int = 1
    total_pages: int = 1

    # --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) ---
    # First-class fields for enterprise clients — no adapter derivation needed.
    # Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score
    max_decay_detected: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description=(
            "The highest decay_score across all sources in this response. "
            "Use for boundary-stamped edge risk in TrustGraph pipelines. "
            "Range: 0.0 (all fresh) → 1.0 (worst source fully decayed). "
            "This is the field to use for propagation risk gates."
        )
    )
    avg_decay_score: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Mean decay score across all sources. Use for trend monitoring."
    )
    worst_source_id: Optional[str] = Field(
        default=None,
        description=(
            "Source ID with the highest decay_score. "
            "Use as TrustGraph edge label or tooltip to identify the stale tail source."
        )
    )
    stale_count: int = Field(
        default=0,
        description="Number of sources labeled 'stale' or 'decayed' in this response."
    )


class KnowledgeObject(BaseModel):
    """
    Enterprise-grade knowledge object for AI system consumption.
    This is the v1 contract output for serious clients.
    """
    source_id: str
    title: str
    url: str
    platform: str
    format: SourceFormat

    # Scores
    quality_score: float
    pedagogical_fit: float
    freshness_score: float

    # Provenance
    authors: List[str]
    publication_date: Optional[datetime]
    license: Optional[str]
    open_access: bool

    # Embedding (populated when output=embeddings)
    embedding: Optional[List[float]] = None

    # Raw content for downstream use
    summary: str
    tags: List[str]


class EnterpriseResponse(BaseModel):
    """
    Full v1 contract response for enterprise clients.
    Matches the locked contract from the planning doc.
    """
    knowledge_objects: List[KnowledgeObject]
    scores: Dict[str, float]          # aggregate scoring breakdown
    source_links: List[str]            # all URLs in order
    last_updated: datetime
    confidence_level: str              # "high" | "medium" | "low"
    domain: Optional[str]
    formats_distribution: Dict[str, int]


class CacheStats(BaseModel):
    total_keys: int
    memory_used_mb: float
    memory_used_percent: float
    hit_rate: float = Field(..., ge=0, le=1)

    hits: int
    misses: int
    evictions: int
    ttl_distribution: Dict[str, int] = Field(default_factory=dict)


class ErrorResponse(BaseModel):
    error: str
    message: str
    details: Optional[Dict[str, Any]] = None
    timestamp: datetime = Field(default_factory=datetime.utcnow)