File size: 17,718 Bytes
3acb982
0733aae
 
 
3acb982
dc6eb4c
3acb982
 
 
 
 
 
0733aae
 
 
dc6eb4c
3acb982
0733aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9142739
 
 
 
 
 
 
 
 
0733aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9142739
 
 
 
 
 
 
 
 
 
0733aae
 
 
 
 
 
3acb982
 
0733aae
 
dc6eb4c
0733aae
 
 
 
 
 
 
 
 
dc6eb4c
3acb982
0733aae
dc6eb4c
0733aae
3acb982
 
0733aae
dc6eb4c
 
 
 
3acb982
0733aae
 
3acb982
 
dc6eb4c
 
 
 
 
0733aae
 
 
 
 
 
 
 
 
7abb8fd
 
 
 
 
 
 
 
 
 
 
 
dc6eb4c
 
 
 
 
 
 
9795dbe
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dc6eb4c
0733aae
dc6eb4c
0733aae
3acb982
 
dc6eb4c
 
 
 
0733aae
3acb982
 
 
0733aae
3acb982
dc6eb4c
 
 
 
 
 
 
 
 
 
d64545e
 
dc6eb4c
 
 
 
 
54ce8e5
dc6eb4c
 
 
54ce8e5
 
 
dc6eb4c
 
 
 
0733aae
 
 
 
 
 
 
dc6eb4c
 
 
 
 
 
0733aae
 
 
 
 
 
 
 
 
54ce8e5
 
 
dc6eb4c
0733aae
dc6eb4c
0733aae
3acb982
 
dc6eb4c
0733aae
dc6eb4c
 
 
e5cdd9c
0733aae
dc6eb4c
 
 
 
 
3acb982
e5cdd9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3acb982
0733aae
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3acb982
dc6eb4c
 
 
 
 
 
 
 
 
3acb982
 
 
dc6eb4c
 
 
0733aae
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
"""
Knowledge Universe API β€” Pydantic Models
Full multi-format taxonomy: 50+ source types, no API keys required
Rick's architecture: every format that exists on the internet
"""

from pydantic import BaseModel, Field, validator
from typing import List, Optional, Dict, Any
from enum import Enum
from datetime import datetime


# ============================================================
# MASTER FORMAT ENUM β€” Every format type on the internet
# ============================================================

class SourceFormat(str, Enum):
    """
    Complete taxonomy of discoverable content types.
    Maps to crawlers, scoring logic, and UI rendering.
    All lowercase strings so JSON round-trips safely.
    """

    # ---------- TEXTUAL (8) ----------
    pdf          = "pdf"           # Academic papers, textbooks (arXiv, LibGen)
    markdown     = "markdown"      # GitHub READMEs, wikis, GitBook
    html         = "html"          # Web articles, tutorials, MIT OCW
    text         = "text"          # Raw plain-text documents
    epub         = "epub"          # E-books (Open Library, Standard Ebooks)
    docx         = "docx"          # Word documents
    latex        = "latex"         # LaTeX source (arXiv raw)
    xml          = "xml"           # Structured documents, OAI-PMH feeds

    # ---------- VISUAL (6) ----------
    image        = "image"         # PNG, JPG, WebP (Wikimedia Commons, Unsplash)
    image_set    = "image_set"     # Slide decks, image collections
    svg          = "svg"           # Vector diagrams (ObservableHQ, D3 gallery)
    infographic  = "infographic"   # Data visualizations, posters
    diagram      = "diagram"       # Flowcharts, UML, network diagrams
    screenshot   = "screenshot"    # Code screenshots, UI mockups

    # ---------- AUDIO (3) ----------
    audio        = "audio"         # MP3, WAV, M4A β€” podcast lectures
    transcript   = "transcript"    # Timestamped transcriptions (YouTube captions)
    podcast      = "podcast"       # Podcast RSS feed metadata (Podcast Index)

    # ---------- VIDEO (4) ----------
    video        = "video"         # MP4, WebM, YouTube, Vimeo
    video_playlist = "video_playlist"  # YouTube playlist / course series
    live         = "live"          # Live streams, webinars
    animation    = "animation"     # GIF, animated SVG, Manim exports

    # ---------- INTERACTIVE (7) ----------
    jupyter      = "jupyter"       # .ipynb (Kaggle, GitHub, Binder)
    colab        = "colab"         # Google Colab notebooks
    rmarkdown    = "rmarkdown"     # R Markdown / Quarto notebooks
    sandbox      = "sandbox"       # CodePen, Replit, JSFiddle
    simulation   = "simulation"    # Physics / ML simulations
    quiz         = "quiz"          # Interactive quizzes
    game         = "game"          # Educational games

    # ---------- AR / VR (2) ----------
    ar           = "ar"            # Augmented reality experiences
    vr           = "vr"            # VR simulations

    # ---------- CODE & PLATFORM (8) ----------
    github       = "github"        # GitHub repositories (GH Archive)
    github_discussion = "github_discussion"  # Issues, Discussions, Q&A
    gist         = "gist"          # GitHub Gists (code snippets)
    kaggle       = "kaggle"        # Kaggle notebooks + datasets
    stackoverflow = "stackoverflow" # Stack Exchange Q&A
    # ── Block 2 new platforms ──────────────────────────────────
    documentation    = "documentation"
    paperswithcode   = "paperswithcode"
    semantic_scholar = "semantic_scholar"
    distill          = "distill"
    observablehq     = "observablehq"
    sketchfab        = "sketchfab"
    freesound        = "freesound"
    wolfram          = "wolfram"    
    api_docs     = "api_docs"      # API references (Read the Docs)
    dataset      = "dataset"       # CSV, Parquet, HDF5 (HuggingFace, OpenML)
    repo         = "repo"          # Generic code repository

    # ---------- 3D & SPATIAL (5) ----------
    model_3d     = "3d_model"      # GLTF, OBJ, FBX (Sketchfab, Thingiverse)
    model_3d_interactive = "3d_interactive"  # Three.js, Babylon.js scenes
    pointcloud   = "pointcloud"    # LiDAR, 3D scans (.las, .pcd)
    cad          = "cad"           # AutoCAD, SolidWorks (.dwg, .stp)
    volumetric   = "volumetric"    # Medical imaging (MRI, CT β€” TCIA)

    # ---------- GRAPH & VISUALIZATION (4) ----------
    graph        = "graph"         # D3.js, Plotly, ObservableHQ
    knowledge_graph = "knowledge_graph"  # RDF, Wikidata, semantic networks
    network      = "network"       # Force-directed graphs
    geo_map      = "map"           # Geographic visualizations (GeoJSON)

    # ---------- DATA & TIME-SERIES (3) ----------
    timeseries   = "timeseries"    # Stock data, sensor logs
    dashboard    = "dashboard"     # Tableau, Looker, Metabase dashboards
    dataframe    = "dataframe"     # Pandas/Polars dataframe snapshots

    # ---------- ASSESSMENT (4) ----------
    flashcards   = "flashcards"    # Spaced repetition (Anki decks)
    problem_set  = "problem_set"   # Homework problems + solutions
    lab          = "lab"           # Hands-on lab exercises
    exam         = "exam"          # Practice / certification exams


# ============================================================
# FORMAT GROUPS β€” used for crawler routing
# ============================================================

FORMAT_GROUPS = {
    "textual":     [SourceFormat.pdf, SourceFormat.markdown, SourceFormat.html,
                    SourceFormat.text, SourceFormat.epub, SourceFormat.latex,
                    SourceFormat.xml, SourceFormat.docx],

    "visual":      [SourceFormat.image, SourceFormat.image_set, SourceFormat.svg,
                    SourceFormat.infographic, SourceFormat.diagram, SourceFormat.screenshot],

    "audio":       [SourceFormat.audio, SourceFormat.transcript, SourceFormat.podcast],

    "video":       [SourceFormat.video, SourceFormat.video_playlist,
                    SourceFormat.live, SourceFormat.animation],

    "interactive": [SourceFormat.jupyter, SourceFormat.colab, SourceFormat.rmarkdown,
                    SourceFormat.sandbox, SourceFormat.simulation,
                    SourceFormat.quiz, SourceFormat.game],

    "code":        [SourceFormat.github, SourceFormat.github_discussion,
                    SourceFormat.gist, SourceFormat.kaggle,
                    SourceFormat.stackoverflow, SourceFormat.api_docs,
                    SourceFormat.dataset, SourceFormat.repo],

    "spatial":     [SourceFormat.model_3d, SourceFormat.model_3d_interactive,
                    SourceFormat.pointcloud, SourceFormat.cad, SourceFormat.volumetric],

    "graph":       [SourceFormat.graph, SourceFormat.knowledge_graph,
                    SourceFormat.network, SourceFormat.geo_map],

    "data":        [SourceFormat.timeseries, SourceFormat.dashboard, SourceFormat.dataframe],

    "assessment":  [SourceFormat.flashcards, SourceFormat.problem_set,
                    SourceFormat.lab, SourceFormat.exam],
}


# ============================================================
# CRAWLER β†’ FORMAT ROUTING MAP
# (Which crawler handles which formats β€” no API keys)
# ============================================================

CRAWLER_FORMAT_MAP = {
    # bulk / protocol access β€” zero API key
    "common_crawl":    [SourceFormat.html, SourceFormat.pdf, SourceFormat.markdown],
    "arxiv":           [SourceFormat.pdf, SourceFormat.latex, SourceFormat.html],
    "gharchive":       [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter],
    "stackoverflow":   [SourceFormat.stackoverflow, SourceFormat.html],
    "wikipedia":       [SourceFormat.html, SourceFormat.markdown, SourceFormat.image,
                        SourceFormat.knowledge_graph],
    "openlibrary":     [SourceFormat.epub, SourceFormat.pdf, SourceFormat.html],
    "libgen":          [SourceFormat.pdf, SourceFormat.epub, SourceFormat.docx],
    "mit_ocw":         [SourceFormat.html, SourceFormat.pdf, SourceFormat.video,
                        SourceFormat.problem_set],
    "huggingface":     [SourceFormat.dataset, SourceFormat.jupyter, SourceFormat.markdown,
                        SourceFormat.dataframe],
    "podcast":         [SourceFormat.podcast, SourceFormat.audio, SourceFormat.transcript],

    # API-key crawlers (existing)
    "github":          [SourceFormat.github, SourceFormat.markdown, SourceFormat.jupyter,
                        SourceFormat.dataset, SourceFormat.gist, SourceFormat.repo],
    "kaggle":          [SourceFormat.kaggle, SourceFormat.jupyter, SourceFormat.dataset],
    "youtube":         [SourceFormat.video, SourceFormat.video_playlist, SourceFormat.transcript],
    
    # FIND CRAWLER_FORMAT_MAP dict and ADD:
    "paperswithcode":  [SourceFormat.pdf, SourceFormat.github],
    "documentation":   [SourceFormat.html],
    "semantic_scholar":[SourceFormat.pdf, SourceFormat.html],
    "distill":         [SourceFormat.html, SourceFormat.simulation],
    "observablehq":    [SourceFormat.sandbox, SourceFormat.html],
    "sketchfab":       [SourceFormat.model_3d, SourceFormat.model_3d_interactive],
    "freesound":       [SourceFormat.audio],
    "wolfram":         [SourceFormat.simulation, SourceFormat.sandbox],
}


# ============================================================
# OTHER ENUMS
# ============================================================

class LearningStyle(str, Enum):
    visual      = "visual"
    textual     = "textual"
    kinesthetic = "kinesthetic"
    auditory    = "auditory"


class OutputFormat(str, Enum):
    json        = "json"
    html        = "html"
    embeddings  = "embeddings"
    pdf         = "pdf"
    streaming   = "streaming"


# ============================================================
# REQUEST MODELS
# ============================================================

class DiscoveryRequest(BaseModel):
    """Request model for multi-format source discovery"""

    topic: str = Field(..., min_length=2, max_length=200)
    difficulty: int = Field(..., ge=1, le=5)

    formats: List[SourceFormat] = Field(
        default_factory=lambda: [SourceFormat.pdf, SourceFormat.video,
                                 SourceFormat.github, SourceFormat.jupyter]
    )

    prerequisites: List[str] = Field(default_factory=list)
    learning_style: Optional[LearningStyle] = None
    language: str = Field(default="en")
    max_results: int = Field(default=10, ge=1, le=50)

    # Domain lock (v1 scope)
    domain: Optional[str] = Field(
        default=None,
        description="ai_engineering | fintech_ai | None (unrestricted)"
    )

    # Output format for AI systems
    output: OutputFormat = Field(default=OutputFormat.json)

    min_freshness: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description=(
            "Minimum freshness score (0.0–1.0). "
            "Sources with freshness below this value are excluded. "
            "Example: 0.5 = exclude anything more than half-decayed. "
            "Use 0.75 to only get fresh/recently-updated sources."
        )
    )
    
    @validator("topic")
    def normalize_topic(cls, v):
        v = v.strip()
        if not v:
            raise ValueError("Topic cannot be empty")
        return v

    @validator("output", pre=True)
    def validate_output(cls, v):
        # ── BUG 3 FIX ──────────────────────────────────────────────────────
        # Pydantic silently coerces invalid enum values to the default.
        # We intercept BEFORE coercion (pre=True) so bad values raise 422
        # with a human-readable message instead of silently returning JSON.
        valid = [e.value for e in OutputFormat]
        if v not in valid:
            raise ValueError(
                f"Invalid output format '{v}'. "
                f"Valid values: {valid}. "
                f"Example: {{\"output\": \"embeddings\"}}"
            )
        return v


# ============================================================
# SOURCE MODELS
# ============================================================

class SourceLink(BaseModel):
    type: str
    url: str
    format: SourceFormat
    size_bytes: Optional[int] = None
    access_method: str = "direct"   # direct | git_clone | streaming | torrent | api


class Source(BaseModel):
    """Discovered source with full metadata"""

    id: str
    title: str
    authors: List[str] = Field(default_factory=list)

    quality_score: float = Field(..., ge=0, le=10)
    pedagogical_fit: float = Field(..., ge=0, le=1)
    difficulty: int = Field(..., ge=1, le=5)

    links: List[SourceLink]
    formats: List[SourceFormat]
    retraction_status: Optional[Dict[str, Any]] = None
    related_sources: Optional[Dict[str, Any]] = None
    summary: str = ""
    prerequisites: List[str] = Field(default_factory=list)
    tags: List[str] = Field(default_factory=list)
    language: str = "en"

    # Quality & Freshness signals (NEW: Decay Report)
    citation_count: int = 0
    peer_reviewed: bool = False
    open_access: bool = True
    
    decay_report: Optional[Dict[str, Any]] = None
    ranking_signals: Optional[Dict[str, Any]] = Field(default=None, alias="_ranking_signals")

    publication_date: Optional[datetime] = None
    last_updated: Optional[datetime] = None

    # Engagement metrics
    views:    Optional[int]   = None
    likes:    Optional[int]   = None
    rating:   Optional[float] = Field(None, ge=0, le=5)
    stars:    Optional[int]   = None
    forks:    Optional[int]   = None
    downloads: Optional[int]  = None

    # Platform
    source_platform: str
    thumbnail_url: Optional[str] = None
    url: str

    # Format-specific metadata
    duration_seconds:  Optional[int]   = None   # video / audio
    file_size_bytes:   Optional[int]   = None   # pdf / epub / dataset
    page_count:        Optional[int]   = None   # pdf / epub
    kernel_type:       Optional[str]   = None   # jupyter (Python / R / Julia)
    dataset_rows:      Optional[int]   = None   # dataset
    dataset_cols:      Optional[int]   = None   # dataset
    license:           Optional[str]   = None   # dataset / repo / epub

    class Config:
        populate_by_name = True


# ============================================================
# RESPONSE MODELS
# ============================================================

class DiscoveryResponse(BaseModel):
    query: str
    domain: Optional[str] = None
    total_found: int
    sources: List[Source]

    formats_found: Dict[str, int] = Field(default_factory=dict)

    cache_hit: bool
    processing_time_ms: float

    page: int = 1
    total_pages: int = 1

    # --- ENTERPRISE TEMPORAL RISK FIELDS (v2.2) ---
    # First-class fields for enterprise clients β€” no adapter derivation needed.
    # Dwayne's pattern: temporal_decay = max_decay_detected ?? avg_decay_score
    max_decay_detected: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description=(
            "The highest decay_score across all sources in this response. "
            "Use for boundary-stamped edge risk in TrustGraph pipelines. "
            "Range: 0.0 (all fresh) β†’ 1.0 (worst source fully decayed). "
            "This is the field to use for propagation risk gates."
        )
    )
    avg_decay_score: Optional[float] = Field(
        default=None,
        ge=0.0,
        le=1.0,
        description="Mean decay score across all sources. Use for trend monitoring."
    )
    worst_source_id: Optional[str] = Field(
        default=None,
        description=(
            "Source ID with the highest decay_score. "
            "Use as TrustGraph edge label or tooltip to identify the stale tail source."
        )
    )
    stale_count: int = Field(
        default=0,
        description="Number of sources labeled 'stale' or 'decayed' in this response."
    )


class KnowledgeObject(BaseModel):
    """
    Enterprise-grade knowledge object for AI system consumption.
    This is the v1 contract output for serious clients.
    """
    source_id: str
    title: str
    url: str
    platform: str
    format: SourceFormat

    # Scores
    quality_score: float
    pedagogical_fit: float
    freshness_score: float

    # Provenance
    authors: List[str]
    publication_date: Optional[datetime]
    license: Optional[str]
    open_access: bool

    # Embedding (populated when output=embeddings)
    embedding: Optional[List[float]] = None

    # Raw content for downstream use
    summary: str
    tags: List[str]


class EnterpriseResponse(BaseModel):
    """
    Full v1 contract response for enterprise clients.
    Matches the locked contract from the planning doc.
    """
    knowledge_objects: List[KnowledgeObject]
    scores: Dict[str, float]          # aggregate scoring breakdown
    source_links: List[str]            # all URLs in order
    last_updated: datetime
    confidence_level: str              # "high" | "medium" | "low"
    domain: Optional[str]
    formats_distribution: Dict[str, int]


class CacheStats(BaseModel):
    total_keys: int
    memory_used_mb: float
    memory_used_percent: float
    hit_rate: float = Field(..., ge=0, le=1)

    hits: int
    misses: int
    evictions: int
    ttl_distribution: Dict[str, int] = Field(default_factory=dict)


class ErrorResponse(BaseModel):
    error: str
    message: str
    details: Optional[Dict[str, Any]] = None
    timestamp: datetime = Field(default_factory=datetime.utcnow)