| | """ |
| | SPARKNET API Schemas |
| | Pydantic models for request/response validation. |
| | """ |
| |
|
| | from pydantic import BaseModel, Field, ConfigDict |
| | from typing import List, Dict, Any, Optional |
| | from datetime import datetime |
| | from enum import Enum |
| |
|
| |
|
| | |
| |
|
| | class DocumentStatus(str, Enum): |
| | PENDING = "pending" |
| | PROCESSING = "processing" |
| | COMPLETED = "completed" |
| | INDEXED = "indexed" |
| | ERROR = "error" |
| |
|
| |
|
| | class QueryIntentType(str, Enum): |
| | FACTOID = "factoid" |
| | COMPARISON = "comparison" |
| | AGGREGATION = "aggregation" |
| | CAUSAL = "causal" |
| | PROCEDURAL = "procedural" |
| | DEFINITION = "definition" |
| | LIST = "list" |
| | MULTI_HOP = "multi_hop" |
| |
|
| |
|
| | class AnswerFormat(str, Enum): |
| | PROSE = "prose" |
| | BULLET_POINTS = "bullet_points" |
| | TABLE = "table" |
| | STEP_BY_STEP = "step_by_step" |
| |
|
| |
|
| | |
| |
|
| | class DocumentUploadResponse(BaseModel): |
| | """Response after uploading a document.""" |
| | model_config = ConfigDict(from_attributes=True) |
| |
|
| | doc_id: str = Field(..., description="Unique document identifier") |
| | filename: str = Field(..., description="Original filename") |
| | status: DocumentStatus = Field(..., description="Document status") |
| | message: str = Field(..., description="Status message") |
| | created_at: datetime = Field(default_factory=datetime.now) |
| |
|
| |
|
| | class DocumentMetadata(BaseModel): |
| | """Document metadata information.""" |
| | model_config = ConfigDict(from_attributes=True) |
| |
|
| | doc_id: str |
| | filename: str |
| | file_type: str |
| | page_count: int = 0 |
| | chunk_count: int = 0 |
| | text_length: int = 0 |
| | status: DocumentStatus |
| | indexed: bool = False |
| | indexed_chunks: int = 0 |
| | processing_time: Optional[float] = None |
| | created_at: datetime |
| | updated_at: Optional[datetime] = None |
| |
|
| |
|
| | class DocumentResponse(BaseModel): |
| | """Full document response with metadata.""" |
| | model_config = ConfigDict(from_attributes=True) |
| |
|
| | doc_id: str |
| | filename: str |
| | file_type: str |
| | status: DocumentStatus |
| | metadata: DocumentMetadata |
| | raw_text: Optional[str] = Field(None, description="Full extracted text (if requested)") |
| | preview: Optional[str] = Field(None, description="Text preview (first 500 chars)") |
| |
|
| |
|
| | class ChunkInfo(BaseModel): |
| | """Information about a document chunk.""" |
| | model_config = ConfigDict(from_attributes=True) |
| |
|
| | chunk_id: str |
| | doc_id: str |
| | text: str |
| | chunk_type: str = "text" |
| | page_num: Optional[int] = None |
| | confidence: float = 1.0 |
| | bbox: Optional[Dict[str, float]] = None |
| | metadata: Dict[str, Any] = Field(default_factory=dict) |
| |
|
| |
|
| | class ChunksResponse(BaseModel): |
| | """Response containing document chunks.""" |
| | doc_id: str |
| | total_chunks: int |
| | chunks: List[ChunkInfo] |
| |
|
| |
|
| | class OCRRegionInfo(BaseModel): |
| | """OCR region information.""" |
| | region_id: str |
| | text: str |
| | confidence: float |
| | page_num: int |
| | bbox: Dict[str, float] |
| |
|
| |
|
| | class LayoutRegionInfo(BaseModel): |
| | """Layout region information.""" |
| | region_id: str |
| | region_type: str |
| | confidence: float |
| | page_num: int |
| | bbox: Dict[str, float] |
| |
|
| |
|
| | class DocumentDetailResponse(BaseModel): |
| | """Detailed document response with all extracted data.""" |
| | doc_id: str |
| | filename: str |
| | status: DocumentStatus |
| | metadata: DocumentMetadata |
| | chunks: List[ChunkInfo] |
| | ocr_regions: List[OCRRegionInfo] = Field(default_factory=list) |
| | layout_regions: List[LayoutRegionInfo] = Field(default_factory=list) |
| |
|
| |
|
| | |
| |
|
| | class QueryRequest(BaseModel): |
| | """RAG query request.""" |
| | query: str = Field(..., min_length=1, max_length=2000, description="Query text") |
| | doc_ids: Optional[List[str]] = Field(None, description="Filter by document IDs") |
| | top_k: int = Field(5, ge=1, le=20, description="Number of chunks to retrieve") |
| | answer_format: AnswerFormat = Field(AnswerFormat.PROSE, description="Desired answer format") |
| | include_sources: bool = Field(True, description="Include source citations") |
| | min_confidence: float = Field(0.5, ge=0.0, le=1.0, description="Minimum confidence threshold") |
| | use_cache: bool = Field(True, description="Use cached results if available") |
| |
|
| |
|
| | class Citation(BaseModel): |
| | """Citation/source reference.""" |
| | citation_id: int = Field(..., description="Citation number [1], [2], etc.") |
| | doc_id: str |
| | document_name: str |
| | chunk_id: str |
| | chunk_text: str |
| | page_num: Optional[int] = None |
| | relevance_score: float |
| | bbox: Optional[Dict[str, float]] = None |
| |
|
| |
|
| | class QueryPlan(BaseModel): |
| | """Query planning information.""" |
| | intent: QueryIntentType |
| | sub_queries: List[str] = Field(default_factory=list) |
| | keywords: List[str] = Field(default_factory=list) |
| | strategy: str = "hybrid" |
| |
|
| |
|
| | class RAGResponse(BaseModel): |
| | """Complete RAG response.""" |
| | query: str |
| | answer: str |
| | confidence: float = Field(..., ge=0.0, le=1.0) |
| | citations: List[Citation] = Field(default_factory=list) |
| | source_count: int = 0 |
| | query_plan: Optional[QueryPlan] = None |
| | from_cache: bool = False |
| | validation: Optional[Dict[str, Any]] = None |
| | latency_ms: Optional[float] = None |
| | revision_count: int = 0 |
| |
|
| |
|
| | class SearchRequest(BaseModel): |
| | """Semantic search request.""" |
| | query: str = Field(..., min_length=1, max_length=1000) |
| | doc_ids: Optional[List[str]] = None |
| | top_k: int = Field(10, ge=1, le=50) |
| | min_score: float = Field(0.0, ge=0.0, le=1.0) |
| |
|
| |
|
| | class SearchResult(BaseModel): |
| | """Single search result.""" |
| | chunk_id: str |
| | doc_id: str |
| | document_name: str |
| | text: str |
| | score: float |
| | page_num: Optional[int] = None |
| | chunk_type: str = "text" |
| |
|
| |
|
| | class SearchResponse(BaseModel): |
| | """Search response with results.""" |
| | query: str |
| | total_results: int |
| | results: List[SearchResult] |
| | latency_ms: float |
| |
|
| |
|
| | |
| |
|
| | class IndexRequest(BaseModel): |
| | """Request to index a document.""" |
| | doc_id: str = Field(..., description="Document ID to index") |
| | force_reindex: bool = Field(False, description="Force reindexing if already indexed") |
| |
|
| |
|
| | class IndexResponse(BaseModel): |
| | """Indexing response.""" |
| | doc_id: str |
| | status: str |
| | chunks_indexed: int |
| | message: str |
| |
|
| |
|
| | class BatchIndexRequest(BaseModel): |
| | """Batch indexing request.""" |
| | doc_ids: List[str] |
| | force_reindex: bool = False |
| |
|
| |
|
| | class BatchIndexResponse(BaseModel): |
| | """Batch indexing response.""" |
| | total_requested: int |
| | successful: int |
| | failed: int |
| | results: List[IndexResponse] |
| |
|
| |
|
| | |
| |
|
| | class HealthResponse(BaseModel): |
| | """Health check response.""" |
| | status: str = Field(..., description="healthy, degraded, or unhealthy") |
| | version: str |
| | components: Dict[str, bool] |
| |
|
| |
|
| | class SystemStatus(BaseModel): |
| | """Detailed system status.""" |
| | status: str |
| | version: str |
| | uptime_seconds: float |
| | components: Dict[str, bool] |
| | statistics: Dict[str, Any] |
| | models: Dict[str, str] |
| |
|
| |
|
| | class CollectionInfo(BaseModel): |
| | """Vector store collection information.""" |
| | name: str |
| | document_count: int |
| | chunk_count: int |
| | embedding_dimension: int |
| |
|
| |
|
| | class StoreStatus(BaseModel): |
| | """Vector store status.""" |
| | status: str |
| | collections: List[CollectionInfo] |
| | total_documents: int |
| | total_chunks: int |
| |
|
| |
|
| | |
| |
|
| | class UserCreate(BaseModel): |
| | """User creation request.""" |
| | username: str = Field(..., min_length=3, max_length=50) |
| | email: str |
| | password: str = Field(..., min_length=8) |
| |
|
| |
|
| | class UserResponse(BaseModel): |
| | """User response (no password).""" |
| | user_id: str |
| | username: str |
| | email: str |
| | is_active: bool = True |
| | created_at: datetime |
| |
|
| |
|
| | class Token(BaseModel): |
| | """JWT token response.""" |
| | access_token: str |
| | token_type: str = "bearer" |
| | expires_in: int |
| |
|
| |
|
| | class TokenData(BaseModel): |
| | """Token payload data.""" |
| | username: Optional[str] = None |
| | user_id: Optional[str] = None |
| | scopes: List[str] = Field(default_factory=list) |
| |
|