File size: 2,575 Bytes
7d07e42 149c85a 7d07e42 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 | from pydantic import BaseModel, Field, HttpUrl
from typing import List, Optional
# === Shared ===
class BBoxSchema(BaseModel):
x1: float
y1: float
x2: float
y2: float
width: float
height: float
class DetectionSchema(BaseModel):
label: str
confidence: float
bbox: BBoxSchema
class_id: int
class OCRBoxSchema(BaseModel):
text: str
confidence: float
bbox: list
# === Requests ===
class AnalyzeURLRequest(BaseModel):
url: str = Field(..., description="URL gambar yang akan dianalisis")
run_caption: bool = Field(True, description="Generate image caption")
run_detection: bool = Field(False, description="Deteksi objek dengan YOLO (off by default — opt-in)")
run_ocr: bool = Field(False, description="Ekstrak teks dari gambar")
classification_labels: Optional[List[str]] = Field(
None,
description="Label untuk zero-shot CLIP classification, e.g. ['kucing','anjing']",
example=["indoor", "outdoor", "nature", "city"],
)
class ClassifyRequest(BaseModel):
url: str
labels: List[str] = Field(..., min_length=2, description="Minimal 2 label kandidat")
class SimilarityRequest(BaseModel):
url: str
text: str = Field(..., min_length=1)
class VisualQARequest(BaseModel):
url: str
question: str = Field(..., description="Pertanyaan tentang isi gambar")
# === Responses ===
class CaptionResponse(BaseModel):
caption: str
model: str
class DetectionResponse(BaseModel):
detections: List[DetectionSchema]
count: int
labels_summary: dict
image_width: int
image_height: int
inference_time_ms: float
class ClassificationResponse(BaseModel):
top_label: str
top_score: float
labels: List[str]
probabilities: List[float]
class OCRResponse(BaseModel):
full_text: str
boxes: List[OCRBoxSchema]
word_count: int
language: str
engine: str
class FullAnalysisResponse(BaseModel):
image_width: int
image_height: int
source: str
caption: Optional[CaptionResponse] = None
detections: Optional[DetectionResponse] = None
classification: Optional[ClassificationResponse] = None
ocr: Optional[OCRResponse] = None
summary_text: str = Field(..., description="Ringkasan teks dari semua model — siap dipakai sebagai konteks LLM")
models_used: List[str]
total_latency_ms: float
class SimilarityResponse(BaseModel):
similarity_score: float
text: str
interpretation: str
class VisualQAResponse(BaseModel):
question: str
answer: str
|