File size: 2,575 Bytes
7d07e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
149c85a
7d07e42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
from pydantic import BaseModel, Field, HttpUrl
from typing import List, Optional


# === Shared ===

class BBoxSchema(BaseModel):
    x1: float
    y1: float
    x2: float
    y2: float
    width: float
    height: float


class DetectionSchema(BaseModel):
    label: str
    confidence: float
    bbox: BBoxSchema
    class_id: int


class OCRBoxSchema(BaseModel):
    text: str
    confidence: float
    bbox: list


# === Requests ===

class AnalyzeURLRequest(BaseModel):
    url: str = Field(..., description="URL gambar yang akan dianalisis")
    run_caption: bool = Field(True, description="Generate image caption")
    run_detection: bool = Field(False, description="Deteksi objek dengan YOLO (off by default — opt-in)")
    run_ocr: bool = Field(False, description="Ekstrak teks dari gambar")
    classification_labels: Optional[List[str]] = Field(
        None,
        description="Label untuk zero-shot CLIP classification, e.g. ['kucing','anjing']",
        example=["indoor", "outdoor", "nature", "city"],
    )


class ClassifyRequest(BaseModel):
    url: str
    labels: List[str] = Field(..., min_length=2, description="Minimal 2 label kandidat")


class SimilarityRequest(BaseModel):
    url: str
    text: str = Field(..., min_length=1)


class VisualQARequest(BaseModel):
    url: str
    question: str = Field(..., description="Pertanyaan tentang isi gambar")


# === Responses ===

class CaptionResponse(BaseModel):
    caption: str
    model: str


class DetectionResponse(BaseModel):
    detections: List[DetectionSchema]
    count: int
    labels_summary: dict
    image_width: int
    image_height: int
    inference_time_ms: float


class ClassificationResponse(BaseModel):
    top_label: str
    top_score: float
    labels: List[str]
    probabilities: List[float]


class OCRResponse(BaseModel):
    full_text: str
    boxes: List[OCRBoxSchema]
    word_count: int
    language: str
    engine: str


class FullAnalysisResponse(BaseModel):
    image_width: int
    image_height: int
    source: str
    caption: Optional[CaptionResponse] = None
    detections: Optional[DetectionResponse] = None
    classification: Optional[ClassificationResponse] = None
    ocr: Optional[OCRResponse] = None
    summary_text: str = Field(..., description="Ringkasan teks dari semua model — siap dipakai sebagai konteks LLM")
    models_used: List[str]
    total_latency_ms: float


class SimilarityResponse(BaseModel):
    similarity_score: float
    text: str
    interpretation: str


class VisualQAResponse(BaseModel):
    question: str
    answer: str