File size: 1,681 Bytes
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97eaafb
 
 
 
 
 
 
 
 
 
 
 
 
 
0534c29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
from pydantic import BaseModel
from typing import Dict, List, Optional


# Pydantic model for the base response
class BaseResponse(BaseModel):
    status: str
    message: Optional[str] = None


class WelcomeResponse(BaseResponse):
    endpoints: Dict[str, str]


class ModelInfoResponse(BaseResponse):
    model_name: str
    model_version: str
    supported_formats: List[str]
    max_file_size_mb: int
    training_info: Optional[Dict] = None
    last_updated: Optional[str] = None


# Pydantic model for the prediction response
class PredictionResponse(BaseModel):
    status: str
    lyrics: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: Optional[Dict] = None


class PredictionXAIResponse(BaseModel):
    status: str
    lyrics: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: Optional[Dict] = None


class AudioOnlyPredictionResponse(BaseModel):
    status: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: dict


class AudioOnlyPredictionXAIResponse(BaseModel):
    status: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: dict


class CombinedExplanationResponse(BaseModel):
    status: str
    lyrics: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: dict  # Contains both multimodal and audio_only results


class CombinedPredictionResponse(BaseModel):
    status: str
    lyrics: str
    audio_file_name: str
    audio_content_type: str
    audio_file_size: int
    results: dict  # Contains both multimodal and audio_only predictions