Spaces:
Build error
Build error
Updated pydantic models to support lists
Browse files- base/ocr.py +7 -7
- data_models.py +17 -17
base/ocr.py
CHANGED
|
@@ -2,7 +2,7 @@
|
|
| 2 |
Custom types for dealing with the Google Vision API JSON output.
|
| 3 |
"""
|
| 4 |
from enum import IntEnum
|
| 5 |
-
from typing import Any, Optional
|
| 6 |
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
|
@@ -36,7 +36,7 @@ class DetectedLanguage(BaseModel):
|
|
| 36 |
|
| 37 |
|
| 38 |
class TextProperty(BaseModel):
|
| 39 |
-
detectedLanguages:
|
| 40 |
detectedBreak: Optional[DetectedBreak]
|
| 41 |
|
| 42 |
|
|
@@ -50,21 +50,21 @@ class Symbol(BaseModel):
|
|
| 50 |
class Word(BaseModel):
|
| 51 |
property: Optional[TextProperty]
|
| 52 |
boundingBox: Any
|
| 53 |
-
symbols:
|
| 54 |
confidence: float
|
| 55 |
|
| 56 |
|
| 57 |
class Paragraph(BaseModel):
|
| 58 |
property: Optional[TextProperty]
|
| 59 |
boundingBox: Any
|
| 60 |
-
words:
|
| 61 |
confidence: float
|
| 62 |
|
| 63 |
|
| 64 |
class Block(BaseModel):
|
| 65 |
property: Optional[TextProperty]
|
| 66 |
boundingBox: Any
|
| 67 |
-
paragraphs:
|
| 68 |
blockType: BlockType
|
| 69 |
confidence: float
|
| 70 |
|
|
@@ -73,12 +73,12 @@ class Page(BaseModel):
|
|
| 73 |
property: Optional[TextProperty]
|
| 74 |
width: int
|
| 75 |
height: int
|
| 76 |
-
blocks:
|
| 77 |
confidence: float
|
| 78 |
|
| 79 |
|
| 80 |
class TextAnnotation(BaseModel):
|
| 81 |
-
pages:
|
| 82 |
text: str
|
| 83 |
|
| 84 |
|
|
|
|
| 2 |
Custom types for dealing with the Google Vision API JSON output.
|
| 3 |
"""
|
| 4 |
from enum import IntEnum
|
| 5 |
+
from typing import Any, Optional, List
|
| 6 |
|
| 7 |
from pydantic import BaseModel
|
| 8 |
|
|
|
|
| 36 |
|
| 37 |
|
| 38 |
class TextProperty(BaseModel):
|
| 39 |
+
detectedLanguages: List[DetectedLanguage]
|
| 40 |
detectedBreak: Optional[DetectedBreak]
|
| 41 |
|
| 42 |
|
|
|
|
| 50 |
class Word(BaseModel):
|
| 51 |
property: Optional[TextProperty]
|
| 52 |
boundingBox: Any
|
| 53 |
+
symbols: List[Symbol]
|
| 54 |
confidence: float
|
| 55 |
|
| 56 |
|
| 57 |
class Paragraph(BaseModel):
|
| 58 |
property: Optional[TextProperty]
|
| 59 |
boundingBox: Any
|
| 60 |
+
words: List[Word]
|
| 61 |
confidence: float
|
| 62 |
|
| 63 |
|
| 64 |
class Block(BaseModel):
|
| 65 |
property: Optional[TextProperty]
|
| 66 |
boundingBox: Any
|
| 67 |
+
paragraphs: List[Paragraph]
|
| 68 |
blockType: BlockType
|
| 69 |
confidence: float
|
| 70 |
|
|
|
|
| 73 |
property: Optional[TextProperty]
|
| 74 |
width: int
|
| 75 |
height: int
|
| 76 |
+
blocks: List[Block]
|
| 77 |
confidence: float
|
| 78 |
|
| 79 |
|
| 80 |
class TextAnnotation(BaseModel):
|
| 81 |
+
pages: List[Page]
|
| 82 |
text: str
|
| 83 |
|
| 84 |
|
data_models.py
CHANGED
|
@@ -4,7 +4,7 @@ Pydantic models used throughout the codebase.
|
|
| 4 |
In particular, these are the types that are used as input and output of each step of the pipeline.
|
| 5 |
"""
|
| 6 |
import json
|
| 7 |
-
from typing import Any, Optional, Union
|
| 8 |
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
|
@@ -65,7 +65,7 @@ class NutrientTable(BaseModel):
|
|
| 65 |
nutrientBasisQuantityValue: Optional[str]
|
| 66 |
nutrientBasisQuantityMeasurementUnitCode: Optional[str]
|
| 67 |
preperationStateCode: Optional[str]
|
| 68 |
-
values:
|
| 69 |
|
| 70 |
def __str__(self):
|
| 71 |
top = "Nutrients per " + " ".join(
|
|
@@ -83,7 +83,7 @@ class Attribute(BaseModel):
|
|
| 83 |
coordinates: str
|
| 84 |
entity: str
|
| 85 |
probability: float
|
| 86 |
-
value: Union[str,
|
| 87 |
model: str
|
| 88 |
|
| 89 |
|
|
@@ -116,13 +116,13 @@ class NetContentAttribute(BaseModel):
|
|
| 116 |
|
| 117 |
class AllergensOut(BaseModel):
|
| 118 |
entity: str
|
| 119 |
-
values:
|
| 120 |
model: str
|
| 121 |
|
| 122 |
|
| 123 |
class CommunicationChannelsOut(BaseModel):
|
| 124 |
entity: str
|
| 125 |
-
values:
|
| 126 |
model: str
|
| 127 |
|
| 128 |
|
|
@@ -131,7 +131,7 @@ class PipelineInput(BaseModel):
|
|
| 131 |
|
| 132 |
|
| 133 |
class PipelineOutput(BaseModel):
|
| 134 |
-
attributes:
|
| 135 |
Union[Attribute, CommunicationChannelsOut, AllergensOut, NetContentAttribute]
|
| 136 |
]
|
| 137 |
job_id: str = Field(alias="job-id")
|
|
@@ -147,13 +147,13 @@ class TextWithLanguage(BaseModel):
|
|
| 147 |
|
| 148 |
|
| 149 |
class OCRTextOut(BaseModel):
|
| 150 |
-
blocks:
|
| 151 |
full_text: str
|
| 152 |
-
sentences:
|
| 153 |
|
| 154 |
|
| 155 |
class OCRTableOut(BaseModel):
|
| 156 |
-
tables:
|
| 157 |
|
| 158 |
|
| 159 |
class OCROut(BaseModel):
|
|
@@ -162,7 +162,7 @@ class OCROut(BaseModel):
|
|
| 162 |
|
| 163 |
|
| 164 |
class OCROutList(BaseModel):
|
| 165 |
-
__root__:
|
| 166 |
|
| 167 |
def __iter__(self):
|
| 168 |
return iter(self.__root__)
|
|
@@ -172,11 +172,11 @@ class OCROutList(BaseModel):
|
|
| 172 |
|
| 173 |
|
| 174 |
class OCRWrapperOut(BaseModel):
|
| 175 |
-
blocks:
|
| 176 |
full_text: str
|
| 177 |
job_id: str
|
| 178 |
-
sentences:
|
| 179 |
-
tables:
|
| 180 |
|
| 181 |
|
| 182 |
class ClassifiedText(BaseModel):
|
|
@@ -210,8 +210,8 @@ class NetContent(BaseModel):
|
|
| 210 |
|
| 211 |
|
| 212 |
class ModelOut(BaseModel):
|
| 213 |
-
blocks:
|
| 214 |
-
tables: Optional[
|
| 215 |
job_id: str
|
| 216 |
model: str
|
| 217 |
full_text: str
|
|
@@ -221,7 +221,7 @@ class ModelOut(BaseModel):
|
|
| 221 |
|
| 222 |
|
| 223 |
class ModelOutList(BaseModel):
|
| 224 |
-
__root__:
|
| 225 |
|
| 226 |
def __iter__(self):
|
| 227 |
return iter(self.__root__)
|
|
@@ -231,6 +231,6 @@ class ModelOutList(BaseModel):
|
|
| 231 |
|
| 232 |
|
| 233 |
class TrainModelOut(BaseModel):
|
| 234 |
-
# To be defined later when we have a
|
| 235 |
model: Optional[Any] = None
|
| 236 |
artifacts: Optional[Any] = None
|
|
|
|
| 4 |
In particular, these are the types that are used as input and output of each step of the pipeline.
|
| 5 |
"""
|
| 6 |
import json
|
| 7 |
+
from typing import Any, Optional, Union, List
|
| 8 |
|
| 9 |
from pydantic import BaseModel, Field
|
| 10 |
|
|
|
|
| 65 |
nutrientBasisQuantityValue: Optional[str]
|
| 66 |
nutrientBasisQuantityMeasurementUnitCode: Optional[str]
|
| 67 |
preperationStateCode: Optional[str]
|
| 68 |
+
values: List[NutrientTableElement]
|
| 69 |
|
| 70 |
def __str__(self):
|
| 71 |
top = "Nutrients per " + " ".join(
|
|
|
|
| 83 |
coordinates: str
|
| 84 |
entity: str
|
| 85 |
probability: float
|
| 86 |
+
value: Union[str, List[NutrientTable]]
|
| 87 |
model: str
|
| 88 |
|
| 89 |
|
|
|
|
| 116 |
|
| 117 |
class AllergensOut(BaseModel):
|
| 118 |
entity: str
|
| 119 |
+
values: List[AttributeAllergen]
|
| 120 |
model: str
|
| 121 |
|
| 122 |
|
| 123 |
class CommunicationChannelsOut(BaseModel):
|
| 124 |
entity: str
|
| 125 |
+
values: List[AttributeCommunicationChannel]
|
| 126 |
model: str
|
| 127 |
|
| 128 |
|
|
|
|
| 131 |
|
| 132 |
|
| 133 |
class PipelineOutput(BaseModel):
|
| 134 |
+
attributes: List[
|
| 135 |
Union[Attribute, CommunicationChannelsOut, AllergensOut, NetContentAttribute]
|
| 136 |
]
|
| 137 |
job_id: str = Field(alias="job-id")
|
|
|
|
| 147 |
|
| 148 |
|
| 149 |
class OCRTextOut(BaseModel):
|
| 150 |
+
blocks: List[str]
|
| 151 |
full_text: str
|
| 152 |
+
sentences: List[TextWithLanguage]
|
| 153 |
|
| 154 |
|
| 155 |
class OCRTableOut(BaseModel):
|
| 156 |
+
tables: List[List[List[str]]]
|
| 157 |
|
| 158 |
|
| 159 |
class OCROut(BaseModel):
|
|
|
|
| 162 |
|
| 163 |
|
| 164 |
class OCROutList(BaseModel):
|
| 165 |
+
__root__: List[OCROut]
|
| 166 |
|
| 167 |
def __iter__(self):
|
| 168 |
return iter(self.__root__)
|
|
|
|
| 172 |
|
| 173 |
|
| 174 |
class OCRWrapperOut(BaseModel):
|
| 175 |
+
blocks: List[str]
|
| 176 |
full_text: str
|
| 177 |
job_id: str
|
| 178 |
+
sentences: List[TextWithLanguage]
|
| 179 |
+
tables: List[List[List[str]]]
|
| 180 |
|
| 181 |
|
| 182 |
class ClassifiedText(BaseModel):
|
|
|
|
| 210 |
|
| 211 |
|
| 212 |
class ModelOut(BaseModel):
|
| 213 |
+
blocks: List[Union[NetContent, Allergen, CommunicationChannels, ClassifiedText]]
|
| 214 |
+
tables: Optional[List[NutrientTable]]
|
| 215 |
job_id: str
|
| 216 |
model: str
|
| 217 |
full_text: str
|
|
|
|
| 221 |
|
| 222 |
|
| 223 |
class ModelOutList(BaseModel):
|
| 224 |
+
__root__: List[ModelOut]
|
| 225 |
|
| 226 |
def __iter__(self):
|
| 227 |
return iter(self.__root__)
|
|
|
|
| 231 |
|
| 232 |
|
| 233 |
class TrainModelOut(BaseModel):
|
| 234 |
+
# To be defined later when we have a List of accepted formats
|
| 235 |
model: Optional[Any] = None
|
| 236 |
artifacts: Optional[Any] = None
|