Spaces:
Sleeping
Sleeping
Dynamically created a schema - MongoDB
Browse files
app.py
CHANGED
|
@@ -115,102 +115,7 @@ def extract_invoice_data(file_data, content_type):
|
|
| 115 |
],
|
| 116 |
response_format={
|
| 117 |
"type": "json_schema",
|
| 118 |
-
"json_schema":
|
| 119 |
-
"name": "invoice",
|
| 120 |
-
"strict": True,
|
| 121 |
-
"schema": {
|
| 122 |
-
"type": "object",
|
| 123 |
-
"title": "Invoice Information Extractor",
|
| 124 |
-
"$schema": "http://json-schema.org/draft-07/schema#",
|
| 125 |
-
"properties": {
|
| 126 |
-
"LineItems": {
|
| 127 |
-
"type": "array",
|
| 128 |
-
"items": {
|
| 129 |
-
"type": "object",
|
| 130 |
-
"required": [
|
| 131 |
-
"ProductCode",
|
| 132 |
-
"Description",
|
| 133 |
-
"Amount"
|
| 134 |
-
],
|
| 135 |
-
"properties": {
|
| 136 |
-
"ProductCode": {
|
| 137 |
-
"type": "string",
|
| 138 |
-
"title": "Product Code",
|
| 139 |
-
"description": "The code of the product"
|
| 140 |
-
},
|
| 141 |
-
"Description": {
|
| 142 |
-
"type": "string",
|
| 143 |
-
"title": "Description",
|
| 144 |
-
"description": "Description of the product"
|
| 145 |
-
},
|
| 146 |
-
"Amount": {
|
| 147 |
-
"type": "number",
|
| 148 |
-
"title": "Amount",
|
| 149 |
-
"description": "The amount of the product"
|
| 150 |
-
}
|
| 151 |
-
},
|
| 152 |
-
"additionalProperties": False
|
| 153 |
-
},
|
| 154 |
-
"title": "Line Items",
|
| 155 |
-
"description": "List of line items on the invoice"
|
| 156 |
-
},
|
| 157 |
-
"TaxAmount": {
|
| 158 |
-
"type": "number",
|
| 159 |
-
"title": "Tax Amount",
|
| 160 |
-
"description": "The tax amount on the invoice"
|
| 161 |
-
},
|
| 162 |
-
"VendorGST": {
|
| 163 |
-
"type": "string",
|
| 164 |
-
"title": "Vendor GST",
|
| 165 |
-
"description": "The GST number of the vendor"
|
| 166 |
-
},
|
| 167 |
-
"VendorName": {
|
| 168 |
-
"type": "string",
|
| 169 |
-
"title": "Vendor Name",
|
| 170 |
-
"description": "The name of the vendor"
|
| 171 |
-
},
|
| 172 |
-
"InvoiceDate": {
|
| 173 |
-
"type": "string",
|
| 174 |
-
"title": "Invoice Date",
|
| 175 |
-
"description": "The date of the invoice (format: dd-MMM-yyyy)"
|
| 176 |
-
},
|
| 177 |
-
"TotalAmount": {
|
| 178 |
-
"type": "number",
|
| 179 |
-
"title": "Total Amount",
|
| 180 |
-
"description": "The total amount on the invoice"
|
| 181 |
-
},
|
| 182 |
-
"InvoiceNumber": {
|
| 183 |
-
"type": "string",
|
| 184 |
-
"title": "Invoice Number",
|
| 185 |
-
"description": "The number of the invoice"
|
| 186 |
-
},
|
| 187 |
-
"VendorAddress": {
|
| 188 |
-
"type": "string",
|
| 189 |
-
"title": "Vendor Address",
|
| 190 |
-
"description": "The address of the vendor"
|
| 191 |
-
},
|
| 192 |
-
"InvoiceCurrency": {
|
| 193 |
-
"type": "string",
|
| 194 |
-
"title": "Invoice Currency",
|
| 195 |
-
"description": "The currency used in the invoice (e.g., USD, INR, AUD)"
|
| 196 |
-
}
|
| 197 |
-
},
|
| 198 |
-
"required": [
|
| 199 |
-
"LineItems",
|
| 200 |
-
"TaxAmount",
|
| 201 |
-
"VendorGST",
|
| 202 |
-
"VendorName",
|
| 203 |
-
"InvoiceDate",
|
| 204 |
-
"TotalAmount",
|
| 205 |
-
"InvoiceNumber",
|
| 206 |
-
"VendorAddress",
|
| 207 |
-
"InvoiceCurrency"
|
| 208 |
-
],
|
| 209 |
-
"additionalProperties": False,
|
| 210 |
-
"description": "Schema for extracting structured invoice data"
|
| 211 |
-
}
|
| 212 |
-
}
|
| 213 |
-
},
|
| 214 |
temperature=0.5,
|
| 215 |
max_tokens=16384
|
| 216 |
)
|
|
@@ -265,13 +170,19 @@ def extract_text_from_file(
|
|
| 265 |
"message": "Document Retrieved from MongoDB.",
|
| 266 |
"document": existing_document
|
| 267 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 268 |
|
| 269 |
# Retrieve file from S3 and determine content type
|
| 270 |
content_type = get_content_type_from_s3(file_key)
|
| 271 |
|
| 272 |
# Extract and parse invoice data
|
| 273 |
file_data, _ = fetch_file_from_s3(file_key)
|
| 274 |
-
extracted_data = extract_invoice_data(file_data, content_type)
|
| 275 |
|
| 276 |
# Store extracted data in MongoDB
|
| 277 |
document = {
|
|
|
|
| 115 |
],
|
| 116 |
response_format={
|
| 117 |
"type": "json_schema",
|
| 118 |
+
"json_schema": json_schema
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
temperature=0.5,
|
| 120 |
max_tokens=16384
|
| 121 |
)
|
|
|
|
| 170 |
"message": "Document Retrieved from MongoDB.",
|
| 171 |
"document": existing_document
|
| 172 |
}
|
| 173 |
+
# Fetch dynamic schema based on document type
|
| 174 |
+
schema_doc = db.schema_collection.find_one({"document_type": document_type})
|
| 175 |
+
if not schema_doc:
|
| 176 |
+
raise ValueError("No schema found for the given document type")
|
| 177 |
+
|
| 178 |
+
json_schema = schema_doc.get("json_schema")
|
| 179 |
|
| 180 |
# Retrieve file from S3 and determine content type
|
| 181 |
content_type = get_content_type_from_s3(file_key)
|
| 182 |
|
| 183 |
# Extract and parse invoice data
|
| 184 |
file_data, _ = fetch_file_from_s3(file_key)
|
| 185 |
+
extracted_data = extract_invoice_data(file_data, content_type,json_schema)
|
| 186 |
|
| 187 |
# Store extracted data in MongoDB
|
| 188 |
document = {
|