vkumartr commited on
Commit
c431613
·
verified ·
1 Parent(s): 89d454b

Dynamically created a schema - MongoDB

Browse files
Files changed (1) hide show
  1. app.py +8 -97
app.py CHANGED
@@ -115,102 +115,7 @@ def extract_invoice_data(file_data, content_type):
115
  ],
116
  response_format={
117
  "type": "json_schema",
118
- "json_schema": {
119
- "name": "invoice",
120
- "strict": True,
121
- "schema": {
122
- "type": "object",
123
- "title": "Invoice Information Extractor",
124
- "$schema": "http://json-schema.org/draft-07/schema#",
125
- "properties": {
126
- "LineItems": {
127
- "type": "array",
128
- "items": {
129
- "type": "object",
130
- "required": [
131
- "ProductCode",
132
- "Description",
133
- "Amount"
134
- ],
135
- "properties": {
136
- "ProductCode": {
137
- "type": "string",
138
- "title": "Product Code",
139
- "description": "The code of the product"
140
- },
141
- "Description": {
142
- "type": "string",
143
- "title": "Description",
144
- "description": "Description of the product"
145
- },
146
- "Amount": {
147
- "type": "number",
148
- "title": "Amount",
149
- "description": "The amount of the product"
150
- }
151
- },
152
- "additionalProperties": False
153
- },
154
- "title": "Line Items",
155
- "description": "List of line items on the invoice"
156
- },
157
- "TaxAmount": {
158
- "type": "number",
159
- "title": "Tax Amount",
160
- "description": "The tax amount on the invoice"
161
- },
162
- "VendorGST": {
163
- "type": "string",
164
- "title": "Vendor GST",
165
- "description": "The GST number of the vendor"
166
- },
167
- "VendorName": {
168
- "type": "string",
169
- "title": "Vendor Name",
170
- "description": "The name of the vendor"
171
- },
172
- "InvoiceDate": {
173
- "type": "string",
174
- "title": "Invoice Date",
175
- "description": "The date of the invoice (format: dd-MMM-yyyy)"
176
- },
177
- "TotalAmount": {
178
- "type": "number",
179
- "title": "Total Amount",
180
- "description": "The total amount on the invoice"
181
- },
182
- "InvoiceNumber": {
183
- "type": "string",
184
- "title": "Invoice Number",
185
- "description": "The number of the invoice"
186
- },
187
- "VendorAddress": {
188
- "type": "string",
189
- "title": "Vendor Address",
190
- "description": "The address of the vendor"
191
- },
192
- "InvoiceCurrency": {
193
- "type": "string",
194
- "title": "Invoice Currency",
195
- "description": "The currency used in the invoice (e.g., USD, INR, AUD)"
196
- }
197
- },
198
- "required": [
199
- "LineItems",
200
- "TaxAmount",
201
- "VendorGST",
202
- "VendorName",
203
- "InvoiceDate",
204
- "TotalAmount",
205
- "InvoiceNumber",
206
- "VendorAddress",
207
- "InvoiceCurrency"
208
- ],
209
- "additionalProperties": False,
210
- "description": "Schema for extracting structured invoice data"
211
- }
212
- }
213
- },
214
  temperature=0.5,
215
  max_tokens=16384
216
  )
@@ -265,13 +170,19 @@ def extract_text_from_file(
265
  "message": "Document Retrieved from MongoDB.",
266
  "document": existing_document
267
  }
 
 
 
 
 
 
268
 
269
  # Retrieve file from S3 and determine content type
270
  content_type = get_content_type_from_s3(file_key)
271
 
272
  # Extract and parse invoice data
273
  file_data, _ = fetch_file_from_s3(file_key)
274
- extracted_data = extract_invoice_data(file_data, content_type)
275
 
276
  # Store extracted data in MongoDB
277
  document = {
 
115
  ],
116
  response_format={
117
  "type": "json_schema",
118
+ "json_schema": json_schema
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  temperature=0.5,
120
  max_tokens=16384
121
  )
 
170
  "message": "Document Retrieved from MongoDB.",
171
  "document": existing_document
172
  }
173
+ # Fetch dynamic schema based on document type
174
+ schema_doc = db.schema_collection.find_one({"document_type": document_type})
175
+ if not schema_doc:
176
+ raise ValueError("No schema found for the given document type")
177
+
178
+ json_schema = schema_doc.get("json_schema")
179
 
180
  # Retrieve file from S3 and determine content type
181
  content_type = get_content_type_from_s3(file_key)
182
 
183
  # Extract and parse invoice data
184
  file_data, _ = fetch_file_from_s3(file_key)
185
+ extracted_data = extract_invoice_data(file_data, content_type,json_schema)
186
 
187
  # Store extracted data in MongoDB
188
  document = {