vkumartr commited on
Commit
961cc29
·
verified ·
1 Parent(s): b415d4c

JSON Schema modified

Browse files
Files changed (1) hide show
  1. app.py +207 -87
app.py CHANGED
@@ -118,8 +118,7 @@ You should extract this data and structure it into a table-like format in the fo
118
  # "InvoiceCurrency": "",
119
  # "BaseAmount": "",
120
  # "TaxAmount": "",
121
- # "TotalInvoiceAmt": "",
122
- # "TypeofInvoice": "",
123
  # "CustomerName": "",
124
  # "CustomerAddress": "",
125
  # "CustomerGSTNO": "",
@@ -137,69 +136,107 @@ You should extract this data and structure it into a table-like format in the fo
137
  # }
138
 
139
  {
140
- "$schema": "http://json-schema.org/draft-07/schema#",
141
- "type": "object",
142
- "title": "Invoice Information Extractor",
143
- "description": "Schema for extracting specific information from invoices",
144
- "properties": {
145
- "invoice_headers": {
146
- "type": "object",
147
- "properties": {
148
- "VendorName": { "type": "string", "description": "The name of the vendor" },
149
- "VendorAddress": { "type": "string", "description": "The address of the vendor" },
150
- "VendorGSTNo": { "type": "string", "description": "The GST number of the vendor" },
151
- "InvoiceNo": { "type": "string", "description": "The number of the invoice" },
152
- "InvoiceDate": { "type": "string", "format": "date", "description": "The date of the invoice in dd-MMM-yyyy format" },
153
- "InvoiceCurrency": { "type": "string", "description": "The currency used in the invoice (e.g., USD, INR, AUD)" },
154
- "BaseAmount": { "type": "number", "description": "The base amount before tax" },
155
- "TaxAmount": { "type": "number", "description": "The tax amount on the invoice" },
156
- "TotalInvoiceAmt": { "type": "number", "description": "The total amount on the invoice" },
157
- "TypeofInvoice": { "type": "string", "description": "Type of invoice (e.g., Tax Invoice, Proforma Invoice)" },
158
- "CustomerName": { "type": "string", "description": "The name of the customer" },
159
- "CustomerAddress": { "type": "string", "description": "The address of the customer" },
160
- "CustomerGSTNO": { "type": "string", "description": "The GST number of the customer" },
161
- "RefNo": { "type": "string", "description": "Reference number related to shipping or order" },
162
- "ShippingOrder": { "type": "string", "description": "Shipping order details" }
163
- },
164
- "required": [
165
- "VendorName",
166
- "VendorAddress",
167
- "VendorGSTNo",
168
- "InvoiceNo",
169
- "InvoiceDate",
170
- "InvoiceCurrency",
171
- "BaseAmount",
172
- "TaxAmount",
173
- "TotalInvoiceAmt",
174
- "TypeofInvoice",
175
- "CustomerName",
176
- "CustomerAddress",
177
- "CustomerGSTNO",
178
- "RefNo",
179
- "ShippingOrder"
180
- ],
181
- "additionalProperties": false
182
- },
183
- "line_items": {
184
  "type": "array",
185
- "description": "List of line items on the invoice",
186
  "items": {
187
- "type": "object",
188
- "properties": {
189
- "Description": { "type": "string", "description": "Description of the product/service" },
190
- "TaxPercentage": { "type": "number", "description": "Tax percentage applied to the line item" },
191
- "TaxAmount": { "type": "number", "description": "Tax amount for the line item" },
192
- "Amount": { "type": "number", "description": "The total amount for the line item" }
 
 
 
 
 
193
  },
194
- "required": ["Description", "TaxPercentage", "TaxAmount", "Amount"],
195
- "additionalProperties": false
196
- }
197
- }
198
- },
199
- "required": ["invoice_headers", "line_items"],
200
- "additionalProperties": false
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  }
202
 
 
203
  Guidelines for Processing:
204
 
205
  Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
@@ -244,6 +281,78 @@ def verify_api_key(api_key: str = Header(...)):
244
  def read_root():
245
  return {"message": "Welcome to the Invoice Summarization API!"}
246
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
247
  @app.get("/ocr/extraction")
248
  def ocr_from_s3(
249
  api_key: str = Depends(verify_api_key),
@@ -252,69 +361,80 @@ def ocr_from_s3(
252
  entity_ref_key: str = Query(..., description="Entity Reference Key")
253
  ):
254
  """
255
- (PDF or Image) stored in S3 and summarize the text using GPT.
 
 
256
  """
257
  try:
258
  # Fetch file from S3
259
  file_data, content_type = fetch_file_from_s3_file(file_key)
260
 
261
  extracted_text = []
262
- base64Data = base64.b64encode(file_data).decode('utf-8')
263
-
264
- # Process PDF or Image file
265
  if content_type.startswith("image/"): # Image file
266
- image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
267
- image_np = np.array(image) # Convert to NumPy array
 
 
 
268
  base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
 
269
  elif content_type == "application/pdf": # PDF file
270
- # Open PDF using PyMuPDF
271
  pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
272
- extracted_text = []
273
- # Process each page in the PDF
274
- for page_number in range(len(pdf_document)):
275
  page = pdf_document[page_number]
276
- extracted_text.append(page.get_text("text")) # Extract text from PDF
 
277
  pdf_document.close()
278
- base64DataResp = f"data:application/pdf;base64,{base64Data}"
279
- else:
280
- return {"error": f"Unsupported file type: {content_type}"}
281
 
282
- # Combine extracted text
283
- full_text = " ".join(extracted_text)
 
 
284
 
285
- # Summarize the extracted text
286
- summary = summarize_text(full_text)
 
 
287
 
288
- # Document structure for MongoDB
 
 
 
289
  document = {
290
  "file_key": file_key,
291
  "file_type": content_type,
292
  "document_type": document_type,
293
- "entityrefkey": entity_ref_key,
294
- "base64DataResp": base64DataResp,
295
- "extracted_text": full_text,
296
- "summary": summary,
 
297
  }
298
 
299
- # Insert into MongoDB
300
  inserted_doc = invoice_collection.insert_one(document)
301
- document_id = str(inserted_doc.inserted_id) # Convert ObjectId to string
302
 
303
  return {
304
  "message": "Document successfully stored in MongoDB",
305
  "document_id": document_id,
306
  "file_key": file_key,
307
- "summary": summary
 
308
  }
309
 
310
  except Exception as e:
311
- # Detailed error information
312
  error_details = {
313
  "error_type": type(e).__name__,
314
  "error_message": str(e),
315
  "traceback": traceback.format_exc()
316
  }
317
  return {"error": error_details}
 
318
 
319
  # Serve the output folder as static files
320
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 
118
  # "InvoiceCurrency": "",
119
  # "BaseAmount": "",
120
  # "TaxAmount": "",
121
+ # "TotalInvoiceAmt": "",x
 
122
  # "CustomerName": "",
123
  # "CustomerAddress": "",
124
  # "CustomerGSTNO": "",
 
136
  # }
137
 
138
  {
139
+ "response_format": {
140
+ "type": "json_schema",
141
+ "json_schema": {
142
+ "name": "invoice",
143
+ "strict": true,
144
+ "schema": {
145
+ "type": "object",
146
+ "title": "Invoice Information Extractor",
147
+ "$schema": "http://json-schema.org/draft-07/schema#",
148
+ "properties": {
149
+ "LineItems": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
150
  "type": "array",
 
151
  "items": {
152
+ "type": "object",
153
+ "required": [
154
+ "ProductCode",
155
+ "Description",
156
+ "Amount"
157
+ ],
158
+ "properties": {
159
+ "Amount": {
160
+ "type": "number",
161
+ "title": "Amount",
162
+ "description": "The amount of the product"
163
  },
164
+ "Description": {
165
+ "type": "string",
166
+ "title": "Description",
167
+ "description": "Description of the product"
168
+ },
169
+ "ProductCode": {
170
+ "type": "string",
171
+ "title": "Product Code",
172
+ "description": "The code of the product"
173
+ }
174
+ },
175
+ "additionalProperties": false
176
+ },
177
+ "title": "Line Items",
178
+ "description": "List of line items on the invoice"
179
+ },
180
+ "TaxAmount": {
181
+ "type": "number",
182
+ "title": "Tax Amount",
183
+ "description": "The tax amount on the invoice"
184
+ },
185
+ "VendorGST": {
186
+ "type": "string",
187
+ "title": "Vendor GST",
188
+ "description": "The GST number of the vendor"
189
+ },
190
+ "VendorName": {
191
+ "type": "string",
192
+ "title": "Vendor Name",
193
+ "description": "The name of the vendor"
194
+ },
195
+ "InvoiceDate": {
196
+ "type": "string",
197
+ "title": "Invoice Date",
198
+ "description": "The date of the invoice in dd-MMM-yyyy format"
199
+ },
200
+ "TotalAmount": {
201
+ "type": "number",
202
+ "title": "Total Amount",
203
+ "description": "The total amount on the invoice"
204
+ },
205
+ "InvoiceNumber": {
206
+ "type": "string",
207
+ "title": "Invoice Number",
208
+ "description": "The number of the invoice"
209
+ },
210
+ "VendorAddress": {
211
+ "type": "string",
212
+ "title": "Vendor Address",
213
+ "description": "The address of the vendor"
214
+ },
215
+ "InvoiceCurrency": {
216
+ "type": "string",
217
+ "title": "Invoice Currency",
218
+ "description": "The currency used in the invoice, e.g., USD, INR, AUD"
219
+ }
220
+ },
221
+ "required": [
222
+ "LineItems",
223
+ "TaxAmount",
224
+ "VendorGST",
225
+ "VendorName",
226
+ "InvoiceDate",
227
+ "TotalAmount",
228
+ "InvoiceNumber",
229
+ "VendorAddress",
230
+ "InvoiceCurrency"
231
+ ],
232
+ "additionalProperties": false,
233
+ "description": "Schema for extracting specific information from invoices"
234
+ }
235
+ }
236
+ }
237
  }
238
 
239
+
240
  Guidelines for Processing:
241
 
242
  Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
 
281
  def read_root():
282
  return {"message": "Welcome to the Invoice Summarization API!"}
283
 
284
+ # @app.get("/ocr/extraction")
285
+ # def ocr_from_s3(
286
+ # api_key: str = Depends(verify_api_key),
287
+ # file_key: str = Query(..., description="S3 file key for the file"),
288
+ # document_type: str = Query(..., description="Type of document"),
289
+ # entity_ref_key: str = Query(..., description="Entity Reference Key")
290
+ # ):
291
+ # """
292
+ # (PDF or Image) stored in S3 and summarize the text using GPT.
293
+ # """
294
+ # try:
295
+ # # Fetch file from S3
296
+ # file_data, content_type = fetch_file_from_s3_file(file_key)
297
+
298
+ # extracted_text = []
299
+ # base64Data = base64.b64encode(file_data).decode('utf-8')
300
+
301
+ # # Process PDF or Image file
302
+ # if content_type.startswith("image/"): # Image file
303
+ # image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
304
+ # image_np = np.array(image) # Convert to NumPy array
305
+ # base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
306
+ # elif content_type == "application/pdf": # PDF file
307
+ # # Open PDF using PyMuPDF
308
+ # pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
309
+ # extracted_text = []
310
+ # # Process each page in the PDF
311
+ # for page_number in range(len(pdf_document)):
312
+ # page = pdf_document[page_number]
313
+ # extracted_text.append(page.get_text("text")) # Extract text from PDF
314
+ # pdf_document.close()
315
+ # base64DataResp = f"data:application/pdf;base64,{base64Data}"
316
+ # else:
317
+ # return {"error": f"Unsupported file type: {content_type}"}
318
+
319
+ # # Combine extracted text
320
+ # full_text = " ".join(extracted_text)
321
+
322
+ # # Summarize the extracted text
323
+ # summary = summarize_text(full_text)
324
+
325
+ # # Document structure for MongoDB
326
+ # document = {
327
+ # "file_key": file_key,
328
+ # "file_type": content_type,
329
+ # "document_type": document_type,
330
+ # "entityrefkey": entity_ref_key,
331
+ # "base64DataResp": base64DataResp,
332
+ # "extracted_text": full_text,
333
+ # "summary": summary,
334
+ # }
335
+
336
+ # # Insert into MongoDB
337
+ # inserted_doc = invoice_collection.insert_one(document)
338
+ # document_id = str(inserted_doc.inserted_id) # Convert ObjectId to string
339
+
340
+ # return {
341
+ # "message": "Document successfully stored in MongoDB",
342
+ # "document_id": document_id,
343
+ # "file_key": file_key,
344
+ # "summary": summary
345
+ # }
346
+
347
+ # except Exception as e:
348
+ # # Detailed error information
349
+ # error_details = {
350
+ # "error_type": type(e).__name__,
351
+ # "error_message": str(e),
352
+ # "traceback": traceback.format_exc()
353
+ # }
354
+ # return {"error": error_details}
355
+
356
  @app.get("/ocr/extraction")
357
  def ocr_from_s3(
358
  api_key: str = Depends(verify_api_key),
 
361
  entity_ref_key: str = Query(..., description="Entity Reference Key")
362
  ):
363
  """
364
+ Extract text from a PDF or Image stored in S3 and process it based on document size.
365
+ If more than 2 pages, skip Base64 conversion and summarization.
366
+ Store extracted data in MongoDB.
367
  """
368
  try:
369
  # Fetch file from S3
370
  file_data, content_type = fetch_file_from_s3_file(file_key)
371
 
372
  extracted_text = []
373
+ base64DataResp = None
374
+ summary = None
375
+
376
  if content_type.startswith("image/"): # Image file
377
+ image = Image.open(io.BytesIO(file_data)).convert("RGB")
378
+ extracted_text.append(pytesseract.image_to_string(image)) # Extract text using OCR
379
+
380
+ # If single image, store Base64
381
+ base64Data = base64.b64encode(file_data).decode('utf-8')
382
  base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
383
+
384
  elif content_type == "application/pdf": # PDF file
 
385
  pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
386
+ num_pages = len(pdf_document)
387
+
388
+ for page_number in range(num_pages):
389
  page = pdf_document[page_number]
390
+ extracted_text.append(page.get_text("text"))
391
+
392
  pdf_document.close()
 
 
 
393
 
394
+ # If 2 pages or less, store Base64
395
+ if num_pages <= 2:
396
+ base64Data = base64.b64encode(file_data).decode('utf-8')
397
+ base64DataResp = f"data:application/pdf;base64,{base64Data}"
398
 
399
+ # If 2 pages or less, generate summary
400
+ if num_pages <= 2:
401
+ full_text = " ".join(extracted_text)
402
+ summary = summarize_text(full_text)
403
 
404
+ else:
405
+ return {"error": f"Unsupported file type: {content_type}"}
406
+
407
+ # Store extracted data in MongoDB
408
  document = {
409
  "file_key": file_key,
410
  "file_type": content_type,
411
  "document_type": document_type,
412
+ "entity_ref_key": entity_ref_key,
413
+ "num_pages": len(extracted_text), # Store page count
414
+ "base64DataResp": base64DataResp, # Only for small files
415
+ "extracted_text": " ".join(extracted_text),
416
+ "summary": summary, # Only for small files
417
  }
418
 
 
419
  inserted_doc = invoice_collection.insert_one(document)
420
+ document_id = str(inserted_doc.inserted_id)
421
 
422
  return {
423
  "message": "Document successfully stored in MongoDB",
424
  "document_id": document_id,
425
  "file_key": file_key,
426
+ "num_pages": len(extracted_text),
427
+ "summary": summary if summary else "Skipped for large documents"
428
  }
429
 
430
  except Exception as e:
 
431
  error_details = {
432
  "error_type": type(e).__name__,
433
  "error_message": str(e),
434
  "traceback": traceback.format_exc()
435
  }
436
  return {"error": error_details}
437
+
438
 
439
  # Serve the output folder as static files
440
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")