vkumartr commited on
Commit
3990e38
·
verified ·
1 Parent(s): 961cc29

Prompting

Browse files
Files changed (1) hide show
  1. app.py +49 -171
app.py CHANGED
@@ -82,58 +82,32 @@ def fetch_file_from_s3_file(file_key):
82
 
83
  # Function to summarize text using OpenAI GPT
84
  def summarize_text(text):
85
- system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:
86
-
87
- Vendor Information:
88
-
89
- Vendor Name
90
- Vendor Address
91
- Vendor GST No.
92
- Invoice Details:
93
-
94
- Invoice No.
95
- Invoice Date Considered as InvoiceDate (formatted as dd-MMM-yyyy).
96
- Invoice Currency/Currency
97
- Base Amount/Amount
98
- Tax Amount
99
- Total Invoice Amount
100
- Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
101
- Customer Information:
102
-
103
- Customer Name
104
- Customer Address
105
- Customer GST No.
106
- Shipping and References:
107
-
108
- MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
109
- Shipping Order
110
- You should extract this data and structure it into a table-like format in the following JSON format:
111
- # {
112
- # "invoice_headers": {
113
- # "VendorName": "",
114
- # "VendorAddress": "",
115
- # "VendorGSTNo": "",
116
- # "InvoiceNo": "",
117
- # "InvoiceDate": "",
118
- # "InvoiceCurrency": "",
119
- # "BaseAmount": "",
120
- # "TaxAmount": "",
121
- # "TotalInvoiceAmt": "",x
122
- # "CustomerName": "",
123
- # "CustomerAddress": "",
124
- # "CustomerGSTNO": "",
125
- # "RefNo": "",
126
- # "ShippingOrder": ""
127
- # },
128
- # "line_items": [
129
- # {
130
- # "Description": "",
131
- # "TaxPercentage": "",
132
- # "TaxAmount": "",
133
- # "Amount": 0
134
- # }
135
- # ]
136
- # }
137
 
138
  {
139
  "response_format": {
@@ -150,84 +124,62 @@ You should extract this data and structure it into a table-like format in the fo
150
  "type": "array",
151
  "items": {
152
  "type": "object",
153
- "required": [
154
- "ProductCode",
155
- "Description",
156
- "Amount"
157
- ],
158
  "properties": {
159
  "Amount": {
160
  "type": "number",
161
- "title": "Amount",
162
- "description": "The amount of the product"
163
  },
164
  "Description": {
165
  "type": "string",
166
- "title": "Description",
167
- "description": "Description of the product"
168
  },
169
  "ProductCode": {
170
  "type": "string",
171
- "title": "Product Code",
172
- "description": "The code of the product"
173
  }
174
  },
175
  "additionalProperties": false
176
  },
177
- "title": "Line Items",
178
- "description": "List of line items on the invoice"
179
  },
180
  "TaxAmount": {
181
  "type": "number",
182
- "title": "Tax Amount",
183
- "description": "The tax amount on the invoice"
184
  },
185
  "VendorGST": {
186
  "type": "string",
187
- "title": "Vendor GST",
188
- "description": "The GST number of the vendor"
189
  },
190
  "VendorName": {
191
  "type": "string",
192
- "title": "Vendor Name",
193
- "description": "The name of the vendor"
194
  },
195
  "InvoiceDate": {
196
  "type": "string",
197
- "title": "Invoice Date",
198
- "description": "The date of the invoice in dd-MMM-yyyy format"
199
  },
200
  "TotalAmount": {
201
  "type": "number",
202
- "title": "Total Amount",
203
- "description": "The total amount on the invoice"
204
  },
205
  "InvoiceNumber": {
206
  "type": "string",
207
- "title": "Invoice Number",
208
- "description": "The number of the invoice"
209
  },
210
  "VendorAddress": {
211
  "type": "string",
212
- "title": "Vendor Address",
213
- "description": "The address of the vendor"
214
  },
215
  "InvoiceCurrency": {
216
  "type": "string",
217
- "title": "Invoice Currency",
218
- "description": "The currency used in the invoice, e.g., USD, INR, AUD"
219
  }
220
  },
221
  "required": [
222
- "LineItems",
223
- "TaxAmount",
224
- "VendorGST",
225
- "VendorName",
226
- "InvoiceDate",
227
- "TotalAmount",
228
- "InvoiceNumber",
229
- "VendorAddress",
230
- "InvoiceCurrency"
231
  ],
232
  "additionalProperties": false,
233
  "description": "Schema for extracting specific information from invoices"
@@ -236,15 +188,13 @@ You should extract this data and structure it into a table-like format in the fo
236
  }
237
  }
238
 
239
-
240
- Guidelines for Processing:
241
-
242
- Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
243
- Convert the Invoice Date to the specified dd-MMM-yyyy format.
244
- Use the correct currency and amounts for each invoice field.
245
- For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
246
- If certain values are missing or not applicable, leave them empty or set them as null where necessary.
247
- This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide."""
248
  try:
249
  response = openai.ChatCompletion.create(
250
  model="gpt-4o-mini",
@@ -281,78 +231,6 @@ def verify_api_key(api_key: str = Header(...)):
281
  def read_root():
282
  return {"message": "Welcome to the Invoice Summarization API!"}
283
 
284
- # @app.get("/ocr/extraction")
285
- # def ocr_from_s3(
286
- # api_key: str = Depends(verify_api_key),
287
- # file_key: str = Query(..., description="S3 file key for the file"),
288
- # document_type: str = Query(..., description="Type of document"),
289
- # entity_ref_key: str = Query(..., description="Entity Reference Key")
290
- # ):
291
- # """
292
- # (PDF or Image) stored in S3 and summarize the text using GPT.
293
- # """
294
- # try:
295
- # # Fetch file from S3
296
- # file_data, content_type = fetch_file_from_s3_file(file_key)
297
-
298
- # extracted_text = []
299
- # base64Data = base64.b64encode(file_data).decode('utf-8')
300
-
301
- # # Process PDF or Image file
302
- # if content_type.startswith("image/"): # Image file
303
- # image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
304
- # image_np = np.array(image) # Convert to NumPy array
305
- # base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
306
- # elif content_type == "application/pdf": # PDF file
307
- # # Open PDF using PyMuPDF
308
- # pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
309
- # extracted_text = []
310
- # # Process each page in the PDF
311
- # for page_number in range(len(pdf_document)):
312
- # page = pdf_document[page_number]
313
- # extracted_text.append(page.get_text("text")) # Extract text from PDF
314
- # pdf_document.close()
315
- # base64DataResp = f"data:application/pdf;base64,{base64Data}"
316
- # else:
317
- # return {"error": f"Unsupported file type: {content_type}"}
318
-
319
- # # Combine extracted text
320
- # full_text = " ".join(extracted_text)
321
-
322
- # # Summarize the extracted text
323
- # summary = summarize_text(full_text)
324
-
325
- # # Document structure for MongoDB
326
- # document = {
327
- # "file_key": file_key,
328
- # "file_type": content_type,
329
- # "document_type": document_type,
330
- # "entityrefkey": entity_ref_key,
331
- # "base64DataResp": base64DataResp,
332
- # "extracted_text": full_text,
333
- # "summary": summary,
334
- # }
335
-
336
- # # Insert into MongoDB
337
- # inserted_doc = invoice_collection.insert_one(document)
338
- # document_id = str(inserted_doc.inserted_id) # Convert ObjectId to string
339
-
340
- # return {
341
- # "message": "Document successfully stored in MongoDB",
342
- # "document_id": document_id,
343
- # "file_key": file_key,
344
- # "summary": summary
345
- # }
346
-
347
- # except Exception as e:
348
- # # Detailed error information
349
- # error_details = {
350
- # "error_type": type(e).__name__,
351
- # "error_message": str(e),
352
- # "traceback": traceback.format_exc()
353
- # }
354
- # return {"error": error_details}
355
-
356
  @app.get("/ocr/extraction")
357
  def ocr_from_s3(
358
  api_key: str = Depends(verify_api_key),
 
82
 
83
  # Function to summarize text using OpenAI GPT
84
  def summarize_text(text):
85
+ system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice into a standardized JSON format. The invoice headers should include the following details:
86
+
87
+ - **Vendor Information**:
88
+ - Vendor Name
89
+ - Vendor Address
90
+ - Vendor GST No.
91
+
92
+ - **Invoice Details**:
93
+ - Invoice No.
94
+ - Invoice Date (formatted as dd-MMM-yyyy)
95
+ - Invoice Currency (e.g., USD, INR, etc.)
96
+ - Base Amount/Amount
97
+ - Tax Amount
98
+ - Total Invoice Amount
99
+ - Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
100
+
101
+ - **Customer Information**:
102
+ - Customer Name
103
+ - Customer Address
104
+ - Customer GST No.
105
+
106
+ - **Shipping and References**:
107
+ - MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
108
+ - Shipping Order
109
+
110
+ You should extract this data and structure it into the following JSON format:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
 
112
  {
113
  "response_format": {
 
124
  "type": "array",
125
  "items": {
126
  "type": "object",
127
+ "required": ["ProductCode", "Description", "Amount"],
 
 
 
 
128
  "properties": {
129
  "Amount": {
130
  "type": "number",
131
+ "description": "Amount for the line item"
 
132
  },
133
  "Description": {
134
  "type": "string",
135
+ "description": "Description of the line item"
 
136
  },
137
  "ProductCode": {
138
  "type": "string",
139
+ "description": "Product or service code for the line item"
 
140
  }
141
  },
142
  "additionalProperties": false
143
  },
144
+ "title": "Line Items"
 
145
  },
146
  "TaxAmount": {
147
  "type": "number",
148
+ "description": "Total tax amount for the invoice"
 
149
  },
150
  "VendorGST": {
151
  "type": "string",
152
+ "description": "Vendor's GST number"
 
153
  },
154
  "VendorName": {
155
  "type": "string",
156
+ "description": "Name of the vendor"
 
157
  },
158
  "InvoiceDate": {
159
  "type": "string",
160
+ "format": "date",
161
+ "description": "Invoice date in dd-MMM-yyyy format"
162
  },
163
  "TotalAmount": {
164
  "type": "number",
165
+ "description": "Total amount for the invoice"
 
166
  },
167
  "InvoiceNumber": {
168
  "type": "string",
169
+ "description": "Invoice number"
 
170
  },
171
  "VendorAddress": {
172
  "type": "string",
173
+ "description": "Vendor's address"
 
174
  },
175
  "InvoiceCurrency": {
176
  "type": "string",
177
+ "description": "Currency used in the invoice"
 
178
  }
179
  },
180
  "required": [
181
+ "LineItems", "TaxAmount", "VendorGST", "VendorName",
182
+ "InvoiceDate", "TotalAmount", "InvoiceNumber", "VendorAddress", "InvoiceCurrency"
 
 
 
 
 
 
 
183
  ],
184
  "additionalProperties": false,
185
  "description": "Schema for extracting specific information from invoices"
 
188
  }
189
  }
190
 
191
+ ### Guidelines for Processing:
192
+ - Extract all relevant data from the invoice using naming conventions such as "Bill to" or "Taxpayer Name" for Vendor and Customer info.
193
+ - Convert the **Invoice Date** to dd-MMM-yyyy format.
194
+ - Ensure correct handling of amounts (e.g., **Amount**, **Tax Amount**, **Total Invoice Amount**).
195
+ - For line items, include descriptions, tax percentages, tax amounts, and amounts.
196
+ - If certain values are missing, leave them empty or set them as null.
197
+ - This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide."""
 
 
198
  try:
199
  response = openai.ChatCompletion.create(
200
  model="gpt-4o-mini",
 
231
  def read_root():
232
  return {"message": "Welcome to the Invoice Summarization API!"}
233
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
234
  @app.get("/ocr/extraction")
235
  def ocr_from_s3(
236
  api_key: str = Depends(verify_api_key),