kmuthudurai commited on
Commit
a5afdd2
·
verified ·
1 Parent(s): f95c2fe

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +65 -5
app.py CHANGED
@@ -62,7 +62,67 @@ def fetch_file_from_s3_file(file_key):
62
 
63
  # Function to summarize text using OpenAI GPT
64
  def summarize_text(text):
65
- system_prompt = "You are a helpful assistant that summarizes extracted Invoice OCR text into JSON format always."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
66
  try:
67
  response = openai.ChatCompletion.create(
68
  model="gpt-4o-mini",
@@ -74,14 +134,14 @@ def summarize_text(text):
74
  max_tokens=16384
75
  )
76
  content = response.choices[0].message.content.strip()
77
- cleaned_content = re.sub(r'^```json\n', '', content) # Remove '```json\n' at the beginning
78
- cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
79
 
80
  # Step 2: Parse the cleaned content as JSON
81
- parsed_content = json.loads(cleaned_content)
82
 
83
  # Step 3: Print the parsed JSON object
84
- return parsed_content
85
  except Exception as e:
86
  return f"Error in summarization: {str(e)}"
87
  # Dependency to check API Key
 
62
 
63
  # Function to summarize text using OpenAI GPT
64
  def summarize_text(text):
65
+ system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice in a standardized JSON format for storing invoice headers and line items. The invoice headers should include the following details:
66
+
67
+ Vendor Information:
68
+
69
+ Vendor Name
70
+ Vendor Address
71
+ Vendor GST No.
72
+ Invoice Details:
73
+
74
+ Invoice No./Bill No./Consecutive Serial No./Serial No. of Invoice/INVOICE → Considered as InvoiceNo.
75
+ Invoice Date/Date/Date of Supply/Bill Date/Issuing Date/Dated → Considered as InvoiceDate (formatted as dd-MMM-yyyy).
76
+ Invoice Currency/Currency
77
+ Base Amount/Amount
78
+ Tax Amount
79
+ Total Invoice Amount
80
+ Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
81
+ Billing Party Information:
82
+
83
+ Invoice Party/Bill To Name/Sold-to-Party/Taxpayer Name/M/s./CB No./Buyer (Bill to)/Billing Party/Customer Name & Address/Name → Considered as BillToName.
84
+ Invoice Party to / Bill To Address
85
+ Invoice Party to / Bill To GST No.
86
+ Shipping and References:
87
+
88
+ MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
89
+ Shipping Order
90
+ You should extract this data and structure it into a table-like format in the following JSON format:
91
+ {
92
+ "invoice_headers": {
93
+ "VendorName": "",
94
+ "VendorAddress": "",
95
+ "VendorGSTNo": "",
96
+ "InvoiceNo": "",
97
+ "InvoiceDate": "",
98
+ "InvoiceCurrency": "",
99
+ "BaseAmount": "",
100
+ "TaxAmount": "",
101
+ "TotalInvoiceAmt": "",
102
+ "TypeofInvoice": "",
103
+ "BillToName": "",
104
+ "BillToAddress": "",
105
+ "BillToGSTNO": "",
106
+ "RefNo": "",
107
+ "ShippingOrder": ""
108
+ },
109
+ "line_items": [
110
+ {
111
+ "Description": "",
112
+ "TaxPercentage": "",
113
+ "TaxAmount": "",
114
+ "Amount": 0
115
+ }
116
+ ]
117
+ }
118
+ Guidelines for Processing:
119
+
120
+ Ensure accurate extraction of data from the invoice by recognizing alternative naming conventions (e.g., Bill to, Taxpayer Name, etc.).
121
+ Convert the Invoice Date to the specified dd-MMM-yyyy format.
122
+ Use the correct currency and amounts for each invoice field.
123
+ For each line item, provide the Description, Tax Percentage, Tax Amount, and Amount.
124
+ If certain values are missing or not applicable, leave them empty or set them as null where necessary.
125
+ This JSON format will be used to store and manage invoices in a structured and uniform way."""
126
  try:
127
  response = openai.ChatCompletion.create(
128
  model="gpt-4o-mini",
 
134
  max_tokens=16384
135
  )
136
  content = response.choices[0].message.content.strip()
137
+ # cleaned_content = re.sub(r'^```json\n', '', content) # Remove '```json\n' at the beginning
138
+ #cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
139
 
140
  # Step 2: Parse the cleaned content as JSON
141
+ #parsed_content = json.loads(cleaned_content)
142
 
143
  # Step 3: Print the parsed JSON object
144
+ return content # parsed_content
145
  except Exception as e:
146
  return f"Error in summarization: {str(e)}"
147
  # Dependency to check API Key