vkumartr commited on
Commit
682c3df
·
verified ·
1 Parent(s): 6b32371

Modified based on JSON Schema

Browse files
Files changed (1) hide show
  1. app.py +158 -174
app.py CHANGED
@@ -19,6 +19,7 @@ from dotenv import load_dotenv
19
  import base64
20
  from bson.objectid import ObjectId
21
 
 
22
  load_dotenv()
23
 
24
  # Set up logging
@@ -30,28 +31,28 @@ MONGODB_URI = os.getenv("MONGODB_URI")
30
  DATABASE_NAME = os.getenv("DATABASE_NAME")
31
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
32
 
33
- app = FastAPI(docs_url='/')
34
- use_gpu = False
35
- output_dir = 'output'
36
 
37
  # Check if environment variables are set
38
  if not MONGODB_URI:
39
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
40
 
41
  # Initialize MongoDB Connection
42
- client = MongoClient(MONGODB_URI)
43
- db = client[DATABASE_NAME]
44
  invoice_collection = db[COLLECTION_NAME]
 
 
 
45
 
46
  @app.on_event("startup")
47
  def startup_db():
48
  try:
49
- # Create a MongoDB client and fetch server info to verify connection
50
- client = MongoClient(MONGODB_URI)
51
- server_info = client.server_info() # This is a blocking call
52
- print("MongoDB connection successful:", server_info)
53
  except Exception as e:
54
- print("MongoDB connection failed:", str(e))
55
 
56
  # AWS S3 Configuration
57
  API_KEY = os.getenv("API_KEY")
@@ -70,7 +71,7 @@ s3_client = boto3.client(
70
  )
71
 
72
  # Function to fetch file from S3
73
- def fetch_file_from_s3_file(file_key):
74
  try:
75
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
76
  content_type = response['ContentType'] # Retrieve MIME type
@@ -80,146 +81,153 @@ def fetch_file_from_s3_file(file_key):
80
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
81
 
82
  # Function to summarize text using OpenAI GPT
83
- def summarize_text(text):
84
- system_prompt = """You are tasked with extracting and structuring all relevant information from an invoice into a standardized JSON format. The invoice headers should include the following details:
85
-
86
- - **Vendor Information**:
87
- - Vendor Name
88
- - Vendor Address
89
- - Vendor GST No.
90
-
91
- - **Invoice Details**:
92
- - Invoice No.
93
- - Invoice Date (formatted as dd-MMM-yyyy)
94
- - Invoice Currency (e.g., USD, INR, etc.)
95
- - Base Amount/Amount
96
- - Tax Amount
97
- - Total Invoice Amount
98
- - Type of Invoice (e.g., "Tax Invoice", "Proforma Invoice", etc.)
99
-
100
- - **Customer Information**:
101
- - Customer Name
102
- - Customer Address
103
- - Customer GST No.
104
-
105
- - **Shipping and References**:
106
- - MBL No./HBL No./Container No./Shipping Bill No./Shipper Invoice No./Manifest No./MAWB/HAWB/OBL No./Bill of Lading Number/REF/Ocean Bill of Lading/House Bill of Lading/BL No./Job No. → Considered as RefNo.
107
- - Shipping Order
108
-
109
- You should extract this data and structure it into the following JSON format:
110
-
111
- {
112
- "response_format": {
113
- "type": "json_schema",
114
- "json_schema": {
115
- "name": "invoice",
116
- "strict": true,
117
- "schema": {
118
- "type": "object",
119
- "title": "Invoice Information Extractor",
120
- "$schema": "http://json-schema.org/draft-07/schema#",
121
- "properties": {
122
- "LineItems": {
123
- "type": "array",
124
- "items": {
125
- "type": "object",
126
- "required": ["ProductCode", "Description", "Amount"],
127
- "properties": {
128
- "Amount": {
129
- "type": "number",
130
- "description": "Amount for the line item"
131
- },
132
- "Description": {
133
- "type": "string",
134
- "description": "Description of the line item"
135
- },
136
- "ProductCode": {
137
- "type": "string",
138
- "description": "Product or service code for the line item"
139
- }
140
- },
141
- "additionalProperties": false
142
- },
143
- "title": "Line Items"
144
- },
145
- "TaxAmount": {
146
- "type": "number",
147
- "description": "Total tax amount for the invoice"
148
- },
149
- "VendorGST": {
150
- "type": "string",
151
- "description": "Vendor's GST number"
152
- },
153
- "VendorName": {
154
- "type": "string",
155
- "description": "Name of the vendor"
156
- },
157
- "InvoiceDate": {
158
- "type": "string",
159
- "format": "date",
160
- "description": "Invoice date in dd-MMM-yyyy format"
161
- },
162
- "TotalAmount": {
163
- "type": "number",
164
- "description": "Total amount for the invoice"
165
- },
166
- "InvoiceNumber": {
167
- "type": "string",
168
- "description": "Invoice number"
169
- },
170
- "VendorAddress": {
171
- "type": "string",
172
- "description": "Vendor's address"
173
- },
174
- "InvoiceCurrency": {
175
- "type": "string",
176
- "description": "Currency used in the invoice"
177
- }
178
- },
179
- "required": [
180
- "LineItems", "TaxAmount", "VendorGST", "VendorName",
181
- "InvoiceDate", "TotalAmount", "InvoiceNumber", "VendorAddress", "InvoiceCurrency"
182
- ],
183
- "additionalProperties": false,
184
- "description": "Schema for extracting specific information from invoices"
185
- }
186
- }
187
- }
188
- }
189
-
190
- ### Guidelines for Processing:
191
- - Extract all relevant data from the invoice using naming conventions such as "Bill to" or "Taxpayer Name" for Vendor and Customer info.
192
- - Convert the **Invoice Date** to dd-MMM-yyyy format.
193
- - Ensure correct handling of amounts (e.g., **Amount**, **Tax Amount**, **Total Invoice Amount**).
194
- - For line items, include descriptions, tax percentages, tax amounts, and amounts.
195
- - If certain values are missing, leave them empty or set them as null.
196
- - This JSON format will be used to store and manage invoices in a structured and uniform way. Please ensure only return JSON format. No extra content should not provide."""
197
  try:
198
  response = openai.ChatCompletion.create(
199
  model="gpt-4o-mini",
200
  messages=[
201
  {"role": "system", "content": system_prompt},
202
- {"role": "user", "content": f"{text}"}
 
 
 
 
 
 
 
 
 
 
203
  ],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
204
  temperature=0.5,
205
  max_tokens=16384
206
  )
 
 
207
  content = response.choices[0].message.content.strip()
208
- print("Before content:", content)
209
- cleaned_content = re.sub(r'^.*```json\n', '', content) # Remove '```json\n' at the beginning
210
- cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
211
 
212
- # Step 2: Parse the cleaned content as JSON
213
  try:
214
  parsed_content = json.loads(cleaned_content)
215
  return parsed_content
216
  except json.JSONDecodeError as e:
217
- print("Error parsing JSON:", e)
218
- # Optionally, print the cleaned content to debug
219
- print("Cleaned content:", cleaned_content)
220
  return None
 
221
  except Exception as e:
222
- return f"Error in summarization: {str(e)}"
 
223
 
224
  # Dependency to check API Key
225
  def verify_api_key(api_key: str = Header(...)):
@@ -239,50 +247,27 @@ def extract_text_from_file(
239
  ):
240
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
241
  try:
242
- existing_document = invoice_collection.find_one({"entityrefkey":entity_ref_key})
243
 
244
- if(existing_document):
245
  existing_document["_id"] = str(existing_document["_id"])
246
  return {
247
  "message": "Document Retrieved from MongoDB.",
248
  "document": existing_document
249
  }
250
- # Fetch file from S3
251
- file_data, content_type = fetch_file_from_s3_file(file_key)
252
-
253
- extracted_text = []
254
- base64DataResp = None
255
- summary = None
256
-
257
- if content_type.startswith("image/"): # Image file
258
- image = Image.open(io.BytesIO(file_data)).convert("RGB")
259
- extracted_text.append(pytesseract.image_to_string(image))
260
-
261
- # If single image, store Base64
262
- base64Data = base64.b64encode(file_data).decode('utf-8')
263
- base64DataResp = f"data:image/{content_type.lower()};base64,{base64Data}"
264
 
265
- elif content_type == "application/pdf": # PDF file
266
- pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
267
- num_pages = len(pdf_document)
268
 
269
- for page_number in range(num_pages):
270
- page = pdf_document[page_number]
271
- extracted_text.append(page.get_text("text"))
272
 
273
- pdf_document.close()
274
-
275
- # If 2 pages or less, store Base64
276
- if num_pages <= 2:
277
- base64Data = base64.b64encode(file_data).decode('utf-8')
278
- base64DataResp = f"data:application/pdf;base64,{base64Data}"
279
-
280
- # If 2 pages or less, generate summary
281
- if num_pages <= 2:
282
- full_text = " ".join(extracted_text)
283
- summary = summarize_text(full_text)
284
- else:
285
- return {"error": f"Unsupported file type: {content_type}"}
286
 
287
  # Store extracted data in MongoDB
288
  document = {
@@ -290,20 +275,20 @@ def extract_text_from_file(
290
  "file_type": content_type,
291
  "document_type": document_type,
292
  "entityrefkey": entity_ref_key,
293
- "num_pages": len(extracted_text), # Store page count
294
  "base64DataResp": base64DataResp, # Only for small files
295
- "extracted_text": " ".join(extracted_text),
296
  "summary": summary, # Only for small files
297
  }
298
 
299
  inserted_doc = invoice_collection.insert_one(document)
300
- document_id = str(inserted_doc.inserted_id)
301
 
302
  return {
303
  "message": "Document successfully stored in MongoDB",
304
  "document_id": document_id,
305
  "file_key": file_key,
306
- "num_pages": len(extracted_text),
307
  "summary": summary if summary else "Skipped for large documents"
308
  }
309
 
@@ -314,7 +299,6 @@ def extract_text_from_file(
314
  "traceback": traceback.format_exc()
315
  }
316
  return {"error": error_details}
317
-
318
 
319
  # Serve the output folder as static files
320
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 
19
  import base64
20
  from bson.objectid import ObjectId
21
 
22
+ db_client = None
23
  load_dotenv()
24
 
25
  # Set up logging
 
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
33
 
34
+ # use_gpu = False
35
+ # output_dir = 'output'
 
36
 
37
  # Check if environment variables are set
38
  if not MONGODB_URI:
39
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
40
 
41
  # Initialize MongoDB Connection
42
+ db_client = MongoClient(MONGODB_URI)
43
+ db = db_client[DATABASE_NAME]
44
  invoice_collection = db[COLLECTION_NAME]
45
+ openai.api_key = OPENAI_API_KEY
46
+
47
+ app = FastAPI(docs_url='/')
48
 
49
  @app.on_event("startup")
50
  def startup_db():
51
  try:
52
+ db_client.server_info()
53
+ logger.info("MongoDB connection successful")
 
 
54
  except Exception as e:
55
+ logger.error(f"MongoDB connection failed: {str(e)}")
56
 
57
  # AWS S3 Configuration
58
  API_KEY = os.getenv("API_KEY")
 
71
  )
72
 
73
  # Function to fetch file from S3
74
+ def fetch_file_from_s3(file_key):
75
  try:
76
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
77
  content_type = response['ContentType'] # Retrieve MIME type
 
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
  # Function to summarize text using OpenAI GPT
84
+ def extract_invoice_data(file_data, content_type):
85
+ system_prompt = "You are an expert in document data extraction."
86
+
87
+ # Convert file to Base64
88
+ base64_encoded = base64.b64encode(file_data).decode('utf-8')
89
+
90
+ # Determine the correct MIME type for OpenAI
91
+ if content_type.startswith("image/"):
92
+ mime_type = content_type # e.g., image/png, image/jpeg
93
+ elif content_type == "application/pdf":
94
+ mime_type = "application/pdf"
95
+ else:
96
+ raise ValueError(f"Unsupported content type: {content_type}")
97
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  try:
99
  response = openai.ChatCompletion.create(
100
  model="gpt-4o-mini",
101
  messages=[
102
  {"role": "system", "content": system_prompt},
103
+ {
104
+ "role": "user",
105
+ "content": [
106
+ {
107
+ "type": "image_url",
108
+ "image_url": {
109
+ "url": f"data:{mime_type};base64,{base64_encoded}"
110
+ }
111
+ }
112
+ ]
113
+ }
114
  ],
115
+ response_format={
116
+ "type": "json_schema",
117
+ "json_schema": {
118
+ "name": "invoice",
119
+ "strict": True,
120
+ "schema": {
121
+ "type": "object",
122
+ "title": "Invoice Information Extractor",
123
+ "$schema": "http://json-schema.org/draft-07/schema#",
124
+ "properties": {
125
+ "LineItems": {
126
+ "type": "array",
127
+ "items": {
128
+ "type": "object",
129
+ "required": [
130
+ "ProductCode",
131
+ "Description",
132
+ "Amount"
133
+ ],
134
+ "properties": {
135
+ "ProductCode": {
136
+ "type": "string",
137
+ "title": "Product Code",
138
+ "description": "The code of the product"
139
+ },
140
+ "Description": {
141
+ "type": "string",
142
+ "title": "Description",
143
+ "description": "Description of the product"
144
+ },
145
+ "Amount": {
146
+ "type": "number",
147
+ "title": "Amount",
148
+ "description": "The amount of the product"
149
+ }
150
+ },
151
+ "additionalProperties": False
152
+ },
153
+ "title": "Line Items",
154
+ "description": "List of line items on the invoice"
155
+ },
156
+ "TaxAmount": {
157
+ "type": "number",
158
+ "title": "Tax Amount",
159
+ "description": "The tax amount on the invoice"
160
+ },
161
+ "VendorGST": {
162
+ "type": "string",
163
+ "title": "Vendor GST",
164
+ "description": "The GST number of the vendor"
165
+ },
166
+ "VendorName": {
167
+ "type": "string",
168
+ "title": "Vendor Name",
169
+ "description": "The name of the vendor"
170
+ },
171
+ "InvoiceDate": {
172
+ "type": "string",
173
+ "title": "Invoice Date",
174
+ "description": "The date of the invoice (format: dd-MMM-yyyy)"
175
+ },
176
+ "TotalAmount": {
177
+ "type": "number",
178
+ "title": "Total Amount",
179
+ "description": "The total amount on the invoice"
180
+ },
181
+ "InvoiceNumber": {
182
+ "type": "string",
183
+ "title": "Invoice Number",
184
+ "description": "The number of the invoice"
185
+ },
186
+ "VendorAddress": {
187
+ "type": "string",
188
+ "title": "Vendor Address",
189
+ "description": "The address of the vendor"
190
+ },
191
+ "InvoiceCurrency": {
192
+ "type": "string",
193
+ "title": "Invoice Currency",
194
+ "description": "The currency used in the invoice (e.g., USD, INR, AUD)"
195
+ }
196
+ },
197
+ "required": [
198
+ "LineItems",
199
+ "TaxAmount",
200
+ "VendorGST",
201
+ "VendorName",
202
+ "InvoiceDate",
203
+ "TotalAmount",
204
+ "InvoiceNumber",
205
+ "VendorAddress",
206
+ "InvoiceCurrency"
207
+ ],
208
+ "additionalProperties": False,
209
+ "description": "Schema for extracting structured invoice data"
210
+ }
211
+ }
212
+ },
213
  temperature=0.5,
214
  max_tokens=16384
215
  )
216
+
217
+ # Clean and parse JSON output
218
  content = response.choices[0].message.content.strip()
219
+ cleaned_content = content.strip().strip('```json').strip('```')
 
 
220
 
 
221
  try:
222
  parsed_content = json.loads(cleaned_content)
223
  return parsed_content
224
  except json.JSONDecodeError as e:
225
+ logger.error(f"JSON Parse Error: {e}")
 
 
226
  return None
227
+
228
  except Exception as e:
229
+ logger.error(f"Error in data extraction: {e}")
230
+ return {"error": str(e)}
231
 
232
  # Dependency to check API Key
233
  def verify_api_key(api_key: str = Header(...)):
 
247
  ):
248
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
249
  try:
250
+ existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
251
 
252
+ if existing_document:
253
  existing_document["_id"] = str(existing_document["_id"])
254
  return {
255
  "message": "Document Retrieved from MongoDB.",
256
  "document": existing_document
257
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
 
259
+ # Retrieve file from S3 and determine content type (Ensure this step is implemented)
260
+ content_type = get_content_type_from_s3(file_key) # Implement this function
 
261
 
262
+ # Extract text (Ensure Extraction function is implemented)
263
+ extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
 
264
 
265
+ # Define values for small/large files
266
+ base64DataResp = None
267
+ summary = None
268
+ if num_pages <= 2:
269
+ base64DataResp = convert_to_base64(file_key) # Implement this function
270
+ summary = generate_summary(extracted_text) # Implement this function
 
 
 
 
 
 
 
271
 
272
  # Store extracted data in MongoDB
273
  document = {
 
275
  "file_type": content_type,
276
  "document_type": document_type,
277
  "entityrefkey": entity_ref_key,
278
+ "num_pages": num_pages,
279
  "base64DataResp": base64DataResp, # Only for small files
280
+ "extracted_text": extracted_text,
281
  "summary": summary, # Only for small files
282
  }
283
 
284
  inserted_doc = invoice_collection.insert_one(document)
285
+ document_id = str(inserted_doc.inserted_id)
286
 
287
  return {
288
  "message": "Document successfully stored in MongoDB",
289
  "document_id": document_id,
290
  "file_key": file_key,
291
+ "num_pages": num_pages,
292
  "summary": summary if summary else "Skipped for large documents"
293
  }
294
 
 
299
  "traceback": traceback.format_exc()
300
  }
301
  return {"error": error_details}
 
302
 
303
  # Serve the output folder as static files
304
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")