vkumartr commited on
Commit
b27007b
·
verified ·
1 Parent(s): d11faed

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -176,7 +176,7 @@ def extract_text_from_file(
176
  document_type: str = Query(..., description="Type of document"),
177
  entity_ref_key: str = Query(..., description="Entity Reference Key")
178
  ):
179
- """Extract text from a PDF or Image stored in S3 and process it based on document size."""
180
  try:
181
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
182
  if existing_document:
@@ -185,7 +185,8 @@ def extract_text_from_file(
185
  "message": "Document Retrieved from MongoDB.",
186
  "document": existing_document
187
  }
188
- # Fetch dynamic schema based on document type
 
189
  schema_doc = schema_collection.find_one({"document_type": document_type})
190
  if not schema_doc:
191
  raise ValueError("No schema found for the given document type")
@@ -193,18 +194,20 @@ def extract_text_from_file(
193
  json_schema = schema_doc.get("json_schema")
194
  if not json_schema:
195
  raise ValueError("Schema is empty or not properly defined.")
196
-
197
- # Retrieve file from S3 and determine content type
198
  content_type = get_content_type_from_s3(file_key)
199
  file_data, _ = fetch_file_from_s3(file_key)
200
- extracted_data,base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
201
 
202
- # Build document for insertion
 
 
 
203
  document = {
204
  "file_key": file_key,
205
  "file_type": content_type,
206
  "document_type": document_type,
207
- "base64dataResp":base64dataresp,
208
  "entityrefkey": entity_ref_key,
209
  "extracted_data": extracted_data
210
  }
@@ -221,7 +224,7 @@ def extract_text_from_file(
221
  "message": "Document successfully stored in MongoDB",
222
  "document_id": document_id,
223
  "entityrefkey": entity_ref_key,
224
- "base64dataResp":base64dataresp,
225
  "extracted_data": extracted_data
226
  }
227
 
 
176
  document_type: str = Query(..., description="Type of document"),
177
  entity_ref_key: str = Query(..., description="Entity Reference Key")
178
  ):
179
+ """Extract text from a PDF or Image stored in S3 and process it accordingly."""
180
  try:
181
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
182
  if existing_document:
 
185
  "message": "Document Retrieved from MongoDB.",
186
  "document": existing_document
187
  }
188
+
189
+ # Fetch schema for the document type
190
  schema_doc = schema_collection.find_one({"document_type": document_type})
191
  if not schema_doc:
192
  raise ValueError("No schema found for the given document type")
 
194
  json_schema = schema_doc.get("json_schema")
195
  if not json_schema:
196
  raise ValueError("Schema is empty or not properly defined.")
197
+
198
+ # Retrieve file from S3
199
  content_type = get_content_type_from_s3(file_key)
200
  file_data, _ = fetch_file_from_s3(file_key)
 
201
 
202
+ # Extract data from the document (PDF or Image)
203
+ extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
204
+
205
+ # Build and store document in MongoDB
206
  document = {
207
  "file_key": file_key,
208
  "file_type": content_type,
209
  "document_type": document_type,
210
+ "base64dataResp": base64dataresp,
211
  "entityrefkey": entity_ref_key,
212
  "extracted_data": extracted_data
213
  }
 
224
  "message": "Document successfully stored in MongoDB",
225
  "document_id": document_id,
226
  "entityrefkey": entity_ref_key,
227
+ "base64dataResp": base64dataresp,
228
  "extracted_data": extracted_data
229
  }
230