vkumartr commited on
Commit
f66ab35
·
verified ·
1 Parent(s): 8b0fe14

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -20
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
 
4
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
5
  from PIL import Image
6
  import io
@@ -30,6 +31,9 @@ MONGODB_URI = os.getenv("MONGODB_URI")
30
  DATABASE_NAME = os.getenv("DATABASE_NAME")
31
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
32
 
 
 
 
33
  # Check if environment variables are set
34
  if not MONGODB_URI:
35
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
@@ -103,7 +107,7 @@ def extract_invoice_data(file_data, content_type):
103
  {
104
  "type": "image_url",
105
  "image_url": {
106
- "url": f"data:image/{mime_type};base64,{base64_encoded}"
107
  }
108
  }
109
  ]
@@ -213,7 +217,7 @@ def extract_invoice_data(file_data, content_type):
213
 
214
  # Clean and parse JSON output
215
  content = response.choices[0].message.content.strip()
216
- return json.loads(content)
217
 
218
  try:
219
  parsed_content = json.loads(cleaned_content)
@@ -226,15 +230,6 @@ def extract_invoice_data(file_data, content_type):
226
  logger.error(f"Error in data extraction: {e}")
227
  return {"error": str(e)}
228
 
229
- def get_content_type_from_s3(file_key):
230
- """Fetch the content type (MIME type) of a file stored in S3."""
231
- try:
232
- response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
233
- return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
234
- except Exception as e:
235
- raise Exception(f"Failed to get content type from S3: {str(e)}")
236
-
237
-
238
  # Dependency to check API Key
239
  def verify_api_key(api_key: str = Header(...)):
240
  if api_key != API_KEY:
@@ -262,12 +257,18 @@ def extract_text_from_file(
262
  "document": existing_document
263
  }
264
 
265
- # Retrieve file from S3 and determine content type
266
- content_type = get_content_type_from_s3(file_key)
 
 
 
267
 
268
- # Extract and parse invoice data
269
- file_data, _ = fetch_file_from_s3(file_key)
270
- extracted_data = extract_invoice_data(file_data, content_type)
 
 
 
271
 
272
  # Store extracted data in MongoDB
273
  document = {
@@ -275,7 +276,10 @@ def extract_text_from_file(
275
  "file_type": content_type,
276
  "document_type": document_type,
277
  "entityrefkey": entity_ref_key,
278
- "extracted_data": extracted_data
 
 
 
279
  }
280
 
281
  inserted_doc = invoice_collection.insert_one(document)
@@ -284,8 +288,9 @@ def extract_text_from_file(
284
  return {
285
  "message": "Document successfully stored in MongoDB",
286
  "document_id": document_id,
287
- "entityrefkey":entity_ref_key,
288
- "extracted_data": extracted_data
 
289
  }
290
 
291
  except Exception as e:
@@ -295,7 +300,6 @@ def extract_text_from_file(
295
  "traceback": traceback.format_exc()
296
  }
297
  return {"error": error_details}
298
-
299
 
300
  # Serve the output folder as static files
301
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 
1
  import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
+ from enum import Enum
5
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
  from PIL import Image
7
  import io
 
31
  DATABASE_NAME = os.getenv("DATABASE_NAME")
32
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
33
 
34
+ # use_gpu = False
35
+ # output_dir = 'output'
36
+
37
  # Check if environment variables are set
38
  if not MONGODB_URI:
39
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
 
107
  {
108
  "type": "image_url",
109
  "image_url": {
110
+ "url": f"data:{mime_type};base64,{base64_encoded}"
111
  }
112
  }
113
  ]
 
217
 
218
  # Clean and parse JSON output
219
  content = response.choices[0].message.content.strip()
220
+ #cleaned_content = content.strip().strip('```json').strip('```')
221
 
222
  try:
223
  parsed_content = json.loads(cleaned_content)
 
230
  logger.error(f"Error in data extraction: {e}")
231
  return {"error": str(e)}
232
 
 
 
 
 
 
 
 
 
 
233
  # Dependency to check API Key
234
  def verify_api_key(api_key: str = Header(...)):
235
  if api_key != API_KEY:
 
257
  "document": existing_document
258
  }
259
 
260
+ # Retrieve file from S3 and determine content type (Ensure this step is implemented)
261
+ content_type = get_content_type_from_s3(file_key) # Implement this function
262
+
263
+ # Extract text (Ensure Extraction function is implemented)
264
+ extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
265
 
266
+ # Define values for small/large files
267
+ base64DataResp = None
268
+ summary = None
269
+ if num_pages <= 2:
270
+ base64DataResp = convert_to_base64(file_key) # Implement this function
271
+ summary = generate_summary(extracted_text) # Implement this function
272
 
273
  # Store extracted data in MongoDB
274
  document = {
 
276
  "file_type": content_type,
277
  "document_type": document_type,
278
  "entityrefkey": entity_ref_key,
279
+ "num_pages": num_pages,
280
+ "base64DataResp": base64DataResp, # Only for small files
281
+ "extracted_text": extracted_text,
282
+ "summary": summary, # Only for small files
283
  }
284
 
285
  inserted_doc = invoice_collection.insert_one(document)
 
288
  return {
289
  "message": "Document successfully stored in MongoDB",
290
  "document_id": document_id,
291
+ "file_key": file_key,
292
+ "num_pages": num_pages,
293
+ "summary": summary if summary else "Skipped for large documents"
294
  }
295
 
296
  except Exception as e:
 
300
  "traceback": traceback.format_exc()
301
  }
302
  return {"error": error_details}
 
303
 
304
  # Serve the output folder as static files
305
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")