vkumartr commited on
Commit
6b32371
·
verified ·
1 Parent(s): aa47259

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -4
app.py CHANGED
@@ -5,7 +5,6 @@ from enum import Enum
5
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
  from PIL import Image
7
  import io
8
- import numpy as np
9
  import fitz # PyMuPDF for PDF handling
10
  import logging
11
  from pymongo import MongoClient
@@ -240,6 +239,14 @@ def extract_text_from_file(
240
  ):
241
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
242
  try:
 
 
 
 
 
 
 
 
243
  # Fetch file from S3
244
  file_data, content_type = fetch_file_from_s3_file(file_key)
245
 
@@ -248,8 +255,8 @@ def extract_text_from_file(
248
  summary = None
249
 
250
  if content_type.startswith("image/"): # Image file
251
- # image = Image.open(io.BytesIO(file_data)).convert("RGB")
252
- # extracted_text.append(pytesseract.image_to_string(image))
253
 
254
  # If single image, store Base64
255
  base64Data = base64.b64encode(file_data).decode('utf-8')
@@ -290,7 +297,7 @@ def extract_text_from_file(
290
  }
291
 
292
  inserted_doc = invoice_collection.insert_one(document)
293
- document_id = str(inserted_doc.inserted_id)
294
 
295
  return {
296
  "message": "Document successfully stored in MongoDB",
 
5
  from fastapi import FastAPI, Header, Query, Depends, HTTPException
6
  from PIL import Image
7
  import io
 
8
  import fitz # PyMuPDF for PDF handling
9
  import logging
10
  from pymongo import MongoClient
 
239
  ):
240
  """Extract text from a PDF or Image stored in S3 and process it based on document size."""
241
  try:
242
+ existing_document = invoice_collection.find_one({"entityrefkey":entity_ref_key})
243
+
244
+ if(existing_document):
245
+ existing_document["_id"] = str(existing_document["_id"])
246
+ return {
247
+ "message": "Document Retrieved from MongoDB.",
248
+ "document": existing_document
249
+ }
250
  # Fetch file from S3
251
  file_data, content_type = fetch_file_from_s3_file(file_key)
252
 
 
255
  summary = None
256
 
257
  if content_type.startswith("image/"): # Image file
258
+ image = Image.open(io.BytesIO(file_data)).convert("RGB")
259
+ extracted_text.append(pytesseract.image_to_string(image))
260
 
261
  # If single image, store Base64
262
  base64Data = base64.b64encode(file_data).decode('utf-8')
 
297
  }
298
 
299
  inserted_doc = invoice_collection.insert_one(document)
300
+ document_id = str(inserted_doc.inserted_id)
301
 
302
  return {
303
  "message": "Document successfully stored in MongoDB",