Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,7 +5,6 @@ from enum import Enum
|
|
| 5 |
from fastapi import FastAPI, Header, Query, Depends, HTTPException
|
| 6 |
from PIL import Image
|
| 7 |
import io
|
| 8 |
-
import numpy as np
|
| 9 |
import fitz # PyMuPDF for PDF handling
|
| 10 |
import logging
|
| 11 |
from pymongo import MongoClient
|
|
@@ -240,6 +239,14 @@ def extract_text_from_file(
|
|
| 240 |
):
|
| 241 |
"""Extract text from a PDF or Image stored in S3 and process it based on document size."""
|
| 242 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
# Fetch file from S3
|
| 244 |
file_data, content_type = fetch_file_from_s3_file(file_key)
|
| 245 |
|
|
@@ -248,8 +255,8 @@ def extract_text_from_file(
|
|
| 248 |
summary = None
|
| 249 |
|
| 250 |
if content_type.startswith("image/"): # Image file
|
| 251 |
-
|
| 252 |
-
|
| 253 |
|
| 254 |
# If single image, store Base64
|
| 255 |
base64Data = base64.b64encode(file_data).decode('utf-8')
|
|
@@ -290,7 +297,7 @@ def extract_text_from_file(
|
|
| 290 |
}
|
| 291 |
|
| 292 |
inserted_doc = invoice_collection.insert_one(document)
|
| 293 |
-
document_id = str(inserted_doc.inserted_id)
|
| 294 |
|
| 295 |
return {
|
| 296 |
"message": "Document successfully stored in MongoDB",
|
|
|
|
| 5 |
from fastapi import FastAPI, Header, Query, Depends, HTTPException
|
| 6 |
from PIL import Image
|
| 7 |
import io
|
|
|
|
| 8 |
import fitz # PyMuPDF for PDF handling
|
| 9 |
import logging
|
| 10 |
from pymongo import MongoClient
|
|
|
|
| 239 |
):
|
| 240 |
"""Extract text from a PDF or Image stored in S3 and process it based on document size."""
|
| 241 |
try:
|
| 242 |
+
existing_document = invoice_collection.find_one({"entityrefkey":entity_ref_key})
|
| 243 |
+
|
| 244 |
+
if(existing_document):
|
| 245 |
+
existing_document["_id"] = str(existing_document["_id"])
|
| 246 |
+
return {
|
| 247 |
+
"message": "Document Retrieved from MongoDB.",
|
| 248 |
+
"document": existing_document
|
| 249 |
+
}
|
| 250 |
# Fetch file from S3
|
| 251 |
file_data, content_type = fetch_file_from_s3_file(file_key)
|
| 252 |
|
|
|
|
| 255 |
summary = None
|
| 256 |
|
| 257 |
if content_type.startswith("image/"): # Image file
|
| 258 |
+
image = Image.open(io.BytesIO(file_data)).convert("RGB")
|
| 259 |
+
extracted_text.append(pytesseract.image_to_string(image))
|
| 260 |
|
| 261 |
# If single image, store Base64
|
| 262 |
base64Data = base64.b64encode(file_data).decode('utf-8')
|
|
|
|
| 297 |
}
|
| 298 |
|
| 299 |
inserted_doc = invoice_collection.insert_one(document)
|
| 300 |
+
document_id = str(inserted_doc.inserted_id)
|
| 301 |
|
| 302 |
return {
|
| 303 |
"message": "Document successfully stored in MongoDB",
|