Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,6 +18,7 @@ import re
|
|
| 18 |
import json
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
import uvicorn
|
|
|
|
| 21 |
|
| 22 |
load_dotenv()
|
| 23 |
|
|
@@ -55,7 +56,7 @@ def fetch_file_from_s3_file(file_key):
|
|
| 55 |
response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
| 56 |
content_type = response['ContentType'] # Retrieve MIME type
|
| 57 |
file_data = response['Body'].read()
|
| 58 |
-
return
|
| 59 |
except Exception as e:
|
| 60 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 61 |
|
|
@@ -102,13 +103,13 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
|
|
| 102 |
file_data, content_type = fetch_file_from_s3_file(file_key)
|
| 103 |
|
| 104 |
extracted_text = []
|
| 105 |
-
|
| 106 |
# Determine file type based on MIME type
|
| 107 |
if content_type.startswith("image/"): # Image file
|
| 108 |
-
image = Image.open(file_data).convert("RGB") # Use BytesIO stream directly
|
| 109 |
image_np = np.array(image) # Convert to NumPy array
|
| 110 |
result = ocr.ocr(image_np, cls=True)
|
| 111 |
-
|
| 112 |
# Extract text from OCR results
|
| 113 |
for line in result:
|
| 114 |
for word_info in line:
|
|
@@ -116,7 +117,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
|
|
| 116 |
|
| 117 |
elif content_type == "application/pdf": # PDF file
|
| 118 |
# Open PDF using PyMuPDF
|
| 119 |
-
pdf_document = fitz.open(stream=file_data, filetype="pdf")
|
| 120 |
|
| 121 |
extracted_text = []
|
| 122 |
|
|
@@ -138,6 +139,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
|
|
| 138 |
extracted_text.append(word_info[1][0])
|
| 139 |
|
| 140 |
pdf_document.close()
|
|
|
|
| 141 |
else:
|
| 142 |
return {"error": f"Unsupported file type: {content_type}"}
|
| 143 |
|
|
@@ -150,6 +152,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
|
|
| 150 |
return {
|
| 151 |
"file_key": file_key,
|
| 152 |
"file_type": content_type,
|
|
|
|
| 153 |
"extracted_text": full_text,
|
| 154 |
"summary": summary
|
| 155 |
}
|
|
|
|
| 18 |
import json
|
| 19 |
from dotenv import load_dotenv
|
| 20 |
import uvicorn
|
| 21 |
+
import base64
|
| 22 |
|
| 23 |
load_dotenv()
|
| 24 |
|
|
|
|
| 56 |
response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
| 57 |
content_type = response['ContentType'] # Retrieve MIME type
|
| 58 |
file_data = response['Body'].read()
|
| 59 |
+
return file_data, content_type # Return file data as BytesIO
|
| 60 |
except Exception as e:
|
| 61 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 62 |
|
|
|
|
| 103 |
file_data, content_type = fetch_file_from_s3_file(file_key)
|
| 104 |
|
| 105 |
extracted_text = []
|
| 106 |
+
base64Data = base64.b64encode(file_data).decode('utf-8')
|
| 107 |
# Determine file type based on MIME type
|
| 108 |
if content_type.startswith("image/"): # Image file
|
| 109 |
+
image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
|
| 110 |
image_np = np.array(image) # Convert to NumPy array
|
| 111 |
result = ocr.ocr(image_np, cls=True)
|
| 112 |
+
base64DataResp = f"data:image/{content_type.lower()};base64,{base64_data}"
|
| 113 |
# Extract text from OCR results
|
| 114 |
for line in result:
|
| 115 |
for word_info in line:
|
|
|
|
| 117 |
|
| 118 |
elif content_type == "application/pdf": # PDF file
|
| 119 |
# Open PDF using PyMuPDF
|
| 120 |
+
pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
|
| 121 |
|
| 122 |
extracted_text = []
|
| 123 |
|
|
|
|
| 139 |
extracted_text.append(word_info[1][0])
|
| 140 |
|
| 141 |
pdf_document.close()
|
| 142 |
+
base64DataResp = f"data:application/pdf;base64,{base64_data}"
|
| 143 |
else:
|
| 144 |
return {"error": f"Unsupported file type: {content_type}"}
|
| 145 |
|
|
|
|
| 152 |
return {
|
| 153 |
"file_key": file_key,
|
| 154 |
"file_type": content_type,
|
| 155 |
+
"base64DataResp":base64DataResp,
|
| 156 |
"extracted_text": full_text,
|
| 157 |
"summary": summary
|
| 158 |
}
|