Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import uvicorn
|
| 2 |
from fastapi.staticfiles import StaticFiles
|
| 3 |
import hashlib
|
|
|
|
| 4 |
from fastapi import FastAPI, Header, Query, Depends, HTTPException
|
| 5 |
from PIL import Image
|
| 6 |
import io
|
|
@@ -30,6 +31,9 @@ MONGODB_URI = os.getenv("MONGODB_URI")
|
|
| 30 |
DATABASE_NAME = os.getenv("DATABASE_NAME")
|
| 31 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
|
| 32 |
|
|
|
|
|
|
|
|
|
|
| 33 |
# Check if environment variables are set
|
| 34 |
if not MONGODB_URI:
|
| 35 |
raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
|
|
@@ -103,7 +107,7 @@ def extract_invoice_data(file_data, content_type):
|
|
| 103 |
{
|
| 104 |
"type": "image_url",
|
| 105 |
"image_url": {
|
| 106 |
-
"url": f"data:
|
| 107 |
}
|
| 108 |
}
|
| 109 |
]
|
|
@@ -213,7 +217,7 @@ def extract_invoice_data(file_data, content_type):
|
|
| 213 |
|
| 214 |
# Clean and parse JSON output
|
| 215 |
content = response.choices[0].message.content.strip()
|
| 216 |
-
|
| 217 |
|
| 218 |
try:
|
| 219 |
parsed_content = json.loads(cleaned_content)
|
|
@@ -226,15 +230,6 @@ def extract_invoice_data(file_data, content_type):
|
|
| 226 |
logger.error(f"Error in data extraction: {e}")
|
| 227 |
return {"error": str(e)}
|
| 228 |
|
| 229 |
-
def get_content_type_from_s3(file_key):
|
| 230 |
-
"""Fetch the content type (MIME type) of a file stored in S3."""
|
| 231 |
-
try:
|
| 232 |
-
response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
| 233 |
-
return response.get('ContentType', 'application/octet-stream') # Default to binary if not found
|
| 234 |
-
except Exception as e:
|
| 235 |
-
raise Exception(f"Failed to get content type from S3: {str(e)}")
|
| 236 |
-
|
| 237 |
-
|
| 238 |
# Dependency to check API Key
|
| 239 |
def verify_api_key(api_key: str = Header(...)):
|
| 240 |
if api_key != API_KEY:
|
|
@@ -262,12 +257,18 @@ def extract_text_from_file(
|
|
| 262 |
"document": existing_document
|
| 263 |
}
|
| 264 |
|
| 265 |
-
# Retrieve file from S3 and determine content type
|
| 266 |
-
content_type = get_content_type_from_s3(file_key)
|
|
|
|
|
|
|
|
|
|
| 267 |
|
| 268 |
-
#
|
| 269 |
-
|
| 270 |
-
|
|
|
|
|
|
|
|
|
|
| 271 |
|
| 272 |
# Store extracted data in MongoDB
|
| 273 |
document = {
|
|
@@ -275,7 +276,10 @@ def extract_text_from_file(
|
|
| 275 |
"file_type": content_type,
|
| 276 |
"document_type": document_type,
|
| 277 |
"entityrefkey": entity_ref_key,
|
| 278 |
-
"
|
|
|
|
|
|
|
|
|
|
| 279 |
}
|
| 280 |
|
| 281 |
inserted_doc = invoice_collection.insert_one(document)
|
|
@@ -284,8 +288,9 @@ def extract_text_from_file(
|
|
| 284 |
return {
|
| 285 |
"message": "Document successfully stored in MongoDB",
|
| 286 |
"document_id": document_id,
|
| 287 |
-
"
|
| 288 |
-
"
|
|
|
|
| 289 |
}
|
| 290 |
|
| 291 |
except Exception as e:
|
|
@@ -295,7 +300,6 @@ def extract_text_from_file(
|
|
| 295 |
"traceback": traceback.format_exc()
|
| 296 |
}
|
| 297 |
return {"error": error_details}
|
| 298 |
-
|
| 299 |
|
| 300 |
# Serve the output folder as static files
|
| 301 |
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
|
|
|
|
| 1 |
import uvicorn
|
| 2 |
from fastapi.staticfiles import StaticFiles
|
| 3 |
import hashlib
|
| 4 |
+
from enum import Enum
|
| 5 |
from fastapi import FastAPI, Header, Query, Depends, HTTPException
|
| 6 |
from PIL import Image
|
| 7 |
import io
|
|
|
|
| 31 |
DATABASE_NAME = os.getenv("DATABASE_NAME")
|
| 32 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
|
| 33 |
|
| 34 |
+
# use_gpu = False
|
| 35 |
+
# output_dir = 'output'
|
| 36 |
+
|
| 37 |
# Check if environment variables are set
|
| 38 |
if not MONGODB_URI:
|
| 39 |
raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
|
|
|
|
| 107 |
{
|
| 108 |
"type": "image_url",
|
| 109 |
"image_url": {
|
| 110 |
+
"url": f"data:{mime_type};base64,{base64_encoded}"
|
| 111 |
}
|
| 112 |
}
|
| 113 |
]
|
|
|
|
| 217 |
|
| 218 |
# Clean and parse JSON output
|
| 219 |
content = response.choices[0].message.content.strip()
|
| 220 |
+
#cleaned_content = content.strip().strip('```json').strip('```')
|
| 221 |
|
| 222 |
try:
|
| 223 |
parsed_content = json.loads(cleaned_content)
|
|
|
|
| 230 |
logger.error(f"Error in data extraction: {e}")
|
| 231 |
return {"error": str(e)}
|
| 232 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
# Dependency to check API Key
|
| 234 |
def verify_api_key(api_key: str = Header(...)):
|
| 235 |
if api_key != API_KEY:
|
|
|
|
| 257 |
"document": existing_document
|
| 258 |
}
|
| 259 |
|
| 260 |
+
# Retrieve file from S3 and determine content type (Ensure this step is implemented)
|
| 261 |
+
content_type = get_content_type_from_s3(file_key) # Implement this function
|
| 262 |
+
|
| 263 |
+
# Extract text (Ensure Extraction function is implemented)
|
| 264 |
+
extracted_text, num_pages = extract_text_from_s3(file_key, content_type)
|
| 265 |
|
| 266 |
+
# Define values for small/large files
|
| 267 |
+
base64DataResp = None
|
| 268 |
+
summary = None
|
| 269 |
+
if num_pages <= 2:
|
| 270 |
+
base64DataResp = convert_to_base64(file_key) # Implement this function
|
| 271 |
+
summary = generate_summary(extracted_text) # Implement this function
|
| 272 |
|
| 273 |
# Store extracted data in MongoDB
|
| 274 |
document = {
|
|
|
|
| 276 |
"file_type": content_type,
|
| 277 |
"document_type": document_type,
|
| 278 |
"entityrefkey": entity_ref_key,
|
| 279 |
+
"num_pages": num_pages,
|
| 280 |
+
"base64DataResp": base64DataResp, # Only for small files
|
| 281 |
+
"extracted_text": extracted_text,
|
| 282 |
+
"summary": summary, # Only for small files
|
| 283 |
}
|
| 284 |
|
| 285 |
inserted_doc = invoice_collection.insert_one(document)
|
|
|
|
| 288 |
return {
|
| 289 |
"message": "Document successfully stored in MongoDB",
|
| 290 |
"document_id": document_id,
|
| 291 |
+
"file_key": file_key,
|
| 292 |
+
"num_pages": num_pages,
|
| 293 |
+
"summary": summary if summary else "Skipped for large documents"
|
| 294 |
}
|
| 295 |
|
| 296 |
except Exception as e:
|
|
|
|
| 300 |
"traceback": traceback.format_exc()
|
| 301 |
}
|
| 302 |
return {"error": error_details}
|
|
|
|
| 303 |
|
| 304 |
# Serve the output folder as static files
|
| 305 |
app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
|