Spaces:
Sleeping
Sleeping
Changes updated
Browse files
app.py
CHANGED
|
@@ -30,18 +30,17 @@ logger = logging.getLogger(__name__)
|
|
| 30 |
MONGODB_URI = os.getenv("MONGODB_URI")
|
| 31 |
DATABASE_NAME = os.getenv("DATABASE_NAME")
|
| 32 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
|
| 33 |
-
|
| 34 |
-
# use_gpu = False
|
| 35 |
-
# output_dir = 'output'
|
| 36 |
|
| 37 |
# Check if environment variables are set
|
| 38 |
if not MONGODB_URI:
|
| 39 |
-
raise ValueError("
|
| 40 |
|
| 41 |
# Initialize MongoDB Connection
|
| 42 |
db_client = MongoClient(MONGODB_URI)
|
| 43 |
db = db_client[DATABASE_NAME]
|
| 44 |
invoice_collection = db[COLLECTION_NAME]
|
|
|
|
| 45 |
|
| 46 |
app = FastAPI(docs_url='/')
|
| 47 |
use_gpu = False
|
|
@@ -82,7 +81,7 @@ def fetch_file_from_s3(file_key):
|
|
| 82 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 83 |
|
| 84 |
# Function to summarize text using OpenAI GPT
|
| 85 |
-
def extract_invoice_data(file_data, content_type):
|
| 86 |
system_prompt = "You are an expert in document data extraction."
|
| 87 |
|
| 88 |
# Convert file to Base64
|
|
@@ -123,7 +122,7 @@ def extract_invoice_data(file_data, content_type):
|
|
| 123 |
|
| 124 |
# Clean and parse JSON output
|
| 125 |
content = response.choices[0].message.content.strip()
|
| 126 |
-
|
| 127 |
|
| 128 |
try:
|
| 129 |
parsed_content = json.loads(cleaned_content)
|
|
@@ -136,7 +135,7 @@ def extract_invoice_data(file_data, content_type):
|
|
| 136 |
logger.error(f"Error in data extraction: {e}")
|
| 137 |
return {"error": str(e)}
|
| 138 |
|
| 139 |
-
|
| 140 |
"""Fetch the content type (MIME type) of a file stored in S3."""
|
| 141 |
try:
|
| 142 |
response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
|
@@ -144,7 +143,6 @@ def extract_invoice_data(file_data, content_type):
|
|
| 144 |
except Exception as e:
|
| 145 |
raise Exception(f"Failed to get content type from S3: {str(e)}")
|
| 146 |
|
| 147 |
-
|
| 148 |
# Dependency to check API Key
|
| 149 |
def verify_api_key(api_key: str = Header(...)):
|
| 150 |
if api_key != API_KEY:
|
|
@@ -164,7 +162,6 @@ def extract_text_from_file(
|
|
| 164 |
"""Extract text from a PDF or Image stored in S3 and process it based on document size."""
|
| 165 |
try:
|
| 166 |
existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
|
| 167 |
-
|
| 168 |
if existing_document:
|
| 169 |
existing_document["_id"] = str(existing_document["_id"])
|
| 170 |
return {
|
|
@@ -172,7 +169,7 @@ def extract_text_from_file(
|
|
| 172 |
"document": existing_document
|
| 173 |
}
|
| 174 |
# Fetch dynamic schema based on document type
|
| 175 |
-
schema_doc =
|
| 176 |
if not schema_doc:
|
| 177 |
raise ValueError("No schema found for the given document type")
|
| 178 |
|
|
@@ -182,12 +179,10 @@ def extract_text_from_file(
|
|
| 182 |
|
| 183 |
# Retrieve file from S3 and determine content type
|
| 184 |
content_type = get_content_type_from_s3(file_key)
|
| 185 |
-
|
| 186 |
-
# Extract and parse invoice data
|
| 187 |
file_data, _ = fetch_file_from_s3(file_key)
|
| 188 |
extracted_data = extract_invoice_data(file_data, content_type, json_schema)
|
| 189 |
|
| 190 |
-
#
|
| 191 |
document = {
|
| 192 |
"file_key": file_key,
|
| 193 |
"file_type": content_type,
|
|
@@ -196,7 +191,6 @@ def extract_text_from_file(
|
|
| 196 |
"extracted_data": extracted_data
|
| 197 |
}
|
| 198 |
|
| 199 |
-
# Insert document into MongoDB
|
| 200 |
try:
|
| 201 |
inserted_doc = invoice_collection.insert_one(document)
|
| 202 |
document_id = str(inserted_doc.inserted_id)
|
|
@@ -208,7 +202,7 @@ def extract_text_from_file(
|
|
| 208 |
return {
|
| 209 |
"message": "Document successfully stored in MongoDB",
|
| 210 |
"document_id": document_id,
|
| 211 |
-
"entityrefkey":entity_ref_key,
|
| 212 |
"extracted_data": extracted_data
|
| 213 |
}
|
| 214 |
|
|
|
|
| 30 |
MONGODB_URI = os.getenv("MONGODB_URI")
|
| 31 |
DATABASE_NAME = os.getenv("DATABASE_NAME")
|
| 32 |
COLLECTION_NAME = os.getenv("COLLECTION_NAME")
|
| 33 |
+
SCHEMA = os.getenv("SCHEMA")
|
|
|
|
|
|
|
| 34 |
|
| 35 |
# Check if environment variables are set
|
| 36 |
if not MONGODB_URI:
|
| 37 |
+
raise ValueError("MONGODB_URI is not set. Please add it to your secrets.")
|
| 38 |
|
| 39 |
# Initialize MongoDB Connection
|
| 40 |
db_client = MongoClient(MONGODB_URI)
|
| 41 |
db = db_client[DATABASE_NAME]
|
| 42 |
invoice_collection = db[COLLECTION_NAME]
|
| 43 |
+
schema_collection = db[SCHEMA]
|
| 44 |
|
| 45 |
app = FastAPI(docs_url='/')
|
| 46 |
use_gpu = False
|
|
|
|
| 81 |
raise Exception(f"Failed to fetch file from S3: {str(e)}")
|
| 82 |
|
| 83 |
# Function to summarize text using OpenAI GPT
|
| 84 |
+
def extract_invoice_data(file_data, content_type, json_schema):
|
| 85 |
system_prompt = "You are an expert in document data extraction."
|
| 86 |
|
| 87 |
# Convert file to Base64
|
|
|
|
| 122 |
|
| 123 |
# Clean and parse JSON output
|
| 124 |
content = response.choices[0].message.content.strip()
|
| 125 |
+
cleaned_content = content.strip().strip('```json').strip('```')
|
| 126 |
|
| 127 |
try:
|
| 128 |
parsed_content = json.loads(cleaned_content)
|
|
|
|
| 135 |
logger.error(f"Error in data extraction: {e}")
|
| 136 |
return {"error": str(e)}
|
| 137 |
|
| 138 |
+
def get_content_type_from_s3(file_key):
|
| 139 |
"""Fetch the content type (MIME type) of a file stored in S3."""
|
| 140 |
try:
|
| 141 |
response = s3_client.head_object(Bucket=S3_BUCKET_NAME, Key=file_key)
|
|
|
|
| 143 |
except Exception as e:
|
| 144 |
raise Exception(f"Failed to get content type from S3: {str(e)}")
|
| 145 |
|
|
|
|
| 146 |
# Dependency to check API Key
|
| 147 |
def verify_api_key(api_key: str = Header(...)):
|
| 148 |
if api_key != API_KEY:
|
|
|
|
| 162 |
"""Extract text from a PDF or Image stored in S3 and process it based on document size."""
|
| 163 |
try:
|
| 164 |
existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
|
|
|
|
| 165 |
if existing_document:
|
| 166 |
existing_document["_id"] = str(existing_document["_id"])
|
| 167 |
return {
|
|
|
|
| 169 |
"document": existing_document
|
| 170 |
}
|
| 171 |
# Fetch dynamic schema based on document type
|
| 172 |
+
schema_doc = schema_collection.find_one({"document_type": document_type})
|
| 173 |
if not schema_doc:
|
| 174 |
raise ValueError("No schema found for the given document type")
|
| 175 |
|
|
|
|
| 179 |
|
| 180 |
# Retrieve file from S3 and determine content type
|
| 181 |
content_type = get_content_type_from_s3(file_key)
|
|
|
|
|
|
|
| 182 |
file_data, _ = fetch_file_from_s3(file_key)
|
| 183 |
extracted_data = extract_invoice_data(file_data, content_type, json_schema)
|
| 184 |
|
| 185 |
+
# Build document for insertion
|
| 186 |
document = {
|
| 187 |
"file_key": file_key,
|
| 188 |
"file_type": content_type,
|
|
|
|
| 191 |
"extracted_data": extracted_data
|
| 192 |
}
|
| 193 |
|
|
|
|
| 194 |
try:
|
| 195 |
inserted_doc = invoice_collection.insert_one(document)
|
| 196 |
document_id = str(inserted_doc.inserted_id)
|
|
|
|
| 202 |
return {
|
| 203 |
"message": "Document successfully stored in MongoDB",
|
| 204 |
"document_id": document_id,
|
| 205 |
+
"entityrefkey": entity_ref_key,
|
| 206 |
"extracted_data": extracted_data
|
| 207 |
}
|
| 208 |
|