vkumartr commited on
Commit
35fd719
·
verified ·
1 Parent(s): b27007b

Newly added

Browse files
Files changed (1) hide show
  1. app.py +31 -22
app.py CHANGED
@@ -94,20 +94,41 @@ def extract_pdf_text(file_data):
94
 
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
- system_prompt = "You are an expert in document data extraction."
 
 
 
98
 
99
  # Convert file to Base64
100
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
101
- base64dataresp = f"data:{content_type};base64,{base64_encoded}"
102
 
 
103
  if content_type == "application/pdf":
104
  extracted_text = extract_pdf_text(file_data)
105
  if not extracted_text:
106
  return {"error": "Failed to extract text from PDF"}, base64dataresp
107
 
108
- return {"extracted_text": extracted_text}, base64dataresp # Return plain text for PDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
- # Handle Images using OpenAI
111
  elif content_type.startswith("image/"):
112
  try:
113
  response = openai.ChatCompletion.create(
@@ -126,25 +147,13 @@ def extract_invoice_data(file_data, content_type, json_schema):
126
  ]
127
  }
128
  ],
129
- response_format={
130
- "type": "json_schema",
131
- "json_schema": json_schema
132
- },
133
  temperature=0.5,
134
  max_tokens=16384
135
  )
136
 
137
- # Clean and parse JSON output
138
- content = response.choices[0].message.content.strip()
139
- cleaned_content = content.strip().strip('```json').strip('```')
140
-
141
- try:
142
- parsed_content = json.loads(cleaned_content)
143
- return parsed_content, base64dataresp # Return extracted structured data
144
- except json.JSONDecodeError as e:
145
- logger.error(f"JSON Parse Error: {e}")
146
- return None, base64dataresp
147
-
148
  except Exception as e:
149
  logger.error(f"Error in OpenAI image processing: {e}")
150
  return {"error": str(e)}, base64dataresp
@@ -176,7 +185,7 @@ def extract_text_from_file(
176
  document_type: str = Query(..., description="Type of document"),
177
  entity_ref_key: str = Query(..., description="Entity Reference Key")
178
  ):
179
- """Extract text from a PDF or Image stored in S3 and process it accordingly."""
180
  try:
181
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
182
  if existing_document:
@@ -186,7 +195,7 @@ def extract_text_from_file(
186
  "document": existing_document
187
  }
188
 
189
- # Fetch schema for the document type
190
  schema_doc = schema_collection.find_one({"document_type": document_type})
191
  if not schema_doc:
192
  raise ValueError("No schema found for the given document type")
@@ -199,7 +208,7 @@ def extract_text_from_file(
199
  content_type = get_content_type_from_s3(file_key)
200
  file_data, _ = fetch_file_from_s3(file_key)
201
 
202
- # Extract data from the document (PDF or Image)
203
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
204
 
205
  # Build and store document in MongoDB
 
94
 
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
+ """
98
+ Extracts data from a PDF or image and returns structured JSON based on the provided schema.
99
+ """
100
+ system_prompt = "You are an expert in document data extraction. Extract relevant fields from the document and return structured JSON based on the provided schema."
101
 
102
  # Convert file to Base64
103
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
104
+ base64dataresp = f"data:{content_type};base64,{base64_encoded}"
105
 
106
+ # Handle PDF Extraction & Format to JSON Schema
107
  if content_type == "application/pdf":
108
  extracted_text = extract_pdf_text(file_data)
109
  if not extracted_text:
110
  return {"error": "Failed to extract text from PDF"}, base64dataresp
111
 
112
+ try:
113
+ # Send extracted text to OpenAI for structured JSON conversion
114
+ response = openai.ChatCompletion.create(
115
+ model="gpt-4o-mini",
116
+ messages=[
117
+ {"role": "system", "content": system_prompt},
118
+ {"role": "user", "content": extracted_text}
119
+ ],
120
+ response_format={"type": "json_schema", "json_schema": json_schema},
121
+ temperature=0.5,
122
+ max_tokens=16384
123
+ )
124
+
125
+ parsed_content = json.loads(response.choices[0].message.content.strip())
126
+ return parsed_content, base64dataresp # Return structured JSON
127
+ except Exception as e:
128
+ logger.error(f"Error in OpenAI text-to-JSON conversion: {e}")
129
+ return {"error": str(e)}, base64dataresp
130
 
131
+ # Handle Image Extraction using OpenAI Vision API
132
  elif content_type.startswith("image/"):
133
  try:
134
  response = openai.ChatCompletion.create(
 
147
  ]
148
  }
149
  ],
150
+ response_format={"type": "json_schema", "json_schema": json_schema},
 
 
 
151
  temperature=0.5,
152
  max_tokens=16384
153
  )
154
 
155
+ parsed_content = json.loads(response.choices[0].message.content.strip())
156
+ return parsed_content, base64dataresp # Return structured JSON
 
 
 
 
 
 
 
 
 
157
  except Exception as e:
158
  logger.error(f"Error in OpenAI image processing: {e}")
159
  return {"error": str(e)}, base64dataresp
 
185
  document_type: str = Query(..., description="Type of document"),
186
  entity_ref_key: str = Query(..., description="Entity Reference Key")
187
  ):
188
+ """Extract structured data from a PDF or Image stored in S3."""
189
  try:
190
  existing_document = invoice_collection.find_one({"entityrefkey": entity_ref_key})
191
  if existing_document:
 
195
  "document": existing_document
196
  }
197
 
198
+ # Fetch JSON schema for the document type
199
  schema_doc = schema_collection.find_one({"document_type": document_type})
200
  if not schema_doc:
201
  raise ValueError("No schema found for the given document type")
 
208
  content_type = get_content_type_from_s3(file_key)
209
  file_data, _ = fetch_file_from_s3(file_key)
210
 
211
+ # Extract structured data from the document
212
  extracted_data, base64dataresp = extract_invoice_data(file_data, content_type, json_schema)
213
 
214
  # Build and store document in MongoDB