vkumartr commited on
Commit
d11faed
·
verified ·
1 Parent(s): cb92879

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +59 -43
app.py CHANGED
@@ -80,6 +80,18 @@ def fetch_file_from_s3(file_key):
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # Function to summarize text using OpenAI GPT
84
  def extract_invoice_data(file_data, content_type, json_schema):
85
  system_prompt = "You are an expert in document data extraction."
@@ -88,53 +100,57 @@ def extract_invoice_data(file_data, content_type, json_schema):
88
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
89
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
90
 
91
- # Determine the correct MIME type for OpenAI
92
- if content_type.startswith("image/"):
93
- mime_type = content_type # e.g., image/png, image/jpeg
94
- elif content_type == "application/pdf":
95
- mime_type = "application/pdf"
96
- else:
97
- raise ValueError(f"Unsupported content type: {content_type}")
98
 
99
- try:
100
- response = openai.ChatCompletion.create(
101
- model="gpt-4o-mini",
102
- messages=[
103
- {"role": "system", "content": system_prompt},
104
- {
105
- "role": "user",
106
- "content": [
107
- {
108
- "type": "image_url",
109
- "image_url": {
110
- "url": f"data:{mime_type};base64,{base64_encoded}"
111
- }
112
- }
113
- ]
114
- }
115
- ],
116
- response_format={
117
- "type": "json_schema",
118
- "json_schema": json_schema
119
- },
120
- temperature=0.5,
121
- max_tokens=16384
122
- )
123
-
124
- # Clean and parse JSON output
125
- content = response.choices[0].message.content.strip()
126
- cleaned_content = content.strip().strip('```json').strip('```')
127
 
 
 
128
  try:
129
- parsed_content = json.loads(cleaned_content)
130
- return parsed_content,base64dataresp
131
- except json.JSONDecodeError as e:
132
- logger.error(f"JSON Parse Error: {e}")
133
- return None,base64dataresp
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
134
 
135
- except Exception as e:
136
- logger.error(f"Error in data extraction: {e}")
137
- return {"error": str(e)},base64dataresp
 
 
 
138
 
139
  def get_content_type_from_s3(file_key):
140
  """Fetch the content type (MIME type) of a file stored in S3."""
 
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
+ def extract_pdf_text(file_data):
84
+ """
85
+ Extracts text from a PDF file using PyMuPDF (fitz).
86
+ """
87
+ try:
88
+ pdf_document = fitz.open(stream=file_data, filetype="pdf")
89
+ text = "\n".join([page.get_text("text") for page in pdf_document])
90
+ return text
91
+ except Exception as e:
92
+ logger.error(f"PDF Extraction Error: {e}")
93
+ return None
94
+
95
  # Function to summarize text using OpenAI GPT
96
  def extract_invoice_data(file_data, content_type, json_schema):
97
  system_prompt = "You are an expert in document data extraction."
 
100
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
101
  base64dataresp = f"data:{content_type};base64,{base64_encoded}"
102
 
103
+ if content_type == "application/pdf":
104
+ extracted_text = extract_pdf_text(file_data)
105
+ if not extracted_text:
106
+ return {"error": "Failed to extract text from PDF"}, base64dataresp
 
 
 
107
 
108
+ return {"extracted_text": extracted_text}, base64dataresp # Return plain text for PDFs
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ # Handle Images using OpenAI
111
+ elif content_type.startswith("image/"):
112
  try:
113
+ response = openai.ChatCompletion.create(
114
+ model="gpt-4o-mini",
115
+ messages=[
116
+ {"role": "system", "content": system_prompt},
117
+ {
118
+ "role": "user",
119
+ "content": [
120
+ {
121
+ "type": "image_url",
122
+ "image_url": {
123
+ "url": f"data:{content_type};base64,{base64_encoded}"
124
+ }
125
+ }
126
+ ]
127
+ }
128
+ ],
129
+ response_format={
130
+ "type": "json_schema",
131
+ "json_schema": json_schema
132
+ },
133
+ temperature=0.5,
134
+ max_tokens=16384
135
+ )
136
+
137
+ # Clean and parse JSON output
138
+ content = response.choices[0].message.content.strip()
139
+ cleaned_content = content.strip().strip('```json').strip('```')
140
+
141
+ try:
142
+ parsed_content = json.loads(cleaned_content)
143
+ return parsed_content, base64dataresp # Return extracted structured data
144
+ except json.JSONDecodeError as e:
145
+ logger.error(f"JSON Parse Error: {e}")
146
+ return None, base64dataresp
147
 
148
+ except Exception as e:
149
+ logger.error(f"Error in OpenAI image processing: {e}")
150
+ return {"error": str(e)}, base64dataresp
151
+
152
+ else:
153
+ raise ValueError(f"Unsupported content type: {content_type}")
154
 
155
  def get_content_type_from_s3(file_key):
156
  """Fetch the content type (MIME type) of a file stored in S3."""