vkumartr commited on
Commit
8b0fe14
·
verified ·
1 Parent(s): e502243

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -17
app.py CHANGED
@@ -77,36 +77,33 @@ def fetch_file_from_s3(file_key):
77
  except Exception as e:
78
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
79
 
80
- def extract_text_from_pdf(file_data):
81
- try:
82
- doc = fitz.open(stream=file_data, filetype="pdf")
83
- return "\n".join([page.get_text("text") for page in doc]).strip()
84
- except Exception as e:
85
- logger.error(f"Failed to extract text from PDF: {e}")
86
- return ""
87
-
88
  # Function to summarize text using OpenAI GPT
89
  def extract_invoice_data(file_data, content_type):
90
- if content_type == "application/pdf":
91
- text = extract_text_from_pdf(file_data)
92
- if len(text.split()) > 500: # Large document handling
93
- return {"extracted_text": text}
94
-
95
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
96
- mime_type = content_type if content_type.startswith("image/") else "application/pdf"
97
-
 
 
 
 
 
 
 
98
  try:
99
  response = openai.ChatCompletion.create(
100
  model="gpt-4o-mini",
101
  messages=[
102
- {"role": "system", "content": "You are an expert in document data extraction."},
103
  {
104
  "role": "user",
105
  "content": [
106
  {
107
  "type": "image_url",
108
  "image_url": {
109
- "url": f"data:{mime_type};base64,{base64_encoded}"
110
  }
111
  }
112
  ]
 
77
  except Exception as e:
78
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
79
 
 
 
 
 
 
 
 
 
80
  # Function to summarize text using OpenAI GPT
81
  def extract_invoice_data(file_data, content_type):
82
+ system_prompt = "You are an expert in document data extraction."
83
+
84
+ # Convert file to Base64
 
 
85
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
86
+
87
+ # Determine the correct MIME type for OpenAI
88
+ if content_type.startswith("image/"):
89
+ mime_type = content_type # e.g., image/png, image/jpeg
90
+ elif content_type == "application/pdf":
91
+ mime_type = "application/pdf"
92
+ else:
93
+ raise ValueError(f"Unsupported content type: {content_type}")
94
+
95
  try:
96
  response = openai.ChatCompletion.create(
97
  model="gpt-4o-mini",
98
  messages=[
99
+ {"role": "system", "content": system_prompt},
100
  {
101
  "role": "user",
102
  "content": [
103
  {
104
  "type": "image_url",
105
  "image_url": {
106
+ "url": f"data:image/{mime_type};base64,{base64_encoded}"
107
  }
108
  }
109
  ]