vkumartr commited on
Commit
e9cefc0
·
verified ·
1 Parent(s): 24259cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -1
app.py CHANGED
@@ -93,15 +93,49 @@ def extract_pdf_text(file_data):
93
  logger.error(f"PDF Extraction Error: {e}")
94
  return None
95
 
 
 
 
 
 
 
 
 
 
 
 
 
96
  # Function to summarize text using OpenAI GPT
97
  def extract_invoice_data(file_data, content_type, json_schema):
98
  """
99
  Extracts data from a PDF (converted to images) or an image.
100
  Only PDFs with 1 or 2 pages are allowed.
101
  """
102
- system_prompt = "You are an expert in document data extraction."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
103
  base64_images = []
104
  base64DataResp = []
 
105
 
106
  if content_type == "application/pdf":
107
  try:
@@ -127,12 +161,17 @@ def extract_invoice_data(file_data, content_type, json_schema):
127
  return {"error": "Failed to process PDF"}, None
128
 
129
  elif content_type.startswith("image/"):
 
130
  # Handle direct image files
131
  base64_pdf = base64.b64encode(file_data).decode('utf-8')
132
  base64DataResp.append(f"data:{content_type};base64,{base64_pdf}")
 
133
  else:
134
  return {"error": f"Unsupported file type: {content_type}"}
135
 
 
 
 
136
  # Prepare OpenAI request
137
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
138
 
 
93
  logger.error(f"PDF Extraction Error: {e}")
94
  return None
95
 
96
+ # def extract_text_from_image(file_data):
97
+ # """
98
+ # Extracts text from a image file using pytesseract
99
+ # """
100
+ # try:
101
+ # image = Image.open(io.BytesIO(image_data))
102
+ # extracted_text = pytesseract.image_to_string(image)
103
+ # return extracted_text.strip()
104
+ # except Exception as e:
105
+ # logger.error(f"Image Extraction Error: {e}")
106
+ # return None
107
+
108
  # Function to summarize text using OpenAI GPT
109
  def extract_invoice_data(file_data, content_type, json_schema):
110
  """
111
  Extracts data from a PDF (converted to images) or an image.
112
  Only PDFs with 1 or 2 pages are allowed.
113
  """
114
+ system_prompt = """You are an expert in invoice data extraction.
115
+ Your task is to extract key fields from an invoice image. Ensure accurate extraction and return the data in JSON format.
116
+
117
+ Extract the following fields:
118
+ 1. Line Items: A list containing:
119
+ - Product Code
120
+ - Description
121
+ - Amount (numeric)
122
+ 2. Tax Amount (if available)
123
+ 3. Vendor GST (if available)
124
+ 4. Vendor Name
125
+ 5. Invoice Date (format: "DD-MMM-YYYY", e.g., "15-Sep-2023")
126
+ 6. Total Amount (numeric)
127
+ 7. Invoice Number (e.g., "INV-2023-001")
128
+ 8. Vendor Address
129
+ 9. Invoice Currency (e.g., "USD", "EUR")
130
+
131
+ Ensure that:
132
+ - All extracted fields match the invoice.
133
+ - If any field is missing, return null instead of hallucinating data.
134
+ - Do not generate synthetic values—only extract real information from the image.
135
+ """
136
  base64_images = []
137
  base64DataResp = []
138
+ extracted_text = ""
139
 
140
  if content_type == "application/pdf":
141
  try:
 
161
  return {"error": "Failed to process PDF"}, None
162
 
163
  elif content_type.startswith("image/"):
164
+ # extracted_text = extract_text_from_image(file_data) # OCR extraction
165
  # Handle direct image files
166
  base64_pdf = base64.b64encode(file_data).decode('utf-8')
167
  base64DataResp.append(f"data:{content_type};base64,{base64_pdf}")
168
+
169
  else:
170
  return {"error": f"Unsupported file type: {content_type}"}
171
 
172
+ if extracted_text:
173
+ return {"extracted_text": extracted_text}, base64DataResp
174
+
175
  # Prepare OpenAI request
176
  openai_content = [{"type": "image_url", "image_url": {"url": img_base64}} for img_base64 in base64_images]
177