vkumartr commited on
Commit
e502243
·
verified ·
1 Parent(s): f4a4d82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -31
app.py CHANGED
@@ -30,9 +30,6 @@ MONGODB_URI = os.getenv("MONGODB_URI")
30
  DATABASE_NAME = os.getenv("DATABASE_NAME")
31
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
32
 
33
- # use_gpu = False
34
- # output_dir = 'output'
35
-
36
  # Check if environment variables are set
37
  if not MONGODB_URI:
38
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
@@ -80,48 +77,36 @@ def fetch_file_from_s3(file_key):
80
  except Exception as e:
81
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
82
 
83
- # def extract_text_from_pdf(file_data):
84
- # """Extracts text from a PDF file."""
85
- # try:
86
- # doc = fitz.open(stream=file_data, filetype="pdf")
87
- # text = "\n".join([page.get_text("text") for page in doc])
88
- # return text.strip()
89
- # except Exception as e:
90
- # logger.error(f"Failed to extract text from PDF: {e}")
91
- # return ""
92
 
93
  # Function to summarize text using OpenAI GPT
94
  def extract_invoice_data(file_data, content_type):
95
- system_prompt = "You are an expert in document data extraction."
96
-
97
- # Convert file to Base64
 
 
98
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
99
-
100
- # Determine the correct MIME type for OpenAI
101
- if content_type.startswith("image/"):
102
- mime_type = content_type # e.g., image/png, image/jpeg
103
- elif content_type == "application/pdf":
104
- mime_type = "application/pdf"
105
- # text = extract_text_from_pdf(file_data)
106
- # mime_type = [{"role": "user", "content": text}]
107
- else:
108
- raise ValueError(f"Unsupported content type: {content_type}")
109
-
110
  try:
111
  response = openai.ChatCompletion.create(
112
  model="gpt-4o-mini",
113
  messages=[
114
- {"role": "system", "content": system_prompt},
115
  {
116
  "role": "user",
117
  "content": [
118
  {
119
  "type": "image_url",
120
  "image_url": {
121
- "url": f"data:image/{mime_type};base64,{base64_encoded}"
122
- },
123
- "image_url": {
124
- "url": f"data:application/pdf;base64,{base64_encoded}"
125
  }
126
  }
127
  ]
 
30
  DATABASE_NAME = os.getenv("DATABASE_NAME")
31
  COLLECTION_NAME = os.getenv("COLLECTION_NAME", "invoice_collection")
32
 
 
 
 
33
  # Check if environment variables are set
34
  if not MONGODB_URI:
35
  raise ValueError("MONGODB_URL is not set. Please add it to Hugging Face secrets.")
 
77
  except Exception as e:
78
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
79
 
80
+ def extract_text_from_pdf(file_data):
81
+ try:
82
+ doc = fitz.open(stream=file_data, filetype="pdf")
83
+ return "\n".join([page.get_text("text") for page in doc]).strip()
84
+ except Exception as e:
85
+ logger.error(f"Failed to extract text from PDF: {e}")
86
+ return ""
 
 
87
 
88
  # Function to summarize text using OpenAI GPT
89
  def extract_invoice_data(file_data, content_type):
90
+ if content_type == "application/pdf":
91
+ text = extract_text_from_pdf(file_data)
92
+ if len(text.split()) > 500: # Large document handling
93
+ return {"extracted_text": text}
94
+
95
  base64_encoded = base64.b64encode(file_data).decode('utf-8')
96
+ mime_type = content_type if content_type.startswith("image/") else "application/pdf"
97
+
 
 
 
 
 
 
 
 
 
98
  try:
99
  response = openai.ChatCompletion.create(
100
  model="gpt-4o-mini",
101
  messages=[
102
+ {"role": "system", "content": "You are an expert in document data extraction."},
103
  {
104
  "role": "user",
105
  "content": [
106
  {
107
  "type": "image_url",
108
  "image_url": {
109
+ "url": f"data:{mime_type};base64,{base64_encoded}"
 
 
 
110
  }
111
  }
112
  ]