kmuthudurai commited on
Commit
5e852da
·
verified ·
1 Parent(s): c1c647b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +8 -5
app.py CHANGED
@@ -18,6 +18,7 @@ import re
18
  import json
19
  from dotenv import load_dotenv
20
  import uvicorn
 
21
 
22
  load_dotenv()
23
 
@@ -55,7 +56,7 @@ def fetch_file_from_s3_file(file_key):
55
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
56
  content_type = response['ContentType'] # Retrieve MIME type
57
  file_data = response['Body'].read()
58
- return io.BytesIO(file_data), content_type # Return file data as BytesIO
59
  except Exception as e:
60
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
61
 
@@ -102,13 +103,13 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
102
  file_data, content_type = fetch_file_from_s3_file(file_key)
103
 
104
  extracted_text = []
105
-
106
  # Determine file type based on MIME type
107
  if content_type.startswith("image/"): # Image file
108
- image = Image.open(file_data).convert("RGB") # Use BytesIO stream directly
109
  image_np = np.array(image) # Convert to NumPy array
110
  result = ocr.ocr(image_np, cls=True)
111
-
112
  # Extract text from OCR results
113
  for line in result:
114
  for word_info in line:
@@ -116,7 +117,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
116
 
117
  elif content_type == "application/pdf": # PDF file
118
  # Open PDF using PyMuPDF
119
- pdf_document = fitz.open(stream=file_data, filetype="pdf")
120
 
121
  extracted_text = []
122
 
@@ -138,6 +139,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
138
  extracted_text.append(word_info[1][0])
139
 
140
  pdf_document.close()
 
141
  else:
142
  return {"error": f"Unsupported file type: {content_type}"}
143
 
@@ -150,6 +152,7 @@ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(...
150
  return {
151
  "file_key": file_key,
152
  "file_type": content_type,
 
153
  "extracted_text": full_text,
154
  "summary": summary
155
  }
 
18
  import json
19
  from dotenv import load_dotenv
20
  import uvicorn
21
+ import base64
22
 
23
  load_dotenv()
24
 
 
56
  response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
57
  content_type = response['ContentType'] # Retrieve MIME type
58
  file_data = response['Body'].read()
59
+ return file_data, content_type # Return file data as BytesIO
60
  except Exception as e:
61
  raise Exception(f"Failed to fetch file from S3: {str(e)}")
62
 
 
103
  file_data, content_type = fetch_file_from_s3_file(file_key)
104
 
105
  extracted_text = []
106
+ base64Data = base64.b64encode(file_data).decode('utf-8')
107
  # Determine file type based on MIME type
108
  if content_type.startswith("image/"): # Image file
109
+ image = Image.open(io.BytesIO(file_data)).convert("RGB") # Use BytesIO stream directly
110
  image_np = np.array(image) # Convert to NumPy array
111
  result = ocr.ocr(image_np, cls=True)
112
+ base64DataResp = f"data:image/{content_type.lower()};base64,{base64_data}"
113
  # Extract text from OCR results
114
  for line in result:
115
  for word_info in line:
 
117
 
118
  elif content_type == "application/pdf": # PDF file
119
  # Open PDF using PyMuPDF
120
+ pdf_document = fitz.open(stream=io.BytesIO(file_data), filetype="pdf")
121
 
122
  extracted_text = []
123
 
 
139
  extracted_text.append(word_info[1][0])
140
 
141
  pdf_document.close()
142
+ base64DataResp = f"data:application/pdf;base64,{base64_data}"
143
  else:
144
  return {"error": f"Unsupported file type: {content_type}"}
145
 
 
152
  return {
153
  "file_key": file_key,
154
  "file_type": content_type,
155
+ "base64DataResp":base64DataResp,
156
  "extracted_text": full_text,
157
  "summary": summary
158
  }