kmuthudurai commited on
Commit
4434125
·
verified ·
1 Parent(s): c088f72

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -103
app.py CHANGED
@@ -2,7 +2,7 @@ import uvicorn
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
- from fastapi import FastAPI, UploadFile, File, HTTPException
6
  from paddleocr import PaddleOCR, PPStructure, save_structure_res
7
  from PIL import Image
8
  import io
@@ -10,6 +10,17 @@ import numpy as np
10
  import fitz # PyMuPDF for PDF handling
11
  import logging
12
 
 
 
 
 
 
 
 
 
 
 
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
15
  logger = logging.getLogger(__name__)
@@ -18,117 +29,139 @@ app = FastAPI(docs_url='/')
18
  use_gpu = False
19
  output_dir = 'output'
20
 
21
- class LangEnum(str, Enum):
22
- ch = "ch"
23
- en = "en"
24
 
25
- # Cache with ocr
26
- ocr_cache = {}
 
 
 
27
 
28
- # Get OCR instance
29
- def get_ocr(lang, use_gpu=False):
30
- if not ocr_cache.get(lang):
31
- ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
32
-
33
- return ocr_cache.get(lang)
34
 
35
- # Function to extract images from PDF
36
- # Function to extract images from PDF
37
- def pdf_to_images(uploaded_file):
 
 
 
 
 
 
 
38
  try:
39
- # Read the file content
40
- file_data = uploaded_file.read()
41
- logger.info(f"Received file of size {len(file_data)} bytes.")
42
-
43
- if len(file_data) == 0:
44
- raise HTTPException(status_code=400, detail="Uploaded PDF is empty.")
45
-
46
- # Open the PDF using fitz (PyMuPDF) from the byte stream
47
- doc = fitz.open(stream=file_data, filetype="pdf")
48
-
49
- # Check if the document has pages
50
- if len(doc) == 0:
51
- raise HTTPException(status_code=400, detail="The PDF document is empty.")
52
-
53
- logger.info(f"PDF loaded successfully with {len(doc)} pages.")
54
-
55
- image_parts = []
56
- for page_number in range(len(doc)):
57
- page = doc.load_page(page_number)
58
- pix = page.get_pixmap()
59
- image_data = pix.tobytes("png")
60
-
61
- # Log progress for each page
62
- logger.info(f"Processed page {page_number + 1}/{len(doc)}.")
63
-
64
- image_parts.append({
65
- "mime_type": "image/png",
66
- "data": image_data
67
- })
68
-
69
- logger.info(f"PDF to image conversion completed with {len(image_parts)} images.")
70
- return image_parts
71
-
72
  except Exception as e:
73
- logger.error(f"Error processing PDF: {str(e)}")
74
- raise HTTPException(status_code=500, detail="Error processing PDF file")
75
-
76
-
77
- @app.post("/ocr")
78
- async def create_upload_file(
79
- file: UploadFile = File(...),
80
- lang: LangEnum = LangEnum.ch,
81
- ):
82
  try:
83
- # Read the file contents
84
- contents = await file.read()
85
-
86
- # Log the file size
87
- logger.info(f"Received file of size {len(contents)} bytes.")
88
-
89
- # Ensure file is not empty
90
- if len(contents) == 0:
91
- raise HTTPException(status_code=400, detail="Uploaded file is empty.")
92
-
93
- # Determine if the uploaded file is a PDF or an image
94
- if file.content_type == "application/pdf":
95
- images = pdf_to_images(file) # No need to await this since it's not async
96
- elif file.content_type.startswith("image/"):
97
- # If it's an image file, process it
98
- image = Image.open(io.BytesIO(contents))
99
- images = [image]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  else:
101
- raise HTTPException(status_code=400, detail="Unsupported file type")
102
-
103
- # Initialize OCR model for the chosen language
104
- ocr = get_ocr(lang=lang, use_gpu=use_gpu)
105
-
106
- final_results = []
107
-
108
- # Iterate over the images and process with OCR
109
- for image in images:
110
- img2np = np.array(image)
111
- result = ocr.ocr(img2np, cls=True)
112
-
113
- if result:
114
- result = result[0] # Extract the result for this image
115
-
116
- boxes = [line[0] for line in result]
117
- txts = [line[1][0] for line in result]
118
- scores = [line[1][1] for line in result]
119
-
120
- # Combine results into a list of dictionaries
121
- final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
122
- final_results.extend(final_result)
123
- else:
124
- logger.warning("OCR did not return any results for the image.")
125
-
126
- return final_results
127
 
128
  except Exception as e:
129
- # Log the error and raise a 500 HTTP error
130
- logger.error(f"Error processing file: {str(e)}")
131
- raise HTTPException(status_code=500, detail="Internal server error while processing the file")
 
 
 
 
132
 
133
  # Serve the output folder as static files
134
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")
 
2
  from fastapi.staticfiles import StaticFiles
3
  import hashlib
4
  from enum import Enum
5
+ from fastapi import FastAPI,Header, Query,Depends,HTTPException
6
  from paddleocr import PaddleOCR, PPStructure, save_structure_res
7
  from PIL import Image
8
  import io
 
10
  import fitz # PyMuPDF for PDF handling
11
  import logging
12
 
13
+ import boto3
14
+ import openai
15
+ import os
16
+ import traceback # For detailed traceback of errors
17
+ import re
18
+ import json
19
+ from dotenv import load_dotenv
20
+ import uvicorn
21
+
22
+ load_dotenv()
23
+
24
  # Set up logging
25
  logging.basicConfig(level=logging.INFO)
26
  logger = logging.getLogger(__name__)
 
29
  use_gpu = False
30
  output_dir = 'output'
31
 
32
+ # Initialize PaddleOCR
33
+ ocr = PaddleOCR(use_angle_cls=True, lang='en')
 
34
 
35
+ # AWS S3 Configuration
36
+ API_KEY = os.getenv("API_KEY")
37
+ AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
38
+ AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
39
+ S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
40
 
41
+ # OpenAI Configuration
42
+ openai.api_key = os.getenv("OPENAI_API_KEY")
 
 
 
 
43
 
44
+ # S3 Client
45
+ s3_client = boto3.client(
46
+ 's3',
47
+ aws_access_key_id=AWS_ACCESS_KEY,
48
+ aws_secret_access_key=AWS_SECRET_KEY
49
+ )
50
+
51
+ # Function to fetch file from S3
52
+
53
+ def fetch_file_from_s3_file(file_key):
54
  try:
55
+ response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
56
+ content_type = response['ContentType'] # Retrieve MIME type
57
+ file_data = response['Body'].read()
58
+ return io.BytesIO(file_data), content_type # Return file data as BytesIO
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  except Exception as e:
60
+ raise Exception(f"Failed to fetch file from S3: {str(e)}")
61
+
62
+ # Function to summarize text using OpenAI GPT
63
+ def summarize_text(text):
64
+ system_prompt = "You are a helpful assistant that summarizes extracted OCR text into JSON format always"
 
 
 
 
65
  try:
66
+ response = openai.ChatCompletion.create(
67
+ model="gpt-4o-mini",
68
+ messages=[
69
+ {"role": "system", "content": system_prompt},
70
+ {"role": "user", "content": f"Summarize the following text and provide the JSON format always: {text}"}
71
+ ],
72
+ temperature=0.5,
73
+ max_tokens=16384
74
+ )
75
+ content = response.choices[0].message.content.strip()
76
+ cleaned_content = re.sub(r'^```json\n', '', content) # Remove '```json\n' at the beginning
77
+ cleaned_content = re.sub(r'\n```$', '', cleaned_content) # Remove '\n```' at the end
78
+
79
+ # Step 2: Parse the cleaned content as JSON
80
+ parsed_content = json.loads(cleaned_content)
81
+
82
+ # Step 3: Print the parsed JSON object
83
+ return parsed_content
84
+ except Exception as e:
85
+ return f"Error in summarization: {str(e)}"
86
+ # Dependency to check API Key
87
+ def verify_api_key(api_key: str = Header(...)):
88
+ if api_key != API_KEY:
89
+ raise HTTPException(status_code=401, detail="Invalid API Key")
90
+
91
+ @app.get("/")
92
+ def read_root():
93
+ return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}
94
+
95
+ @app.get("/ocr/extraction")
96
+ def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")):
97
+ """
98
+ Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
99
+ """
100
+ try:
101
+ # Fetch file from S3
102
+ file_data, content_type = fetch_file_from_s3_file(file_key)
103
+
104
+ extracted_text = []
105
+
106
+ # Determine file type based on MIME type
107
+ if content_type.startswith("image/"): # Image file
108
+ image = Image.open(file_data).convert("RGB") # Use BytesIO stream directly
109
+ image_np = np.array(image) # Convert to NumPy array
110
+ result = ocr.ocr(image_np, cls=True)
111
+
112
+ # Extract text from OCR results
113
+ for line in result:
114
+ for word_info in line:
115
+ extracted_text.append(word_info[1][0])
116
+
117
+ elif content_type == "application/pdf": # PDF file
118
+ # Open PDF using PyMuPDF
119
+ pdf_document = fitz.open(stream=file_data, filetype="pdf")
120
+
121
+ extracted_text = []
122
+
123
+ # Process each page in the PDF
124
+ for page_number in range(len(pdf_document)):
125
+ page = pdf_document[page_number]
126
+
127
+ # Render the page as an image
128
+ pix = page.get_pixmap()
129
+ image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
130
+
131
+ # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
132
+ image_np = np.array(image)
133
+
134
+ # Run OCR on the image
135
+ result = ocr.ocr(image_np, cls=True)
136
+ for line in result:
137
+ for word_info in line:
138
+ extracted_text.append(word_info[1][0])
139
+
140
+ pdf_document.close()
141
  else:
142
+ return {"error": f"Unsupported file type: {content_type}"}
143
+
144
+ # Combine extracted text
145
+ full_text = " ".join(extracted_text)
146
+
147
+ # Summarize the extracted text
148
+ summary = summarize_text(full_text)
149
+
150
+ return {
151
+ "file_key": file_key,
152
+ "file_type": content_type,
153
+ "extracted_text": full_text,
154
+ "summary": summary
155
+ }
 
 
 
 
 
 
 
 
 
 
 
 
156
 
157
  except Exception as e:
158
+ # Detailed error information
159
+ error_details = {
160
+ "error_type": type(e).__name__,
161
+ "error_message": str(e),
162
+ "traceback": traceback.format_exc()
163
+ }
164
+ return {"error": error_details}
165
 
166
  # Serve the output folder as static files
167
  app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")