document-extraction

Sleeping

App Files Files Community

kmuthudurai commited on Dec 18, 2024

Commit

4434125

verified ·

1 Parent(s): c088f72

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -103

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import uvicorn
 from fastapi.staticfiles import StaticFiles
 import hashlib
 from enum import Enum
-from fastapi import FastAPI, UploadFile, File, HTTPException
 from paddleocr import PaddleOCR, PPStructure, save_structure_res
 from PIL import Image
 import io
@@ -10,6 +10,17 @@ import numpy as np
 import fitz  # PyMuPDF for PDF handling
 import logging
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -18,117 +29,139 @@ app = FastAPI(docs_url='/')
 use_gpu = False
 output_dir = 'output'
-class LangEnum(str, Enum):
-    ch = "ch"
-    en = "en"
-# Cache with ocr
-ocr_cache = {}
-# Get OCR instance
-def get_ocr(lang, use_gpu=False):
-    if not ocr_cache.get(lang):
-        ocr_cache[lang] = PaddleOCR(use_angle_cls=True, lang=lang, use_gpu=use_gpu)
-    return ocr_cache.get(lang)
-# Function to extract images from PDF
-# Function to extract images from PDF
-def pdf_to_images(uploaded_file):
     try:
-        # Read the file content
-        file_data = uploaded_file.read()
-        logger.info(f"Received file of size {len(file_data)} bytes.")
-        if len(file_data) == 0:
-            raise HTTPException(status_code=400, detail="Uploaded PDF is empty.")
-        # Open the PDF using fitz (PyMuPDF) from the byte stream
-        doc = fitz.open(stream=file_data, filetype="pdf")
-        # Check if the document has pages
-        if len(doc) == 0:
-            raise HTTPException(status_code=400, detail="The PDF document is empty.")
-        logger.info(f"PDF loaded successfully with {len(doc)} pages.")
-        image_parts = []
-        for page_number in range(len(doc)):
-            page = doc.load_page(page_number)
-            pix = page.get_pixmap()
-            image_data = pix.tobytes("png")
-            # Log progress for each page
-            logger.info(f"Processed page {page_number + 1}/{len(doc)}.")
-            image_parts.append({
-                "mime_type": "image/png",
-                "data": image_data
-            })
-        logger.info(f"PDF to image conversion completed with {len(image_parts)} images.")
-        return image_parts
     except Exception as e:
-        logger.error(f"Error processing PDF: {str(e)}")
-        raise HTTPException(status_code=500, detail="Error processing PDF file")
-@app.post("/ocr")
-async def create_upload_file(
-    file: UploadFile = File(...),
-    lang: LangEnum = LangEnum.ch,
-):
     try:
-        # Read the file contents
-        contents = await file.read()
-        # Log the file size
-        logger.info(f"Received file of size {len(contents)} bytes.")
-        # Ensure file is not empty
-        if len(contents) == 0:
-            raise HTTPException(status_code=400, detail="Uploaded file is empty.")
-        # Determine if the uploaded file is a PDF or an image
-        if file.content_type == "application/pdf":
-            images = pdf_to_images(file)  # No need to await this since it's not async
-        elif file.content_type.startswith("image/"):
-            # If it's an image file, process it
-            image = Image.open(io.BytesIO(contents))
-            images = [image]
         else:
-            raise HTTPException(status_code=400, detail="Unsupported file type")
-        # Initialize OCR model for the chosen language
-        ocr = get_ocr(lang=lang, use_gpu=use_gpu)
-        final_results = []
-        # Iterate over the images and process with OCR
-        for image in images:
-            img2np = np.array(image)
-            result = ocr.ocr(img2np, cls=True)
-            if result:
-                result = result[0]  # Extract the result for this image
-                boxes = [line[0] for line in result]
-                txts = [line[1][0] for line in result]
-                scores = [line[1][1] for line in result]
-                # Combine results into a list of dictionaries
-                final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
-                final_results.extend(final_result)
-            else:
-                logger.warning("OCR did not return any results for the image.")
-        return final_results
     except Exception as e:
-        # Log the error and raise a 500 HTTP error
-        logger.error(f"Error processing file: {str(e)}")
-        raise HTTPException(status_code=500, detail="Internal server error while processing the file")
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")

 from fastapi.staticfiles import StaticFiles
 import hashlib
 from enum import Enum
+from fastapi import FastAPI,Header, Query,Depends,HTTPException
 from paddleocr import PaddleOCR, PPStructure, save_structure_res
 from PIL import Image
 import io
 import fitz  # PyMuPDF for PDF handling
 import logging
+import boto3
+import openai
+import os
+import traceback  # For detailed traceback of errors
+import re
+import json
+from dotenv import load_dotenv
+import uvicorn
+load_dotenv()
 # Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 use_gpu = False
 output_dir = 'output'
+# Initialize PaddleOCR
+ocr = PaddleOCR(use_angle_cls=True, lang='en')
+# AWS S3 Configuration
+API_KEY = os.getenv("API_KEY")
+AWS_ACCESS_KEY = os.getenv("AWS_ACCESS_KEY")
+AWS_SECRET_KEY = os.getenv("AWS_SECRET_KEY")
+S3_BUCKET_NAME = os.getenv("S3_BUCKET_NAME")
+# OpenAI Configuration
+openai.api_key = os.getenv("OPENAI_API_KEY")
+# S3 Client
+s3_client = boto3.client(
+    's3',
+    aws_access_key_id=AWS_ACCESS_KEY,
+    aws_secret_access_key=AWS_SECRET_KEY
+)
+# Function to fetch file from S3
+def fetch_file_from_s3_file(file_key):
     try:
+        response = s3_client.get_object(Bucket=S3_BUCKET_NAME, Key=file_key)
+        content_type = response['ContentType']  # Retrieve MIME type
+        file_data = response['Body'].read()
+        return io.BytesIO(file_data), content_type  # Return file data as BytesIO
     except Exception as e:
+        raise Exception(f"Failed to fetch file from S3: {str(e)}")
+# Function to summarize text using OpenAI GPT
+def summarize_text(text):
+    system_prompt = "You are a helpful assistant that summarizes extracted OCR text into JSON format always"
     try:
+        response = openai.ChatCompletion.create(
+            model="gpt-4o-mini",
+            messages=[
+                {"role": "system", "content": system_prompt},
+                {"role": "user", "content": f"Summarize the following text and provide the JSON format always: {text}"}
+            ],
+            temperature=0.5,
+            max_tokens=16384
+        )
+        content = response.choices[0].message.content.strip()
+        cleaned_content = re.sub(r'^```json\n', '', content)  # Remove '```json\n' at the beginning
+        cleaned_content = re.sub(r'\n```$', '', cleaned_content)  # Remove '\n```' at the end
+        # Step 2: Parse the cleaned content as JSON
+        parsed_content = json.loads(cleaned_content)
+        # Step 3: Print the parsed JSON object
+        return parsed_content
+    except Exception as e:
+        return f"Error in summarization: {str(e)}"
+# Dependency to check API Key
+def verify_api_key(api_key: str = Header(...)):
+    if api_key != API_KEY:
+        raise HTTPException(status_code=401, detail="Invalid API Key")
+@app.get("/")
+def read_root():
+    return {"message": "Welcome to the PaddleOCR with S3 and GPT Summarization API!"}
+@app.get("/ocr/extraction")
+def ocr_from_s3(api_key: str = Depends(verify_api_key),file_key: str = Query(..., description="S3 file key for the file")):
+    """
+    Perform OCR on a file (PDF or Image) stored in S3 and summarize the text using GPT.
+    """
+    try:
+        # Fetch file from S3
+        file_data, content_type = fetch_file_from_s3_file(file_key)
+        extracted_text = []
+        # Determine file type based on MIME type
+        if content_type.startswith("image/"):  # Image file
+            image = Image.open(file_data).convert("RGB")  # Use BytesIO stream directly
+            image_np = np.array(image)  # Convert to NumPy array
+            result = ocr.ocr(image_np, cls=True)
+            # Extract text from OCR results
+            for line in result:
+                for word_info in line:
+                    extracted_text.append(word_info[1][0])
+        elif content_type == "application/pdf":  # PDF file
+            # Open PDF using PyMuPDF
+            pdf_document = fitz.open(stream=file_data, filetype="pdf")
+            extracted_text = []
+            # Process each page in the PDF
+            for page_number in range(len(pdf_document)):
+                page = pdf_document[page_number]
+                # Render the page as an image
+                pix = page.get_pixmap()
+                image = Image.open(io.BytesIO(pix.tobytes("png"))).convert("RGB")
+                # Convert Pillow image to NumPy array (for PaddleOCR compatibility)
+                image_np = np.array(image)
+                # Run OCR on the image
+                result = ocr.ocr(image_np, cls=True)
+                for line in result:
+                    for word_info in line:
+                        extracted_text.append(word_info[1][0])
+            pdf_document.close()
         else:
+            return {"error": f"Unsupported file type: {content_type}"}
+        # Combine extracted text
+        full_text = " ".join(extracted_text)
+        # Summarize the extracted text
+        summary = summarize_text(full_text)
+        return {
+            "file_key": file_key,
+            "file_type": content_type,
+            "extracted_text": full_text,
+            "summary": summary
+        }
     except Exception as e:
+        # Detailed error information
+        error_details = {
+            "error_type": type(e).__name__,
+            "error_message": str(e),
+            "traceback": traceback.format_exc()
+        }
+        return {"error": error_details}
 # Serve the output folder as static files
 app.mount("/output", StaticFiles(directory="output", follow_symlink=True, html=True), name="output")