document-extraction

Sleeping

App Files Files Community

kmuthudurai commited on Dec 13, 2024

Commit

9bcc761

verified ·

1 Parent(s): 53a4f24

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -4

app.py CHANGED Viewed

@@ -33,10 +33,11 @@ def get_ocr(lang, use_gpu=False):
     return ocr_cache.get(lang)
 # Function to extract images from PDF
-async def pdf_to_images(uploaded_file):
     try:
-        # Read file content and log the size of the file
-        file_data = await uploaded_file.read()
         logger.info(f"Received file of size {len(file_data)} bytes.")
         if len(file_data) == 0:
@@ -45,6 +46,7 @@ async def pdf_to_images(uploaded_file):
         # Open the PDF using fitz (PyMuPDF) from the byte stream
         doc = fitz.open(stream=file_data, filetype="pdf")
         if len(doc) == 0:
             raise HTTPException(status_code=400, detail="The PDF document is empty.")
@@ -90,7 +92,63 @@ async def create_upload_file(
         # Determine if the uploaded file is a PDF or an image
         if file.content_type == "application/pdf":
-            images = pdf_to_images(file)
         elif file.content_type.startswith("image/"):
             # If it's an image file, process it
             image = Image.open(io.BytesIO(contents))

     return ocr_cache.get(lang)
 # Function to extract images from PDF
+# Function to extract images from PDF
+def pdf_to_images(uploaded_file):
     try:
+        # Read the file content
+        file_data = uploaded_file.file.read()
         logger.info(f"Received file of size {len(file_data)} bytes.")
         if len(file_data) == 0:
         # Open the PDF using fitz (PyMuPDF) from the byte stream
         doc = fitz.open(stream=file_data, filetype="pdf")
+        # Check if the document has pages
         if len(doc) == 0:
             raise HTTPException(status_code=400, detail="The PDF document is empty.")
         # Determine if the uploaded file is a PDF or an image
         if file.content_type == "application/pdf":
+            images = pdf_to_images(file)  # No need to await this since it's not async
+        elif file.content_type.startswith("image/"):
+            # If it's an image file, process it
+            image = Image.open(io.BytesIO(contents))
+            images = [image]
+        else:
+            raise HTTPException(status_code=400, detail="Unsupported file type")
+        # Initialize OCR model for the chosen language
+        ocr = get_ocr(lang=lang, use_gpu=use_gpu)
+        final_results = []
+        # Iterate over the images and process with OCR
+        for image in images:
+            img2np = np.array(image)
+            result = ocr.ocr(img2np, cls=True)
+            if result:
+                result = result[0]  # Extract the result for this image
+                boxes = [line[0] for line in result]
+                txts = [line[1][0] for line in result]
+                scores = [line[1][1] for line in result]
+                # Combine results into a list of dictionaries
+                final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
+                final_results.extend(final_result)
+            else:
+                logger.warning("OCR did not return any results for the image.")
+        return final_results
+    except Exception as e:
+        # Log the error and raise a 500 HTTP error
+        logger.error(f"Error processing file: {str(e)}")
+        raise HTTPException(status_code=500, detail="Internal server error while processing the file")
+@app.post("/ocr")
+async def create_upload_file(
+    file: UploadFile = File(...),
+    lang: LangEnum = LangEnum.ch,
+):
+    try:
+        # Read the file contents
+        contents = await file.read()
+        # Log the file size
+        logger.info(f"Received file of size {len(contents)} bytes.")
+        # Ensure file is not empty
+        if len(contents) == 0:
+            raise HTTPException(status_code=400, detail="Uploaded file is empty.")
+        # Determine if the uploaded file is a PDF or an image
+        if file.content_type == "application/pdf":
+            images = pdf_to_images(file)  # No need to await this since it's not async
         elif file.content_type.startswith("image/"):
             # If it's an image file, process it
             image = Image.open(io.BytesIO(contents))