document-extraction

Sleeping

App Files Files Community

kmuthudurai commited on Dec 13, 2024

Commit

3aa7abf

verified ·

1 Parent(s): 58b796b

Update app.py

Browse files

Files changed (1) hide show

app.py +44 -17

app.py CHANGED Viewed

@@ -35,11 +35,19 @@ def get_ocr(lang, use_gpu=False):
 # Function to extract images from PDF
 def pdf_to_images(uploaded_file):
     try:
-        # Read the uploaded file as bytes
-        file_data = uploaded_file.file.read()  # This returns the file as bytes
-        # Open the PDF using fitz (PyMuPDF) from the bytes data
         doc = fitz.open(stream=file_data, filetype="pdf")
         logger.info(f"PDF loaded successfully with {len(doc)} pages.")
         image_parts = []
@@ -70,34 +78,53 @@ async def create_upload_file(
     lang: LangEnum = LangEnum.ch,
 ):
     try:
         contents = await file.read()
-        # Determine if the uploaded file is a PDF or image
         if file.content_type == "application/pdf":
             images = pdf_to_images(file)
         else:
-            # If it's an image file
-            images = [Image.open(io.BytesIO(contents))]
         ocr = get_ocr(lang=lang, use_gpu=use_gpu)
-        final_results = []
         for image in images:
             img2np = np.array(image)
-            result = ocr.ocr(img2np, cls=True)[0]
-            boxes = [line[0] for line in result]
-            txts = [line[1][0] for line in result]
-            scores = [line[1][1] for line in result]
-            # 识别结果
-            final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
-            final_results.extend(final_result)
         return final_results
     except Exception as e:
         logger.error(f"Error processing file: {str(e)}")
         raise HTTPException(status_code=500, detail="Internal server error while processing the file")

 # Function to extract images from PDF
 def pdf_to_images(uploaded_file):
     try:
+        # Read file content and log the size of the file
+        file_data = uploaded_file.file.read()
+        logger.info(f"Received file of size {len(file_data)} bytes.")
+        if len(file_data) == 0:
+            raise HTTPException(status_code=400, detail="Uploaded PDF is empty.")
+        # Open the PDF using fitz (PyMuPDF) from the byte stream
         doc = fitz.open(stream=file_data, filetype="pdf")
+        if len(doc) == 0:
+            raise HTTPException(status_code=400, detail="The PDF document is empty.")
         logger.info(f"PDF loaded successfully with {len(doc)} pages.")
         image_parts = []
     lang: LangEnum = LangEnum.ch,
 ):
     try:
+        # Read the file contents
         contents = await file.read()
+        # Log the file size
+        logger.info(f"Received file of size {len(contents)} bytes.")
+        # Ensure file is not empty
+        if len(contents) == 0:
+            raise HTTPException(status_code=400, detail="Uploaded file is empty.")
+        # Determine if the uploaded file is a PDF or an image
         if file.content_type == "application/pdf":
             images = pdf_to_images(file)
+        elif file.content_type.startswith("image/"):
+            # If it's an image file, process it
+            image = Image.open(io.BytesIO(contents))
+            images = [image]
         else:
+            raise HTTPException(status_code=400, detail="Unsupported file type")
+        # Initialize OCR model for the chosen language
         ocr = get_ocr(lang=lang, use_gpu=use_gpu)
+        final_results = []
+        # Iterate over the images and process with OCR
         for image in images:
             img2np = np.array(image)
+            result = ocr.ocr(img2np, cls=True)
+            if result:
+                result = result[0]  # Extract the result for this image
+                boxes = [line[0] for line in result]
+                txts = [line[1][0] for line in result]
+                scores = [line[1][1] for line in result]
+                # Combine results into a list of dictionaries
+                final_result = [dict(boxes=box, txt=txt, score=score) for box, txt, score in zip(boxes, txts, scores)]
+                final_results.extend(final_result)
+            else:
+                logger.warning("OCR did not return any results for the image.")
         return final_results
     except Exception as e:
+        # Log the error and raise a 500 HTTP error
         logger.error(f"Error processing file: {str(e)}")
         raise HTTPException(status_code=500, detail="Internal server error while processing the file")