Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -33,15 +33,22 @@ def get_ocr(lang, use_gpu=False):
|
|
| 33 |
return ocr_cache.get(lang)
|
| 34 |
|
| 35 |
# Function to extract images from PDF
|
| 36 |
-
def pdf_to_images(
|
| 37 |
try:
|
| 38 |
-
doc = fitz.open(
|
| 39 |
-
|
| 40 |
-
|
|
|
|
|
|
|
| 41 |
pix = page.get_pixmap()
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
except Exception as e:
|
| 46 |
logger.error(f"Error processing PDF: {str(e)}")
|
| 47 |
raise HTTPException(status_code=500, detail="Error processing PDF file")
|
|
@@ -56,7 +63,7 @@ async def create_upload_file(
|
|
| 56 |
|
| 57 |
# Determine if the uploaded file is a PDF or image
|
| 58 |
if file.content_type == "application/pdf":
|
| 59 |
-
images = pdf_to_images(
|
| 60 |
else:
|
| 61 |
# If it's an image file
|
| 62 |
images = [Image.open(io.BytesIO(contents))]
|
|
|
|
| 33 |
return ocr_cache.get(lang)
|
| 34 |
|
| 35 |
# Function to extract images from PDF
|
| 36 |
+
def pdf_to_images(uploaded_file):
|
| 37 |
try:
|
| 38 |
+
doc = fitz.open(stream=uploaded_file.read(),filetype="pdf")
|
| 39 |
+
image_parts = []
|
| 40 |
+
|
| 41 |
+
for page_number in range(len(pdf_document)):
|
| 42 |
+
page = pdf_document.load_page(page_number)
|
| 43 |
pix = page.get_pixmap()
|
| 44 |
+
image_data = pix.tobytes("png")
|
| 45 |
+
|
| 46 |
+
image_parts.append({
|
| 47 |
+
"mime_type": "image/png",
|
| 48 |
+
"data": image_data
|
| 49 |
+
})
|
| 50 |
+
|
| 51 |
+
return image_parts
|
| 52 |
except Exception as e:
|
| 53 |
logger.error(f"Error processing PDF: {str(e)}")
|
| 54 |
raise HTTPException(status_code=500, detail="Error processing PDF file")
|
|
|
|
| 63 |
|
| 64 |
# Determine if the uploaded file is a PDF or image
|
| 65 |
if file.content_type == "application/pdf":
|
| 66 |
+
images = pdf_to_images(file)
|
| 67 |
else:
|
| 68 |
# If it's an image file
|
| 69 |
images = [Image.open(io.BytesIO(contents))]
|