Spaces:
Sleeping
Sleeping
File size: 3,870 Bytes
e158d2f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 |
"""
Text extraction from PDFs and images using EasyOCR
Smart extraction: tries text layer first, falls back to OCR
"""
import fitz # PyMuPDF
import easyocr
from PIL import Image
from pdf2image import convert_from_bytes
import io
import numpy as np
from typing import Tuple, Optional
print("Initializing EasyOCR Reader...")
try:
reader = easyocr.Reader(['en'], gpu=False, verbose=False)
print("β EasyOCR Reader initialized successfully")
except Exception as e:
print(f"β EasyOCR initialization failed: {e}")
reader = None
def extract_text_from_pdf(pdf_bytes: bytes) -> Tuple[Optional[str], bool]:
"""
Extract text from PDF with smart OCR fallback
Returns:
(extracted_text, ocr_used)
"""
if not pdf_bytes:
return None, False
try:
# Try extracting text layer first (fast)
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
full_text = ""
for page in doc:
full_text += page.get_text()
doc.close()
# Check if meaningful text was extracted
if len(full_text.strip()) > 50:
print(f"β Extracted {len(full_text)} chars from text layer")
return full_text.strip(), False
# No text layer - use OCR
print("β No text layer detected, using EasyOCR...")
text = extract_text_from_pdf_via_ocr(pdf_bytes)
return text, True
except Exception as e:
print(f"β Error in PDF text extraction: {e}")
return None, False
def extract_text_from_pdf_via_ocr(pdf_bytes: bytes) -> Optional[str]:
"""
Extract text using EasyOCR on PDF pages converted to images
"""
if not reader:
raise RuntimeError("EasyOCR not initialized")
try:
# Convert PDF to images
images = convert_from_bytes(pdf_bytes, dpi=300)
full_text = ""
for i, image in enumerate(images):
print(f" OCR processing page {i+1}/{len(images)}...")
# Convert PIL to numpy array
img_array = np.array(image)
# Run EasyOCR
results = reader.readtext(img_array, detail=0, paragraph=True)
page_text = ' '.join(results)
full_text += page_text + "\n\n"
print(f"β EasyOCR extracted {len(full_text)} chars from {len(images)} pages")
return full_text.strip()
except Exception as e:
print(f"β OCR failed: {e}")
return None
def extract_text_from_image(image_bytes: bytes) -> Optional[str]:
"""
Extract text from image file using EasyOCR
"""
if not reader:
raise RuntimeError("EasyOCR not initialized")
try:
print("Processing image with EasyOCR...")
# Open and prepare image
image = Image.open(io.BytesIO(image_bytes))
if image.mode != 'RGB':
image = image.convert('RGB')
# Convert to numpy
img_array = np.array(image)
# Run EasyOCR
results = reader.readtext(img_array, detail=0, paragraph=True)
text = ' '.join(results)
print(f"β EasyOCR extracted {len(text)} chars from image")
return text.strip()
except Exception as e:
print(f"β Image OCR failed: {e}")
return None
def get_ocr_confidence(image_array: np.ndarray) -> list:
"""
Get detailed OCR results with confidence scores
"""
if not reader:
return []
try:
results = reader.readtext(image_array, detail=1)
return [
{
"text": text,
"confidence": round(conf, 3),
"bbox": bbox
}
for bbox, text, conf in results
]
except:
return []
|