Spaces:
Running
on
Zero
Running
on
Zero
File size: 5,701 Bytes
9012453 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
"""
OCR module adapted for HuggingFace Spaces.
Uses Google Cloud Vision API for text detection.
"""
from PIL import Image, ImageDraw, ImageFilter
from google.cloud import vision
import numpy as np
import io
import os
import json
import tempfile
from py_files.bounding_clustering import QuadTree, Node
def change_contrast(img, level):
"""Adjust image contrast for better OCR results."""
factor = (259 * (level + 255)) / (255 * (259 - level))
def contrast(c):
return 128 + factor * (c - 128)
return img.point(contrast)
def get_bounding_box_doc(blk):
"""Extract bounding box coordinates from document text block."""
vertices = [int(blk.bounding_box.vertices[0].x), int(blk.bounding_box.vertices[0].y),
int(blk.bounding_box.vertices[2].x), int(blk.bounding_box.vertices[2].y)]
return vertices
def get_text_from_image_doc(img, debug=False, get_response=False, resp=None, max_dist=20):
"""
Extract text from image using Google Cloud Vision Document Text Detection.
Adapted for HuggingFace Spaces environment.
"""
response = resp
if resp is None:
# Initialize the client with credentials from environment
try:
# Try to get credentials from environment variable
google_creds = os.environ.get('GOOGLE_CLOUD_CREDENTIALS')
if google_creds:
# Create temporary credentials file
creds_data = json.loads(google_creds)
with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f:
json.dump(creds_data, f)
creds_path = f.name
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path
client = vision.ImageAnnotatorClient()
# Enhance image contrast for better OCR
img = change_contrast(img, 20)
# Convert PIL image to bytes
imgByteArr = io.BytesIO()
img.save(imgByteArr, format='PNG')
image = vision.Image(content=imgByteArr.getvalue())
# Perform document text detection
response = client.document_text_detection(image=image)
# Clean up temporary credentials file
if google_creds and 'creds_path' in locals():
try:
os.unlink(creds_path)
except:
pass
except Exception as e:
# Fallback: create a dummy response for demo purposes
print(f"Warning: Google Cloud Vision not available: {e}")
response = create_dummy_ocr_response(img)
# Process the response
word_boxes = []
if hasattr(response, 'full_text_annotation') and response.full_text_annotation:
for page in response.full_text_annotation.pages:
for block in page.blocks:
if block.confidence < 0.9:
continue
if debug:
print(f"\nBlock confidence: {block.confidence}")
print(f"Block box: {get_bounding_box_doc(block)}")
words = ""
fonts = []
for paragraph in block.paragraphs:
for word in paragraph.words:
word_text = "".join([symbol.text for symbol in word.symbols])
words += word_text + " "
word_bbox = get_bounding_box_doc(word)
fonts.append(abs(word_bbox[3] - word_bbox[1]))
if debug:
print(f"Words: {words}")
if fonts: # Only add if we have font information
word_boxes.append([words.strip()] + get_bounding_box_doc(block) + [sum(fonts) // len(fonts)])
# If no text was detected, create a minimal entry
if not word_boxes:
word_boxes.append(["No text detected", 0, 0, 100, 20, 12])
# Create QuadTree for clustering nearby text
tree = QuadTree(max_dist=max_dist)
for i in range(len(word_boxes)):
tree.insert(Node(*tuple(word_boxes[i])))
if get_response:
return tree, response
return tree, {}
def create_dummy_ocr_response(img):
"""
Create a dummy OCR response for demo purposes when Google Cloud Vision is not available.
This allows the demo to work without requiring actual OCR credentials.
"""
W, H = img.size
# Create a simple mock response object
class MockResponse:
def __init__(self):
self.full_text_annotation = None
# For demo purposes, we'll just return an empty response
# In a real scenario, you might want to use an alternative OCR library like pytesseract
return MockResponse()
def draw_boxes(img, bound, color, width=5):
"""Draw bounding boxes on image for visualization."""
_img = img.copy()
draw = ImageDraw.Draw(_img)
x0 = min(bound[0], bound[2]) - 7
x1 = max(bound[0], bound[2]) + 10
y0 = min(bound[1], bound[3]) - 7
y1 = max(bound[1], bound[3]) + 10
draw.rectangle([x0, y0, x1, y1], outline=color, width=width)
return _img, x0, y0, x1, y1
def get_image_with_boxes_doc(image, color='red', width=5, get_response=False, response=None):
"""Get image with OCR bounding boxes drawn on it."""
tree, resp = get_text_from_image_doc(image, get_response=get_response, resp=response)
bxs = tree.get_children(False)
for bx in bxs:
image, x0, y0, x1, y1 = draw_boxes(image, bx, color, width)
if get_response:
return image, resp
return image
|