Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| OCR module adapted for HuggingFace Spaces. | |
| Uses Google Cloud Vision API for text detection. | |
| """ | |
| from PIL import Image, ImageDraw, ImageFilter | |
| from google.cloud import vision | |
| import numpy as np | |
| import io | |
| import os | |
| import json | |
| import tempfile | |
| from py_files.bounding_clustering import QuadTree, Node | |
| def change_contrast(img, level): | |
| """Adjust image contrast for better OCR results.""" | |
| factor = (259 * (level + 255)) / (255 * (259 - level)) | |
| def contrast(c): | |
| return 128 + factor * (c - 128) | |
| return img.point(contrast) | |
| def get_bounding_box_doc(blk): | |
| """Extract bounding box coordinates from document text block.""" | |
| vertices = [int(blk.bounding_box.vertices[0].x), int(blk.bounding_box.vertices[0].y), | |
| int(blk.bounding_box.vertices[2].x), int(blk.bounding_box.vertices[2].y)] | |
| return vertices | |
| def get_text_from_image_doc(img, debug=False, get_response=False, resp=None, max_dist=20): | |
| """ | |
| Extract text from image using Google Cloud Vision Document Text Detection. | |
| Adapted for HuggingFace Spaces environment. | |
| """ | |
| response = resp | |
| if resp is None: | |
| # Initialize the client with credentials from environment | |
| try: | |
| # Try to get credentials from environment variable | |
| google_creds = os.environ.get('GOOGLE_CLOUD_CREDENTIALS') | |
| if google_creds: | |
| # Create temporary credentials file | |
| creds_data = json.loads(google_creds) | |
| with tempfile.NamedTemporaryFile(mode='w', suffix='.json', delete=False) as f: | |
| json.dump(creds_data, f) | |
| creds_path = f.name | |
| os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = creds_path | |
| client = vision.ImageAnnotatorClient() | |
| # Enhance image contrast for better OCR | |
| img = change_contrast(img, 20) | |
| # Convert PIL image to bytes | |
| imgByteArr = io.BytesIO() | |
| img.save(imgByteArr, format='PNG') | |
| image = vision.Image(content=imgByteArr.getvalue()) | |
| # Perform document text detection | |
| response = client.document_text_detection(image=image) | |
| # Clean up temporary credentials file | |
| if google_creds and 'creds_path' in locals(): | |
| try: | |
| os.unlink(creds_path) | |
| except: | |
| pass | |
| except Exception as e: | |
| # Fallback: create a dummy response for demo purposes | |
| print(f"Warning: Google Cloud Vision not available: {e}") | |
| response = create_dummy_ocr_response(img) | |
| # Process the response | |
| word_boxes = [] | |
| if hasattr(response, 'full_text_annotation') and response.full_text_annotation: | |
| for page in response.full_text_annotation.pages: | |
| for block in page.blocks: | |
| if block.confidence < 0.9: | |
| continue | |
| if debug: | |
| print(f"\nBlock confidence: {block.confidence}") | |
| print(f"Block box: {get_bounding_box_doc(block)}") | |
| words = "" | |
| fonts = [] | |
| for paragraph in block.paragraphs: | |
| for word in paragraph.words: | |
| word_text = "".join([symbol.text for symbol in word.symbols]) | |
| words += word_text + " " | |
| word_bbox = get_bounding_box_doc(word) | |
| fonts.append(abs(word_bbox[3] - word_bbox[1])) | |
| if debug: | |
| print(f"Words: {words}") | |
| if fonts: # Only add if we have font information | |
| word_boxes.append([words.strip()] + get_bounding_box_doc(block) + [sum(fonts) // len(fonts)]) | |
| # If no text was detected, create a minimal entry | |
| if not word_boxes: | |
| word_boxes.append(["No text detected", 0, 0, 100, 20, 12]) | |
| # Create QuadTree for clustering nearby text | |
| tree = QuadTree(max_dist=max_dist) | |
| for i in range(len(word_boxes)): | |
| tree.insert(Node(*tuple(word_boxes[i]))) | |
| if get_response: | |
| return tree, response | |
| return tree, {} | |
| def create_dummy_ocr_response(img): | |
| """ | |
| Create a dummy OCR response for demo purposes when Google Cloud Vision is not available. | |
| This allows the demo to work without requiring actual OCR credentials. | |
| """ | |
| W, H = img.size | |
| # Create a simple mock response object | |
| class MockResponse: | |
| def __init__(self): | |
| self.full_text_annotation = None | |
| # For demo purposes, we'll just return an empty response | |
| # In a real scenario, you might want to use an alternative OCR library like pytesseract | |
| return MockResponse() | |
| def draw_boxes(img, bound, color, width=5): | |
| """Draw bounding boxes on image for visualization.""" | |
| _img = img.copy() | |
| draw = ImageDraw.Draw(_img) | |
| x0 = min(bound[0], bound[2]) - 7 | |
| x1 = max(bound[0], bound[2]) + 10 | |
| y0 = min(bound[1], bound[3]) - 7 | |
| y1 = max(bound[1], bound[3]) + 10 | |
| draw.rectangle([x0, y0, x1, y1], outline=color, width=width) | |
| return _img, x0, y0, x1, y1 | |
| def get_image_with_boxes_doc(image, color='red', width=5, get_response=False, response=None): | |
| """Get image with OCR bounding boxes drawn on it.""" | |
| tree, resp = get_text_from_image_doc(image, get_response=get_response, resp=response) | |
| bxs = tree.get_children(False) | |
| for bx in bxs: | |
| image, x0, y0, x1, y1 = draw_boxes(image, bx, color, width) | |
| if get_response: | |
| return image, resp | |
| return image | |