Spaces:
Build error
Build error
| """ | |
| audiolens β j2 image preprocessing | |
| prepares a raw phone-captured document image for ocr. | |
| each preprocessing step is a separate function so they can be | |
| tested, tuned, or swapped out individually as needed. | |
| pipeline order: | |
| 1. to_grayscale β converts colour input to grayscale | |
| 2. deskew β corrects tilt from phone capture angle | |
| 3. denoise β removes grain and compression artifacts | |
| 4. enhance_contrast β applies clahe for local contrast improvement | |
| 5. binarise β converts to clean black/white via otsu threshold | |
| 6. preprocess β runs all steps in order (main entry point) | |
| no downloads needed. import preprocess() directly into the pipeline. | |
| """ | |
| import numpy as np | |
| import cv2 | |
| def to_grayscale(image): | |
| """ | |
| converts a bgr colour image to grayscale. | |
| if image is already grayscale, returns a copy unchanged. | |
| """ | |
| if len(image.shape) == 3 and image.shape[2] == 3: | |
| return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| return image.copy() | |
| def deskew(gray): | |
| """ | |
| detects and corrects the dominant tilt angle of the document. | |
| common when a user photographs a document at a slight angle. | |
| uses the minimum area bounding box of dark pixel clusters to | |
| estimate the skew angle, then rotates to correct it. | |
| angles under 0.5 degrees are ignored to avoid introducing | |
| unnecessary interpolation artifacts on near-straight images. | |
| """ | |
| coords = np.column_stack(np.where(gray < 128)) | |
| # not enough dark pixels to estimate angle reliably | |
| if len(coords) < 50: | |
| return gray | |
| angle = cv2.minAreaRect(coords)[-1] | |
| # minAreaRect returns angles in [-90, 0) β normalise to [-45, 45] | |
| if angle < -45: | |
| angle = 90 + angle | |
| # skip tiny corrections | |
| if abs(angle) < 0.5: | |
| return gray | |
| h, w = gray.shape | |
| center = (w // 2, h // 2) | |
| matrix = cv2.getRotationMatrix2D(center, angle, 1.0) | |
| rotated = cv2.warpAffine( | |
| gray, matrix, (w, h), | |
| flags=cv2.INTER_CUBIC, | |
| borderMode=cv2.BORDER_REPLICATE, | |
| ) | |
| return rotated | |
| def denoise(gray): | |
| """ | |
| removes noise, grain, and jpeg compression artifacts from the image. | |
| uses opencv's non-local means denoising which is effective on | |
| document scans and phone camera captures without blurring text edges. | |
| h=10 is a conservative strength β enough to clean grain but | |
| not so aggressive that it softens thin strokes in small text. | |
| """ | |
| return cv2.fastNlMeansDenoising(gray, h=10) | |
| def enhance_contrast(gray): | |
| """ | |
| applies clahe (contrast limited adaptive histogram equalisation). | |
| unlike global histogram equalisation, clahe works on small tiles | |
| so it handles documents with uneven lighting β e.g. a shadow | |
| across part of a medicine label or a receipt photographed in dim light. | |
| cliplimit=2.0 prevents over-amplification of noise in flat regions. | |
| tileGridSize=(8, 8) gives a good balance between local and global correction. | |
| """ | |
| clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8)) | |
| return clahe.apply(gray) | |
| def binarise(gray): | |
| """ | |
| converts the grayscale image to a clean black and white binary image. | |
| uses otsu's method which automatically finds the optimal threshold | |
| value based on the image's intensity histogram β no manual tuning needed. | |
| binarisation removes any remaining grey tones and produces the | |
| high-contrast input that ocr models perform best on. | |
| """ | |
| _, binary = cv2.threshold( | |
| gray, 0, 255, | |
| cv2.THRESH_BINARY + cv2.THRESH_OTSU, | |
| ) | |
| return binary | |
| def preprocess(image): | |
| """ | |
| runs the full preprocessing pipeline on a raw document image. | |
| this is the main entry point called from the audiolens pipeline. | |
| input: numpy array β bgr colour or grayscale, any resolution | |
| output: numpy array β grayscale binarised image, same resolution | |
| pipeline: grayscale β deskew β denoise β enhance_contrast β binarise | |
| """ | |
| image = to_grayscale(image) | |
| image = deskew(image) | |
| image = denoise(image) | |
| image = enhance_contrast(image) | |
| image = binarise(image) | |
| return image | |