import os import time import easyocr import cv2 import io import re import pandas as pd import pytesseract from azure.ai.vision.imageanalysis import ImageAnalysisClient from azure.ai.vision.imageanalysis.models import VisualFeatures from azure.core.credentials import AzureKeyCredential # ---------------- OCR MODELS ------------------------------ def run_ocr(ocr_model:str, image_path, api_key=None, endpoint=None): if ocr_model == "Azure": return azure_ocr(image_path, api_key, endpoint) elif ocr_model == "EasyOCR": return easy_ocr_detection(image_path) elif ocr_model == "Pytesseract": return pytesseract_ocr_detection(image_path) def azure_ocr(image_path,api_key, endpoint): image = cv2.imread(image_path) detected_text = [] subscription_key = api_key try: endpoint = endpoint key = subscription_key except KeyError: print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'") print("Set them before running this sample.") exit() client = ImageAnalysisClient( endpoint=endpoint, credential=AzureKeyCredential(key) ) retval, buffer = cv2.imencode('.jpg', image) jpeg_bytes = buffer.tobytes() image_data = io.BytesIO(jpeg_bytes) result = client.analyze( image_data=image_data, visual_features=[VisualFeatures.READ] ) if result.read is not None: for line in result.read.blocks[0].lines: text = line.text x_coords = [point['x'] for point in line.bounding_polygon] y_coords = [point['y'] for point in line.bounding_polygon] x1, y1 = min(x_coords), min(y_coords) x2,y2= max(x_coords), max(y_coords) rect_bbox = (x1,y1,x2,y2) detected_text.append((text,rect_bbox)) return detected_text def easy_ocr_detection(image_path): """ width_ths (float, default = 0.5) - Maximum horizontal distance to merge boxes. """ image = cv2.imread(image_path) reader = easyocr.Reader(['no']) results = reader.readtext(image, width_ths=0.6) detected_text = [] for result in results: bbox, text, prob = result # bbox: [[x1,y1],[x2,y2], [x3,y3], [x4,y4]] x_coords = [point[0] for point in bbox] y_coords = [point[1] for point in bbox] x1, y1 = min(x_coords), min(y_coords) x2, y2 = max(x_coords), max(y_coords) rect_bbox = (int(x1), int(y1), int(x2), int(y2)) rect_bbox = (x1,y1,x2,y2) detected_text.append((text,rect_bbox)) return detected_text def pytesseract_ocr_detection(image_path): image = cv2.imread(image_path) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) data = pytesseract.image_to_data(thresh, output_type=pytesseract.Output.DICT) detected_text = [] n_boxes = len(data['text']) for i in range(n_boxes): text = data['text'][i].strip() if text != "": (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i]) rect_bbox = (x, y, x + w, y + h) detected_text.append((text, rect_bbox)) return detected_text def plot_text_bboxes(image_path,detected_text): img = cv2.imread(image_path) # ------------ REGEX POST-PROCESSING OF TEXT ---------------------------------- def ocr_to_pandas(detected_text): """ Stores results from OCR in Pandas Dataframe Args: - detected_text: A List with tuples containing OCR text and bounding boxes. Ex.: [("sov", (x1,y1,x2,y2))] Returns: - Pandas Dataframe with columns "text" and "box" """ list_of_dicts = [{'text': text, 'box': box} for text, box in detected_text] df = pd.DataFrame(list_of_dicts) return df def regex_from_list(df, text_list, ignore_case = True): """ Use regex to find text in dataframe. Args: - df: dataframe containing column "text" from OCR - text_list: a list with strings we want to match with. Ex: ["sov", "stue", "kjøkken"] - ignore_case: bool. Accept both lower and upper case Returns: - The filtered dataframe with matched text """ text_column = df["text"] if ignore_case: pattern = re.compile("|".join(text_list), re.IGNORECASE) else: pattern = re.compile("|".join(text_list)) match = text_column.str.match(pattern) df_filtered = df[match] return df_filtered def regex_from_pandas(df, pattern): text_column = df["text"].str.lower() match = text_column.str.match(pattern) df_filtered = df[match] return df_filtered def drop_duplicate_boxes(df, box_col="box"): if df is None or df.empty or box_col not in df.columns: return df.copy() out = df.copy() out["__box_key"] = out[box_col].apply(lambda bl: tuple(bl)) out = ( out .drop_duplicates(subset="__box_key", keep="first") .reset_index(drop=True) .drop(columns="__box_key") ) return out # -------- OBS! OLD -> REMOVE? --------- def _load_txt_files(file_path): with open(file_path, "r") as f: text = [line.strip() for line in f.readlines()] #text = [line.strip() for line in f] return text def _find_matches(target_text, ocr_text): matches = [] target_sorted = sorted(target_text, key=len, reverse=True) pattern = r'\b(' + '|'.join(target_sorted) + r')\b' for text,box in ocr_text: match = re.search(pattern, text, re.IGNORECASE) if match: matches.extend((text,box)) return matches def get_rooms_text(ocr_results, file_path): text_path = os.path.join(os.path.dirname(__file__), file_path) valid_rooms = _load_txt_files(text_path) matched_text = _find_matches(valid_rooms, ocr_results) return matched_text def get_byggarealer(byggareal_text, arealer_text): pass