Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import easyocr | |
| import cv2 | |
| import io | |
| import re | |
| import pandas as pd | |
| import pytesseract | |
| from azure.ai.vision.imageanalysis import ImageAnalysisClient | |
| from azure.ai.vision.imageanalysis.models import VisualFeatures | |
| from azure.core.credentials import AzureKeyCredential | |
| # ---------------- OCR MODELS ------------------------------ | |
| def run_ocr(ocr_model:str, image_path, api_key=None, endpoint=None): | |
| if ocr_model == "Azure": | |
| return azure_ocr(image_path, api_key, endpoint) | |
| elif ocr_model == "EasyOCR": | |
| return easy_ocr_detection(image_path) | |
| elif ocr_model == "Pytesseract": | |
| return pytesseract_ocr_detection(image_path) | |
| def azure_ocr(image_path,api_key, endpoint): | |
| image = cv2.imread(image_path) | |
| detected_text = [] | |
| subscription_key = api_key | |
| try: | |
| endpoint = endpoint | |
| key = subscription_key | |
| except KeyError: | |
| print("Missing environment variable 'VISION_ENDPOINT' or 'VISION_KEY'") | |
| print("Set them before running this sample.") | |
| exit() | |
| client = ImageAnalysisClient( | |
| endpoint=endpoint, | |
| credential=AzureKeyCredential(key) | |
| ) | |
| retval, buffer = cv2.imencode('.jpg', image) | |
| jpeg_bytes = buffer.tobytes() | |
| image_data = io.BytesIO(jpeg_bytes) | |
| result = client.analyze( | |
| image_data=image_data, | |
| visual_features=[VisualFeatures.READ] | |
| ) | |
| if result.read is not None: | |
| for line in result.read.blocks[0].lines: | |
| text = line.text | |
| x_coords = [point['x'] for point in line.bounding_polygon] | |
| y_coords = [point['y'] for point in line.bounding_polygon] | |
| x1, y1 = min(x_coords), min(y_coords) | |
| x2,y2= max(x_coords), max(y_coords) | |
| rect_bbox = (x1,y1,x2,y2) | |
| detected_text.append((text,rect_bbox)) | |
| return detected_text | |
| def easy_ocr_detection(image_path): | |
| """ | |
| width_ths (float, default = 0.5) - Maximum horizontal distance to merge boxes. | |
| """ | |
| image = cv2.imread(image_path) | |
| reader = easyocr.Reader(['no']) | |
| results = reader.readtext(image, width_ths=0.6) | |
| detected_text = [] | |
| for result in results: | |
| bbox, text, prob = result | |
| # bbox: [[x1,y1],[x2,y2], [x3,y3], [x4,y4]] | |
| x_coords = [point[0] for point in bbox] | |
| y_coords = [point[1] for point in bbox] | |
| x1, y1 = min(x_coords), min(y_coords) | |
| x2, y2 = max(x_coords), max(y_coords) | |
| rect_bbox = (int(x1), int(y1), int(x2), int(y2)) | |
| rect_bbox = (x1,y1,x2,y2) | |
| detected_text.append((text,rect_bbox)) | |
| return detected_text | |
| def pytesseract_ocr_detection(image_path): | |
| image = cv2.imread(image_path) | |
| gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) | |
| _, thresh = cv2.threshold(gray, 180, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) | |
| data = pytesseract.image_to_data(thresh, output_type=pytesseract.Output.DICT) | |
| detected_text = [] | |
| n_boxes = len(data['text']) | |
| for i in range(n_boxes): | |
| text = data['text'][i].strip() | |
| if text != "": | |
| (x, y, w, h) = (data['left'][i], data['top'][i], data['width'][i], data['height'][i]) | |
| rect_bbox = (x, y, x + w, y + h) | |
| detected_text.append((text, rect_bbox)) | |
| return detected_text | |
| def plot_text_bboxes(image_path,detected_text): | |
| img = cv2.imread(image_path) | |
| # ------------ REGEX POST-PROCESSING OF TEXT ---------------------------------- | |
| def ocr_to_pandas(detected_text): | |
| """ | |
| Stores results from OCR in Pandas Dataframe | |
| Args: | |
| - detected_text: A List with tuples containing OCR text and bounding boxes. | |
| Ex.: [("sov", (x1,y1,x2,y2))] | |
| Returns: | |
| - Pandas Dataframe with columns "text" and "box" | |
| """ | |
| list_of_dicts = [{'text': text, 'box': box} for text, box in detected_text] | |
| df = pd.DataFrame(list_of_dicts) | |
| return df | |
| def regex_from_list(df, text_list, ignore_case = True): | |
| """ | |
| Use regex to find text in dataframe. | |
| Args: | |
| - df: dataframe containing column "text" from OCR | |
| - text_list: a list with strings we want to match with. Ex: ["sov", "stue", "kjøkken"] | |
| - ignore_case: bool. Accept both lower and upper case | |
| Returns: | |
| - The filtered dataframe with matched text | |
| """ | |
| text_column = df["text"] | |
| if ignore_case: | |
| pattern = re.compile("|".join(text_list), re.IGNORECASE) | |
| else: | |
| pattern = re.compile("|".join(text_list)) | |
| match = text_column.str.match(pattern) | |
| df_filtered = df[match] | |
| return df_filtered | |
| def regex_from_pandas(df, pattern): | |
| text_column = df["text"].str.lower() | |
| match = text_column.str.match(pattern) | |
| df_filtered = df[match] | |
| return df_filtered | |
| def drop_duplicate_boxes(df, box_col="box"): | |
| if df is None or df.empty or box_col not in df.columns: | |
| return df.copy() | |
| out = df.copy() | |
| out["__box_key"] = out[box_col].apply(lambda bl: tuple(bl)) | |
| out = ( | |
| out | |
| .drop_duplicates(subset="__box_key", keep="first") | |
| .reset_index(drop=True) | |
| .drop(columns="__box_key") | |
| ) | |
| return out | |
| # -------- OBS! OLD -> REMOVE? --------- | |
| def _load_txt_files(file_path): | |
| with open(file_path, "r") as f: | |
| text = [line.strip() for line in f.readlines()] | |
| #text = [line.strip() for line in f] | |
| return text | |
| def _find_matches(target_text, ocr_text): | |
| matches = [] | |
| target_sorted = sorted(target_text, key=len, reverse=True) | |
| pattern = r'\b(' + '|'.join(target_sorted) + r')\b' | |
| for text,box in ocr_text: | |
| match = re.search(pattern, text, re.IGNORECASE) | |
| if match: | |
| matches.extend((text,box)) | |
| return matches | |
| def get_rooms_text(ocr_results, file_path): | |
| text_path = os.path.join(os.path.dirname(__file__), file_path) | |
| valid_rooms = _load_txt_files(text_path) | |
| matched_text = _find_matches(valid_rooms, ocr_results) | |
| return matched_text | |
| def get_byggarealer(byggareal_text, arealer_text): | |
| pass |