Spaces:
Sleeping
Sleeping
| import re | |
| def locate(ocr_text): | |
| # input: string produced by the ocr | |
| # output: (1) array of possible page numbers (may include false positives) | |
| # (2) array of possible dates | |
| # (3) array of possible book numbers | |
| possible_pages = [] | |
| possible_dates = [] | |
| possible_book = [] | |
| result = ocr_text.split("\n") | |
| pattern = re.compile(r'Re(?:c|ceived|e|o|a)') | |
| book_pattern = re.compile(r'B(?:OOK|00K)', re.IGNORECASE) | |
| for word in result: | |
| # checks for possible page numbers | |
| if word.isdigit() == True: | |
| possible_pages.append(word) | |
| # checks for rec'd dates | |
| if re.match(pattern, word): | |
| # appending entire string for human judgement as OCR fails to correctly translate years in few cases | |
| possible_dates.append(word) | |
| if re.match(book_pattern, word): | |
| possible_book.append(word) | |
| if not possible_pages: | |
| possible_pages.append("Null") | |
| if not possible_dates: | |
| possible_dates.append("Null") | |
| if not possible_book: | |
| possible_book.append("Null") | |
| return possible_pages, possible_dates, possible_book |