| import re |
| from docx import Document |
| from helpers import get_doc_blocks |
|
|
| def get_ikz_pdf(pdf_blocks): |
| ikz_pdf = set() |
| for block in pdf_blocks: |
| ikz_pdf.update( |
| re.findall("\d{32,40}", block) |
| ) |
| return ikz_pdf |
|
|
|
|
| def get_ikz_doc(doc): |
| ikz_docx = set() |
| paragraphs = get_doc_blocks(doc) |
| ikz_doc_regex = ["\d{36}", "(?:\d{2})(?:-\d{3,20}){5}"] |
|
|
| for docpara in paragraphs: |
| for val in ikz_doc_regex: |
| ikz_docx.update( |
| re.findall(val, docpara) |
| ) |
| for table in doc.tables: |
| for row in table.rows: |
| for cell in row.cells: |
| for para in cell.paragraphs: |
| for val in ikz_doc_regex: |
| ikz_docx.update( |
| re.findall(val, para.text) |
| ) |
| return ikz_docx |
|
|