| | import pandas as pd |
| | import numpy as np |
| | import fitz |
| | import re |
| | import cv2 |
| |
|
| |
|
| | def paragraphs(folderpath): |
| | doc = fitz.open(folderpath + '/opinion.pdf') |
| | df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None}) |
| | indices = list(df.index) |
| | pg_indices = df['Pg Ind'].tolist() |
| |
|
| | x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {} |
| | paras = [] |
| | for (i, pg_ind) in enumerate(pg_indices): |
| | lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0]) |
| | pg_x1s, pg_x2s = [], [] |
| | for (j, n) in enumerate(lines): |
| | x1s.append(n[0]) |
| | y1s.append(n[1]) |
| | x2s.append(n[2]) |
| | y2s.append(n[3]) |
| | line_texts.append(n[4]) |
| | pg_x1s.append(n[0]) |
| | pg_x2s.append(n[2]) |
| | pg_inds.append(i) |
| | line_inds.append(j) |
| | baselines[i] = min(pg_x1s) |
| | rights[i] = max(pg_x2s) |
| |
|
| | for (j, line_text) in enumerate(line_texts): |
| | if j == 0: |
| | para = [] |
| | continue |
| |
|
| | if len(line_texts[j]) > 0: |
| | prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2 |
| | current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2 |
| |
|
| | prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None |
| | prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None |
| | current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None |
| | prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None |
| | current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None |
| | prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None |
| |
|
| | current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7 |
| | prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7 |
| | prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 18 |
| | current_supertabbed = x1s[j] - baselines[pg_inds[j]] > 18 |
| | prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7 |
| | prior_right_margin = x1s[j - 1] > prior_median |
| |
|
| | is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date) |
| | is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) |
| | is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left) |
| | is_after_blockquote = (prior_endswith_period and not current_supertabbed and prior_supertabbed) |
| | is_after_disposition = (prior_right_margin and current_tabbed) |
| | is_after_oneline_paragraph = (prior_tabbed and current_tabbed and not prior_supertabbed and not current_supertabbed) |
| |
|
| | if is_section_header or is_the_classic or is_after_oneline_paragraph or is_start_blockquote or is_after_blockquote or is_after_disposition: |
| | paras.append(para) |
| | para = [] |
| | |
| | |
| | para.append((pg_inds[j], line_inds[j], line_text)) |
| |
|
| | paras.append(para) |
| | paras_df = pd.DataFrame({'Lines': paras}) |
| | return paras_df |
| |
|
| | def process_file(folderpath): |
| | paras_df = paragraphs(folderpath) |
| | paras_df.to_csv(folderpath + '/paragraphs.csv', index=True) |
| | data_df = pd.read_csv(folderpath + '/data.csv') |
| | paras_lines = paras_df['Lines'].tolist() |
| | indents = [] |
| | for (i, para_lines) in enumerate(paras_lines): |
| | para = [] |
| | para_start_pg_ind, para_start_line_ind, para_first_line = para_lines[0] |
| | page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind] |
| | pg_lines = eval(page_df['Lines'].tolist()[0]) |
| | x1, y1, x2, y2, text = pg_lines[para_start_line_ind] |
| | indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind)) |
| |
|
| | for indent in indents: |
| | x1, y1, x2, y2, para_first_line, pg_ind = indent |
| | image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png') |
| | cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2) |
| | cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image) |
| |
|
| | process_file('PDF Cases/462_122') |