cools commited on
Commit
db9a867
·
1 Parent(s): d1048ed

Delete TextProcessor.py

Browse files
Files changed (1) hide show
  1. TextProcessor.py +0 -101
TextProcessor.py DELETED
@@ -1,101 +0,0 @@
1
- # This file converts the images into text
2
- import pandas as pd
3
- import numpy as np
4
- import fitz
5
- import re
6
- import cv2
7
-
8
-
9
- def paragraphs(folderpath):
10
- doc = fitz.open(folderpath + '/opinion.pdf')
11
- df = pd.read_csv(folderpath + '/data.csv').replace({np.nan: None})
12
- indices = list(df.index)
13
- pg_indices = df['Pg Ind'].tolist()
14
-
15
- x1s, y1s, x2s, y2s, line_texts, line_inds, pg_inds, baselines, rights = [], [], [], [], [], [], [], {}, {}
16
- paras = []
17
- for (i, pg_ind) in enumerate(pg_indices):
18
- lines = eval(df[df['Pg Ind'] == i]['Lines'].tolist()[0])
19
- pg_x1s, pg_x2s = [], []
20
- for (j, n) in enumerate(lines):
21
- x1s.append(n[0])
22
- y1s.append(n[1])
23
- x2s.append(n[2])
24
- y2s.append(n[3])
25
- line_texts.append(n[4])
26
- pg_x1s.append(n[0])
27
- pg_x2s.append(n[2])
28
- pg_inds.append(i)
29
- line_inds.append(j)
30
- baselines[i] = min(pg_x1s)
31
- rights[i] = max(pg_x2s)
32
-
33
- is_inblock = False
34
- for (j, line_text) in enumerate(line_texts):
35
- if j == 0:
36
- para = []
37
- continue
38
-
39
- if len(line_texts[j]) > 0:
40
- prior_median = (baselines[pg_inds[j - 1]] + rights[pg_inds[j - 1]]) / 2
41
- current_median = (baselines[pg_inds[j]] + rights[pg_inds[j]]) / 2
42
-
43
- prior_endswith_period = re.search('[:\.]([^A-z]{0,2})$',line_texts[j - 1].strip()) is not None # Include colon?
44
- prior_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j - 1].strip()) is not None
45
- current_is_section_header = re.search('^([ABCDEIVX]+)$', line_texts[j].strip()) is not None
46
- prior_is_asterisk = re.search('^([\s\*]+)$', line_texts[j - 1].strip()) is not None
47
- current_is_asterisk = re.search('^([\s\*]+)$', line_texts[j].strip()) is not None
48
- prior_is_date = re.search('(\[[A-z\s0-9]*,\s[0-9]*]+)$', line_texts[j - 1].strip()) is not None
49
-
50
- current_tabbed = x1s[j] - baselines[pg_inds[j]] > 7
51
- prior_tabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] > 7
52
- prior_supertabbed = x1s[j - 1] - baselines[pg_inds[j - 1]] >= 14
53
- current_supertabbed = x1s[j] - baselines[pg_inds[j]] >= 14
54
- prior_more_left = (x1s[j] - baselines[pg_inds[j]]) - (x1s[j - 1] - baselines[pg_inds[j - 1]]) > 7
55
- prior_right_margin = x1s[j - 1] > prior_median
56
- current_right_margin = x1s[j] > prior_median
57
-
58
- is_section_header = (prior_is_section_header or current_is_section_header or prior_is_asterisk or current_is_asterisk or prior_is_date)
59
- is_the_classic = (prior_endswith_period and current_tabbed and prior_more_left and not prior_supertabbed) # Note: Supertabbing oofs stuff
60
- is_start_blockquote = (prior_endswith_period and current_supertabbed and prior_more_left and not is_inblock)
61
- is_after_blockquote = (prior_endswith_period and not current_supertabbed and is_inblock)
62
- is_after_disposition = (prior_right_margin and current_tabbed)
63
- is_disposition = (current_right_margin and ("affirm" in line_texts[j].lower() or "reverse" in line_texts[j].lower() or "vacate" in line_texts[j].lower() or "so ordered" in line_texts[j].lower()))
64
-
65
- if is_start_blockquote and not is_section_header:
66
- is_inblock = True
67
- if is_after_blockquote or prior_is_date: # This date hack helps ensure that slip opinion headers do not get caught
68
- is_inblock = False
69
-
70
- if is_section_header or is_the_classic or is_start_blockquote or is_after_blockquote or is_after_disposition or is_disposition:
71
- paras.append(para)
72
- para = []
73
- # print('\n')
74
- # print(str([j, pg_inds[j]]) + ':\t' + str(is_the_classic) + '\t' + str(is_start_blockquote) + '\t' + str(is_after_blockquote)+ '\t' + str(is_after_disposition) + '\t' + str(is_section_header) + '\t' + line_text)
75
- para.append((pg_inds[j], line_inds[j], is_inblock, line_text))
76
-
77
- paras.append(para)
78
- paras_df = pd.DataFrame({'Lines': paras})
79
- return paras_df
80
-
81
- def process_file(folderpath):
82
- paras_df = paragraphs(folderpath)
83
- paras_df.to_csv(folderpath + '/paragraphs.csv', index=False)
84
- data_df = pd.read_csv(folderpath + '/data.csv')
85
- paras_lines = paras_df['Lines'].tolist()
86
- indents = []
87
- for (i, para_lines) in enumerate(paras_lines):
88
- if para_lines == []:
89
- continue
90
- para = []
91
- para_start_pg_ind, para_start_line_ind, _, para_first_line = para_lines[0]
92
- page_df = data_df[data_df['Pg Ind'] == para_start_pg_ind]
93
- pg_lines = eval(page_df['Lines'].tolist()[0])
94
- x1, y1, x2, y2, text = pg_lines[para_start_line_ind]
95
- indents.append((x1, y1, x2, y2, para_first_line, para_start_pg_ind))
96
-
97
- for indent in indents:
98
- x1, y1, x2, y2, para_first_line, pg_ind = indent
99
- image = cv2.imread(folderpath + '/' + str(pg_ind) + '-processed.png')
100
- cv2.circle(image, (x1 - 15, int(0.5 * (y1 + y2))), radius=1, color=(240, 32, 160), thickness=2)
101
- cv2.imwrite(folderpath + '/' + str(pg_ind) + '-processed.png', image)