Daniel Castrillon commited on
Commit
66130ae
·
1 Parent(s): f707000

updated logic to crop pdfs

Browse files
Files changed (2) hide show
  1. .gitignore +2 -0
  2. app.py +46 -28
.gitignore CHANGED
@@ -159,3 +159,5 @@ cython_debug/
159
  # and can be added to the global gitignore or merged into this file. For a more nuclear
160
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
  #.idea/
 
 
 
159
  # and can be added to the global gitignore or merged into this file. For a more nuclear
160
  # option (not recommended) you can uncomment the following to ignore the entire idea folder.
161
  #.idea/
162
+
163
+ .DS_Store
app.py CHANGED
@@ -12,7 +12,7 @@ import pytesseract
12
  def save_pdf_file(file_name, page):
13
  pdf_writer = PyPDF2.PdfWriter()
14
  pdf_writer.add_page(page)
15
-
16
  #return pdf stream
17
  with open(f"files/{file_name}", "wb") as out:
18
  pdf_writer.write(out)
@@ -24,12 +24,13 @@ def process_estafeta_pdf(file_name, page):
24
  y = page['/MediaBox'][3]
25
 
26
  if x > y:
27
- page.cropbox.upper_left = (40, 60)
28
- page.cropbox.lower_right = (340,520)
29
  else:
30
- page.cropbox.upper_left = (30,40)
31
- page.cropbox.lower_right = (490,340)
32
 
 
33
  save_pdf_file(file_name, page)
34
 
35
  def process_estafeta_text(pdf_text):
@@ -37,21 +38,17 @@ def process_estafeta_text(pdf_text):
37
  match = re.search(r'CONFIRMACION (\d+-\d+\w+)', pdf_text)
38
  if match:
39
  extracted_text = match.group(1)
40
- index = -1
41
- for i, char in enumerate(reversed(extracted_text)):
42
- if char.isalpha():
43
- index = len(extracted_text) - i
44
- break
45
-
46
- extracted_text = extracted_text[0:index].replace("-", "")
47
  file_name = extracted_text + ".pdf"
48
  file_path = f"files/{file_name}"
49
  return file_name, file_path
50
 
51
-
52
  def process_dhl_pdf(file_name, page):
53
- page.cropbox.upper_left = (92,20)
54
- page.cropbox.lower_right = (360,560)
55
  save_pdf_file(file_name, page)
56
 
57
  def process_dhl_text(pdf_text):
@@ -63,7 +60,10 @@ def process_dhl_text(pdf_text):
63
  file_path = f"files/{file_name}"
64
  return file_name, file_path
65
 
66
- def process_ups_pdf(file_name, page):
 
 
 
67
  page.rotate(90)
68
  save_pdf_file(file_name, page)
69
  file_path = f"files/{file_name}"
@@ -71,25 +71,35 @@ def process_ups_pdf(file_name, page):
71
  images = convert_from_path(file_path)
72
  image = images[0]
73
  image.save(f"{file_path}.png", "PNG")
 
 
74
  loaded_image = Image.open(f"{file_path}.png")
75
  extracted_text = pytesseract.image_to_string(loaded_image)
 
76
  image.close()
77
- loaded_image.close()
78
  os.remove(f"{file_path}.png")
79
- match = re.search(r'TRACKING #:\s+([A-Z\d\s]+)', extracted_text)
80
- if match:
81
- extracted_text = match.group(1)
82
- extracted_text = extracted_text.replace(" ", "").replace("BILLING", "").replace("\n", "")
83
- file_name = extracted_text + ".pdf"
84
  else:
85
- print("Pattern not found in the text.")
 
 
 
 
 
 
 
 
 
 
86
 
87
  return file_name, file_path
88
 
89
-
90
  def process_coppel_pdf(file_name, page):
91
- page.cropbox.upper_left = (0,150)
92
- page.cropbox.lower_right = (400,520)
93
  save_pdf_file(file_name, page)
94
 
95
  def process_coppel_text(pdf_text):
@@ -103,6 +113,15 @@ def process_coppel_text(pdf_text):
103
  file_path = f"files/{file_name}"
104
  return file_name, file_path
105
 
 
 
 
 
 
 
 
 
 
106
  def process_pdf_file(file):
107
  """
108
  This function processes the PDF file and returns the file name, file path and transport company
@@ -115,7 +134,6 @@ def process_pdf_file(file):
115
  file_path (str): The path of the file
116
  transport_company (str): The transport company
117
  """
118
-
119
  pdf_stream = BytesIO(file.content)
120
  pdf = PyPDF2.PdfReader(pdf_stream)
121
  page = pdf.pages[0]
@@ -139,7 +157,7 @@ def process_pdf_file(file):
139
  process_coppel_pdf(file_name, page)
140
  else:
141
  transport_company = "ups"
142
- file_name, file_path = process_ups_pdf(file_name, page)
143
 
144
  pdf_stream.close()
145
  return file_name, file_path, transport_company
 
12
  def save_pdf_file(file_name, page):
13
  pdf_writer = PyPDF2.PdfWriter()
14
  pdf_writer.add_page(page)
15
+
16
  #return pdf stream
17
  with open(f"files/{file_name}", "wb") as out:
18
  pdf_writer.write(out)
 
24
  y = page['/MediaBox'][3]
25
 
26
  if x > y:
27
+ page.mediabox.lower_left = (40, 60)
28
+ page.mediabox.upper_right = (340,520)
29
  else:
30
+ page.mediabox.lower_left = (30,40)
31
+ page.mediabox.upper_right = (490,340)
32
 
33
+
34
  save_pdf_file(file_name, page)
35
 
36
  def process_estafeta_text(pdf_text):
 
38
  match = re.search(r'CONFIRMACION (\d+-\d+\w+)', pdf_text)
39
  if match:
40
  extracted_text = match.group(1)
41
+ extracted_text = extracted_text[0:23].replace("-", "")
42
+ else:
43
+ raise Exception("Pattern not found in the text.")
44
+
 
 
 
45
  file_name = extracted_text + ".pdf"
46
  file_path = f"files/{file_name}"
47
  return file_name, file_path
48
 
 
49
  def process_dhl_pdf(file_name, page):
50
+ page.mediabox.lower_left = (92,20)
51
+ page.mediabox.upper_right = (360,560)
52
  save_pdf_file(file_name, page)
53
 
54
  def process_dhl_text(pdf_text):
 
60
  file_path = f"files/{file_name}"
61
  return file_name, file_path
62
 
63
+ def process_ups_pdf(page):
64
+ page.rotate(90)
65
+
66
+ def process_pdf_from_image(file_name, page):
67
  page.rotate(90)
68
  save_pdf_file(file_name, page)
69
  file_path = f"files/{file_name}"
 
71
  images = convert_from_path(file_path)
72
  image = images[0]
73
  image.save(f"{file_path}.png", "PNG")
74
+
75
+ # open the image and extract the text
76
  loaded_image = Image.open(f"{file_path}.png")
77
  extracted_text = pytesseract.image_to_string(loaded_image)
78
+
79
  image.close()
 
80
  os.remove(f"{file_path}.png")
81
+
82
+ if re.search("Fed2x", extracted_text, re.IGNORECASE):
83
+ file_name, file_path = process_fedex_text(extracted_text)
84
+ process_fedex_pdf(file_name, page)
 
85
  else:
86
+ image_path = f"{file_path}-resized.pdf"
87
+ loaded_image.resize((400, 500)).save(image_path, "PDF")
88
+ loaded_image.close()
89
+ match = re.search(r'TRACKING #:\s+([A-Z\d\s]+)', extracted_text)
90
+ file_path = image_path
91
+ if match:
92
+ extracted_text = match.group(1)
93
+ extracted_text = extracted_text.replace(" ", "").replace("BILLING", "").replace("\n", "")
94
+ file_name = extracted_text + ".pdf"
95
+ else:
96
+ print("Pattern not found in the text.")
97
 
98
  return file_name, file_path
99
 
 
100
  def process_coppel_pdf(file_name, page):
101
+ page.mediabox.lower_left = (0,150)
102
+ page.mediabox.upper_right = (290,520)
103
  save_pdf_file(file_name, page)
104
 
105
  def process_coppel_text(pdf_text):
 
113
  file_path = f"files/{file_name}"
114
  return file_name, file_path
115
 
116
+ def process_fedex_pdf(file_name, page):
117
+ page.rotate(-90)
118
+ page.mediabox.lower_left = (0, 0)
119
+ page.mediabox.upper_right = (500,650)
120
+ save_pdf_file(file_name, page)
121
+
122
+ def process_fedex_text(pdf_text):
123
+ return "fedex.pdf", "files/fedex.pdf"
124
+
125
  def process_pdf_file(file):
126
  """
127
  This function processes the PDF file and returns the file name, file path and transport company
 
134
  file_path (str): The path of the file
135
  transport_company (str): The transport company
136
  """
 
137
  pdf_stream = BytesIO(file.content)
138
  pdf = PyPDF2.PdfReader(pdf_stream)
139
  page = pdf.pages[0]
 
157
  process_coppel_pdf(file_name, page)
158
  else:
159
  transport_company = "ups"
160
+ file_name, file_path = process_pdf_from_image(file_name, page)
161
 
162
  pdf_stream.close()
163
  return file_name, file_path, transport_company