Daniel Castrillon commited on
Commit
b321778
·
1 Parent(s): 66a2e50

added ocr processing

Browse files
Files changed (3) hide show
  1. Dockerfile +25 -2
  2. app.py +26 -5
  3. requirements.txt +78 -2
Dockerfile CHANGED
@@ -1,11 +1,34 @@
 
1
  FROM python:3.9
 
 
2
  RUN useradd -m -u 1000 user
3
  USER user
4
  ENV HOME=/home/user \
5
  PATH=/home/user/.local/bin:$PATH
 
 
6
  WORKDIR $HOME/app
 
 
7
  COPY --chown=user . .
8
- COPY ./requirements.txt ~/app/requirements.txt
 
 
 
 
 
 
 
 
 
 
 
 
9
  RUN pip install -r requirements.txt
10
- RUN mkdir -p files && chown -R user:user files
 
 
 
 
11
  CMD ["chainlit", "run", "app.py", "--port", "7860"]
 
1
+ # Use a Python 3.9 base image
2
  FROM python:3.9
3
+
4
+ # Create a user and set environment variables
5
  RUN useradd -m -u 1000 user
6
  USER user
7
  ENV HOME=/home/user \
8
  PATH=/home/user/.local/bin:$PATH
9
+
10
+ # Set the working directory
11
  WORKDIR $HOME/app
12
+
13
+ # Copy your application code
14
  COPY --chown=user . .
15
+
16
+ # Copy requirements.txt
17
+ COPY ./requirements.txt $HOME/app/requirements.txt
18
+
19
+ # Install Tesseract and any necessary dependencies
20
+ USER root
21
+ RUN apt-get update && apt-get install -y tesseract-ocr poppler-utils
22
+
23
+
24
+ # Switch back to the user
25
+ USER user
26
+
27
+ # Install Python dependencies
28
  RUN pip install -r requirements.txt
29
+
30
+ # Create a directory for files and set permissions
31
+ RUN mkdir -p $HOME/app/files && chown -R user:user $HOME/app/files
32
+
33
+ # Specify the command to run your application
34
  CMD ["chainlit", "run", "app.py", "--port", "7860"]
app.py CHANGED
@@ -3,6 +3,11 @@ import chainlit as cl
3
  from io import BytesIO
4
  import re
5
  import os
 
 
 
 
 
6
 
7
  def save_pdf_file(file_name, page):
8
  pdf_writer = PyPDF2.PdfWriter()
@@ -52,12 +57,28 @@ def process_dhl_text(pdf_text):
52
  return file_name, file_path
53
 
54
  def process_ups_pdf(file_name, page):
55
- # page.cropbox.upper_left = (0,0)
56
- # page.cropbox.lower_right = (500,400)
57
  page.rotate(90)
58
- pdf_text = page.extract_text()
59
- print(pdf_text)
60
  save_pdf_file(file_name, page)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  def process_coppel_pdf(file_name, page):
63
  page.cropbox.upper_left = (0,150)
@@ -111,7 +132,7 @@ def process_pdf_file(file):
111
  process_coppel_pdf(file_name, page)
112
  else:
113
  transport_company = "ups"
114
- process_ups_pdf(file_name, page)
115
 
116
  pdf_stream.close()
117
  return file_name, file_path, transport_company
 
3
  from io import BytesIO
4
  import re
5
  import os
6
+ from pdf2image import convert_from_path
7
+ from PIL import Image
8
+ import pytesseract
9
+
10
+
11
 
12
  def save_pdf_file(file_name, page):
13
  pdf_writer = PyPDF2.PdfWriter()
 
57
  return file_name, file_path
58
 
59
  def process_ups_pdf(file_name, page):
 
 
60
  page.rotate(90)
 
 
61
  save_pdf_file(file_name, page)
62
+ file_path = f"files/{file_name}"
63
+
64
+ images = convert_from_path(file_path)
65
+ image = images[0]
66
+ image.save(f"{file_path}.png", "PNG")
67
+ loaded_image = Image.open(f"{file_path}.png")
68
+ extracted_text = pytesseract.image_to_string(loaded_image)
69
+ image.close()
70
+ loaded_image.close()
71
+ os.remove(f"{file_path}.png")
72
+ match = re.search(r'TRACKING #:\s+([A-Z\d\s]+)', extracted_text)
73
+ if match:
74
+ extracted_text = match.group(1)
75
+ extracted_text = extracted_text.replace(" ", "").replace("BILLING", "").replace("\n", "")
76
+ file_name = extracted_text + ".pdf"
77
+ else:
78
+ print("Pattern not found in the text.")
79
+
80
+ return file_name, file_path
81
+
82
 
83
  def process_coppel_pdf(file_name, page):
84
  page.cropbox.upper_left = (0,150)
 
132
  process_coppel_pdf(file_name, page)
133
  else:
134
  transport_company = "ups"
135
+ file_name, file_path = process_ups_pdf(file_name, page)
136
 
137
  pdf_stream.close()
138
  return file_name, file_path, transport_company
requirements.txt CHANGED
@@ -1,2 +1,78 @@
1
- chainlit
2
- PyPDF2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ aiofiles==23.2.1
2
+ aiohttp==3.8.6
3
+ aiosignal==1.3.1
4
+ annotated-types==0.6.0
5
+ anyio==3.7.1
6
+ async-timeout==4.0.3
7
+ asyncer==0.0.2
8
+ attrs==23.1.0
9
+ backoff==2.2.1
10
+ bidict==0.22.1
11
+ certifi==2023.7.22
12
+ chainlit==0.7.301
13
+ charset-normalizer==3.3.1
14
+ click==8.1.7
15
+ dataclasses-json==0.5.14
16
+ Deprecated==1.2.14
17
+ exceptiongroup==1.1.3
18
+ fastapi==0.100.1
19
+ fastapi-socketio==0.0.10
20
+ filetype==1.2.0
21
+ frozenlist==1.4.0
22
+ googleapis-common-protos==1.61.0
23
+ grpcio==1.59.0
24
+ h11==0.14.0
25
+ httpcore==0.18.0
26
+ httpx==0.25.0
27
+ idna==3.4
28
+ importlib-metadata==6.8.0
29
+ Jinja2==3.1.2
30
+ Lazify==0.4.0
31
+ MarkupSafe==2.1.3
32
+ marshmallow==3.20.1
33
+ multidict==6.0.4
34
+ mypy-extensions==1.0.0
35
+ nest-asyncio==1.5.8
36
+ nodeenv==1.8.0
37
+ opentelemetry-api==1.20.0
38
+ opentelemetry-exporter-otlp==1.20.0
39
+ opentelemetry-exporter-otlp-proto-common==1.20.0
40
+ opentelemetry-exporter-otlp-proto-grpc==1.20.0
41
+ opentelemetry-exporter-otlp-proto-http==1.20.0
42
+ opentelemetry-instrumentation==0.41b0
43
+ opentelemetry-proto==1.20.0
44
+ opentelemetry-sdk==1.20.0
45
+ opentelemetry-semantic-conventions==0.41b0
46
+ packaging==23.2
47
+ pdf2image==1.16.3
48
+ Pillow==10.1.0
49
+ prisma==0.10.0
50
+ protobuf==4.24.4
51
+ pydantic==2.4.2
52
+ pydantic_core==2.10.1
53
+ PyJWT==2.8.0
54
+ PyPDF2==3.0.1
55
+ pytesseract==0.3.10
56
+ python-dotenv==1.0.0
57
+ python-engineio==4.8.0
58
+ python-graphql-client==0.4.3
59
+ python-multipart==0.0.6
60
+ python-socketio==5.10.0
61
+ requests==2.31.0
62
+ simple-websocket==1.0.0
63
+ sniffio==1.3.0
64
+ starlette==0.27.0
65
+ syncer==2.0.3
66
+ tomli==2.0.1
67
+ tomlkit==0.12.1
68
+ typing-inspect==0.9.0
69
+ typing_extensions==4.8.0
70
+ uptrace==1.20.2
71
+ urllib3==2.0.7
72
+ uvicorn==0.23.2
73
+ watchfiles==0.20.0
74
+ websockets==12.0
75
+ wrapt==1.15.0
76
+ wsproto==1.2.0
77
+ yarl==1.9.2
78
+ zipp==3.17.0