Spaces:
Sleeping
Sleeping
File size: 8,997 Bytes
7f2ede9 b321778 ea67231 b321778 7f2ede9 66130ae 7f2ede9 f707000 66130ae f707000 66130ae f707000 66130ae 7f2ede9 66a2e50 66130ae 66a2e50 7f2ede9 66130ae 7f2ede9 66a2e50 66130ae 98d21f2 7f2ede9 b321778 66130ae b321778 66130ae b321778 66130ae b321778 66130ae b321778 7f2ede9 66130ae 7f2ede9 66a2e50 66130ae 7f2ede9 ea67231 7f2ede9 ea67231 7f2ede9 ea67231 7f2ede9 ea67231 7f2ede9 ea67231 7f2ede9 ea67231 7f2ede9 f6719e9 7f2ede9 ea67231 7f2ede9 ea67231 66a2e50 ea67231 7f2ede9 ea67231 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
import PyPDF2
import chainlit as cl
from io import BytesIO
import re
import os
from pdf2image import convert_from_path
from PIL import Image
import pytesseract
from fpdf import FPDF
def save_pdf_file(file_name, page):
pdf_writer = PyPDF2.PdfWriter()
pdf_writer.add_page(page)
#return pdf stream
with open(f"files/{file_name}", "wb") as out:
pdf_writer.write(out)
pdf_writer.close()
def process_estafeta_pdf(file_name, page):
x = page['/MediaBox'][2]
y = page['/MediaBox'][3]
if x > y:
page.mediabox.lower_left = (40, 60)
page.mediabox.upper_right = (340,520)
else:
page.mediabox.lower_left = (30,40)
page.mediabox.upper_right = (490,340)
save_pdf_file(file_name, page)
def process_estafeta_text(pdf_text):
extracted_text = ""
match = re.search(r'CONFIRMACION (\d+-\d+\w+)', pdf_text)
if match:
extracted_text = match.group(1)
extracted_text = extracted_text[0:23].replace("-", "")
else:
raise Exception("Pattern not found in the text.")
file_name = extracted_text + ".pdf"
file_path = f"files/{file_name}"
return file_name, file_path
def process_dhl_pdf(file_name, page):
page.mediabox.lower_left = (92,20)
page.mediabox.upper_right = (360,560)
save_pdf_file(file_name, page)
def process_dhl_text(pdf_text):
matches = re.findall(r'WAYBILL (\d+(?: \d+)*)', pdf_text)
for match in matches:
extracted_text = match.replace(" ", "")
file_name = extracted_text + ".pdf"
file_path = f"files/{file_name}"
return file_name, file_path
def process_ups_pdf(page):
page.rotate(90)
def process_pdf_from_image(file_name, page):
page.rotate(90)
save_pdf_file(file_name, page)
file_path = f"files/{file_name}"
images = convert_from_path(file_path)
image = images[0]
image.save(f"{file_path}.png", "PNG")
# open the image and extract the text
loaded_image = Image.open(f"{file_path}.png")
extracted_text = pytesseract.image_to_string(loaded_image)
image.close()
os.remove(f"{file_path}.png")
if re.search("Fed2x", extracted_text, re.IGNORECASE):
file_name, file_path = process_fedex_text(extracted_text)
process_fedex_pdf(file_name, page)
else:
image_path = f"{file_path}-resized.pdf"
loaded_image.resize((400, 500)).save(image_path, "PDF")
loaded_image.close()
match = re.search(r'TRACKING #:\s+([A-Z\d\s]+)', extracted_text)
file_path = image_path
if match:
extracted_text = match.group(1)
extracted_text = extracted_text.replace(" ", "").replace("BILLING", "").replace("\n", "")
file_name = extracted_text + ".pdf"
else:
print("Pattern not found in the text.")
return file_name, file_path
def process_coppel_pdf(file_name, page):
page.mediabox.lower_left = (0,150)
page.mediabox.upper_right = (290,520)
save_pdf_file(file_name, page)
def process_coppel_text(pdf_text):
match = re.search(r'TN: (\w+)', pdf_text)
if match:
extracted_text = match.group(1)
else:
print("Pattern not found in the text.")
file_name = extracted_text + ".pdf"
file_path = f"files/{file_name}"
return file_name, file_path
def process_fedex_pdf(file_name, page):
page.rotate(-90)
page.mediabox.lower_left = (0, 0)
page.mediabox.upper_right = (500,650)
save_pdf_file(file_name, page)
def process_fedex_text(pdf_text):
return "fedex.pdf", "files/fedex.pdf"
def process_pdf_file(file):
"""
This function processes the PDF file and returns the file name, file path and transport company
Parameters:
file (File): The PDF file to process
Returns:
file_name (str): The name of the file
file_path (str): The path of the file
transport_company (str): The transport company
"""
with open(file.path, "rb") as pdf_stream:
pdf = PyPDF2.PdfReader(pdf_stream)
page = pdf.pages[0]
pdf_text = page.extract_text()
transport_company = None
file_name = file.name
if re.search("estafeta", pdf_text, re.IGNORECASE):
transport_company = "estafeta"
file_name, file_path = process_estafeta_text(pdf_text)
process_estafeta_pdf(file_name, page)
elif re.search("dhl", pdf_text, re.IGNORECASE):
transport_company = "dhl"
file_name, file_path = process_dhl_text(pdf_text)
process_dhl_pdf(file_name, page)
elif re.search("coppel", pdf_text, re.IGNORECASE):
transport_company = "coppel"
file_name, file_path = process_coppel_text(pdf_text)
process_coppel_pdf(file_name, page)
else:
transport_company = "ups"
file_name, file_path = process_pdf_from_image(file_name, page)
pdf_stream.close()
return file_name, file_path, transport_company
def gif2pdf(file):
# Open the GIF image
gif = Image.open(file.path)
# Convert GIF to RGB format (because PDFs do not support palettes)
rgb_images = []
for frame in range(gif.n_frames):
gif.seek(frame)
rgb_image = gif.convert("RGB")
rgb_images.append(rgb_image)
# Create a PDF instance
pdf = FPDF(unit="pt")
# Loop through RGB images and add them as pages to the PDF
for img in rgb_images:
width, height = img.size # Get the size of the image in pixels
# Convert the size from pixels to points (assuming 72 DPI for the images)
pdf.add_page(format=(width, height))
# Create a temporary image file
img.save("temp_frame.jpg")
# Add the image to the PDF (x, y, w, h are specified in points)
pdf.image("temp_frame.jpg", x=0, y=0, w=width, h=height)
# Save the resulting PDF
file_name = file.name.split('.')[0]
output_pdf_path = f".files/{file_name}.pdf"
pdf.output(output_pdf_path)
# Rotate the PDF pages by 90 degrees
with open(output_pdf_path, "rb") as pdf_file:
reader = PyPDF2.PdfReader(pdf_file)
writer = PyPDF2.PdfWriter()
for page_num in range(len(reader.pages)):
page = reader.pages[page_num]
page.rotate(90)
writer.add_page(page)
rotated_pdf_path = f".files/{file_name}_rotated.pdf"
with open(rotated_pdf_path, "wb") as rotated_pdf_file:
writer.write(rotated_pdf_file)
os.remove(output_pdf_path)
return file_name, rotated_pdf_path
def remove_all_files_in_folder(folder_path):
# Loop through all files in the specified folder
for filename in os.listdir(folder_path):
file_path = os.path.join(folder_path, filename)
# Check if it's a file (not a directory) before removing
if os.path.isfile(file_path):
os.remove(file_path)
async def process_chat():
await cl.Message(content="Hola! Sube las guías en formato PDF.\nPuedes subir hasta 20 guías a la vez").send()
files = None
# Wait for the user to upload a PDF file
while files is None:
files = await cl.AskFileMessage(
content="Sube las guías aquí!",
accept=["application/pdf", "image/gif"],
max_size_mb=5,
max_files=20,
timeout=30,
).send()
try:
element_types = {
'application/pdf': 0,
'image/gif': 0
}
for item in files:
if item.type in element_types:
element_types[item.type] += 1
if element_types["application/pdf"] > 0 and element_types["image/gif"] > 0:
msg = cl.Message(content="Solo ingresa PDFs o Gifs")
await msg.send()
return
elements = []
content = ""
for file in files:
if file.type == 'image/gif':
file_name, file_path = gif2pdf(file)
elements.append(cl.File(name=file_name, display="inline", path=file_path))
content = "Archivos convertidos en PDF"
else:
file_name, file_path, transport_company = process_pdf_file(file)
elements.append(cl.File(name=file_name, display="inline", path=file_path))
file_name = file_name.replace(".pdf", "")
content = f"Guía de {transport_company.upper()}: **{file_name}**"
msg = cl.Message(content=content, elements=elements)
await msg.send()
remove_all_files_in_folder(".files")
remove_all_files_in_folder("files")
except Exception as e:
msg = cl.Message(content=f"Error: {e}")
await msg.send()
@cl.on_chat_start
async def start_chat():
await process_chat()
@cl.on_message
async def main(message: str):
await process_chat() |