Update utils.py
Browse files
utils.py
CHANGED
|
@@ -2,8 +2,11 @@ from bs4 import BeautifulSoup
|
|
| 2 |
import requests
|
| 3 |
import numpy as np
|
| 4 |
from datetime import datetime
|
| 5 |
-
from
|
|
|
|
|
|
|
| 6 |
import io
|
|
|
|
| 7 |
|
| 8 |
def gen_link():
|
| 9 |
if np.random.choice([True, False]):
|
|
@@ -95,17 +98,48 @@ def gen_html(num):
|
|
| 95 |
'''
|
| 96 |
return all_q
|
| 97 |
|
| 98 |
-
def
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
import requests
|
| 3 |
import numpy as np
|
| 4 |
from datetime import datetime
|
| 5 |
+
from reportlab.lib.pagesizes import letter
|
| 6 |
+
from reportlab.pdfgen import canvas
|
| 7 |
+
from reportlab.lib.utils import ImageReader
|
| 8 |
import io
|
| 9 |
+
from PIL import Image
|
| 10 |
|
| 11 |
def gen_link():
|
| 12 |
if np.random.choice([True, False]):
|
|
|
|
| 98 |
'''
|
| 99 |
return all_q
|
| 100 |
|
| 101 |
+
def generate_pdf_content(html_content):
|
| 102 |
+
# Use BeautifulSoup to parse the HTML content
|
| 103 |
+
soup = BeautifulSoup(html_content, 'html.parser')
|
| 104 |
+
|
| 105 |
+
# Extract text and images
|
| 106 |
+
elements = []
|
| 107 |
+
for tag in soup.find_all(['p', 'a', 'img']):
|
| 108 |
+
if tag.name == 'p':
|
| 109 |
+
elements.append(('text', tag.get_text()))
|
| 110 |
+
elif tag.name == 'a':
|
| 111 |
+
elements.append(('link', tag.get('href'), tag.get_text()))
|
| 112 |
+
elif tag.name == 'img':
|
| 113 |
+
img_url = tag.get('src')
|
| 114 |
+
response = requests.get(img_url)
|
| 115 |
+
img = Image.open(io.BytesIO(response.content))
|
| 116 |
+
elements.append(('image', img))
|
| 117 |
+
|
| 118 |
+
return elements
|
| 119 |
+
|
| 120 |
+
def create_pdf(filename, content):
|
| 121 |
+
c = canvas.Canvas(filename, pagesize=letter)
|
| 122 |
+
width, height = letter
|
| 123 |
+
y = height - 40
|
| 124 |
+
|
| 125 |
+
for elem in content:
|
| 126 |
+
if elem[0] == 'text':
|
| 127 |
+
c.drawString(30, y, elem[1])
|
| 128 |
+
y -= 20
|
| 129 |
+
elif elem[0] == 'link':
|
| 130 |
+
c.drawString(30, y, f'{elem[2]}: {elem[1]}')
|
| 131 |
+
y -= 20
|
| 132 |
+
elif elem[0] == 'image':
|
| 133 |
+
img_reader = ImageReader(elem[1])
|
| 134 |
+
c.drawImage(img_reader, 30, y - elem[1].size[1], width=elem[1].size[0], height=elem[1].size[1])
|
| 135 |
+
y -= elem[1].size[1] + 20
|
| 136 |
+
|
| 137 |
+
if y < 50:
|
| 138 |
+
c.showPage()
|
| 139 |
+
y = height - 40
|
| 140 |
+
|
| 141 |
+
c.save()
|
| 142 |
+
|
| 143 |
+
def convert_html_to_pdf(html_content, output_filename):
|
| 144 |
+
content = generate_pdf_content(html_content)
|
| 145 |
+
create_pdf(output_filename, content)
|