ymcmy commited on
Commit
cfd35b8
·
verified ·
1 Parent(s): e2cac91

Update utils.py

Browse files
Files changed (1) hide show
  1. utils.py +49 -15
utils.py CHANGED
@@ -2,8 +2,11 @@ from bs4 import BeautifulSoup
2
  import requests
3
  import numpy as np
4
  from datetime import datetime
5
- from xhtml2pdf import pisa
 
 
6
  import io
 
7
 
8
  def gen_link():
9
  if np.random.choice([True, False]):
@@ -95,17 +98,48 @@ def gen_html(num):
95
  '''
96
  return all_q
97
 
98
- def convert_html_to_pdf(source_html, output_filename):
99
- try:
100
- result_file = open(output_filename, "w+b")
101
- pisa_status = pisa.CreatePDF(
102
- io.StringIO(source_html),
103
- dest=result_file
104
- )
105
- result_file.close()
106
- if pisa_status.err:
107
- raise Exception("Error converting HTML to PDF")
108
- except Exception as e:
109
- print(f"Error: {e}")
110
- raise
111
- return pisa_status.err
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import requests
3
  import numpy as np
4
  from datetime import datetime
5
+ from reportlab.lib.pagesizes import letter
6
+ from reportlab.pdfgen import canvas
7
+ from reportlab.lib.utils import ImageReader
8
  import io
9
+ from PIL import Image
10
 
11
  def gen_link():
12
  if np.random.choice([True, False]):
 
98
  '''
99
  return all_q
100
 
101
+ def generate_pdf_content(html_content):
102
+ # Use BeautifulSoup to parse the HTML content
103
+ soup = BeautifulSoup(html_content, 'html.parser')
104
+
105
+ # Extract text and images
106
+ elements = []
107
+ for tag in soup.find_all(['p', 'a', 'img']):
108
+ if tag.name == 'p':
109
+ elements.append(('text', tag.get_text()))
110
+ elif tag.name == 'a':
111
+ elements.append(('link', tag.get('href'), tag.get_text()))
112
+ elif tag.name == 'img':
113
+ img_url = tag.get('src')
114
+ response = requests.get(img_url)
115
+ img = Image.open(io.BytesIO(response.content))
116
+ elements.append(('image', img))
117
+
118
+ return elements
119
+
120
+ def create_pdf(filename, content):
121
+ c = canvas.Canvas(filename, pagesize=letter)
122
+ width, height = letter
123
+ y = height - 40
124
+
125
+ for elem in content:
126
+ if elem[0] == 'text':
127
+ c.drawString(30, y, elem[1])
128
+ y -= 20
129
+ elif elem[0] == 'link':
130
+ c.drawString(30, y, f'{elem[2]}: {elem[1]}')
131
+ y -= 20
132
+ elif elem[0] == 'image':
133
+ img_reader = ImageReader(elem[1])
134
+ c.drawImage(img_reader, 30, y - elem[1].size[1], width=elem[1].size[0], height=elem[1].size[1])
135
+ y -= elem[1].size[1] + 20
136
+
137
+ if y < 50:
138
+ c.showPage()
139
+ y = height - 40
140
+
141
+ c.save()
142
+
143
+ def convert_html_to_pdf(html_content, output_filename):
144
+ content = generate_pdf_content(html_content)
145
+ create_pdf(output_filename, content)