Spaces:

mobenta
/

HTML_Content_Processor

Sleeping

App Files Files Community

HTML_Content_Processor / app.py

mobenta

Update app.py

f4c7780 verified almost 2 years ago

raw

history blame contribute delete

2.3 kB


	import nltk
	from unstructured.documents.html import HTMLDocument
	import requests
	from bs4 import BeautifulSoup
	from reportlab.lib.pagesizes import letter
	from reportlab.pdfgen import canvas
	import gradio as gr

	# Download and install NLTK data
	nltk.download('punkt')
	nltk.download('averaged_perceptron_tagger')

	# Function to process HTML content from a given URL
	def process_html_from_url(url):
	response = requests.get(url)

	# Check if the request was successful
	if response.status_code == 200:
	# Get the HTML content of the page
	html_content = response.text

	# Extract text content from HTML using BeautifulSoup
	soup = BeautifulSoup(html_content, 'html.parser')
	page_content = soup.get_text()

	# Save the parsed content to a text file
	text_filename = 'output.txt'
	with open(text_filename, 'w') as f:
	f.write(page_content)

	# Save the parsed content to a PDF file
	pdf_filename = 'output.pdf'
	save_text_to_pdf(page_content, pdf_filename)

	return text_filename, pdf_filename
	else:
	return None, None

	def save_text_to_pdf(text, filename):
	c = canvas.Canvas(filename, pagesize=letter)
	width, height = letter

	# Split the text into lines
	lines = text.split('\n')

	# Define the starting position
	x = 40
	y = height - 40
	line_height = 12

	# Add text to the canvas
	for line in lines:
	if y < 40:
	c.showPage()
	y = height - 40
	c.drawString(x, y, line)
	y -= line_height

	# Save the PDF file
	c.save()

	# Function to be used by Gradio interface
	def gradio_process(url):
	text_file, pdf_file = process_html_from_url(url)
	if text_file and pdf_file:
	return text_file, pdf_file
	else:
	return "Failed to retrieve HTML content", ""

	# Create the Gradio interface
	iface = gr.Interface(
	fn=gradio_process,
	inputs=gr.Textbox(label="Enter the URL to process"),
	outputs=[
	gr.File(label="Text File"),
	gr.File(label="PDF File")
	],
	title="HTML Content Processor",
	description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files."
	)

	# Launch the Gradio app
	iface.launch(debug=True)