Spaces:
Sleeping
Sleeping
File size: 2,303 Bytes
f4c7780 170126f | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 |
import nltk
from unstructured.documents.html import HTMLDocument
import requests
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import gradio as gr
# Download and install NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
# Function to process HTML content from a given URL
def process_html_from_url(url):
response = requests.get(url)
# Check if the request was successful
if response.status_code == 200:
# Get the HTML content of the page
html_content = response.text
# Extract text content from HTML using BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')
page_content = soup.get_text()
# Save the parsed content to a text file
text_filename = 'output.txt'
with open(text_filename, 'w') as f:
f.write(page_content)
# Save the parsed content to a PDF file
pdf_filename = 'output.pdf'
save_text_to_pdf(page_content, pdf_filename)
return text_filename, pdf_filename
else:
return None, None
def save_text_to_pdf(text, filename):
c = canvas.Canvas(filename, pagesize=letter)
width, height = letter
# Split the text into lines
lines = text.split('\n')
# Define the starting position
x = 40
y = height - 40
line_height = 12
# Add text to the canvas
for line in lines:
if y < 40:
c.showPage()
y = height - 40
c.drawString(x, y, line)
y -= line_height
# Save the PDF file
c.save()
# Function to be used by Gradio interface
def gradio_process(url):
text_file, pdf_file = process_html_from_url(url)
if text_file and pdf_file:
return text_file, pdf_file
else:
return "Failed to retrieve HTML content", ""
# Create the Gradio interface
iface = gr.Interface(
fn=gradio_process,
inputs=gr.Textbox(label="Enter the URL to process"),
outputs=[
gr.File(label="Text File"),
gr.File(label="PDF File")
],
title="HTML Content Processor",
description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files."
)
# Launch the Gradio app
iface.launch(debug=True)
|