Spaces:

mobenta
/

HTML_Content_Processor

Sleeping

File size: 2,303 Bytes

f4c7780
170126f


import nltk
from unstructured.documents.html import HTMLDocument
import requests
from bs4 import BeautifulSoup
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
import gradio as gr

# Download and install NLTK data
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

# Function to process HTML content from a given URL
def process_html_from_url(url):
    response = requests.get(url)

    # Check if the request was successful
    if response.status_code == 200:
        # Get the HTML content of the page
        html_content = response.text

        # Extract text content from HTML using BeautifulSoup
        soup = BeautifulSoup(html_content, 'html.parser')
        page_content = soup.get_text()

        # Save the parsed content to a text file
        text_filename = 'output.txt'
        with open(text_filename, 'w') as f:
            f.write(page_content)

        # Save the parsed content to a PDF file
        pdf_filename = 'output.pdf'
        save_text_to_pdf(page_content, pdf_filename)

        return text_filename, pdf_filename
    else:
        return None, None

def save_text_to_pdf(text, filename):
    c = canvas.Canvas(filename, pagesize=letter)
    width, height = letter

    # Split the text into lines
    lines = text.split('\n')

    # Define the starting position
    x = 40
    y = height - 40
    line_height = 12

    # Add text to the canvas
    for line in lines:
        if y < 40:
            c.showPage()
            y = height - 40
        c.drawString(x, y, line)
        y -= line_height

    # Save the PDF file
    c.save()

# Function to be used by Gradio interface
def gradio_process(url):
    text_file, pdf_file = process_html_from_url(url)
    if text_file and pdf_file:
        return text_file, pdf_file
    else:
        return "Failed to retrieve HTML content", ""

# Create the Gradio interface
iface = gr.Interface(
    fn=gradio_process,
    inputs=gr.Textbox(label="Enter the URL to process"),
    outputs=[
        gr.File(label="Text File"),
        gr.File(label="PDF File")
    ],
    title="HTML Content Processor",
    description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files."
)

# Launch the Gradio app
iface.launch(debug=True)