Spaces:

mobenta
/

HTML_Content_Processor

Sleeping

App Files Files Community

mobenta commited on May 29, 2024

Commit

170126f

verified ·

1 Parent(s): 9b5f926

Create app.py

Browse files

Files changed (1) hide show

app.py +85 -0

app.py ADDED Viewed

	@@ -0,0 +1,85 @@

+!pip install gradio requests reportlab unstructured
+import nltk
+from unstructured.documents.html import HTMLDocument
+import requests
+from bs4 import BeautifulSoup
+from reportlab.lib.pagesizes import letter
+from reportlab.pdfgen import canvas
+import gradio as gr
+# Download and install NLTK data
+nltk.download('punkt')
+nltk.download('averaged_perceptron_tagger')
+# Function to process HTML content from a given URL
+def process_html_from_url(url):
+    response = requests.get(url)
+    # Check if the request was successful
+    if response.status_code == 200:
+        # Get the HTML content of the page
+        html_content = response.text
+        # Extract text content from HTML using BeautifulSoup
+        soup = BeautifulSoup(html_content, 'html.parser')
+        page_content = soup.get_text()
+        # Save the parsed content to a text file
+        text_filename = 'output.txt'
+        with open(text_filename, 'w') as f:
+            f.write(page_content)
+        # Save the parsed content to a PDF file
+        pdf_filename = 'output.pdf'
+        save_text_to_pdf(page_content, pdf_filename)
+        return text_filename, pdf_filename
+    else:
+        return None, None
+def save_text_to_pdf(text, filename):
+    c = canvas.Canvas(filename, pagesize=letter)
+    width, height = letter
+    # Split the text into lines
+    lines = text.split('\n')
+    # Define the starting position
+    x = 40
+    y = height - 40
+    line_height = 12
+    # Add text to the canvas
+    for line in lines:
+        if y < 40:
+            c.showPage()
+            y = height - 40
+        c.drawString(x, y, line)
+        y -= line_height
+    # Save the PDF file
+    c.save()
+# Function to be used by Gradio interface
+def gradio_process(url):
+    text_file, pdf_file = process_html_from_url(url)
+    if text_file and pdf_file:
+        return text_file, pdf_file
+    else:
+        return "Failed to retrieve HTML content", ""
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=gradio_process,
+    inputs=gr.Textbox(label="Enter the URL to process"),
+    outputs=[
+        gr.File(label="Text File"),
+        gr.File(label="PDF File")
+    ],
+    title="HTML Content Processor",
+    description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files."
+)
+# Launch the Gradio app
+iface.launch(debug=True)