import nltk from unstructured.documents.html import HTMLDocument import requests from bs4 import BeautifulSoup from reportlab.lib.pagesizes import letter from reportlab.pdfgen import canvas import gradio as gr # Download and install NLTK data nltk.download('punkt') nltk.download('averaged_perceptron_tagger') # Function to process HTML content from a given URL def process_html_from_url(url): response = requests.get(url) # Check if the request was successful if response.status_code == 200: # Get the HTML content of the page html_content = response.text # Extract text content from HTML using BeautifulSoup soup = BeautifulSoup(html_content, 'html.parser') page_content = soup.get_text() # Save the parsed content to a text file text_filename = 'output.txt' with open(text_filename, 'w') as f: f.write(page_content) # Save the parsed content to a PDF file pdf_filename = 'output.pdf' save_text_to_pdf(page_content, pdf_filename) return text_filename, pdf_filename else: return None, None def save_text_to_pdf(text, filename): c = canvas.Canvas(filename, pagesize=letter) width, height = letter # Split the text into lines lines = text.split('\n') # Define the starting position x = 40 y = height - 40 line_height = 12 # Add text to the canvas for line in lines: if y < 40: c.showPage() y = height - 40 c.drawString(x, y, line) y -= line_height # Save the PDF file c.save() # Function to be used by Gradio interface def gradio_process(url): text_file, pdf_file = process_html_from_url(url) if text_file and pdf_file: return text_file, pdf_file else: return "Failed to retrieve HTML content", "" # Create the Gradio interface iface = gr.Interface( fn=gradio_process, inputs=gr.Textbox(label="Enter the URL to process"), outputs=[ gr.File(label="Text File"), gr.File(label="PDF File") ], title="HTML Content Processor", description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files." ) # Launch the Gradio app iface.launch(debug=True)