Spaces:
Sleeping
Sleeping
| import nltk | |
| from unstructured.documents.html import HTMLDocument | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from reportlab.lib.pagesizes import letter | |
| from reportlab.pdfgen import canvas | |
| import gradio as gr | |
| # Download and install NLTK data | |
| nltk.download('punkt') | |
| nltk.download('averaged_perceptron_tagger') | |
| # Function to process HTML content from a given URL | |
| def process_html_from_url(url): | |
| response = requests.get(url) | |
| # Check if the request was successful | |
| if response.status_code == 200: | |
| # Get the HTML content of the page | |
| html_content = response.text | |
| # Extract text content from HTML using BeautifulSoup | |
| soup = BeautifulSoup(html_content, 'html.parser') | |
| page_content = soup.get_text() | |
| # Save the parsed content to a text file | |
| text_filename = 'output.txt' | |
| with open(text_filename, 'w') as f: | |
| f.write(page_content) | |
| # Save the parsed content to a PDF file | |
| pdf_filename = 'output.pdf' | |
| save_text_to_pdf(page_content, pdf_filename) | |
| return text_filename, pdf_filename | |
| else: | |
| return None, None | |
| def save_text_to_pdf(text, filename): | |
| c = canvas.Canvas(filename, pagesize=letter) | |
| width, height = letter | |
| # Split the text into lines | |
| lines = text.split('\n') | |
| # Define the starting position | |
| x = 40 | |
| y = height - 40 | |
| line_height = 12 | |
| # Add text to the canvas | |
| for line in lines: | |
| if y < 40: | |
| c.showPage() | |
| y = height - 40 | |
| c.drawString(x, y, line) | |
| y -= line_height | |
| # Save the PDF file | |
| c.save() | |
| # Function to be used by Gradio interface | |
| def gradio_process(url): | |
| text_file, pdf_file = process_html_from_url(url) | |
| if text_file and pdf_file: | |
| return text_file, pdf_file | |
| else: | |
| return "Failed to retrieve HTML content", "" | |
| # Create the Gradio interface | |
| iface = gr.Interface( | |
| fn=gradio_process, | |
| inputs=gr.Textbox(label="Enter the URL to process"), | |
| outputs=[ | |
| gr.File(label="Text File"), | |
| gr.File(label="PDF File") | |
| ], | |
| title="HTML Content Processor", | |
| description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files." | |
| ) | |
| # Launch the Gradio app | |
| iface.launch(debug=True) | |