mobenta commited on
Commit
170126f
·
verified ·
1 Parent(s): 9b5f926

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +85 -0
app.py ADDED
@@ -0,0 +1,85 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install gradio requests reportlab unstructured
2
+ import nltk
3
+ from unstructured.documents.html import HTMLDocument
4
+ import requests
5
+ from bs4 import BeautifulSoup
6
+ from reportlab.lib.pagesizes import letter
7
+ from reportlab.pdfgen import canvas
8
+ import gradio as gr
9
+
10
+ # Download and install NLTK data
11
+ nltk.download('punkt')
12
+ nltk.download('averaged_perceptron_tagger')
13
+
14
+ # Function to process HTML content from a given URL
15
+ def process_html_from_url(url):
16
+ response = requests.get(url)
17
+
18
+ # Check if the request was successful
19
+ if response.status_code == 200:
20
+ # Get the HTML content of the page
21
+ html_content = response.text
22
+
23
+ # Extract text content from HTML using BeautifulSoup
24
+ soup = BeautifulSoup(html_content, 'html.parser')
25
+ page_content = soup.get_text()
26
+
27
+ # Save the parsed content to a text file
28
+ text_filename = 'output.txt'
29
+ with open(text_filename, 'w') as f:
30
+ f.write(page_content)
31
+
32
+ # Save the parsed content to a PDF file
33
+ pdf_filename = 'output.pdf'
34
+ save_text_to_pdf(page_content, pdf_filename)
35
+
36
+ return text_filename, pdf_filename
37
+ else:
38
+ return None, None
39
+
40
+ def save_text_to_pdf(text, filename):
41
+ c = canvas.Canvas(filename, pagesize=letter)
42
+ width, height = letter
43
+
44
+ # Split the text into lines
45
+ lines = text.split('\n')
46
+
47
+ # Define the starting position
48
+ x = 40
49
+ y = height - 40
50
+ line_height = 12
51
+
52
+ # Add text to the canvas
53
+ for line in lines:
54
+ if y < 40:
55
+ c.showPage()
56
+ y = height - 40
57
+ c.drawString(x, y, line)
58
+ y -= line_height
59
+
60
+ # Save the PDF file
61
+ c.save()
62
+
63
+ # Function to be used by Gradio interface
64
+ def gradio_process(url):
65
+ text_file, pdf_file = process_html_from_url(url)
66
+ if text_file and pdf_file:
67
+ return text_file, pdf_file
68
+ else:
69
+ return "Failed to retrieve HTML content", ""
70
+
71
+ # Create the Gradio interface
72
+ iface = gr.Interface(
73
+ fn=gradio_process,
74
+ inputs=gr.Textbox(label="Enter the URL to process"),
75
+ outputs=[
76
+ gr.File(label="Text File"),
77
+ gr.File(label="PDF File")
78
+ ],
79
+ title="HTML Content Processor",
80
+ description="Enter a URL to download and process its HTML content. You can download the resulting text and PDF files."
81
+ )
82
+
83
+ # Launch the Gradio app
84
+ iface.launch(debug=True)
85
+