bonrix commited on
Commit
32c0a08
·
1 Parent(s): af8088f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +15 -7
app.py CHANGED
@@ -97,10 +97,12 @@ def crawl_website(url):
97
  xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
98
 
99
  # Write the XML string to a file
100
- with open("sitemap.xml", "w") as file:
 
101
  file.write(xml_str)
102
 
103
- return "sitemap.xml"
 
104
 
105
  def extract_text_from_sitemap(sitemap_file):
106
  with open(sitemap_file, 'r') as file:
@@ -125,18 +127,24 @@ def extract_text_from_sitemap(sitemap_file):
125
 
126
  return extracted_text
127
 
 
128
  def gradio_interface(url):
129
  sitemap_file = crawl_website(url)
130
  extracted_text = extract_text_from_sitemap(sitemap_file)
131
- text_file_path = 'extracted_text.txt'
132
 
 
 
133
  with open(text_file_path, 'w', encoding='utf-8') as file:
134
  file.write(extracted_text)
135
 
136
  return "\n".join(crawl_website.progress_textbox), text_file_path
137
 
138
 
139
- with gr.Interface(fn=gradio_interface, inputs="text", outputs=["text", "file"],
140
- title="Website Crawler",
141
- description="Enter a website URL to crawl and extract text from web pages.") as iface:
142
- iface.launch()
 
 
 
 
 
97
  xml_str = "\n".join([line for line in xml_str.split("\n") if line.strip()])
98
 
99
  # Write the XML string to a file
100
+ sitemap_file = "sitemap.xml"
101
+ with open(sitemap_file, "w") as file:
102
  file.write(xml_str)
103
 
104
+ return sitemap_file
105
+
106
 
107
  def extract_text_from_sitemap(sitemap_file):
108
  with open(sitemap_file, 'r') as file:
 
127
 
128
  return extracted_text
129
 
130
+
131
  def gradio_interface(url):
132
  sitemap_file = crawl_website(url)
133
  extracted_text = extract_text_from_sitemap(sitemap_file)
 
134
 
135
+ # Save the extracted text to a file
136
+ text_file_path = 'extracted_text.txt'
137
  with open(text_file_path, 'w', encoding='utf-8') as file:
138
  file.write(extracted_text)
139
 
140
  return "\n".join(crawl_website.progress_textbox), text_file_path
141
 
142
 
143
+ iface = gr.Interface(
144
+ fn=gradio_interface,
145
+ inputs="text",
146
+ outputs=["text", "file"],
147
+ title="Website Crawler",
148
+ description="Enter a website URL to crawl and extract text from web pages."
149
+ )
150
+ iface.launch()