Project / app.py
JEET1812's picture
Update app.py
b6e77a8
import requests
import xml.etree.ElementTree as ET
from openpyxl import Workbook
import gradio as gr
import base64
def extract_links_from_xml(xml_content):
root = ET.fromstring(xml_content)
urls = []
# Find all <loc> elements and extract the text inside them
for loc in root.iter('{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
urls.append(loc.text.strip())
return urls
def check_sitemap(sitemap_url):
# Send a GET request to the sitemap URL
response = requests.get(sitemap_url)
if response.status_code == 200:
urls = extract_links_from_xml(response.content)
# Create a new Excel workbook and select the active sheet
workbook = Workbook()
sheet = workbook.active
# Write headers to the first row of the sheet
sheet['A1'] = 'URL'
sheet['B1'] = 'Response'
row = 2 # Starting row to write URLs and responses
for url in urls:
# Send a GET request to the URL in the sitemap
url_response = requests.get(url)
# Write the URL and response status code to the next row of the sheet
sheet.cell(row=row, column=1, value=url)
sheet.cell(row=row, column=2, value=url_response.status_code)
row += 1
# Save the workbook as an Excel file
workbook.save('sitemap_responses.xlsx')
print("Excel file generated successfully.")
else:
print(f"Error retrieving sitemap: {response.status_code}")
def download_xml(url):
check_sitemap(url)
with open("sitemap_responses.xlsx", "rb") as f:
data = f.read()
base64_data = base64.b64encode(data).decode("utf-8")
return f'<a href="data:application/octet-stream;base64,{base64_data}" download="sitemap_responses.xlsx">Download XML</a>'
# Gradio interface
iface = gr.Interface(download_xml, inputs="text", outputs=gr.outputs.HTML(), title="Sitemap Checker")
iface.launch()