import requests
from gradio import Blocks, Button, Textbox, HTML
from urllib.parse import urljoin, urlparse
import re
# Function to get meta tags from a URL
def get_meta_tags(url):
try:
# Send HTTP request to the URL
response = requests.get(url)
if response.status_code == 200:
data = {
"meta_tags": {},
"favicon": None,
"robots_txt": False,
"sitemap_xml": False,
"robots_txt_content": "",
"sitemap_xml_content": ""
}
# Extracting the meta tags from the HTML
html = response.text
data['meta_tags'] = parse_meta_tags(html)
# Find the favicon link (if exists)
favicon_url = extract_favicon(html, url)
data['favicon'] = favicon_url
# Check if robots.txt exists
robots_txt_url = urljoin(url, "/robots.txt")
robots_response = requests.get(robots_txt_url)
if robots_response.status_code == 200:
data['robots_txt'] = True
data['robots_txt_content'] = robots_response.text
# Check if sitemap.xml exists
sitemap_url = urljoin(url, "/sitemap.xml")
sitemap_response = requests.get(sitemap_url)
if sitemap_response.status_code == 200:
data['sitemap_xml'] = True
data['sitemap_xml_content'] = sitemap_response.text
return data
except Exception as e:
print(f"An error occurred: {e}")
return {"error": str(e)}
# Function to parse meta tags from HTML
def parse_meta_tags(html):
meta_tags = {}
# Use regex to find tags and extract attributes
matches = re.findall(r']+)>', html)
for match in matches:
attrs = re.findall(r'(\w+)=["\']([^"\']+)["\']', match)
for attr in attrs:
meta_tags[attr[0]] = attr[1]
return meta_tags
# Function to extract favicon URL from the HTML
def extract_favicon(html, base_url):
# Look for the favicon in the HTML
match = re.search(r'{key}: {value}
"
if result['favicon']:
output += f"
Missing
Found
{result['robots_txt_content']}Not found
Found
{result['sitemap_xml_content']}Not found