Spaces:
Sleeping
Sleeping
File size: 4,192 Bytes
299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb 299b271 bb681cb | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | import requests
from gradio import Blocks, Button, Textbox, HTML
from urllib.parse import urljoin, urlparse
import re
# Function to get meta tags from a URL
def get_meta_tags(url):
try:
# Send HTTP request to the URL
response = requests.get(url)
if response.status_code == 200:
data = {
"meta_tags": {},
"favicon": None,
"robots_txt": False,
"sitemap_xml": False,
"robots_txt_content": "",
"sitemap_xml_content": ""
}
# Extracting the meta tags from the HTML
html = response.text
data['meta_tags'] = parse_meta_tags(html)
# Find the favicon link (if exists)
favicon_url = extract_favicon(html, url)
data['favicon'] = favicon_url
# Check if robots.txt exists
robots_txt_url = urljoin(url, "/robots.txt")
robots_response = requests.get(robots_txt_url)
if robots_response.status_code == 200:
data['robots_txt'] = True
data['robots_txt_content'] = robots_response.text
# Check if sitemap.xml exists
sitemap_url = urljoin(url, "/sitemap.xml")
sitemap_response = requests.get(sitemap_url)
if sitemap_response.status_code == 200:
data['sitemap_xml'] = True
data['sitemap_xml_content'] = sitemap_response.text
return data
except Exception as e:
print(f"An error occurred: {e}")
return {"error": str(e)}
# Function to parse meta tags from HTML
def parse_meta_tags(html):
meta_tags = {}
# Use regex to find <meta> tags and extract attributes
matches = re.findall(r'<meta\s+([^\>]+)>', html)
for match in matches:
attrs = re.findall(r'(\w+)=["\']([^"\']+)["\']', match)
for attr in attrs:
meta_tags[attr[0]] = attr[1]
return meta_tags
# Function to extract favicon URL from the HTML
def extract_favicon(html, base_url):
# Look for the favicon in the HTML
match = re.search(r'<link\s+rel=["\']icon["\']\s+href=["\']([^"\']+)["\']', html)
if match:
favicon_url = match.group(1)
if not favicon_url.startswith('http'):
favicon_url = urljoin(base_url, favicon_url)
return favicon_url
return None
# Function to format the result output
def format_output(result):
if "error" in result:
return f"Error: {result['error']}"
output = "<h3>Meta Tags</h3>"
for key, value in result["meta_tags"].items():
output += f"<strong>{key}</strong>: {value}<br>"
if result['favicon']:
output += f"<h3>Favicon</h3><img src='{result['favicon']}' alt='Favicon' style='width:50px;height:50px;'><br>"
else:
output += "<h3>Favicon</h3><p>Missing</p><br>"
if result['robots_txt']:
output += "<h3>robots.txt</h3><p>Found</p><br>"
output += f"<pre>{result['robots_txt_content']}</pre><br>"
else:
output += "<h3>robots.txt</h3><p>Not found</p><br>"
if result['sitemap_xml']:
output += "<h3>sitemap.xml</h3><p>Found</p><br>"
output += f"<pre>{result['sitemap_xml_content']}</pre><br>"
else:
output += "<h3>sitemap.xml</h3><p>Not found</p><br>"
return output
# Gradio Interface
def get_meta_tags_ui():
with Blocks() as interface:
# Input element to enter the URL
url_input = Textbox(label="Enter URL", placeholder="https://example.com")
# Button to trigger the meta tags fetch
lookup_button = Button("Get Meta Tags from URL")
# HTML output area for the results
html_output = HTML()
# Action when the button is clicked
def update(value):
if value:
result = get_meta_tags(value)
return format_output(result)
# Link button click with the update function
lookup_button.click(fn=update, inputs=url_input, outputs=html_output)
return interface
# Run the interface
if __name__ == "__main__":
interface = get_meta_tags_ui()
interface.launch()
|