Spaces:
Sleeping
Sleeping
| import requests | |
| from gradio import Blocks, Button, Textbox, HTML | |
| from urllib.parse import urljoin, urlparse | |
| import re | |
| # Function to get meta tags from a URL | |
| def get_meta_tags(url): | |
| try: | |
| # Send HTTP request to the URL | |
| response = requests.get(url) | |
| if response.status_code == 200: | |
| data = { | |
| "meta_tags": {}, | |
| "favicon": None, | |
| "robots_txt": False, | |
| "sitemap_xml": False, | |
| "robots_txt_content": "", | |
| "sitemap_xml_content": "" | |
| } | |
| # Extracting the meta tags from the HTML | |
| html = response.text | |
| data['meta_tags'] = parse_meta_tags(html) | |
| # Find the favicon link (if exists) | |
| favicon_url = extract_favicon(html, url) | |
| data['favicon'] = favicon_url | |
| # Check if robots.txt exists | |
| robots_txt_url = urljoin(url, "/robots.txt") | |
| robots_response = requests.get(robots_txt_url) | |
| if robots_response.status_code == 200: | |
| data['robots_txt'] = True | |
| data['robots_txt_content'] = robots_response.text | |
| # Check if sitemap.xml exists | |
| sitemap_url = urljoin(url, "/sitemap.xml") | |
| sitemap_response = requests.get(sitemap_url) | |
| if sitemap_response.status_code == 200: | |
| data['sitemap_xml'] = True | |
| data['sitemap_xml_content'] = sitemap_response.text | |
| return data | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| return {"error": str(e)} | |
| # Function to parse meta tags from HTML | |
| def parse_meta_tags(html): | |
| meta_tags = {} | |
| # Use regex to find <meta> tags and extract attributes | |
| matches = re.findall(r'<meta\s+([^\>]+)>', html) | |
| for match in matches: | |
| attrs = re.findall(r'(\w+)=["\']([^"\']+)["\']', match) | |
| for attr in attrs: | |
| meta_tags[attr[0]] = attr[1] | |
| return meta_tags | |
| # Function to extract favicon URL from the HTML | |
| def extract_favicon(html, base_url): | |
| # Look for the favicon in the HTML | |
| match = re.search(r'<link\s+rel=["\']icon["\']\s+href=["\']([^"\']+)["\']', html) | |
| if match: | |
| favicon_url = match.group(1) | |
| if not favicon_url.startswith('http'): | |
| favicon_url = urljoin(base_url, favicon_url) | |
| return favicon_url | |
| return None | |
| # Function to format the result output | |
| def format_output(result): | |
| if "error" in result: | |
| return f"Error: {result['error']}" | |
| output = "<h3>Meta Tags</h3>" | |
| for key, value in result["meta_tags"].items(): | |
| output += f"<strong>{key}</strong>: {value}<br>" | |
| if result['favicon']: | |
| output += f"<h3>Favicon</h3><img src='{result['favicon']}' alt='Favicon' style='width:50px;height:50px;'><br>" | |
| else: | |
| output += "<h3>Favicon</h3><p>Missing</p><br>" | |
| if result['robots_txt']: | |
| output += "<h3>robots.txt</h3><p>Found</p><br>" | |
| output += f"<pre>{result['robots_txt_content']}</pre><br>" | |
| else: | |
| output += "<h3>robots.txt</h3><p>Not found</p><br>" | |
| if result['sitemap_xml']: | |
| output += "<h3>sitemap.xml</h3><p>Found</p><br>" | |
| output += f"<pre>{result['sitemap_xml_content']}</pre><br>" | |
| else: | |
| output += "<h3>sitemap.xml</h3><p>Not found</p><br>" | |
| return output | |
| # Gradio Interface | |
| def get_meta_tags_ui(): | |
| with Blocks() as interface: | |
| # Input element to enter the URL | |
| url_input = Textbox(label="Enter URL", placeholder="https://example.com") | |
| # Button to trigger the meta tags fetch | |
| lookup_button = Button("Get Meta Tags from URL") | |
| # HTML output area for the results | |
| html_output = HTML() | |
| # Action when the button is clicked | |
| def update(value): | |
| if value: | |
| result = get_meta_tags(value) | |
| return format_output(result) | |
| # Link button click with the update function | |
| lookup_button.click(fn=update, inputs=url_input, outputs=html_output) | |
| return interface | |
| # Run the interface | |
| if __name__ == "__main__": | |
| interface = get_meta_tags_ui() | |
| interface.launch() | |