File size: 4,192 Bytes
299b271
 
bb681cb
 
299b271
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bb681cb
 
 
299b271
bb681cb
 
 
 
 
 
299b271
bb681cb
 
 
 
 
 
299b271
bb681cb
299b271
bb681cb
 
 
299b271
bb681cb
 
299b271
bb681cb
 
 
 
 
 
299b271
 
bb681cb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299b271
bb681cb
 
 
299b271
bb681cb
 
299b271
bb681cb
 
299b271
bb681cb
 
 
 
 
299b271
bb681cb
 
299b271
 
 
bb681cb
299b271
bb681cb
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import requests
from gradio import Blocks, Button, Textbox, HTML
from urllib.parse import urljoin, urlparse
import re

# Function to get meta tags from a URL
def get_meta_tags(url):
    try:
        # Send HTTP request to the URL
        response = requests.get(url)

        if response.status_code == 200:
            data = {
                "meta_tags": {},
                "favicon": None,
                "robots_txt": False,
                "sitemap_xml": False,
                "robots_txt_content": "",
                "sitemap_xml_content": ""
            }

            # Extracting the meta tags from the HTML
            html = response.text
            data['meta_tags'] = parse_meta_tags(html)

            # Find the favicon link (if exists)
            favicon_url = extract_favicon(html, url)
            data['favicon'] = favicon_url

            # Check if robots.txt exists
            robots_txt_url = urljoin(url, "/robots.txt")
            robots_response = requests.get(robots_txt_url)
            if robots_response.status_code == 200:
                data['robots_txt'] = True
                data['robots_txt_content'] = robots_response.text

            # Check if sitemap.xml exists
            sitemap_url = urljoin(url, "/sitemap.xml")
            sitemap_response = requests.get(sitemap_url)
            if sitemap_response.status_code == 200:
                data['sitemap_xml'] = True
                data['sitemap_xml_content'] = sitemap_response.text

            return data

    except Exception as e:
        print(f"An error occurred: {e}")
        return {"error": str(e)}

# Function to parse meta tags from HTML
def parse_meta_tags(html):
    meta_tags = {}
    # Use regex to find <meta> tags and extract attributes
    matches = re.findall(r'<meta\s+([^\>]+)>', html)
    for match in matches:
        attrs = re.findall(r'(\w+)=["\']([^"\']+)["\']', match)
        for attr in attrs:
            meta_tags[attr[0]] = attr[1]
    return meta_tags

# Function to extract favicon URL from the HTML
def extract_favicon(html, base_url):
    # Look for the favicon in the HTML
    match = re.search(r'<link\s+rel=["\']icon["\']\s+href=["\']([^"\']+)["\']', html)
    if match:
        favicon_url = match.group(1)
        if not favicon_url.startswith('http'):
            favicon_url = urljoin(base_url, favicon_url)
        return favicon_url
    return None

# Function to format the result output
def format_output(result):
    if "error" in result:
        return f"Error: {result['error']}"

    output = "<h3>Meta Tags</h3>"
    for key, value in result["meta_tags"].items():
        output += f"<strong>{key}</strong>: {value}<br>"

    if result['favicon']:
        output += f"<h3>Favicon</h3><img src='{result['favicon']}' alt='Favicon' style='width:50px;height:50px;'><br>"
    else:
        output += "<h3>Favicon</h3><p>Missing</p><br>"

    if result['robots_txt']:
        output += "<h3>robots.txt</h3><p>Found</p><br>"
        output += f"<pre>{result['robots_txt_content']}</pre><br>"
    else:
        output += "<h3>robots.txt</h3><p>Not found</p><br>"

    if result['sitemap_xml']:
        output += "<h3>sitemap.xml</h3><p>Found</p><br>"
        output += f"<pre>{result['sitemap_xml_content']}</pre><br>"
    else:
        output += "<h3>sitemap.xml</h3><p>Not found</p><br>"

    return output

# Gradio Interface
def get_meta_tags_ui():
    with Blocks() as interface:
        # Input element to enter the URL
        url_input = Textbox(label="Enter URL", placeholder="https://example.com")
        
        # Button to trigger the meta tags fetch
        lookup_button = Button("Get Meta Tags from URL")

        # HTML output area for the results
        html_output = HTML()

        # Action when the button is clicked
        def update(value):
            if value:
                result = get_meta_tags(value)
                return format_output(result)

        # Link button click with the update function
        lookup_button.click(fn=update, inputs=url_input, outputs=html_output)

    return interface

# Run the interface
if __name__ == "__main__":
    interface = get_meta_tags_ui()
    interface.launch()