sudo-soldier commited on
Commit
bb681cb
·
verified ·
1 Parent(s): 299b271

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +86 -91
app.py CHANGED
@@ -1,5 +1,7 @@
1
  import requests
2
  from gradio import Blocks, Button, Textbox, HTML
 
 
3
 
4
  # Function to get meta tags from a URL
5
  def get_meta_tags(url):
@@ -21,111 +23,104 @@ def get_meta_tags(url):
21
  html = response.text
22
  data['meta_tags'] = parse_meta_tags(html)
23
 
24
- sitemap_xml_link_tag = response.json().get("robot", {}).get("sitemap")
25
-
26
- except Exception as e:
27
- print(f"An error occurred: {e}")
28
-
29
- except Exception as e:
30
- print(f"An error occurred while fetching the URL: {e}")
31
 
32
- return data
 
 
 
 
 
33
 
 
 
 
 
 
 
34
 
35
- # Function to parse meta tags from HTML
36
- def parse_meta_tags(html):
37
- import re
38
 
39
- # Find all <meta> tags in the HTML and extract their attributes
40
- patterns = [
41
- r'<meta\s+(\w+)\s*=\s*"([^"]*)"',
42
- r'<meta\s+(\S+)\s*\=(\S*)',
43
- r'^<meta[^>]*>$'
44
- ]
45
 
 
 
46
  meta_tags = {}
47
-
48
- for pattern in patterns:
49
- matches = re.findall(pattern, html)
50
-
51
- # Adding the matched attribute-value pairs to a dictionary
52
- if len(matches) > 1 and (len(matches[0]) == 2 or len(matches[1:]) == 3):
53
- key_value_pairs = [(matches[i][0], matches[i+1] if i + 1 < len(matches[0]) else "") for i in range(0, len(matches), 2)]
54
-
55
- elif len(pattern) == 4:
56
- # For the last pattern, there's no need to have any match of length >1
57
- key_value_pairs = [(matches[i], matches[j]) for i,j in enumerate(zip(range(len(matches)-3),range(len(matches[0]))))]
58
-
59
- else: continue
60
-
61
- meta_tags.update({key: value for (key, value) in key_value_pairs if value})
62
-
63
  return meta_tags
64
 
65
-
66
- # Function to join url with favicon link
67
- def urljoin(*parts):
68
- from urllib.parse importurlunparse, urlparse
69
-
70
- parsed = urlparse(url)
71
- path_parts = []
72
-
73
- for part in parts:
74
- if isinstance(part, tuple) and len(part) > 1:
75
- scheme, rest = part
76
-
77
- # Handle protocol-less URLs with 'http' as default
78
- if not scheme:
79
- scheme = 'http'
80
-
81
- parsed = urlparse(url)
82
-
83
- path_parts.append(path_parts[-1] + "/" + str(part))
84
-
85
- return urlunparse((parsed.scheme, "", "/".join(path_parts), None, "", ""))
86
-
87
- # Function for the main interface.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
  def get_meta_tags_ui():
89
- html_output = Blocks()
90
-
91
- with html_output:
92
- title_key = "Keys: "
93
-
94
- # Define a button
95
- buttons = Button(
96
- style="background-color: white",
97
- text="Get Meta Tags from URL"
98
- )
99
-
100
- # Add favicon to the interface
101
- Buttons.append(Button(style='favicon', icon='link-icon'))
102
-
103
- # Get meta tags
104
- input_element = Textbox()
105
 
 
 
106
 
107
- def update(value):
108
- if not value:
109
- return
110
 
111
- result= get_meta_tags(value)
 
 
 
 
112
 
113
- html_output.clear()
114
-
115
- title_key = "Keys: "
116
-
117
- output_html_string = str(format_output(result))
118
-
119
- buttons.launch(input_element, submit=update)
120
-
121
-
122
- interface = Blocks(
123
- style="background-color: white",
124
- elements=[html_output],
125
- )
126
 
127
  return interface
128
 
129
-
130
  if __name__ == "__main__":
131
- get_meta_tags_ui()
 
 
1
  import requests
2
  from gradio import Blocks, Button, Textbox, HTML
3
+ from urllib.parse import urljoin, urlparse
4
+ import re
5
 
6
  # Function to get meta tags from a URL
7
  def get_meta_tags(url):
 
23
  html = response.text
24
  data['meta_tags'] = parse_meta_tags(html)
25
 
26
+ # Find the favicon link (if exists)
27
+ favicon_url = extract_favicon(html, url)
28
+ data['favicon'] = favicon_url
 
 
 
 
29
 
30
+ # Check if robots.txt exists
31
+ robots_txt_url = urljoin(url, "/robots.txt")
32
+ robots_response = requests.get(robots_txt_url)
33
+ if robots_response.status_code == 200:
34
+ data['robots_txt'] = True
35
+ data['robots_txt_content'] = robots_response.text
36
 
37
+ # Check if sitemap.xml exists
38
+ sitemap_url = urljoin(url, "/sitemap.xml")
39
+ sitemap_response = requests.get(sitemap_url)
40
+ if sitemap_response.status_code == 200:
41
+ data['sitemap_xml'] = True
42
+ data['sitemap_xml_content'] = sitemap_response.text
43
 
44
+ return data
 
 
45
 
46
+ except Exception as e:
47
+ print(f"An error occurred: {e}")
48
+ return {"error": str(e)}
 
 
 
49
 
50
+ # Function to parse meta tags from HTML
51
+ def parse_meta_tags(html):
52
  meta_tags = {}
53
+ # Use regex to find <meta> tags and extract attributes
54
+ matches = re.findall(r'<meta\s+([^\>]+)>', html)
55
+ for match in matches:
56
+ attrs = re.findall(r'(\w+)=["\']([^"\']+)["\']', match)
57
+ for attr in attrs:
58
+ meta_tags[attr[0]] = attr[1]
 
 
 
 
 
 
 
 
 
 
59
  return meta_tags
60
 
61
+ # Function to extract favicon URL from the HTML
62
+ def extract_favicon(html, base_url):
63
+ # Look for the favicon in the HTML
64
+ match = re.search(r'<link\s+rel=["\']icon["\']\s+href=["\']([^"\']+)["\']', html)
65
+ if match:
66
+ favicon_url = match.group(1)
67
+ if not favicon_url.startswith('http'):
68
+ favicon_url = urljoin(base_url, favicon_url)
69
+ return favicon_url
70
+ return None
71
+
72
+ # Function to format the result output
73
+ def format_output(result):
74
+ if "error" in result:
75
+ return f"Error: {result['error']}"
76
+
77
+ output = "<h3>Meta Tags</h3>"
78
+ for key, value in result["meta_tags"].items():
79
+ output += f"<strong>{key}</strong>: {value}<br>"
80
+
81
+ if result['favicon']:
82
+ output += f"<h3>Favicon</h3><img src='{result['favicon']}' alt='Favicon' style='width:50px;height:50px;'><br>"
83
+ else:
84
+ output += "<h3>Favicon</h3><p>Missing</p><br>"
85
+
86
+ if result['robots_txt']:
87
+ output += "<h3>robots.txt</h3><p>Found</p><br>"
88
+ output += f"<pre>{result['robots_txt_content']}</pre><br>"
89
+ else:
90
+ output += "<h3>robots.txt</h3><p>Not found</p><br>"
91
+
92
+ if result['sitemap_xml']:
93
+ output += "<h3>sitemap.xml</h3><p>Found</p><br>"
94
+ output += f"<pre>{result['sitemap_xml_content']}</pre><br>"
95
+ else:
96
+ output += "<h3>sitemap.xml</h3><p>Not found</p><br>"
97
+
98
+ return output
99
+
100
+ # Gradio Interface
101
  def get_meta_tags_ui():
102
+ with Blocks() as interface:
103
+ # Input element to enter the URL
104
+ url_input = Textbox(label="Enter URL", placeholder="https://example.com")
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
+ # Button to trigger the meta tags fetch
107
+ lookup_button = Button("Get Meta Tags from URL")
108
 
109
+ # HTML output area for the results
110
+ html_output = HTML()
 
111
 
112
+ # Action when the button is clicked
113
+ def update(value):
114
+ if value:
115
+ result = get_meta_tags(value)
116
+ return format_output(result)
117
 
118
+ # Link button click with the update function
119
+ lookup_button.click(fn=update, inputs=url_input, outputs=html_output)
 
 
 
 
 
 
 
 
 
 
 
120
 
121
  return interface
122
 
123
+ # Run the interface
124
  if __name__ == "__main__":
125
+ interface = get_meta_tags_ui()
126
+ interface.launch()