princelv commited on
Commit
2b079e6
·
verified ·
1 Parent(s): 97b8e7f

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +93 -0
  2. requirements.txt +5 -0
  3. templates/index.html +93 -0
app.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request, flash
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+ import pandas as pd
5
+ from urllib.parse import urlparse
6
+
7
+ app = Flask(__name__)
8
+ app.secret_key = "super_duper_secret_key"
9
+
10
+ HEADERS = {
11
+ "User-Agent": (
12
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
13
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
14
+ "Chrome/125.0.0.0 Safari/537.36"
15
+ )
16
+ }
17
+
18
+
19
+ def is_valid_url(url: str) -> bool:
20
+ """
21
+ Validate URL format.
22
+ """
23
+ parsed = urlparse(url)
24
+ return parsed.scheme in ("http", "https") and parsed.netloc
25
+
26
+
27
+ @app.route("/", methods=["GET", "POST"])
28
+ def index():
29
+ table_html = None
30
+
31
+ if request.method == "POST":
32
+ url = request.form.get("url", "").strip()
33
+
34
+ if not is_valid_url(url):
35
+ flash("Invalid URL. Please enter a valid URL.", "danger")
36
+ return render_template("index.html")
37
+
38
+ try:
39
+ response = requests.get(
40
+ url,
41
+ headers=HEADERS,
42
+ timeout=10
43
+ )
44
+
45
+ response.raise_for_status()
46
+
47
+ soup = BeautifulSoup(response.text, "html.parser")
48
+
49
+ # Extract all links
50
+ links = []
51
+
52
+ for a in soup.find_all("a", href=True):
53
+ href = a["href"].strip()
54
+
55
+ if href:
56
+ links.append({
57
+ "Link": href
58
+ })
59
+
60
+ if not links:
61
+ flash("No links found on the page.", "warning")
62
+ return render_template("index.html")
63
+
64
+ # Convert to DataFrame
65
+ df = pd.DataFrame(links)
66
+
67
+ # Remove duplicates
68
+ df.drop_duplicates(inplace=True)
69
+
70
+ # Convert DataFrame to HTML
71
+ table_html = df.to_html(
72
+ classes="table table-bordered table-striped",
73
+ index=False,
74
+ escape=False
75
+ )
76
+
77
+ except requests.exceptions.Timeout:
78
+ flash("Request timed out.", "danger")
79
+
80
+ except requests.exceptions.ConnectionError:
81
+ flash("Failed to connect to the website.", "danger")
82
+
83
+ except requests.exceptions.HTTPError as e:
84
+ flash(f"HTTP Error: {e}", "danger")
85
+
86
+ except Exception as e:
87
+ flash(f"Unexpected Error: {str(e)}", "danger")
88
+
89
+ return render_template("index.html", table=table_html)
90
+
91
+
92
+ if __name__ == "__main__":
93
+ app.run(debug=True)
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ flask
2
+ beautifulsoup4
3
+ requests
4
+ pandas
5
+ gunicorn
templates/index.html ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+
4
+ <head>
5
+ <meta charset="UTF-8">
6
+ <meta name="viewport" content="width=device-width, initial-scale=1.0">
7
+
8
+ <title>Flask Web Scraper</title>
9
+
10
+ <link rel="stylesheet"
11
+ href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css">
12
+
13
+ <link rel="stylesheet"
14
+ href="{{ url_for('static', filename='style.css') }}">
15
+ </head>
16
+
17
+ <body>
18
+
19
+ <div class="container py-5">
20
+
21
+ <div class="card shadow-lg p-4">
22
+
23
+ <h1 class="mb-4 text-center">
24
+ Flask Web Scraper
25
+ </h1>
26
+
27
+ <!-- Flash Messages -->
28
+ {% with messages = get_flashed_messages(with_categories=true) %}
29
+ {% if messages %}
30
+ {% for category, message in messages %}
31
+ <div class="alert alert-{{ category }} alert-dismissible fade show">
32
+
33
+ {{ message }}
34
+
35
+ <button type="button"
36
+ class="btn-close"
37
+ data-bs-dismiss="alert">
38
+ </button>
39
+
40
+ </div>
41
+ {% endfor %}
42
+ {% endif %}
43
+ {% endwith %}
44
+
45
+ <!-- Form -->
46
+ <form method="POST">
47
+
48
+ <div class="mb-3">
49
+ <label for="url" class="form-label">
50
+ Enter URL
51
+ </label>
52
+
53
+ <input
54
+ type="url"
55
+ class="form-control"
56
+ id="url"
57
+ name="url"
58
+ placeholder="https://example.com"
59
+ required>
60
+ </div>
61
+
62
+ <button type="submit" class="btn btn-primary w-100">
63
+ Scrape Website
64
+ </button>
65
+
66
+ </form>
67
+
68
+ </div>
69
+
70
+ <!-- Results -->
71
+ {% if table %}
72
+
73
+ <div class="card shadow-lg mt-5 p-4">
74
+
75
+ <h2 class="mb-4">
76
+ Extracted Links
77
+ </h2>
78
+
79
+ <div class="table-responsive">
80
+ {{ table|safe }}
81
+ </div>
82
+
83
+ </div>
84
+
85
+ {% endif %}
86
+
87
+ </div>
88
+
89
+ <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
90
+
91
+ </body>
92
+
93
+ </html>