Upload 3 files
Browse files- app.py +93 -0
- requirements.txt +5 -0
- templates/index.html +93 -0
app.py
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request, flash
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from urllib.parse import urlparse
|
| 6 |
+
|
| 7 |
+
app = Flask(__name__)
|
| 8 |
+
app.secret_key = "super_duper_secret_key"
|
| 9 |
+
|
| 10 |
+
HEADERS = {
|
| 11 |
+
"User-Agent": (
|
| 12 |
+
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
|
| 13 |
+
"AppleWebKit/537.36 (KHTML, like Gecko) "
|
| 14 |
+
"Chrome/125.0.0.0 Safari/537.36"
|
| 15 |
+
)
|
| 16 |
+
}
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def is_valid_url(url: str) -> bool:
|
| 20 |
+
"""
|
| 21 |
+
Validate URL format.
|
| 22 |
+
"""
|
| 23 |
+
parsed = urlparse(url)
|
| 24 |
+
return parsed.scheme in ("http", "https") and parsed.netloc
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
@app.route("/", methods=["GET", "POST"])
|
| 28 |
+
def index():
|
| 29 |
+
table_html = None
|
| 30 |
+
|
| 31 |
+
if request.method == "POST":
|
| 32 |
+
url = request.form.get("url", "").strip()
|
| 33 |
+
|
| 34 |
+
if not is_valid_url(url):
|
| 35 |
+
flash("Invalid URL. Please enter a valid URL.", "danger")
|
| 36 |
+
return render_template("index.html")
|
| 37 |
+
|
| 38 |
+
try:
|
| 39 |
+
response = requests.get(
|
| 40 |
+
url,
|
| 41 |
+
headers=HEADERS,
|
| 42 |
+
timeout=10
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
response.raise_for_status()
|
| 46 |
+
|
| 47 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 48 |
+
|
| 49 |
+
# Extract all links
|
| 50 |
+
links = []
|
| 51 |
+
|
| 52 |
+
for a in soup.find_all("a", href=True):
|
| 53 |
+
href = a["href"].strip()
|
| 54 |
+
|
| 55 |
+
if href:
|
| 56 |
+
links.append({
|
| 57 |
+
"Link": href
|
| 58 |
+
})
|
| 59 |
+
|
| 60 |
+
if not links:
|
| 61 |
+
flash("No links found on the page.", "warning")
|
| 62 |
+
return render_template("index.html")
|
| 63 |
+
|
| 64 |
+
# Convert to DataFrame
|
| 65 |
+
df = pd.DataFrame(links)
|
| 66 |
+
|
| 67 |
+
# Remove duplicates
|
| 68 |
+
df.drop_duplicates(inplace=True)
|
| 69 |
+
|
| 70 |
+
# Convert DataFrame to HTML
|
| 71 |
+
table_html = df.to_html(
|
| 72 |
+
classes="table table-bordered table-striped",
|
| 73 |
+
index=False,
|
| 74 |
+
escape=False
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
except requests.exceptions.Timeout:
|
| 78 |
+
flash("Request timed out.", "danger")
|
| 79 |
+
|
| 80 |
+
except requests.exceptions.ConnectionError:
|
| 81 |
+
flash("Failed to connect to the website.", "danger")
|
| 82 |
+
|
| 83 |
+
except requests.exceptions.HTTPError as e:
|
| 84 |
+
flash(f"HTTP Error: {e}", "danger")
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
flash(f"Unexpected Error: {str(e)}", "danger")
|
| 88 |
+
|
| 89 |
+
return render_template("index.html", table=table_html)
|
| 90 |
+
|
| 91 |
+
|
| 92 |
+
if __name__ == "__main__":
|
| 93 |
+
app.run(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
beautifulsoup4
|
| 3 |
+
requests
|
| 4 |
+
pandas
|
| 5 |
+
gunicorn
|
templates/index.html
ADDED
|
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html lang="en">
|
| 3 |
+
|
| 4 |
+
<head>
|
| 5 |
+
<meta charset="UTF-8">
|
| 6 |
+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
| 7 |
+
|
| 8 |
+
<title>Flask Web Scraper</title>
|
| 9 |
+
|
| 10 |
+
<link rel="stylesheet"
|
| 11 |
+
href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css">
|
| 12 |
+
|
| 13 |
+
<link rel="stylesheet"
|
| 14 |
+
href="{{ url_for('static', filename='style.css') }}">
|
| 15 |
+
</head>
|
| 16 |
+
|
| 17 |
+
<body>
|
| 18 |
+
|
| 19 |
+
<div class="container py-5">
|
| 20 |
+
|
| 21 |
+
<div class="card shadow-lg p-4">
|
| 22 |
+
|
| 23 |
+
<h1 class="mb-4 text-center">
|
| 24 |
+
Flask Web Scraper
|
| 25 |
+
</h1>
|
| 26 |
+
|
| 27 |
+
<!-- Flash Messages -->
|
| 28 |
+
{% with messages = get_flashed_messages(with_categories=true) %}
|
| 29 |
+
{% if messages %}
|
| 30 |
+
{% for category, message in messages %}
|
| 31 |
+
<div class="alert alert-{{ category }} alert-dismissible fade show">
|
| 32 |
+
|
| 33 |
+
{{ message }}
|
| 34 |
+
|
| 35 |
+
<button type="button"
|
| 36 |
+
class="btn-close"
|
| 37 |
+
data-bs-dismiss="alert">
|
| 38 |
+
</button>
|
| 39 |
+
|
| 40 |
+
</div>
|
| 41 |
+
{% endfor %}
|
| 42 |
+
{% endif %}
|
| 43 |
+
{% endwith %}
|
| 44 |
+
|
| 45 |
+
<!-- Form -->
|
| 46 |
+
<form method="POST">
|
| 47 |
+
|
| 48 |
+
<div class="mb-3">
|
| 49 |
+
<label for="url" class="form-label">
|
| 50 |
+
Enter URL
|
| 51 |
+
</label>
|
| 52 |
+
|
| 53 |
+
<input
|
| 54 |
+
type="url"
|
| 55 |
+
class="form-control"
|
| 56 |
+
id="url"
|
| 57 |
+
name="url"
|
| 58 |
+
placeholder="https://example.com"
|
| 59 |
+
required>
|
| 60 |
+
</div>
|
| 61 |
+
|
| 62 |
+
<button type="submit" class="btn btn-primary w-100">
|
| 63 |
+
Scrape Website
|
| 64 |
+
</button>
|
| 65 |
+
|
| 66 |
+
</form>
|
| 67 |
+
|
| 68 |
+
</div>
|
| 69 |
+
|
| 70 |
+
<!-- Results -->
|
| 71 |
+
{% if table %}
|
| 72 |
+
|
| 73 |
+
<div class="card shadow-lg mt-5 p-4">
|
| 74 |
+
|
| 75 |
+
<h2 class="mb-4">
|
| 76 |
+
Extracted Links
|
| 77 |
+
</h2>
|
| 78 |
+
|
| 79 |
+
<div class="table-responsive">
|
| 80 |
+
{{ table|safe }}
|
| 81 |
+
</div>
|
| 82 |
+
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
{% endif %}
|
| 86 |
+
|
| 87 |
+
</div>
|
| 88 |
+
|
| 89 |
+
<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/js/bootstrap.bundle.min.js"></script>
|
| 90 |
+
|
| 91 |
+
</body>
|
| 92 |
+
|
| 93 |
+
</html>
|