webscrap / app.py
princelv's picture
Update app.py
f579252 verified
from flask import Flask, render_template, request, flash
import requests
from bs4 import BeautifulSoup
import pandas as pd
from urllib.parse import urlparse
app = Flask(__name__)
app.secret_key = "super_duper_secret_key"
HEADERS = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/125.0.0.0 Safari/537.36"
)
}
def is_valid_url(url: str) -> bool:
"""
Validate URL format.
"""
parsed = urlparse(url)
return parsed.scheme in ("http", "https") and parsed.netloc
@app.route("/", methods=["GET", "POST"])
def index():
table_html = None
if request.method == "POST":
url = request.form.get("url", "").strip()
if not is_valid_url(url):
flash("Invalid URL. Please enter a valid URL.", "danger")
return render_template("index.html")
try:
response = requests.get(
url,
headers=HEADERS,
timeout=10
)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
# Extract all links
links = []
for a in soup.find_all("a", href=True):
href = a["href"].strip()
if href:
links.append({
"Link": href
})
if not links:
flash("No links found on the page.", "warning")
return render_template("index.html")
# Convert to DataFrame
df = pd.DataFrame(links)
# Remove duplicates
df.drop_duplicates(inplace=True)
# Convert DataFrame to HTML
table_html = df.to_html(
classes="table table-bordered table-striped",
index=False,
escape=False
)
except requests.exceptions.Timeout:
flash("Request timed out.", "danger")
except requests.exceptions.ConnectionError:
flash("Failed to connect to the website.", "danger")
except requests.exceptions.HTTPError as e:
flash(f"HTTP Error: {e}", "danger")
except Exception as e:
flash(f"Unexpected Error: {str(e)}", "danger")
return render_template("index.html", table=table_html)
if __name__ == "__main__":
app.run(debug=True, host='0.0.0.0', port=7860)