Spaces:
Running
Running
Upload 4 files
Browse files- app.py +26 -0
- requirements.txt +12 -0
- templates/index.html +36 -0
- templates/result.html +26 -0
app.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from flask import Flask, render_template, request
|
| 2 |
+
import requests
|
| 3 |
+
from bs4 import BeautifulSoup
|
| 4 |
+
|
| 5 |
+
app = Flask(__name__)
|
| 6 |
+
|
| 7 |
+
@app.route("/")
|
| 8 |
+
def index():
|
| 9 |
+
return render_template("index.html")
|
| 10 |
+
|
| 11 |
+
@app.route("/scrape", methods=["POST"])
|
| 12 |
+
def scrape():
|
| 13 |
+
url, tag = request.form.get("url"), request.form.get("tag")
|
| 14 |
+
if not url or not tag:
|
| 15 |
+
return render_template("result.html", error="Both URL and Tag are required.")
|
| 16 |
+
|
| 17 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
| 18 |
+
response.raise_for_status() # This will automatically raise an error if the request fails
|
| 19 |
+
|
| 20 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
| 21 |
+
elements = [e.get_text() for e in soup.find_all(tag)]
|
| 22 |
+
|
| 23 |
+
return render_template("result.html", tag=tag, url=url, title=soup.title.string or "No Title", elements=elements)
|
| 24 |
+
|
| 25 |
+
if __name__ == "__main__":
|
| 26 |
+
app.run(debug=True)
|
requirements.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
flask
|
| 2 |
+
requests
|
| 3 |
+
bs4
|
| 4 |
+
gunicorn
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
|
templates/index.html
ADDED
|
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html>
|
| 2 |
+
<head>
|
| 3 |
+
<link rel="icon" type="image/png"
|
| 4 |
+
href="https://cdn.glitch.global/011875c1-2e8a-4ff4-806a-793934a0acda/android-chrome-512x512.png?v=1734461641548" />
|
| 5 |
+
<title>Web Scraper</title>
|
| 6 |
+
<style>
|
| 7 |
+
body {
|
| 8 |
+
font-family: Arial, sans-serif;
|
| 9 |
+
margin: 0;
|
| 10 |
+
padding: 0;
|
| 11 |
+
background-color: #f9f9f9;
|
| 12 |
+
text-align: center;
|
| 13 |
+
}
|
| 14 |
+
.logo {
|
| 15 |
+
margin-top: 30px;
|
| 16 |
+
width: 128px;
|
| 17 |
+
height: 128px;
|
| 18 |
+
}
|
| 19 |
+
</style>
|
| 20 |
+
</head>
|
| 21 |
+
<body>
|
| 22 |
+
<img src="https://cdn.glitch.global/011875c1-2e8a-4ff4-806a-793934a0acda/android-chrome-512x512.png?v=1734461641548"
|
| 23 |
+
alt="Logo" class="logo">
|
| 24 |
+
<h1>Customizable Web Scraper</h1>
|
| 25 |
+
<form action="/scrape" method="POST">
|
| 26 |
+
<label for="url">Enter URL:</label><br>
|
| 27 |
+
<input type="text" id="url" name="url" placeholder="https://example.com" required><br><br>
|
| 28 |
+
|
| 29 |
+
<label for="tag">Enter Tag to Scrape (e.g., p, h1, img):</label><br>
|
| 30 |
+
<input type="text" id="tag" name="tag" placeholder="p" required><br><br>
|
| 31 |
+
|
| 32 |
+
<button type="submit">Scrape</button>
|
| 33 |
+
</form>
|
| 34 |
+
|
| 35 |
+
</body>
|
| 36 |
+
</html>
|
templates/result.html
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<html>
|
| 2 |
+
<head>
|
| 3 |
+
<link rel="icon" type="image/png"
|
| 4 |
+
href="https://cdn.glitch.global/011875c1-2e8a-4ff4-806a-793934a0acda/android-chrome-512x512.png?v=1734461641548" />
|
| 5 |
+
<title>Scraped Results</title>
|
| 6 |
+
</head>
|
| 7 |
+
<body>
|
| 8 |
+
<h1>Scraped Results</h1>
|
| 9 |
+
{% if error %}
|
| 10 |
+
<p style="color: red;">{{ error }}</p>
|
| 11 |
+
{% else %}
|
| 12 |
+
<h2>Title: {{ title }}</h2>
|
| 13 |
+
<h3>Content from: <a href="{{ url }}" target="_blank">{{ url }}</a></h3>
|
| 14 |
+
<h3>Scraped Elements for Tag: <{{ tag }}></h3>
|
| 15 |
+
<ul>
|
| 16 |
+
{% for element in elements %}
|
| 17 |
+
<li>{{ element }}</li>
|
| 18 |
+
{% else %}
|
| 19 |
+
<li>No content found for tag <{{ tag }}>.</li>
|
| 20 |
+
{% endfor %}
|
| 21 |
+
</ul>
|
| 22 |
+
{% endif %}
|
| 23 |
+
<br>
|
| 24 |
+
<a href="/">Go Back</a>
|
| 25 |
+
</body>
|
| 26 |
+
</html>
|