simran40 commited on
Commit
0f22445
·
verified ·
1 Parent(s): a61d173

Upload 7 files

Browse files
Dockerfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ WORKDIR /app
3
+ COPY . /app
4
+ RUN pip install -r requirements.txt
5
+ CMD ["python", "main.py"]
__pycache__/scraper.cpython-310.pyc ADDED
Binary file (973 Bytes). View file
 
__pycache__/templates/index.html ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Smart Web Scraper</title>
6
+ <style>
7
+ @import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;600&display=swap');
8
+ * { margin: 0; padding: 0; box-sizing: border-box; }
9
+ body {
10
+ font-family: 'Poppins', sans-serif;
11
+ background: linear-gradient(135deg, #283c86, #45a247);
12
+ height: 100vh;
13
+ display: flex;
14
+ flex-direction: column;
15
+ justify-content: center;
16
+ align-items: center;
17
+ color: #fff;
18
+ }
19
+ .card {
20
+ background: rgba(255, 255, 255, 0.15);
21
+ padding: 40px 50px;
22
+ border-radius: 20px;
23
+ backdrop-filter: blur(12px);
24
+ box-shadow: 0 8px 25px rgba(0,0,0,0.3);
25
+ text-align: center;
26
+ animation: fadeIn 1.2s ease-in-out;
27
+ }
28
+ h1 {
29
+ font-size: 42px;
30
+ margin-bottom: 10px;
31
+ letter-spacing: 1px;
32
+ }
33
+ p {
34
+ font-size: 18px;
35
+ margin-bottom: 25px;
36
+ color: #f2f2f2;
37
+ }
38
+ input[type="text"] {
39
+ width: 380px;
40
+ padding: 12px;
41
+ border-radius: 8px;
42
+ border: none;
43
+ outline: none;
44
+ font-size: 16px;
45
+ margin-bottom: 20px;
46
+ }
47
+ .tag-buttons {
48
+ margin: 10px 0 25px 0;
49
+ }
50
+ .tag-buttons button {
51
+ background: rgba(255,255,255,0.2);
52
+ border: 1px solid #fff;
53
+ color: #fff;
54
+ border-radius: 8px;
55
+ margin: 5px;
56
+ padding: 8px 16px;
57
+ cursor: pointer;
58
+ font-size: 15px;
59
+ transition: all 0.3s;
60
+ }
61
+ .tag-buttons button:hover,
62
+ .tag-buttons button.active {
63
+ background-color: #00c9a7;
64
+ transform: scale(1.05);
65
+ color: #222;
66
+ }
67
+ input[type="submit"] {
68
+ padding: 12px 30px;
69
+ background: linear-gradient(90deg, #00c9ff, #92fe9d);
70
+ border: none;
71
+ border-radius: 30px;
72
+ color: #222;
73
+ font-size: 17px;
74
+ cursor: pointer;
75
+ transition: all 0.3s;
76
+ }
77
+ input[type="submit"]:hover {
78
+ transform: scale(1.07);
79
+ background: linear-gradient(90deg, #92fe9d, #00c9ff);
80
+ }
81
+ footer {
82
+ position: absolute;
83
+ bottom: 15px;
84
+ color: #e0e0e0;
85
+ font-size: 14px;
86
+ }
87
+ @keyframes fadeIn {
88
+ from {opacity: 0; transform: translateY(40px);}
89
+ to {opacity: 1; transform: translateY(0);}
90
+ }
91
+ </style>
92
+ <script>
93
+ function setTag(tagName) {
94
+ document.getElementById('selectedTag').value = tagName;
95
+ const buttons = document.querySelectorAll('.tag-buttons button');
96
+ buttons.forEach(btn => btn.classList.remove('active'));
97
+ event.target.classList.add('active');
98
+ }
99
+ </script>
100
+ </head>
101
+ <body>
102
+ <div class="card">
103
+ <h1>🌐 Smart Web Scraper</h1>
104
+ <p>Enter a website and select which tag you want to scrape (H1, H2, etc.)</p>
105
+ <form method="POST" action="/">
106
+ <input type="text" name="url" placeholder="https://indianexpress.com/" required><br>
107
+ <div class="tag-buttons">
108
+ <button type="button" onclick="setTag('h1')">H1</button>
109
+ <button type="button" onclick="setTag('h2')" class="active">H2</button>
110
+ <button type="button" onclick="setTag('h3')">H3</button>
111
+ <button type="button" onclick="setTag('p')">P</button>
112
+ <button type="button" onclick="setTag('span')">SPAN</button>
113
+ </div>
114
+ <input type="hidden" name="tag" id="selectedTag" value="h2">
115
+ <input type="submit" value="Scrape Now 🚀">
116
+ </form>
117
+ </div>
118
+ <footer>Made with ❤️ using Flask & BeautifulSoup</footer>
119
+ </body>
120
+ </html>
__pycache__/templates/result.html ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="UTF-8">
5
+ <title>Scraped Results</title>
6
+ <style>
7
+ @import url('https://fonts.googleapis.com/css2?family=Poppins:wght@400;600&display=swap');
8
+ body {
9
+ font-family: 'Poppins', sans-serif;
10
+ background: linear-gradient(135deg, #1a2a6c, #b21f1f, #fdbb2d);
11
+ color: #fff;
12
+ min-height: 100vh;
13
+ margin: 0;
14
+ text-align: center;
15
+ padding: 40px 0;
16
+ }
17
+ h1 {
18
+ margin-bottom: 15px;
19
+ font-size: 30px;
20
+ }
21
+ h2 {
22
+ font-size: 18px;
23
+ margin-bottom: 25px;
24
+ color: #ffecb3;
25
+ }
26
+ .results {
27
+ display: flex;
28
+ flex-direction: column;
29
+ align-items: center;
30
+ gap: 10px;
31
+ width: 85%;
32
+ margin: auto;
33
+ }
34
+ .item {
35
+ background: rgba(255,255,255,0.15);
36
+ backdrop-filter: blur(10px);
37
+ border-radius: 12px;
38
+ padding: 12px 20px;
39
+ text-align: left;
40
+ width: 80%;
41
+ transition: transform 0.3s, box-shadow 0.3s;
42
+ }
43
+ .item:hover {
44
+ transform: scale(1.03);
45
+ box-shadow: 0 4px 15px rgba(0,0,0,0.3);
46
+ }
47
+ .buttons {
48
+ margin-top: 30px;
49
+ }
50
+ a, button {
51
+ text-decoration: none;
52
+ border: none;
53
+ color: #fff;
54
+ background: linear-gradient(90deg, #00c6ff, #0072ff);
55
+ padding: 10px 25px;
56
+ border-radius: 25px;
57
+ margin: 10px;
58
+ font-size: 16px;
59
+ cursor: pointer;
60
+ transition: transform 0.2s, background 0.3s;
61
+ }
62
+ a:hover, button:hover {
63
+ transform: scale(1.05);
64
+ background: linear-gradient(90deg, #0072ff, #00c6ff);
65
+ }
66
+ </style>
67
+ </head>
68
+ <body>
69
+ <h1>📰 Scraped Data from {{ url }}</h1>
70
+ <h2>Selected Tag: &lt;{{ tag.upper() }}&gt;</h2>
71
+ <div class="results">
72
+ {% if titles %}
73
+ {% for title in titles %}
74
+ <div class="item">{{ title }}</div>
75
+ {% endfor %}
76
+ {% else %}
77
+ <p>No data found or error occurred.</p>
78
+ {% endif %}
79
+ </div>
80
+ <div class="buttons">
81
+ <a href="/">⬅ Home</a>
82
+ <form method="POST" action="/" style="display:inline;">
83
+ <input type="hidden" name="url" value="{{ url }}">
84
+ <input type="hidden" name="tag" value="{{ tag }}">
85
+ <button type="submit">🔄 Re-scrape</button>
86
+ </form>
87
+ </div>
88
+ </body>
89
+ </html>
main.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, request
2
+ from scraper import scrape_blog
3
+
4
+ app = Flask(__name__)
5
+
6
+ @app.route('/', methods=['GET', 'POST'])
7
+ def home():
8
+ if request.method == 'POST':
9
+ url = request.form.get('url')
10
+ tag = request.form.get('tag') or 'h2'
11
+ data = scrape_blog(url, tag)
12
+ return render_template('result.html', titles=data, url=url, tag=tag)
13
+ return render_template('index.html')
14
+
15
+ if __name__ == '__main__':
16
+ app.run(debug=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ flask
2
+ requests
3
+ beautifulsoup4
scraper.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+
4
+ def scrape_blog(url="https://indianexpress.com/", tag="h2"):
5
+ try:
6
+ headers = {
7
+ "User-Agent": (
8
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
9
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
10
+ "Chrome/91.0.4472.124 Safari/537.36"
11
+ )
12
+ }
13
+ response = requests.get(url, headers=headers, timeout=10)
14
+ response.raise_for_status()
15
+
16
+ soup = BeautifulSoup(response.text, "html.parser")
17
+
18
+ titles = []
19
+ for element in soup.find_all(tag):
20
+ text = element.get_text(strip=True)
21
+ if text and len(text) > 20:
22
+ titles.append(text)
23
+
24
+ clean_titles = list(dict.fromkeys(titles))
25
+ if not clean_titles:
26
+ return [f"No <{tag}> content found. Try another tag."]
27
+ return clean_titles[:25]
28
+ except Exception as e:
29
+ return [f"Error: {e}"]