saran14 commited on
Commit
796e8b5
·
0 Parent(s):

Initial commit: Pakistani News Aggregator Flask App

Browse files
.dockerignore ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ __pycache__
2
+ *.pyc
3
+ *.pyo
4
+ *.pyd
5
+ .Python
6
+ env
7
+ pip-log.txt
8
+ pip-delete-this-directory.txt
9
+ .tox
10
+ .coverage
11
+ .coverage.*
12
+ .cache
13
+ nosetests.xml
14
+ coverage.xml
15
+ *.cover
16
+ *.log
17
+ .git
18
+ .mypy_cache
19
+ .pytest_cache
20
+ .hypothesis
21
+
22
+ .DS_Store
23
+ .vscode
24
+ *.swp
25
+ *.swo
26
+
27
+ news.db
28
+ *.db
29
+
30
+ node_modules
31
+ npm-debug.log*
32
+
33
+ .dockerignore
34
+ Dockerfile
35
+ README.md
36
+ .gitignore
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ news.db
Dockerfile ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
+
4
+ FROM python:3.9
5
+
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
+
10
+ WORKDIR /app
11
+
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade -r requirements.txt
14
+
15
+ COPY --chown=user . /app
16
+
17
+ # Expose port 7860 for Hugging Face Spaces
18
+ EXPOSE 7860
19
+
20
+ # Set environment variables for Flask
21
+ ENV FLASK_APP=app.py
22
+ ENV FLASK_RUN_HOST=0.0.0.0
23
+ ENV FLASK_RUN_PORT=7860
24
+
25
+ # Run Flask app directly
26
+ CMD ["python", "app.py"]
README.md ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Pakistani News Aggregator
3
+ emoji: 📰
4
+ colorFrom: blue
5
+ colorTo: red
6
+ sdk: docker
7
+ pinned: false
8
+ license: mit
9
+ ---
10
+
11
+ # Pakistani News Aggregator
12
+
13
+ A Flask-based news aggregator that fetches and filters Pakistani news articles based on India-related keywords. The application scrapes content from multiple Pakistani news sources and displays clean, readable articles.
14
+
15
+ ## Features
16
+
17
+ - **Multi-source RSS aggregation** from Pakistani news outlets
18
+ - **Keyword-based filtering** for India-related content
19
+ - **Content scraping** with clean text extraction
20
+ - **User authentication** system
21
+ - **Responsive web interface**
22
+
23
+ ## News Sources
24
+
25
+ - Dawn.com
26
+ - Express.pk
27
+ - BOL News
28
+ - UrduPoint
29
+ - The Pakistan
30
+
31
+ ## Keywords
32
+
33
+ The system filters news based on keywords related to:
34
+ - India and Indian affairs
35
+ - Kashmir and regional politics
36
+ - Bilateral relations
37
+ - Security and defense
38
+ - Economic and cultural news
39
+
40
+ ## Usage
41
+
42
+ The application automatically fetches and displays the latest news articles that match the configured keywords, providing a Pakistani perspective on India-related developments.
README_STREAMLIT.md ADDED
File without changes
__pycache__/config.cpython-39.pyc ADDED
Binary file (525 Bytes). View file
 
__pycache__/country_feeds_config.cpython-39.pyc ADDED
Binary file (217 Bytes). View file
 
__pycache__/db.cpython-39.pyc ADDED
Binary file (3.04 kB). View file
 
__pycache__/fetcher.cpython-39.pyc ADDED
Binary file (3.33 kB). View file
 
__pycache__/opml_util.cpython-39.pyc ADDED
Binary file (1.32 kB). View file
 
__pycache__/proxy_config.cpython-39.pyc ADDED
Binary file (845 Bytes). View file
 
app.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, render_template, abort, jsonify, request, redirect, url_for, session, flash
2
+ from apscheduler.schedulers.background import BackgroundScheduler
3
+ from datetime import datetime
4
+ import traceback
5
+ import os
6
+
7
+ from config import RSS_URL, APP_TITLE, APP_SOURCE_NAME, FETCH_INTERVAL_MINUTES, DATABASE, TIMEZONE
8
+ from db import init_db, upsert_article, get_latest, get_article_by_guid
9
+ from fetcher import parse_rss, extract_article_content, iso_dt
10
+ import re
11
+
12
+ import json
13
+ import os
14
+
15
+ # List of RSS feeds to search
16
+ SEARCH_RSS_FEEDS = [
17
+ "https://www.dawn.com/feeds/home",
18
+ "https://www.express.pk/feed/",
19
+ "https://www.bolnews.com/feed/",
20
+ "https://www.urdupoint.com/en/sitemap/news.rs",
21
+ "https://thepakistan.pk/feed/"
22
+ ]
23
+
24
+ KEYWORDS_FILE = os.path.join(os.path.dirname(__file__), 'keywords.json')
25
+
26
+ def load_keywords():
27
+ # keywords.json is not a valid JSON array, so parse manually
28
+ with open(KEYWORDS_FILE, 'r', encoding='utf-8') as f:
29
+ text = f.read()
30
+ # Remove braces and split by comma
31
+ text = text.strip().replace('{', '').replace('}', '').replace(';', '')
32
+ keywords = [k.strip(' "\n') for k in text.split(',') if k.strip()]
33
+ return [k for k in keywords if k]
34
+
35
+
36
+ app = Flask(__name__)
37
+ app.secret_key = 'your_secret_key_here' # Change this to a secure random value
38
+
39
+ # Ensure DB is initialized for all environments (including production)
40
+ init_db(DATABASE)
41
+
42
+ USERS_FILE = os.path.join(os.path.dirname(__file__), 'users.json')
43
+
44
+ def load_users():
45
+ with open(USERS_FILE, 'r') as f:
46
+ data = json.load(f)
47
+ return data.get('users', [])
48
+
49
+ def check_user(username, password):
50
+ users = load_users()
51
+ for user in users:
52
+ if user['username'] == username and user['password'] == password:
53
+ return True
54
+ return False
55
+
56
+ @app.route('/login', methods=['GET', 'POST'])
57
+ def login():
58
+ if request.method == 'POST':
59
+ username = request.form['username']
60
+ password = request.form['password']
61
+ if check_user(username, password):
62
+ session['username'] = username
63
+ flash('Login successful!', 'success')
64
+ return redirect(url_for('home'))
65
+ else:
66
+ flash('Invalid username or password', 'danger')
67
+ return render_template('login.html', app_title=APP_TITLE)
68
+
69
+ @app.route('/logout')
70
+ def logout():
71
+ session.pop('username', None)
72
+ flash('Logged out successfully.', 'info')
73
+ return redirect(url_for('login'))
74
+
75
+ def fetch_and_update():
76
+ """
77
+ 1) Parse RSS
78
+ 2) For each item, fetch full article page
79
+ 3) Upsert into SQLite keyed by GUID
80
+ """
81
+ try:
82
+ feed = parse_rss(RSS_URL)
83
+ for entry in feed.entries:
84
+ guid = entry.get("id") or entry.get("guid") or entry.get("link")
85
+ if not guid:
86
+ continue
87
+ link = entry.get("link")
88
+ title = entry.get("title", "").strip()
89
+ summary = (entry.get("summary") or entry.get("description") or "").strip()
90
+ published = iso_dt(entry.get("published"))
91
+
92
+ # Get the full content
93
+ full = extract_article_content(link) if link else {"content_html": None, "top_image": None, "title": None}
94
+ content_html = full["content_html"] or f"<p>{summary}</p>"
95
+ top_image = full.get("top_image")
96
+
97
+ # Prefer the page short title if available
98
+ if full.get("title") and len(full["title"]) > len(title) - 8:
99
+ title = full["title"]
100
+
101
+ art = {
102
+ "guid": guid,
103
+ "title": title,
104
+ "link": link,
105
+ "summary": summary,
106
+ "published": published,
107
+ "content_html": content_html,
108
+ "top_image": top_image
109
+ }
110
+ upsert_article(art)
111
+
112
+ print(f"[{datetime.now()}] Fetched & updated.")
113
+ except Exception as e:
114
+ print("Fetch error:", e)
115
+ traceback.print_exc()
116
+
117
+
118
+ ## Disabled initial fetch to avoid RSS_URL timeout errors
119
+
120
+
121
+
122
+ # Home page now fetches and displays news from 5 RSS feeds filtered by keywords
123
+ @app.route("/")
124
+ def home():
125
+ if 'username' not in session:
126
+ return redirect(url_for('login'))
127
+ keywords = load_keywords()
128
+ import re
129
+ # More flexible pattern matching for better coverage
130
+ pattern = re.compile(r"\b(" + "|".join(re.escape(k) for k in keywords) + r")\b", re.IGNORECASE)
131
+
132
+ # Pakistani news RSS feeds as requested
133
+ SEARCH_RSS_FEEDS = [
134
+ "https://www.dawn.com/feeds/home",
135
+ "https://www.express.pk/feed/",
136
+ "https://www.bolnews.com/feed/",
137
+ "https://www.urdupoint.com/en/sitemap/news.rs",
138
+ "https://thepakistan.pk/feed/"
139
+ ]
140
+
141
+ all_results = []
142
+ for feed_url in SEARCH_RSS_FEEDS:
143
+ try:
144
+ print(f"Fetching from {feed_url}...")
145
+ feed = parse_rss(feed_url)
146
+ articles_found = 0
147
+ for entry in feed.entries:
148
+ title = entry.get("title", "")
149
+ summary = entry.get("summary", entry.get("description", ""))
150
+ # Check both title and summary for keywords, also check for general India-related content
151
+ is_relevant = (pattern.search(title) or pattern.search(summary) or
152
+ "india" in title.lower() or "india" in summary.lower() or
153
+ "indian" in title.lower() or "indian" in summary.lower())
154
+
155
+ if is_relevant:
156
+ # Extract full article content
157
+ article_url = entry.get("link")
158
+ if article_url:
159
+ try:
160
+ print(f"Extracting content from: {article_url}")
161
+ full_content = extract_article_content(article_url)
162
+ content_html = full_content.get("content_html", summary)
163
+ top_image = full_content.get("top_image")
164
+
165
+ # Clean HTML tags from content for better display
166
+ from bs4 import BeautifulSoup
167
+ if content_html:
168
+ soup = BeautifulSoup(content_html, 'html.parser')
169
+ clean_content = soup.get_text(separator=' ', strip=True)
170
+ # Limit content length for better display
171
+ if len(clean_content) > 500:
172
+ clean_content = clean_content[:500] + "..."
173
+ else:
174
+ clean_content = summary
175
+
176
+ all_results.append({
177
+ "title": title,
178
+ "link": article_url,
179
+ "summary": clean_content,
180
+ "content_html": content_html,
181
+ "top_image": top_image,
182
+ "published": entry.get("published"),
183
+ "source": feed_url.split('/')[2],
184
+ "guid": entry.get("id", article_url)
185
+ })
186
+ articles_found += 1
187
+ except Exception as e:
188
+ print(f"Error extracting content from {article_url}: {e}")
189
+ # Fallback to summary only
190
+ from bs4 import BeautifulSoup
191
+ clean_summary = BeautifulSoup(summary, 'html.parser').get_text(separator=' ', strip=True) if summary else "No summary available"
192
+
193
+ all_results.append({
194
+ "title": title,
195
+ "link": article_url,
196
+ "summary": clean_summary,
197
+ "content_html": f"<p>{summary}</p>",
198
+ "top_image": None,
199
+ "published": entry.get("published"),
200
+ "source": feed_url.split('/')[2],
201
+ "guid": entry.get("id", article_url)
202
+ })
203
+ articles_found += 1
204
+ print(f"Found {articles_found} matching articles from {feed_url}")
205
+ except Exception as e:
206
+ print(f"Error fetching {feed_url}: {e}")
207
+ continue
208
+ # Sort by published date if available
209
+ all_results.sort(key=lambda x: x.get("published", ""), reverse=True)
210
+ return render_template("index.html",
211
+ items=all_results,
212
+ app_title=APP_TITLE,
213
+ source_name="Aggregated News",
214
+ username=session.get('username'))
215
+
216
+ @app.route("/article/<path:guid>")
217
+ def article_page(guid):
218
+ item = get_article_by_guid(guid)
219
+ if not item:
220
+ # If not found in database, try to extract content from the URL directly
221
+ if guid.startswith('http'):
222
+ try:
223
+ full = extract_article_content(guid)
224
+ item = {
225
+ "guid": guid,
226
+ "title": full.get("title", "Article"),
227
+ "link": guid,
228
+ "summary": "",
229
+ "published": datetime.now().isoformat(),
230
+ "content_html": full.get("content_html", "<p>Content could not be extracted.</p>"),
231
+ "top_image": full.get("top_image")
232
+ }
233
+ except Exception as e:
234
+ print(f"Error extracting article content: {e}")
235
+ abort(404)
236
+ else:
237
+ abort(404)
238
+
239
+ # If content_html is missing, fetch and update it automatically
240
+ if not item.get("content_html") and item.get("link"):
241
+ full = extract_article_content(item["link"])
242
+ item["content_html"] = full["content_html"]
243
+ item["top_image"] = full.get("top_image")
244
+ # Optionally update the title if available
245
+ if full.get("title") and len(full["title"]) > len(item["title"]) - 8:
246
+ item["title"] = full["title"]
247
+ # Only upsert if we have a proper GUID (not a URL)
248
+ if not item["guid"].startswith('http'):
249
+ upsert_article(item)
250
+
251
+ return render_template("article.html",
252
+ item=item,
253
+ app_title=APP_TITLE,
254
+ source_name=APP_SOURCE_NAME)
255
+
256
+ @app.route("/api/article/<path:guid>")
257
+ def api_article(guid):
258
+ item = get_article_by_guid(guid)
259
+ if not item:
260
+ return jsonify({"error": "not found"}), 404
261
+ return jsonify(item)
262
+
263
+ def start_scheduler():
264
+ scheduler = BackgroundScheduler(timezone=TIMEZONE)
265
+
266
+ scheduler.start()
267
+ # Make an immediate first run
268
+
269
+
270
+
271
+ # --- New route for searching news by keywords ---
272
+ @app.route("/search-news")
273
+ def search_news():
274
+ keywords = load_keywords()
275
+ pattern = re.compile(r"(" + "|".join(re.escape(k) for k in keywords) + r")", re.IGNORECASE)
276
+ all_results = []
277
+ for feed_url in SEARCH_RSS_FEEDS:
278
+ try:
279
+ feed = parse_rss(feed_url)
280
+ for entry in feed.entries:
281
+ title = entry.get("title", "")
282
+ summary = entry.get("summary", entry.get("description", ""))
283
+ if pattern.search(title) or pattern.search(summary):
284
+ all_results.append({
285
+ "title": title,
286
+ "link": entry.get("link"),
287
+ "summary": summary,
288
+ "published": entry.get("published"),
289
+ "source": feed_url
290
+ })
291
+ except Exception as e:
292
+ print(f"Error fetching {feed_url}: {e}")
293
+ # Sort by published date if available
294
+ all_results.sort(key=lambda x: x.get("published", ""), reverse=True)
295
+ return render_template("search_results.html", results=all_results, app_title=APP_TITLE, source_name="Aggregated News")
296
+
297
+
298
+ if __name__ == "__main__":
299
+ port = int(os.environ.get("PORT", 7860))
300
+ app.run(host="0.0.0.0", port=port, debug=False)
config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ RSS_URL = "https://www.geo.tv/rss/1/1" # Pakistani news RSS feed
2
+ APP_TITLE = "Pakistan News Aggregator"
3
+ APP_SOURCE_NAME = "Pakistani News Sources"
4
+ FETCH_INTERVAL_MINUTES = 60 # every 1 hour
5
+ USER_AGENT = (
6
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
7
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
8
+ "Chrome/124.0.0.0 Safari/537.36"
9
+ )
10
+ DATABASE = "news.db"
11
+ TIMEZONE = "UTC"
12
+
db.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import sqlite3
3
+ import os
4
+ from contextlib import contextmanager
5
+
6
+ DB_PATH = None # set by init_db
7
+
8
+ def init_db(db_path: str):
9
+ global DB_PATH
10
+ # Ensure the database is created in the correct location (relative to this file)
11
+ if not os.path.isabs(db_path):
12
+ db_path = os.path.join(os.path.dirname(__file__), db_path)
13
+ DB_PATH = db_path
14
+ # This will create the DB file if it does not exist
15
+ with sqlite3.connect(DB_PATH) as con:
16
+ con.execute("""
17
+ CREATE TABLE IF NOT EXISTS articles (
18
+ guid TEXT PRIMARY KEY,
19
+ title TEXT,
20
+ link TEXT,
21
+ summary TEXT,
22
+ published TEXT,
23
+ content_html TEXT,
24
+ top_image TEXT,
25
+ last_fetched TEXT
26
+ )
27
+ """)
28
+ con.execute("CREATE INDEX IF NOT EXISTS idx_published ON articles(published)")
29
+ con.commit()
30
+
31
+ @contextmanager
32
+ def get_conn():
33
+ con = sqlite3.connect(DB_PATH)
34
+ try:
35
+ yield con
36
+ finally:
37
+ con.close()
38
+
39
+ def upsert_article(article):
40
+ with get_conn() as con:
41
+ con.execute("""
42
+ INSERT INTO articles (guid, title, link, summary, published, content_html, top_image, last_fetched)
43
+ VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
44
+ ON CONFLICT(guid) DO UPDATE SET
45
+ title=excluded.title,
46
+ link=excluded.link,
47
+ summary=excluded.summary,
48
+ published=excluded.published,
49
+ content_html=excluded.content_html,
50
+ top_image=excluded.top_image,
51
+ last_fetched=datetime('now')
52
+ """, (
53
+ article["guid"],
54
+ article.get("title"),
55
+ article.get("link"),
56
+ article.get("summary"),
57
+ article.get("published"),
58
+ article.get("content_html"),
59
+ article.get("top_image"),
60
+ ))
61
+ con.commit()
62
+
63
+ def get_latest(limit=30):
64
+ with get_conn() as con:
65
+ cur = con.execute("""
66
+ SELECT guid, title, link, summary, published, top_image
67
+ FROM articles
68
+ ORDER BY datetime(published) DESC, rowid DESC
69
+ LIMIT ?
70
+ """, (limit,))
71
+ rows = cur.fetchall()
72
+ cols = ["guid", "title", "link", "summary", "published", "top_image"]
73
+ return [dict(zip(cols, r)) for r in rows]
74
+
75
+ def get_article_by_guid(guid: str):
76
+ with get_conn() as con:
77
+ cur = con.execute("""
78
+ SELECT guid, title, link, summary, published, content_html, top_image, last_fetched
79
+ FROM articles WHERE guid=?
80
+ """, (guid,))
81
+ row = cur.fetchone()
82
+ if not row:
83
+ return None
84
+ cols = ["guid","title","link","summary","published","content_html","top_image","last_fetched"]
85
+ return dict(zip(cols, row))
86
+
docker-compose.yml ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: '3.8'
2
+
3
+ services:
4
+ pakistani-news-app:
5
+ build: .
6
+ ports:
7
+ - "5000:5000"
8
+ environment:
9
+ - FLASK_ENV=production
10
+ - FLASK_APP=app.py
11
+ volumes:
12
+ - ./data:/app/data # Persist database
13
+ restart: unless-stopped
14
+ healthcheck:
15
+ test: ["CMD", "curl", "-f", "http://localhost:5000/"]
16
+ interval: 30s
17
+ timeout: 10s
18
+ retries: 3
19
+ start_period: 40s
fetcher.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ import feedparser
3
+ from bs4 import BeautifulSoup
4
+ from readability import Document
5
+ from dateutil import parser as dateparser
6
+ from datetime import datetime, timezone
7
+ import time
8
+ from typing import Optional
9
+ from config import USER_AGENT
10
+ from urllib.parse import urljoin
11
+
12
+ def http_get(url, timeout=30):
13
+ headers = {
14
+ "User-Agent": USER_AGENT,
15
+ "Accept-Language": "en-US,en;q=0.9",
16
+ "Referer": "https://duckduckgo.com/",
17
+ "Connection": "keep-alive",
18
+ "Accept": "application/rss+xml, application/xml, text/xml",
19
+ }
20
+ try:
21
+ return requests.get(url, headers=headers, timeout=timeout, verify=True)
22
+ except requests.exceptions.SSLError:
23
+ # Retry without SSL verification as fallback
24
+ return requests.get(url, headers=headers, timeout=timeout, verify=False)
25
+
26
+ def parse_rss(rss_url: str):
27
+ # requests first for better headers + retries
28
+ r = http_get(rss_url)
29
+ r.raise_for_status()
30
+ return feedparser.parse(r.content)
31
+
32
+ def iso_dt(s: Optional[str]) -> str:
33
+ if not s:
34
+ return datetime.now(timezone.utc).isoformat()
35
+ try:
36
+ dt = dateparser.parse(s)
37
+ if not dt.tzinfo:
38
+ dt = dt.replace(tzinfo=timezone.utc)
39
+ return dt.astimezone(timezone.utc).isoformat()
40
+ except Exception:
41
+ return datetime.now(timezone.utc).isoformat()
42
+
43
+ def extract_article_content(url: str):
44
+ """
45
+ Fetch article page and extract readable HTML + a top image if possible.
46
+ Uses readability + BeautifulSoup cleanup. Works for most publisher pages.
47
+ """
48
+ try:
49
+ resp = http_get(url, timeout=25)
50
+ resp.raise_for_status()
51
+ except Exception:
52
+ return {"content_html": None, "top_image": None, "title": None}
53
+
54
+ html = resp.text
55
+ content_html = None
56
+ title = None
57
+ # Try readability first
58
+ try:
59
+ doc = Document(html)
60
+ content_html = doc.summary(html_partial=True)
61
+ title = doc.short_title()
62
+ except Exception:
63
+ content_html = None
64
+ title = None
65
+
66
+ # Fallback: use BeautifulSoup to extract <article> or largest <div> with text
67
+ if not content_html:
68
+ soup = BeautifulSoup(html, "lxml")
69
+ article_tag = soup.find("article")
70
+ if article_tag:
71
+ # Clean up unwanted elements
72
+ for unwanted in article_tag.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
73
+ unwanted.decompose()
74
+ # Remove social media buttons and ads
75
+ for unwanted in article_tag.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
76
+ unwanted.decompose()
77
+ # Remove unwanted text patterns
78
+ for unwanted in article_tag.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
79
+ if unwanted.parent:
80
+ unwanted.parent.decompose()
81
+ content_html = str(article_tag)
82
+ else:
83
+ # fallback: find the largest <div> with text
84
+ divs = soup.find_all("div")
85
+ largest = max(divs, key=lambda d: len(d.get_text(strip=True)), default=None)
86
+ if largest and len(largest.get_text(strip=True)) > 200:
87
+ # Clean up unwanted elements
88
+ for unwanted in largest.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
89
+ unwanted.decompose()
90
+ # Remove social media and navigation elements
91
+ for unwanted in largest.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
92
+ unwanted.decompose()
93
+ # Remove unwanted text patterns
94
+ for unwanted in largest.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
95
+ if unwanted.parent:
96
+ unwanted.parent.decompose()
97
+ content_html = str(largest)
98
+ else:
99
+ # fallback: just use the body
100
+ body = soup.find("body")
101
+ if body:
102
+ content_html = str(body)
103
+
104
+ # Try to find a good image
105
+ top_image = None
106
+ try:
107
+ soup = BeautifulSoup(html, "lxml")
108
+ og = soup.find("meta", property="og:image") or soup.find("meta", attrs={"name":"og:image"})
109
+ if og and og.get("content"):
110
+ top_image = urljoin(url, og["content"])
111
+ else:
112
+ img = soup.find("article")
113
+ img = img.find("img") if img else soup.find("img")
114
+ if img and img.get("src"):
115
+ top_image = urljoin(url, img["src"])
116
+ except Exception:
117
+ pass
118
+
119
+ return {"content_html": content_html, "top_image": top_image, "title": title}
120
+
gradio_app.py ADDED
File without changes
keywords.json ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "india",
3
+ "indian",
4
+ "kashmir",
5
+ "jammu",
6
+ "Indian Army",
7
+ "Indian Politics",
8
+ "Modi",
9
+ "New Delhi",
10
+ "defense",
11
+ "security",
12
+ "indian development",
13
+ "infrastructure",
14
+ "education",
15
+ "indian economy",
16
+ "technology",
17
+ "science",
18
+ "Indian Border",
19
+ "India Air Force",
20
+ "Indian Navy",
21
+ "BJP",
22
+ "Congress",
23
+ "Lok Sabha",
24
+ "RSS",
25
+ "Hindu",
26
+ "Hindutva",
27
+ "Mumbai",
28
+ "Delhi",
29
+ "Gujarat",
30
+ "Punjab",
31
+ "Rajasthan",
32
+ "Uttar Pradesh",
33
+ "West Bengal",
34
+ "Tamil Nadu",
35
+ "Karnataka",
36
+ "Maharashtra",
37
+ "Bollywood",
38
+ "cricket",
39
+ "BCCI",
40
+ "IPL",
41
+ "Indo-Pak",
42
+ "bilateral",
43
+ "trade",
44
+ "diplomatic",
45
+ "foreign ministry",
46
+ "RAW",
47
+ "intelligence",
48
+ "terrorism",
49
+ "ceasefire",
50
+ "violation",
51
+ "infiltration",
52
+ "Siachen",
53
+ "Kargil",
54
+ "Azad Kashmir"
55
+ ]
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ Flask
2
+ APScheduler
3
+ requests
4
+ feedparser
5
+ beautifulsoup4
6
+ readability-lxml
7
+ python-dateutil
8
+ lxml
requirements_streamlit.txt ADDED
File without changes
run_streamlit.bat ADDED
File without changes
scraper.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import json
4
+ import datetime
5
+
6
+ # Replace with the correct GUID News URL
7
+ URL = "https://www.theguidenews.com/latest-news"
8
+
9
+ def scrape_guid_news():
10
+ response = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
11
+ soup = BeautifulSoup(response.content, "lxml")
12
+
13
+ news_list = []
14
+ articles = soup.find_all("div", class_="news-card") # Adjust class based on HTML structure
15
+
16
+ for article in articles:
17
+ try:
18
+ title = article.find("h2").get_text(strip=True)
19
+ link = article.find("a")["href"]
20
+ image_tag = article.find("img")
21
+ image = image_tag["src"] if image_tag else None
22
+ summary = article.find("p").get_text(strip=True)
23
+ date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
24
+
25
+ news_list.append({
26
+ "title": title,
27
+ "link": link,
28
+ "image": image,
29
+ "summary": summary,
30
+ "date": date
31
+ })
32
+ except Exception as e:
33
+ print(f"Skipping one article due to error: {e}")
34
+ continue
35
+
36
+ with open("news.json", "w", encoding="utf-8") as f:
37
+ json.dump(news_list, f, indent=4, ensure_ascii=False)
38
+
39
+ print(f"[+] Scraped {len(news_list)} news articles successfully!")
40
+
41
+ if __name__ == "__main__":
42
+ scrape_guid_news()
static/styles.css ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .img-cover {
2
+ height: 180px;
3
+ object-fit: cover;
4
+ }
5
+
6
+ .article-content {
7
+ line-height: 1.6;
8
+ }
9
+
10
+ .article-content img {
11
+ max-width: 100%;
12
+ height: auto;
13
+ margin: 15px 0;
14
+ }
15
+
16
+ .article-content figure {
17
+ max-width: 100%;
18
+ margin: 15px 0;
19
+ }
20
+
21
+ .article-content iframe {
22
+ max-width: 100%;
23
+ }
24
+
25
+ .article-content p {
26
+ margin-bottom: 15px;
27
+ }
28
+
29
+ .article-content h1, .article-content h2, .article-content h3 {
30
+ margin-top: 20px;
31
+ margin-bottom: 15px;
32
+ }
33
+
34
+ .article-body img {
35
+ max-width: 100%;
36
+ height: auto;
37
+ }
38
+ .article-body figure {
39
+ max-width: 100%;
40
+ }
41
+ .article-body iframe {
42
+ max-width: 100%;
43
+ }
44
+
streamlit_app.py ADDED
File without changes
templates/article.html ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block content %}
3
+ <div class="container">
4
+ <div class="row">
5
+ <div class="col-12">
6
+ <article class="card shadow-sm">
7
+ {% if item.top_image %}
8
+ <img
9
+ src="{{ item.top_image }}"
10
+ class="card-img-top"
11
+ alt="Article Image"
12
+ style="height: 300px; object-fit: cover;"
13
+ />
14
+ {% endif %}
15
+ <div class="card-body">
16
+ <h1 class="card-title">{{ item.title }}</h1>
17
+ <div class="text-muted small mb-3">
18
+ {% if item.published %}
19
+ Published: {{ item.published }} |
20
+ {% endif %}
21
+ <a href="{{ item.link }}" target="_blank" rel="noopener">View Original</a>
22
+ </div>
23
+
24
+ {% if item.content_html %}
25
+ <div class="article-content">
26
+ {% set cleaned_content = item.content_html | replace("Read more", "") | replace("Email", "") | replace("Subscribe", "") | replace("Share", "") | replace("Tweet", "") %}
27
+ {{ cleaned_content | safe }}
28
+ </div>
29
+ {% else %}
30
+ <p>{{ item.summary }}</p>
31
+ {% endif %}
32
+ </div>
33
+ </article>
34
+ </div>
35
+ </div>
36
+ </div>
37
+ {% endblock %}
38
+
templates/base.html ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>{{ app_title }}</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
7
+ <link
8
+ href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css"
9
+ rel="stylesheet"
10
+ />
11
+ <link
12
+ rel="stylesheet"
13
+ href="{{ url_for('static', filename='styles.css') }}"
14
+ />
15
+ </head>
16
+ <body class="bg-light">
17
+ <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
18
+ <div class="container d-flex align-items-center">
19
+ <img
20
+ src="{{ url_for('static', filename='image.png') }}"
21
+ alt="Logo"
22
+ style="height: 40px; margin-right: 12px"
23
+ />
24
+ <a class="navbar-brand fw-bold" href="/">{{ app_title }}</a>
25
+ <span class="navbar-text text-secondary ms-2"
26
+ >Source: {{ source_name }}</span
27
+ >
28
+ <div class="ms-auto">
29
+ {% if session.get('username') %}
30
+ <span class="text-light me-2">Hello, {{ session['username'] }}</span>
31
+ <a href="{{ url_for('logout') }}" class="btn btn-outline-light btn-sm"
32
+ >Logout</a
33
+ >
34
+ {% else %}
35
+ <a href="{{ url_for('login') }}" class="btn btn-outline-light btn-sm"
36
+ >Login</a
37
+ >
38
+ {% endif %}
39
+ </div>
40
+ </div>
41
+ </nav>
42
+ <main class="container my-4">{% block content %}{% endblock %}</main>
43
+ <footer class="border-top py-3">
44
+ <div class="container small text-muted">
45
+ Updated hourly • {{ source_name }}
46
+ </div>
47
+ </footer>
48
+ </body>
49
+ </html>
templates/index.html ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %} {% block content %}
2
+ <h1 class="h3 mb-4">Latest News</h1>
3
+
4
+ <div class="row g-4">
5
+ {% for it in items %}
6
+ <div class="col-12 col-md-6 col-lg-4">
7
+ <article class="card h-100 shadow-sm">
8
+ {% if it.top_image %}
9
+ <img
10
+ src="{{ it.top_image }}"
11
+ class="card-img-top img-cover"
12
+ alt="cover"
13
+ />
14
+ {% endif %}
15
+ <div class="card-body d-flex flex-column">
16
+ <h2 class="h5 card-title">{{ it.title }}</h2>
17
+ <p class="card-text text-muted small">{{ it.summary }}</p>
18
+ <div class="mt-auto d-flex gap-2">
19
+ <a
20
+ class="btn btn-sm btn-primary"
21
+ href="{{ url_for('article_page', guid=it.guid) }}"
22
+ >Read Article</a
23
+ >
24
+ <a
25
+ class="btn btn-sm btn-outline-secondary"
26
+ href="{{ it.link }}"
27
+ target="_blank"
28
+ rel="noopener"
29
+ >Original</a
30
+ >
31
+ </div>
32
+ {% if it.published %}
33
+ <div class="text-muted small mt-2">{{ it.published }}</div>
34
+ {% endif %}
35
+ <div class="text-muted small">Source: {{ it.source }}</div>
36
+ </div>
37
+ </article>
38
+ </div>
39
+ {% endfor %}
40
+ </div>
41
+ {% endblock %}
templates/login.html ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <!DOCTYPE html>
2
+ <html lang="en">
3
+ <head>
4
+ <meta charset="utf-8" />
5
+ <title>{{ app_title }} - Login</title>
6
+ <meta name="viewport" content="width=device-width, initial-scale=1" />
7
+ <link
8
+ href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css"
9
+ rel="stylesheet"
10
+ />
11
+ <link
12
+ rel="stylesheet"
13
+ href="{{ url_for('static', filename='styles.css') }}"
14
+ />
15
+ </head>
16
+ <body class="bg-light">
17
+ <div class="container mt-5" style="max-width: 400px">
18
+ <div class="text-center mb-4">
19
+ <img
20
+ src="{{ url_for('static', filename='image.png') }}"
21
+ alt="Header Logo"
22
+ style="max-width: 120px"
23
+ />
24
+ <h2 class="mt-2">{{ app_title }}</h2>
25
+ </div>
26
+ {% with messages = get_flashed_messages(with_categories=true) %} {% if
27
+ messages %} {% for category, message in messages %}
28
+ <div class="alert alert-{{ category }}">{{ message }}</div>
29
+ {% endfor %} {% endif %} {% endwith %}
30
+ <form method="post" class="card card-body shadow-sm">
31
+ <div class="mb-3">
32
+ <label for="username" class="form-label">Username</label>
33
+ <input
34
+ type="text"
35
+ class="form-control"
36
+ id="username"
37
+ name="username"
38
+ required
39
+ autofocus
40
+ />
41
+ </div>
42
+ <div class="mb-3">
43
+ <label for="password" class="form-label">Password</label>
44
+ <input
45
+ type="password"
46
+ class="form-control"
47
+ id="password"
48
+ name="password"
49
+ required
50
+ />
51
+ </div>
52
+ <button type="submit" class="btn btn-primary w-100">Login</button>
53
+ </form>
54
+ </div>
55
+ </body>
56
+ </html>
templates/search_results.html ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "base.html" %}
2
+ {% block content %}
3
+ <h1 class="h3 mb-4">Search Results (by Keywords)</h1>
4
+ <div class="row g-4">
5
+ {% if results %}
6
+ {% for it in results %}
7
+ <div class="col-12 col-md-6 col-lg-4">
8
+ <article class="card h-100 shadow-sm">
9
+ <div class="card-body d-flex flex-column">
10
+ <h2 class="h5 card-title">{{ it.title }}</h2>
11
+ <p class="card-text text-muted small">{{ it.summary }}</p>
12
+ <div class="mt-auto d-flex gap-2">
13
+ <a class="btn btn-sm btn-primary" href="{{ it.link }}" target="_blank" rel="noopener">Read Original</a>
14
+ </div>
15
+ {% if it.published %}
16
+ <div class="text-muted small mt-2">{{ it.published }}</div>
17
+ {% endif %}
18
+ <div class="text-muted small mt-2">Source: {{ it.source }}</div>
19
+ </div>
20
+ </article>
21
+ </div>
22
+ {% endfor %}
23
+ {% else %}
24
+ <div class="col-12">
25
+ <div class="alert alert-info">No articles found for the given keywords.</div>
26
+ </div>
27
+ {% endif %}
28
+ </div>
29
+ {% endblock %}
users.json ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "users": [
3
+ {
4
+ "username": "test",
5
+ "password": "test"
6
+ }
7
+ ]
8
+ }