Spaces:

saran14
/

New_Aggregator_v1

Sleeping

App Files Files Community

saran14 commited on Sep 3, 2025

Commit

796e8b5

0 Parent(s):

Initial commit: Pakistani News Aggregator Flask App

Browse files

Files changed (30) hide show

.dockerignore +36 -0
.gitignore +1 -0
Dockerfile +26 -0
README.md +42 -0
README_STREAMLIT.md +0 -0
__pycache__/config.cpython-39.pyc +0 -0
__pycache__/country_feeds_config.cpython-39.pyc +0 -0
__pycache__/db.cpython-39.pyc +0 -0
__pycache__/fetcher.cpython-39.pyc +0 -0
__pycache__/opml_util.cpython-39.pyc +0 -0
__pycache__/proxy_config.cpython-39.pyc +0 -0
app.py +300 -0
config.py +12 -0
db.py +86 -0
docker-compose.yml +19 -0
fetcher.py +120 -0
gradio_app.py +0 -0
keywords.json +55 -0
requirements.txt +8 -0
requirements_streamlit.txt +0 -0
run_streamlit.bat +0 -0
scraper.py +42 -0
static/styles.css +44 -0
streamlit_app.py +0 -0
templates/article.html +38 -0
templates/base.html +49 -0
templates/index.html +41 -0
templates/login.html +56 -0
templates/search_results.html +29 -0
users.json +8 -0

.dockerignore ADDED Viewed

	@@ -0,0 +1,36 @@

+__pycache__
+*.pyc
+*.pyo
+*.pyd
+.Python
+env
+pip-log.txt
+pip-delete-this-directory.txt
+.tox
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.log
+.git
+.mypy_cache
+.pytest_cache
+.hypothesis
+.DS_Store
+.vscode
+*.swp
+*.swo
+news.db
+*.db
+node_modules
+npm-debug.log*
+.dockerignore
+Dockerfile
+README.md
+.gitignore

.gitignore ADDED Viewed

	@@ -0,0 +1 @@


1	+ news.db

Dockerfile ADDED Viewed

	@@ -0,0 +1,26 @@

+# Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
+# you will also find guides on how best to write your Dockerfile
+FROM python:3.9
+RUN useradd -m -u 1000 user
+USER user
+ENV PATH="/home/user/.local/bin:$PATH"
+WORKDIR /app
+COPY --chown=user ./requirements.txt requirements.txt
+RUN pip install --no-cache-dir --upgrade -r requirements.txt
+COPY --chown=user . /app
+# Expose port 7860 for Hugging Face Spaces
+EXPOSE 7860
+# Set environment variables for Flask
+ENV FLASK_APP=app.py
+ENV FLASK_RUN_HOST=0.0.0.0
+ENV FLASK_RUN_PORT=7860
+# Run Flask app directly
+CMD ["python", "app.py"]

README.md ADDED Viewed

	@@ -0,0 +1,42 @@

+---
+title: Pakistani News Aggregator
+emoji: 📰
+colorFrom: blue
+colorTo: red
+sdk: docker
+pinned: false
+license: mit
+---
+# Pakistani News Aggregator
+A Flask-based news aggregator that fetches and filters Pakistani news articles based on India-related keywords. The application scrapes content from multiple Pakistani news sources and displays clean, readable articles.
+## Features
+- **Multi-source RSS aggregation** from Pakistani news outlets
+- **Keyword-based filtering** for India-related content
+- **Content scraping** with clean text extraction
+- **User authentication** system
+- **Responsive web interface**
+## News Sources
+- Dawn.com
+- Express.pk
+- BOL News
+- UrduPoint
+- The Pakistan
+## Keywords
+The system filters news based on keywords related to:
+- India and Indian affairs
+- Kashmir and regional politics
+- Bilateral relations
+- Security and defense
+- Economic and cultural news
+## Usage
+The application automatically fetches and displays the latest news articles that match the configured keywords, providing a Pakistani perspective on India-related developments.

README_STREAMLIT.md ADDED Viewed

File without changes

__pycache__/config.cpython-39.pyc ADDED Viewed

Binary file (525 Bytes). View file

__pycache__/country_feeds_config.cpython-39.pyc ADDED Viewed

Binary file (217 Bytes). View file

__pycache__/db.cpython-39.pyc ADDED Viewed

Binary file (3.04 kB). View file

__pycache__/fetcher.cpython-39.pyc ADDED Viewed

Binary file (3.33 kB). View file

__pycache__/opml_util.cpython-39.pyc ADDED Viewed

Binary file (1.32 kB). View file

__pycache__/proxy_config.cpython-39.pyc ADDED Viewed

Binary file (845 Bytes). View file

app.py ADDED Viewed

	@@ -0,0 +1,300 @@

+from flask import Flask, render_template, abort, jsonify, request, redirect, url_for, session, flash
+from apscheduler.schedulers.background import BackgroundScheduler
+from datetime import datetime
+import traceback
+import os
+from config import RSS_URL, APP_TITLE, APP_SOURCE_NAME, FETCH_INTERVAL_MINUTES, DATABASE, TIMEZONE
+from db import init_db, upsert_article, get_latest, get_article_by_guid
+from fetcher import parse_rss, extract_article_content, iso_dt
+import re
+import json
+import os
+# List of RSS feeds to search
+SEARCH_RSS_FEEDS = [
+    "https://www.dawn.com/feeds/home",
+    "https://www.express.pk/feed/",
+    "https://www.bolnews.com/feed/",
+    "https://www.urdupoint.com/en/sitemap/news.rs",
+    "https://thepakistan.pk/feed/"
+]
+KEYWORDS_FILE = os.path.join(os.path.dirname(__file__), 'keywords.json')
+def load_keywords():
+    # keywords.json is not a valid JSON array, so parse manually
+    with open(KEYWORDS_FILE, 'r', encoding='utf-8') as f:
+        text = f.read()
+    # Remove braces and split by comma
+    text = text.strip().replace('{', '').replace('}', '').replace(';', '')
+    keywords = [k.strip(' "\n') for k in text.split(',') if k.strip()]
+    return [k for k in keywords if k]
+app = Flask(__name__)
+app.secret_key = 'your_secret_key_here'  # Change this to a secure random value
+# Ensure DB is initialized for all environments (including production)
+init_db(DATABASE)
+USERS_FILE = os.path.join(os.path.dirname(__file__), 'users.json')
+def load_users():
+    with open(USERS_FILE, 'r') as f:
+        data = json.load(f)
+    return data.get('users', [])
+def check_user(username, password):
+    users = load_users()
+    for user in users:
+        if user['username'] == username and user['password'] == password:
+            return True
+    return False
+@app.route('/login', methods=['GET', 'POST'])
+def login():
+    if request.method == 'POST':
+        username = request.form['username']
+        password = request.form['password']
+        if check_user(username, password):
+            session['username'] = username
+            flash('Login successful!', 'success')
+            return redirect(url_for('home'))
+        else:
+            flash('Invalid username or password', 'danger')
+    return render_template('login.html', app_title=APP_TITLE)
+@app.route('/logout')
+def logout():
+    session.pop('username', None)
+    flash('Logged out successfully.', 'info')
+    return redirect(url_for('login'))
+def fetch_and_update():
+    """
+    1) Parse RSS
+    2) For each item, fetch full article page
+    3) Upsert into SQLite keyed by GUID
+    """
+    try:
+        feed = parse_rss(RSS_URL)
+        for entry in feed.entries:
+            guid = entry.get("id") or entry.get("guid") or entry.get("link")
+            if not guid:
+                continue
+            link = entry.get("link")
+            title = entry.get("title", "").strip()
+            summary = (entry.get("summary") or entry.get("description") or "").strip()
+            published = iso_dt(entry.get("published"))
+            # Get the full content
+            full = extract_article_content(link) if link else {"content_html": None, "top_image": None, "title": None}
+            content_html = full["content_html"] or f"<p>{summary}</p>"
+            top_image = full.get("top_image")
+            # Prefer the page short title if available
+            if full.get("title") and len(full["title"]) > len(title) - 8:
+                title = full["title"]
+            art = {
+                "guid": guid,
+                "title": title,
+                "link": link,
+                "summary": summary,
+                "published": published,
+                "content_html": content_html,
+                "top_image": top_image
+            }
+            upsert_article(art)
+        print(f"[{datetime.now()}] Fetched & updated.")
+    except Exception as e:
+        print("Fetch error:", e)
+        traceback.print_exc()
+## Disabled initial fetch to avoid RSS_URL timeout errors
+# Home page now fetches and displays news from 5 RSS feeds filtered by keywords
+@app.route("/")
+def home():
+    if 'username' not in session:
+        return redirect(url_for('login'))
+    keywords = load_keywords()
+    import re
+    # More flexible pattern matching for better coverage
+    pattern = re.compile(r"\b(" + "|".join(re.escape(k) for k in keywords) + r")\b", re.IGNORECASE)
+    # Pakistani news RSS feeds as requested
+    SEARCH_RSS_FEEDS = [
+        "https://www.dawn.com/feeds/home",
+        "https://www.express.pk/feed/",
+        "https://www.bolnews.com/feed/",
+        "https://www.urdupoint.com/en/sitemap/news.rs",
+        "https://thepakistan.pk/feed/"
+    ]
+    all_results = []
+    for feed_url in SEARCH_RSS_FEEDS:
+        try:
+            print(f"Fetching from {feed_url}...")
+            feed = parse_rss(feed_url)
+            articles_found = 0
+            for entry in feed.entries:
+                title = entry.get("title", "")
+                summary = entry.get("summary", entry.get("description", ""))
+                # Check both title and summary for keywords, also check for general India-related content
+                is_relevant = (pattern.search(title) or pattern.search(summary) or
+                             "india" in title.lower() or "india" in summary.lower() or
+                             "indian" in title.lower() or "indian" in summary.lower())
+                if is_relevant:
+                    # Extract full article content
+                    article_url = entry.get("link")
+                    if article_url:
+                        try:
+                            print(f"Extracting content from: {article_url}")
+                            full_content = extract_article_content(article_url)
+                            content_html = full_content.get("content_html", summary)
+                            top_image = full_content.get("top_image")
+                            # Clean HTML tags from content for better display
+                            from bs4 import BeautifulSoup
+                            if content_html:
+                                soup = BeautifulSoup(content_html, 'html.parser')
+                                clean_content = soup.get_text(separator=' ', strip=True)
+                                # Limit content length for better display
+                                if len(clean_content) > 500:
+                                    clean_content = clean_content[:500] + "..."
+                            else:
+                                clean_content = summary
+                            all_results.append({
+                                "title": title,
+                                "link": article_url,
+                                "summary": clean_content,
+                                "content_html": content_html,
+                                "top_image": top_image,
+                                "published": entry.get("published"),
+                                "source": feed_url.split('/')[2],
+                                "guid": entry.get("id", article_url)
+                            })
+                            articles_found += 1
+                        except Exception as e:
+                            print(f"Error extracting content from {article_url}: {e}")
+                            # Fallback to summary only
+                            from bs4 import BeautifulSoup
+                            clean_summary = BeautifulSoup(summary, 'html.parser').get_text(separator=' ', strip=True) if summary else "No summary available"
+                            all_results.append({
+                                "title": title,
+                                "link": article_url,
+                                "summary": clean_summary,
+                                "content_html": f"<p>{summary}</p>",
+                                "top_image": None,
+                                "published": entry.get("published"),
+                                "source": feed_url.split('/')[2],
+                                "guid": entry.get("id", article_url)
+                            })
+                            articles_found += 1
+            print(f"Found {articles_found} matching articles from {feed_url}")
+        except Exception as e:
+            print(f"Error fetching {feed_url}: {e}")
+            continue
+    # Sort by published date if available
+    all_results.sort(key=lambda x: x.get("published", ""), reverse=True)
+    return render_template("index.html",
+                           items=all_results,
+                           app_title=APP_TITLE,
+                           source_name="Aggregated News",
+                           username=session.get('username'))
+@app.route("/article/<path:guid>")
+def article_page(guid):
+    item = get_article_by_guid(guid)
+    if not item:
+        # If not found in database, try to extract content from the URL directly
+        if guid.startswith('http'):
+            try:
+                full = extract_article_content(guid)
+                item = {
+                    "guid": guid,
+                    "title": full.get("title", "Article"),
+                    "link": guid,
+                    "summary": "",
+                    "published": datetime.now().isoformat(),
+                    "content_html": full.get("content_html", "<p>Content could not be extracted.</p>"),
+                    "top_image": full.get("top_image")
+                }
+            except Exception as e:
+                print(f"Error extracting article content: {e}")
+                abort(404)
+        else:
+            abort(404)
+    # If content_html is missing, fetch and update it automatically
+    if not item.get("content_html") and item.get("link"):
+        full = extract_article_content(item["link"])
+        item["content_html"] = full["content_html"]
+        item["top_image"] = full.get("top_image")
+        # Optionally update the title if available
+        if full.get("title") and len(full["title"]) > len(item["title"]) - 8:
+            item["title"] = full["title"]
+        # Only upsert if we have a proper GUID (not a URL)
+        if not item["guid"].startswith('http'):
+            upsert_article(item)
+    return render_template("article.html",
+                           item=item,
+                           app_title=APP_TITLE,
+                           source_name=APP_SOURCE_NAME)
+@app.route("/api/article/<path:guid>")
+def api_article(guid):
+    item = get_article_by_guid(guid)
+    if not item:
+        return jsonify({"error": "not found"}), 404
+    return jsonify(item)
+def start_scheduler():
+    scheduler = BackgroundScheduler(timezone=TIMEZONE)
+    scheduler.start()
+    # Make an immediate first run
+# --- New route for searching news by keywords ---
+@app.route("/search-news")
+def search_news():
+    keywords = load_keywords()
+    pattern = re.compile(r"(" + "|".join(re.escape(k) for k in keywords) + r")", re.IGNORECASE)
+    all_results = []
+    for feed_url in SEARCH_RSS_FEEDS:
+        try:
+            feed = parse_rss(feed_url)
+            for entry in feed.entries:
+                title = entry.get("title", "")
+                summary = entry.get("summary", entry.get("description", ""))
+                if pattern.search(title) or pattern.search(summary):
+                    all_results.append({
+                        "title": title,
+                        "link": entry.get("link"),
+                        "summary": summary,
+                        "published": entry.get("published"),
+                        "source": feed_url
+                    })
+        except Exception as e:
+            print(f"Error fetching {feed_url}: {e}")
+    # Sort by published date if available
+    all_results.sort(key=lambda x: x.get("published", ""), reverse=True)
+    return render_template("search_results.html", results=all_results, app_title=APP_TITLE, source_name="Aggregated News")
+if __name__ == "__main__":
+    port = int(os.environ.get("PORT", 7860))
+    app.run(host="0.0.0.0", port=port, debug=False)

config.py ADDED Viewed

	@@ -0,0 +1,12 @@

+RSS_URL = "https://www.geo.tv/rss/1/1"  # Pakistani news RSS feed
+APP_TITLE = "Pakistan News Aggregator"
+APP_SOURCE_NAME = "Pakistani News Sources"
+FETCH_INTERVAL_MINUTES = 60  # every 1 hour
+USER_AGENT = (
+    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+    "AppleWebKit/537.36 (KHTML, like Gecko) "
+    "Chrome/124.0.0.0 Safari/537.36"
+)
+DATABASE = "news.db"
+TIMEZONE = "UTC"

db.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import sqlite3
+import os
+from contextlib import contextmanager
+DB_PATH = None  # set by init_db
+def init_db(db_path: str):
+    global DB_PATH
+    # Ensure the database is created in the correct location (relative to this file)
+    if not os.path.isabs(db_path):
+        db_path = os.path.join(os.path.dirname(__file__), db_path)
+    DB_PATH = db_path
+    # This will create the DB file if it does not exist
+    with sqlite3.connect(DB_PATH) as con:
+        con.execute("""
+        CREATE TABLE IF NOT EXISTS articles (
+            guid TEXT PRIMARY KEY,
+            title TEXT,
+            link TEXT,
+            summary TEXT,
+            published TEXT,
+            content_html TEXT,
+            top_image TEXT,
+            last_fetched TEXT
+        )
+        """)
+        con.execute("CREATE INDEX IF NOT EXISTS idx_published ON articles(published)")
+        con.commit()
+@contextmanager
+def get_conn():
+    con = sqlite3.connect(DB_PATH)
+    try:
+        yield con
+    finally:
+        con.close()
+def upsert_article(article):
+    with get_conn() as con:
+        con.execute("""
+        INSERT INTO articles (guid, title, link, summary, published, content_html, top_image, last_fetched)
+        VALUES (?, ?, ?, ?, ?, ?, ?, datetime('now'))
+        ON CONFLICT(guid) DO UPDATE SET
+            title=excluded.title,
+            link=excluded.link,
+            summary=excluded.summary,
+            published=excluded.published,
+            content_html=excluded.content_html,
+            top_image=excluded.top_image,
+            last_fetched=datetime('now')
+        """, (
+            article["guid"],
+            article.get("title"),
+            article.get("link"),
+            article.get("summary"),
+            article.get("published"),
+            article.get("content_html"),
+            article.get("top_image"),
+        ))
+        con.commit()
+def get_latest(limit=30):
+    with get_conn() as con:
+        cur = con.execute("""
+        SELECT guid, title, link, summary, published, top_image
+        FROM articles
+        ORDER BY datetime(published) DESC, rowid DESC
+        LIMIT ?
+        """, (limit,))
+        rows = cur.fetchall()
+    cols = ["guid", "title", "link", "summary", "published", "top_image"]
+    return [dict(zip(cols, r)) for r in rows]
+def get_article_by_guid(guid: str):
+    with get_conn() as con:
+        cur = con.execute("""
+        SELECT guid, title, link, summary, published, content_html, top_image, last_fetched
+        FROM articles WHERE guid=?
+        """, (guid,))
+        row = cur.fetchone()
+    if not row:
+        return None
+    cols = ["guid","title","link","summary","published","content_html","top_image","last_fetched"]
+    return dict(zip(cols, row))

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,19 @@

+version: '3.8'
+services:
+  pakistani-news-app:
+    build: .
+    ports:
+      - "5000:5000"
+    environment:
+      - FLASK_ENV=production
+      - FLASK_APP=app.py
+    volumes:
+      - ./data:/app/data  # Persist database
+    restart: unless-stopped
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:5000/"]
+      interval: 30s
+      timeout: 10s
+      retries: 3
+      start_period: 40s

fetcher.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import requests
+import feedparser
+from bs4 import BeautifulSoup
+from readability import Document
+from dateutil import parser as dateparser
+from datetime import datetime, timezone
+import time
+from typing import Optional
+from config import USER_AGENT
+from urllib.parse import urljoin
+def http_get(url, timeout=30):
+    headers = {
+        "User-Agent": USER_AGENT,
+        "Accept-Language": "en-US,en;q=0.9",
+        "Referer": "https://duckduckgo.com/",
+        "Connection": "keep-alive",
+        "Accept": "application/rss+xml, application/xml, text/xml",
+    }
+    try:
+        return requests.get(url, headers=headers, timeout=timeout, verify=True)
+    except requests.exceptions.SSLError:
+        # Retry without SSL verification as fallback
+        return requests.get(url, headers=headers, timeout=timeout, verify=False)
+def parse_rss(rss_url: str):
+    # requests first for better headers + retries
+    r = http_get(rss_url)
+    r.raise_for_status()
+    return feedparser.parse(r.content)
+def iso_dt(s: Optional[str]) -> str:
+    if not s:
+        return datetime.now(timezone.utc).isoformat()
+    try:
+        dt = dateparser.parse(s)
+        if not dt.tzinfo:
+            dt = dt.replace(tzinfo=timezone.utc)
+        return dt.astimezone(timezone.utc).isoformat()
+    except Exception:
+        return datetime.now(timezone.utc).isoformat()
+def extract_article_content(url: str):
+    """
+    Fetch article page and extract readable HTML + a top image if possible.
+    Uses readability + BeautifulSoup cleanup. Works for most publisher pages.
+    """
+    try:
+        resp = http_get(url, timeout=25)
+        resp.raise_for_status()
+    except Exception:
+        return {"content_html": None, "top_image": None, "title": None}
+    html = resp.text
+    content_html = None
+    title = None
+    # Try readability first
+    try:
+        doc = Document(html)
+        content_html = doc.summary(html_partial=True)
+        title = doc.short_title()
+    except Exception:
+        content_html = None
+        title = None
+    # Fallback: use BeautifulSoup to extract <article> or largest <div> with text
+    if not content_html:
+        soup = BeautifulSoup(html, "lxml")
+        article_tag = soup.find("article")
+        if article_tag:
+            # Clean up unwanted elements
+            for unwanted in article_tag.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
+                unwanted.decompose()
+            # Remove social media buttons and ads
+            for unwanted in article_tag.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
+                unwanted.decompose()
+            # Remove unwanted text patterns
+            for unwanted in article_tag.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
+                if unwanted.parent:
+                    unwanted.parent.decompose()
+            content_html = str(article_tag)
+        else:
+            # fallback: find the largest <div> with text
+            divs = soup.find_all("div")
+            largest = max(divs, key=lambda d: len(d.get_text(strip=True)), default=None)
+            if largest and len(largest.get_text(strip=True)) > 200:
+                # Clean up unwanted elements
+                for unwanted in largest.find_all(["script", "style", "nav", "header", "footer", "aside", "button", "form"]):
+                    unwanted.decompose()
+                # Remove social media and navigation elements
+                for unwanted in largest.find_all(class_=["share", "social", "btn", "button", "ad", "advertisement", "promo", "read-more", "email", "subscribe"]):
+                    unwanted.decompose()
+                # Remove unwanted text patterns
+                for unwanted in largest.find_all(text=["Read more", "Email", "Subscribe", "Share", "Tweet", "Like"]):
+                    if unwanted.parent:
+                        unwanted.parent.decompose()
+                content_html = str(largest)
+            else:
+                # fallback: just use the body
+                body = soup.find("body")
+                if body:
+                    content_html = str(body)
+    # Try to find a good image
+    top_image = None
+    try:
+        soup = BeautifulSoup(html, "lxml")
+        og = soup.find("meta", property="og:image") or soup.find("meta", attrs={"name":"og:image"})
+        if og and og.get("content"):
+            top_image = urljoin(url, og["content"])
+        else:
+            img = soup.find("article")
+            img = img.find("img") if img else soup.find("img")
+            if img and img.get("src"):
+                top_image = urljoin(url, img["src"])
+    except Exception:
+        pass
+    return {"content_html": content_html, "top_image": top_image, "title": title}

gradio_app.py ADDED Viewed

File without changes

keywords.json ADDED Viewed

	@@ -0,0 +1,55 @@

+[
+    "india",
+    "indian",
+    "kashmir",
+    "jammu",
+    "Indian Army",
+    "Indian Politics",
+    "Modi",
+    "New Delhi",
+    "defense",
+    "security",
+    "indian development",
+    "infrastructure",
+    "education",
+    "indian economy",
+    "technology",
+    "science",
+    "Indian Border",
+    "India Air Force",
+    "Indian Navy",
+    "BJP",
+    "Congress",
+    "Lok Sabha",
+    "RSS",
+    "Hindu",
+    "Hindutva",
+    "Mumbai",
+    "Delhi",
+    "Gujarat",
+    "Punjab",
+    "Rajasthan",
+    "Uttar Pradesh",
+    "West Bengal",
+    "Tamil Nadu",
+    "Karnataka",
+    "Maharashtra",
+    "Bollywood",
+    "cricket",
+    "BCCI",
+    "IPL",
+    "Indo-Pak",
+    "bilateral",
+    "trade",
+    "diplomatic",
+    "foreign ministry",
+    "RAW",
+    "intelligence",
+    "terrorism",
+    "ceasefire",
+    "violation",
+    "infiltration",
+    "Siachen",
+    "Kargil",
+    "Azad Kashmir"
+]

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+Flask
+APScheduler
+requests
+feedparser
+beautifulsoup4
+readability-lxml
+python-dateutil
+lxml

requirements_streamlit.txt ADDED Viewed

File without changes

run_streamlit.bat ADDED Viewed

File without changes

scraper.py ADDED Viewed

	@@ -0,0 +1,42 @@

+import requests
+from bs4 import BeautifulSoup
+import json
+import datetime
+# Replace with the correct GUID News URL
+URL = "https://www.theguidenews.com/latest-news"
+def scrape_guid_news():
+    response = requests.get(URL, headers={"User-Agent": "Mozilla/5.0"})
+    soup = BeautifulSoup(response.content, "lxml")
+    news_list = []
+    articles = soup.find_all("div", class_="news-card")  # Adjust class based on HTML structure
+    for article in articles:
+        try:
+            title = article.find("h2").get_text(strip=True)
+            link = article.find("a")["href"]
+            image_tag = article.find("img")
+            image = image_tag["src"] if image_tag else None
+            summary = article.find("p").get_text(strip=True)
+            date = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+            news_list.append({
+                "title": title,
+                "link": link,
+                "image": image,
+                "summary": summary,
+                "date": date
+            })
+        except Exception as e:
+            print(f"Skipping one article due to error: {e}")
+            continue
+    with open("news.json", "w", encoding="utf-8") as f:
+        json.dump(news_list, f, indent=4, ensure_ascii=False)
+    print(f"[+] Scraped {len(news_list)} news articles successfully!")
+if __name__ == "__main__":
+    scrape_guid_news()

static/styles.css ADDED Viewed

	@@ -0,0 +1,44 @@

+.img-cover {
+  height: 180px;
+  object-fit: cover;
+}
+.article-content {
+  line-height: 1.6;
+}
+.article-content img {
+  max-width: 100%;
+  height: auto;
+  margin: 15px 0;
+}
+.article-content figure {
+  max-width: 100%;
+  margin: 15px 0;
+}
+.article-content iframe {
+  max-width: 100%;
+}
+.article-content p {
+  margin-bottom: 15px;
+}
+.article-content h1, .article-content h2, .article-content h3 {
+  margin-top: 20px;
+  margin-bottom: 15px;
+}
+.article-body img {
+  max-width: 100%;
+  height: auto;
+}
+.article-body figure {
+  max-width: 100%;
+}
+.article-body iframe {
+  max-width: 100%;
+}

streamlit_app.py ADDED Viewed

File without changes

templates/article.html ADDED Viewed

	@@ -0,0 +1,38 @@

+{% extends "base.html" %}
+{% block content %}
+<div class="container">
+  <div class="row">
+    <div class="col-12">
+      <article class="card shadow-sm">
+        {% if item.top_image %}
+        <img
+          src="{{ item.top_image }}"
+          class="card-img-top"
+          alt="Article Image"
+          style="height: 300px; object-fit: cover;"
+        />
+        {% endif %}
+        <div class="card-body">
+          <h1 class="card-title">{{ item.title }}</h1>
+          <div class="text-muted small mb-3">
+            {% if item.published %}
+            Published: {{ item.published }} |
+            {% endif %}
+            <a href="{{ item.link }}" target="_blank" rel="noopener">View Original</a>
+          </div>
+          {% if item.content_html %}
+          <div class="article-content">
+            {% set cleaned_content = item.content_html | replace("Read more", "") | replace("Email", "") | replace("Subscribe", "") | replace("Share", "") | replace("Tweet", "") %}
+            {{ cleaned_content | safe }}
+          </div>
+          {% else %}
+          <p>{{ item.summary }}</p>
+          {% endif %}
+        </div>
+      </article>
+    </div>
+  </div>
+</div>
+{% endblock %}

templates/base.html ADDED Viewed

	@@ -0,0 +1,49 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>{{ app_title }}</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link
+      href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css"
+      rel="stylesheet"
+    />
+    <link
+      rel="stylesheet"
+      href="{{ url_for('static', filename='styles.css') }}"
+    />
+  </head>
+  <body class="bg-light">
+    <nav class="navbar navbar-expand-lg navbar-dark bg-dark">
+      <div class="container d-flex align-items-center">
+        <img
+          src="{{ url_for('static', filename='image.png') }}"
+          alt="Logo"
+          style="height: 40px; margin-right: 12px"
+        />
+        <a class="navbar-brand fw-bold" href="/">{{ app_title }}</a>
+        <span class="navbar-text text-secondary ms-2"
+          >Source: {{ source_name }}</span
+        >
+        <div class="ms-auto">
+          {% if session.get('username') %}
+          <span class="text-light me-2">Hello, {{ session['username'] }}</span>
+          <a href="{{ url_for('logout') }}" class="btn btn-outline-light btn-sm"
+            >Logout</a
+          >
+          {% else %}
+          <a href="{{ url_for('login') }}" class="btn btn-outline-light btn-sm"
+            >Login</a
+          >
+          {% endif %}
+        </div>
+      </div>
+    </nav>
+    <main class="container my-4">{% block content %}{% endblock %}</main>
+    <footer class="border-top py-3">
+      <div class="container small text-muted">
+        Updated hourly • {{ source_name }}
+      </div>
+    </footer>
+  </body>
+</html>

templates/index.html ADDED Viewed

	@@ -0,0 +1,41 @@

+{% extends "base.html" %} {% block content %}
+<h1 class="h3 mb-4">Latest News</h1>
+<div class="row g-4">
+  {% for it in items %}
+  <div class="col-12 col-md-6 col-lg-4">
+    <article class="card h-100 shadow-sm">
+      {% if it.top_image %}
+      <img
+        src="{{ it.top_image }}"
+        class="card-img-top img-cover"
+        alt="cover"
+      />
+      {% endif %}
+      <div class="card-body d-flex flex-column">
+        <h2 class="h5 card-title">{{ it.title }}</h2>
+        <p class="card-text text-muted small">{{ it.summary }}</p>
+        <div class="mt-auto d-flex gap-2">
+          <a
+            class="btn btn-sm btn-primary"
+            href="{{ url_for('article_page', guid=it.guid) }}"
+            >Read Article</a
+          >
+          <a
+            class="btn btn-sm btn-outline-secondary"
+            href="{{ it.link }}"
+            target="_blank"
+            rel="noopener"
+            >Original</a
+          >
+        </div>
+        {% if it.published %}
+        <div class="text-muted small mt-2">{{ it.published }}</div>
+        {% endif %}
+        <div class="text-muted small">Source: {{ it.source }}</div>
+      </div>
+    </article>
+  </div>
+  {% endfor %}
+</div>
+{% endblock %}

templates/login.html ADDED Viewed

	@@ -0,0 +1,56 @@

+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>{{ app_title }} - Login</title>
+    <meta name="viewport" content="width=device-width, initial-scale=1" />
+    <link
+      href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.3/dist/css/bootstrap.min.css"
+      rel="stylesheet"
+    />
+    <link
+      rel="stylesheet"
+      href="{{ url_for('static', filename='styles.css') }}"
+    />
+  </head>
+  <body class="bg-light">
+    <div class="container mt-5" style="max-width: 400px">
+      <div class="text-center mb-4">
+        <img
+          src="{{ url_for('static', filename='image.png') }}"
+          alt="Header Logo"
+          style="max-width: 120px"
+        />
+        <h2 class="mt-2">{{ app_title }}</h2>
+      </div>
+      {% with messages = get_flashed_messages(with_categories=true) %} {% if
+      messages %} {% for category, message in messages %}
+      <div class="alert alert-{{ category }}">{{ message }}</div>
+      {% endfor %} {% endif %} {% endwith %}
+      <form method="post" class="card card-body shadow-sm">
+        <div class="mb-3">
+          <label for="username" class="form-label">Username</label>
+          <input
+            type="text"
+            class="form-control"
+            id="username"
+            name="username"
+            required
+            autofocus
+          />
+        </div>
+        <div class="mb-3">
+          <label for="password" class="form-label">Password</label>
+          <input
+            type="password"
+            class="form-control"
+            id="password"
+            name="password"
+            required
+          />
+        </div>
+        <button type="submit" class="btn btn-primary w-100">Login</button>
+      </form>
+    </div>
+  </body>
+</html>

templates/search_results.html ADDED Viewed

	@@ -0,0 +1,29 @@

+{% extends "base.html" %}
+{% block content %}
+<h1 class="h3 mb-4">Search Results (by Keywords)</h1>
+<div class="row g-4">
+  {% if results %}
+    {% for it in results %}
+    <div class="col-12 col-md-6 col-lg-4">
+      <article class="card h-100 shadow-sm">
+        <div class="card-body d-flex flex-column">
+          <h2 class="h5 card-title">{{ it.title }}</h2>
+          <p class="card-text text-muted small">{{ it.summary }}</p>
+          <div class="mt-auto d-flex gap-2">
+            <a class="btn btn-sm btn-primary" href="{{ it.link }}" target="_blank" rel="noopener">Read Original</a>
+          </div>
+          {% if it.published %}
+          <div class="text-muted small mt-2">{{ it.published }}</div>
+          {% endif %}
+          <div class="text-muted small mt-2">Source: {{ it.source }}</div>
+        </div>
+      </article>
+    </div>
+    {% endfor %}
+  {% else %}
+    <div class="col-12">
+      <div class="alert alert-info">No articles found for the given keywords.</div>
+    </div>
+  {% endif %}
+</div>
+{% endblock %}

users.json ADDED Viewed

	@@ -0,0 +1,8 @@

+{
+  "users": [
+    {
+  "username": "test",
+  "password": "test"
+    }
+  ]
+}