Spaces:

broadfield-dev
/

RSS_News_1

Paused

App Files Files Community

broadfield-dev commited on Feb 22, 2025

Commit

1c7cefc

verified ·

1 Parent(s): a947e87

Create app.py

Browse files

Files changed (1) hide show

app.py +259 -0

app.py ADDED Viewed

	@@ -0,0 +1,259 @@

+import os
+import threading
+from flask import Flask, render_template, request, jsonify
+from rss_processor import fetch_rss_feeds, process_and_store_articles, vector_db, download_from_hf_hub, upload_to_hf_hub
+import logging
+import time
+from datetime import datetime
+app = Flask(__name__)
+# Setup logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Global flag to track background loading
+loading_complete = False
+last_update_time = time.time()
+def load_feeds_in_background():
+    global loading_complete, last_update_time
+    try:
+        logger.info("Starting background RSS feed fetch")
+        articles = fetch_rss_feeds()
+        logger.info(f"Fetched {len(articles)} articles")
+        process_and_store_articles(articles)
+        last_update_time = time.time()  # Update timestamp when new articles are added
+        logger.info("Background feed processing complete")
+        # Upload updated DB to Hugging Face Hub
+        upload_to_hf_hub()
+        loading_complete = True
+    except Exception as e:
+        logger.error(f"Error in background feed loading: {e}")
+        loading_complete = True
+@app.route('/')
+def index():
+    global loading_complete
+    loading_complete = False  # Reset on each load
+    # Ensure Chroma DB is downloaded from Hugging Face Hub on first load
+    if not os.path.exists("chroma_db"):
+        logger.info("Downloading Chroma DB from Hugging Face Hub...")
+        download_from_hf_hub()
+    # Start background feed loading
+    threading.Thread(target=load_feeds_in_background, daemon=True).start()
+    try:
+        # Retrieve all articles from Chroma DB
+        all_docs = vector_db.get(include=['documents', 'metadatas'])
+        if not all_docs.get('metadatas'):
+            logger.info("No articles in DB yet")
+            return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
+        # Process and categorize articles, getting only 10 most recent per category with strict deduplication
+        enriched_articles = []
+        seen_keys = set()
+        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
+            if not meta:
+                continue
+            title = meta.get("title", "No Title").strip()
+            link = meta.get("link", "").strip()
+            published = meta.get("published", "Unknown Date").strip()
+            # Use a more robust key including trimmed fields to prevent duplicates
+            key = f"{title}|{link}|{published}"
+            if key not in seen_keys:
+                seen_keys.add(key)
+                # Try to parse published date, fallback to string sorting
+                try:
+                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
+                except (ValueError, TypeError):
+                    # Fallback to a very old date for sorting if parsing fails
+                    published = "1970-01-01T00:00:00"
+                enriched_articles.append({
+                    "title": title,
+                    "link": link,
+                    "description": meta.get("original_description", "No Description"),
+                    "category": meta.get("category", "Uncategorized"),
+                    "published": published,
+                    "image": meta.get("image", "svg"),
+                })
+        # Sort by published date (handle both datetime and string)
+        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
+        # Group by category and limit to 10 most recent per category with final deduplication
+        categorized_articles = {}
+        for article in enriched_articles:
+            cat = article["category"]
+            if cat not in categorized_articles:
+                categorized_articles[cat] = []
+            # Add only if not already in the category list (extra deduplication)
+            key = f"{article['title']}|{article['link']}|{article['published']}"
+            if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
+                categorized_articles[cat].append(article)
+        # Limit to 10 most recent per category and sort again for safety
+        for cat in categorized_articles:
+            unique_articles = []
+            seen_cat_keys = set()
+            for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
+                key = f"{article['title']}|{article['link']}|{article['published']}"
+                if key not in seen_cat_keys:
+                    seen_cat_keys.add(key)
+                    unique_articles.append(article)
+            categorized_articles[cat] = unique_articles[:10]
+        logger.info(f"Displaying articles: {sum(len(articles) for articles in categorized_articles.values())} total")
+        return render_template("index.html",
+                              categorized_articles=categorized_articles,
+                              has_articles=True,
+                              loading=True)
+    except Exception as e:
+        logger.error(f"Error retrieving articles: {e}")
+        return render_template("index.html", categorized_articles={}, has_articles=False, loading=True)
+@app.route('/search', methods=['POST'])
+def search():
+    query = request.form.get('search')
+    if not query:
+        return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
+    try:
+        logger.info(f"Searching for: {query}")
+        results = vector_db.similarity_search(query, k=10)
+        enriched_articles = []
+        seen_keys = set()
+        for doc in results:
+            meta = doc.metadata
+            title = meta.get("title", "No Title").strip()
+            link = meta.get("link", "").strip()
+            published = meta.get("published", "Unknown Date").strip()
+            key = f"{title}|{link}|{published}"
+            if key not in seen_keys:
+                seen_keys.add(key)
+                enriched_articles.append({
+                    "title": title,
+                    "link": link,
+                    "description": meta.get("original_description", "No Description"),
+                    "category": meta.get("category", "Uncategorized"),
+                    "published": meta.get("published", "Unknown Date"),
+                    "image": meta.get("image", "svg"),
+                })
+        categorized_articles = {}
+        for article in enriched_articles:
+            cat = article["category"]
+            categorized_articles.setdefault(cat, []).append(article)
+        return render_template("index.html", categorized_articles=categorized_articles, has_articles=bool(enriched_articles), loading=False)
+    except Exception as e:
+        logger.error(f"Search error: {e}")
+        return render_template("index.html", categorized_articles={}, has_articles=False, loading=False)
+@app.route('/check_loading')
+def check_loading():
+    global loading_complete, last_update_time
+    if loading_complete:
+        return jsonify({"status": "complete", "last_update": last_update_time})
+    return jsonify({"status": "loading"}), 202
+@app.route('/get_updates')
+def get_updates():
+    global last_update_time
+    try:
+        all_docs = vector_db.get(include=['documents', 'metadatas'])
+        if not all_docs.get('metadatas'):
+            return jsonify({"articles": [], "last_update": last_update_time})
+        enriched_articles = []
+        seen_keys = set()
+        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
+            if not meta:
+                continue
+            title = meta.get("title", "No Title").strip()
+            link = meta.get("link", "").strip()
+            published = meta.get("published", "Unknown Date").strip()
+            key = f"{title}|{link}|{published}"
+            if key not in seen_keys:
+                seen_keys.add(key)
+                try:
+                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
+                except (ValueError, TypeError):
+                    published = "1970-01-01T00:00:00"  # Fallback to a very old date
+                enriched_articles.append({
+                    "title": title,
+                    "link": link,
+                    "description": meta.get("original_description", "No Description"),
+                    "category": meta.get("category", "Uncategorized"),
+                    "published": published,
+                    "image": meta.get("image", "svg"),
+                })
+        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
+        categorized_articles = {}
+        for article in enriched_articles:
+            cat = article["category"]
+            if cat not in categorized_articles:
+                categorized_articles[cat] = []
+            # Extra deduplication for category
+            key = f"{article['title']}|{article['link']}|{article['published']}"
+            if key not in [f"{a['title']}|{a['link']}|{a['published']}" for a in categorized_articles[cat]]:
+                categorized_articles[cat].append(article)
+        # Limit to 10 most recent per category with final deduplication
+        for cat in categorized_articles:
+            unique_articles = []
+            seen_cat_keys = set()
+            for article in sorted(categorized_articles[cat], key=lambda x: x["published"], reverse=True):
+                key = f"{article['title']}|{article['link']}|{article['published']}"
+                if key not in seen_cat_keys:
+                    seen_cat_keys.add(key)
+                    unique_articles.append(article)
+            categorized_articles[cat] = unique_articles[:10]
+        return jsonify({"articles": categorized_articles, "last_update": last_update_time})
+    except Exception as e:
+        logger.error(f"Error fetching updates: {e}")
+        return jsonify({"articles": {}, "last_update": last_update_time}), 500
+@app.route('/get_all_articles/<category>')
+def get_all_articles(category):
+    try:
+        all_docs = vector_db.get(include=['documents', 'metadatas'])
+        if not all_docs.get('metadatas'):
+            return jsonify({"articles": [], "category": category})
+        enriched_articles = []
+        seen_keys = set()
+        for doc, meta in zip(all_docs['documents'], all_docs['metadatas']):
+            if not meta or meta.get("category") != category:
+                continue
+            title = meta.get("title", "No Title").strip()
+            link = meta.get("link", "").strip()
+            published = meta.get("published", "Unknown Date").strip()
+            key = f"{title}|{link}|{published}"
+            if key not in seen_keys:
+                seen_keys.add(key)
+                try:
+                    published = datetime.strptime(published, "%Y-%m-%d %H:%M:%S").isoformat() if "Unknown" not in published else published
+                except (ValueError, TypeError):
+                    published = "1970-01-01T00:00:00"  # Fallback to a very old date
+                enriched_articles.append({
+                    "title": title,
+                    "link": link,
+                    "description": meta.get("original_description", "No Description"),
+                    "category": meta.get("category", "Uncategorized"),
+                    "published": published,
+                    "image": meta.get("image", "svg"),
+                })
+        enriched_articles.sort(key=lambda x: x["published"], reverse=True)
+        return jsonify({"articles": enriched_articles, "category": category})
+    except Exception as e:
+        logger.error(f"Error fetching all articles for category {category}: {e}")
+        return jsonify({"articles": [], "category": category}), 500
+if __name__ == "__main__":
+    app.run(host="0.0.0.0", port=7860)