Spaces:

sharad31
/

newshub-fast-api

Sleeping

App Files Files Community

sharad31 commited on Sep 13, 2025

Commit

dc2f44e

verified ·

1 Parent(s): c2082f1

Update app.py

Browse files

Files changed (1) hide show

app.py +77 -3

app.py CHANGED Viewed

@@ -42,6 +42,46 @@ app.add_middleware(
     allow_headers=["*"],
 )
 def get_site_map(url):
     """
@@ -55,7 +95,7 @@ def get_site_map(url):
         return response.text
     else:
         return None
 def parse_sitemap(sitemap):
     """
     Parse the sitemap and extract the URLs.
@@ -78,7 +118,9 @@ def parse_sitemap(sitemap):
             "news_image":news_image.text if news_image is not None else None,
             "publisher":publisher.text if publisher is not None else None,
             "last_mode":last_mode.text if last_mode is not None else None,
-            "loc":loc.text if loc is not None else None
         }
         print(url)
         if loc is not None:
@@ -91,6 +133,11 @@ def parse_sitemap(sitemap):
 def read_root():
     return {"Me": "NewsHUB"}
 @app.get("/news/thehindu")
 def get_news():
     get_data  = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
@@ -108,7 +155,34 @@ def get_news():
     get_data  = get_site_map('https://indianexpress.com/news-sitemap.xml')
     parse_data = parse_sitemap(get_data)
     return parse_data
 class NewsUrl(BaseModel):

     allow_headers=["*"],
 )
+# ----------------------
+# Categories definitions
+# ----------------------
+# A simple, extensible set of categories and keyword heuristics to classify news titles.
+CATEGORIES = [
+    "politics",
+    "business",
+    "technology",
+    "sports",
+    "entertainment",
+    "health",
+    "science",
+    "world",
+    "india",
+    "opinion",
+]
+CATEGORY_KEYWORDS = {
+    "politics": ["election", "minister", "parliament", "bjp", "congress", "policy", "senate", "government"],
+    "business": ["market", "stocks", "ipo", "economy", "trade", "merger", "acquisition", "startup", "funding"],
+    "technology": ["tech", "ai", "software", "app", "android", "ios", "google", "apple", "microsoft", "internet"],
+    "sports": ["match", "tournament", "cricket", "football", "soccer", "tennis", "olympic", "ipl", "world cup"],
+    "entertainment": ["movie", "film", "bollywood", "hollywood", "song", "music", "actor", "actress", "trailer"],
+    "health": ["health", "covid", "vaccine", "disease", "hospital", "fitness", "diet", "mental"],
+    "science": ["research", "study", "space", "nasa", "isro", "discovery", "quantum", "biology", "physics"],
+    "world": ["global", "world", "international", "united nations", "china", "us", "europe", "russia", "ukraine"],
+    "india": ["india", "indian", "delhi", "mumbai", "bengaluru", "karnataka", "maharashtra", "kerala"],
+    "opinion": ["opinion", "editorial", "op-ed", "analysis", "column"],
+}
+def categorize_title(title: str) -> str | None:
+    if not title:
+        return None
+    t = title.lower()
+    for cat, keywords in CATEGORY_KEYWORDS.items():
+        if any(k in t for k in keywords):
+            return cat
+    # Fallback: None (uncategorized)
+    return None
 def get_site_map(url):
     """
         return response.text
     else:
         return None
 def parse_sitemap(sitemap):
     """
     Parse the sitemap and extract the URLs.
             "news_image":news_image.text if news_image is not None else None,
             "publisher":publisher.text if publisher is not None else None,
             "last_mode":last_mode.text if last_mode is not None else None,
+            "loc":loc.text if loc is not None else None,
+            # Attach a naive category based on title keywords
+            "category": categorize_title(news_title.text) if news_title is not None else None,
         }
         print(url)
         if loc is not None:
 def read_root():
     return {"Me": "NewsHUB"}
+@app.get("/categories")
+def list_categories():
+    """Return the list of supported categories."""
+    return {"categories": CATEGORIES}
 @app.get("/news/thehindu")
 def get_news():
     get_data  = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
     get_data  = get_site_map('https://indianexpress.com/news-sitemap.xml')
     parse_data = parse_sitemap(get_data)
     return parse_data
+@app.get("/news/by-category/{category}")
+def news_by_category(category: str):
+    """
+    Aggregate news from available sources and filter by category (keyword-based on title).
+    Category must be one of /categories. Case-insensitive.
+    """
+    cat = category.lower()
+    if cat not in CATEGORIES:
+        return {"error": "invalid_category", "message": f"Category must be one of {CATEGORIES}"}
+    sources = [
+        'https://www.thehindu.com/sitemap/googlenews/all/all.xml',
+        'https://economictimes.indiatimes.com/sitemap/today',
+        'https://indianexpress.com/news-sitemap.xml',
+    ]
+    aggregated: list[dict] = []
+    for src in sources:
+        sm = get_site_map(src)
+        if not sm:
+            continue
+        parsed = parse_sitemap(sm)
+        aggregated.extend(parsed)
+    filtered = [item for item in aggregated if (item.get("category") or "").lower() == cat]
+    return {"category": cat, "count": len(filtered), "items": filtered}
 class NewsUrl(BaseModel):