Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -42,6 +42,46 @@ app.add_middleware(
|
|
| 42 |
allow_headers=["*"],
|
| 43 |
)
|
| 44 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
def get_site_map(url):
|
| 47 |
"""
|
|
@@ -55,7 +95,7 @@ def get_site_map(url):
|
|
| 55 |
return response.text
|
| 56 |
else:
|
| 57 |
return None
|
| 58 |
-
|
| 59 |
def parse_sitemap(sitemap):
|
| 60 |
"""
|
| 61 |
Parse the sitemap and extract the URLs.
|
|
@@ -78,7 +118,9 @@ def parse_sitemap(sitemap):
|
|
| 78 |
"news_image":news_image.text if news_image is not None else None,
|
| 79 |
"publisher":publisher.text if publisher is not None else None,
|
| 80 |
"last_mode":last_mode.text if last_mode is not None else None,
|
| 81 |
-
"loc":loc.text if loc is not None else None
|
|
|
|
|
|
|
| 82 |
}
|
| 83 |
print(url)
|
| 84 |
if loc is not None:
|
|
@@ -91,6 +133,11 @@ def parse_sitemap(sitemap):
|
|
| 91 |
def read_root():
|
| 92 |
return {"Me": "NewsHUB"}
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
@app.get("/news/thehindu")
|
| 95 |
def get_news():
|
| 96 |
get_data = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
|
|
@@ -108,7 +155,34 @@ def get_news():
|
|
| 108 |
get_data = get_site_map('https://indianexpress.com/news-sitemap.xml')
|
| 109 |
parse_data = parse_sitemap(get_data)
|
| 110 |
return parse_data
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
|
| 113 |
|
| 114 |
class NewsUrl(BaseModel):
|
|
|
|
| 42 |
allow_headers=["*"],
|
| 43 |
)
|
| 44 |
|
| 45 |
+
# ----------------------
|
| 46 |
+
# Categories definitions
|
| 47 |
+
# ----------------------
|
| 48 |
+
# A simple, extensible set of categories and keyword heuristics to classify news titles.
|
| 49 |
+
CATEGORIES = [
|
| 50 |
+
"politics",
|
| 51 |
+
"business",
|
| 52 |
+
"technology",
|
| 53 |
+
"sports",
|
| 54 |
+
"entertainment",
|
| 55 |
+
"health",
|
| 56 |
+
"science",
|
| 57 |
+
"world",
|
| 58 |
+
"india",
|
| 59 |
+
"opinion",
|
| 60 |
+
]
|
| 61 |
+
|
| 62 |
+
CATEGORY_KEYWORDS = {
|
| 63 |
+
"politics": ["election", "minister", "parliament", "bjp", "congress", "policy", "senate", "government"],
|
| 64 |
+
"business": ["market", "stocks", "ipo", "economy", "trade", "merger", "acquisition", "startup", "funding"],
|
| 65 |
+
"technology": ["tech", "ai", "software", "app", "android", "ios", "google", "apple", "microsoft", "internet"],
|
| 66 |
+
"sports": ["match", "tournament", "cricket", "football", "soccer", "tennis", "olympic", "ipl", "world cup"],
|
| 67 |
+
"entertainment": ["movie", "film", "bollywood", "hollywood", "song", "music", "actor", "actress", "trailer"],
|
| 68 |
+
"health": ["health", "covid", "vaccine", "disease", "hospital", "fitness", "diet", "mental"],
|
| 69 |
+
"science": ["research", "study", "space", "nasa", "isro", "discovery", "quantum", "biology", "physics"],
|
| 70 |
+
"world": ["global", "world", "international", "united nations", "china", "us", "europe", "russia", "ukraine"],
|
| 71 |
+
"india": ["india", "indian", "delhi", "mumbai", "bengaluru", "karnataka", "maharashtra", "kerala"],
|
| 72 |
+
"opinion": ["opinion", "editorial", "op-ed", "analysis", "column"],
|
| 73 |
+
}
|
| 74 |
+
|
| 75 |
+
def categorize_title(title: str) -> str | None:
|
| 76 |
+
if not title:
|
| 77 |
+
return None
|
| 78 |
+
t = title.lower()
|
| 79 |
+
for cat, keywords in CATEGORY_KEYWORDS.items():
|
| 80 |
+
if any(k in t for k in keywords):
|
| 81 |
+
return cat
|
| 82 |
+
# Fallback: None (uncategorized)
|
| 83 |
+
return None
|
| 84 |
+
|
| 85 |
|
| 86 |
def get_site_map(url):
|
| 87 |
"""
|
|
|
|
| 95 |
return response.text
|
| 96 |
else:
|
| 97 |
return None
|
| 98 |
+
|
| 99 |
def parse_sitemap(sitemap):
|
| 100 |
"""
|
| 101 |
Parse the sitemap and extract the URLs.
|
|
|
|
| 118 |
"news_image":news_image.text if news_image is not None else None,
|
| 119 |
"publisher":publisher.text if publisher is not None else None,
|
| 120 |
"last_mode":last_mode.text if last_mode is not None else None,
|
| 121 |
+
"loc":loc.text if loc is not None else None,
|
| 122 |
+
# Attach a naive category based on title keywords
|
| 123 |
+
"category": categorize_title(news_title.text) if news_title is not None else None,
|
| 124 |
}
|
| 125 |
print(url)
|
| 126 |
if loc is not None:
|
|
|
|
| 133 |
def read_root():
|
| 134 |
return {"Me": "NewsHUB"}
|
| 135 |
|
| 136 |
+
@app.get("/categories")
|
| 137 |
+
def list_categories():
|
| 138 |
+
"""Return the list of supported categories."""
|
| 139 |
+
return {"categories": CATEGORIES}
|
| 140 |
+
|
| 141 |
@app.get("/news/thehindu")
|
| 142 |
def get_news():
|
| 143 |
get_data = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
|
|
|
|
| 155 |
get_data = get_site_map('https://indianexpress.com/news-sitemap.xml')
|
| 156 |
parse_data = parse_sitemap(get_data)
|
| 157 |
return parse_data
|
| 158 |
+
|
| 159 |
+
@app.get("/news/by-category/{category}")
|
| 160 |
+
def news_by_category(category: str):
|
| 161 |
+
"""
|
| 162 |
+
Aggregate news from available sources and filter by category (keyword-based on title).
|
| 163 |
+
Category must be one of /categories. Case-insensitive.
|
| 164 |
+
"""
|
| 165 |
+
cat = category.lower()
|
| 166 |
+
if cat not in CATEGORIES:
|
| 167 |
+
return {"error": "invalid_category", "message": f"Category must be one of {CATEGORIES}"}
|
| 168 |
+
|
| 169 |
+
sources = [
|
| 170 |
+
'https://www.thehindu.com/sitemap/googlenews/all/all.xml',
|
| 171 |
+
'https://economictimes.indiatimes.com/sitemap/today',
|
| 172 |
+
'https://indianexpress.com/news-sitemap.xml',
|
| 173 |
+
]
|
| 174 |
+
|
| 175 |
+
aggregated: list[dict] = []
|
| 176 |
+
for src in sources:
|
| 177 |
+
sm = get_site_map(src)
|
| 178 |
+
if not sm:
|
| 179 |
+
continue
|
| 180 |
+
parsed = parse_sitemap(sm)
|
| 181 |
+
aggregated.extend(parsed)
|
| 182 |
+
|
| 183 |
+
filtered = [item for item in aggregated if (item.get("category") or "").lower() == cat]
|
| 184 |
+
return {"category": cat, "count": len(filtered), "items": filtered}
|
| 185 |
+
|
| 186 |
|
| 187 |
|
| 188 |
class NewsUrl(BaseModel):
|