sharad31 commited on
Commit
dc2f44e
·
verified ·
1 Parent(s): c2082f1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -3
app.py CHANGED
@@ -42,6 +42,46 @@ app.add_middleware(
42
  allow_headers=["*"],
43
  )
44
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
 
46
  def get_site_map(url):
47
  """
@@ -55,7 +95,7 @@ def get_site_map(url):
55
  return response.text
56
  else:
57
  return None
58
-
59
  def parse_sitemap(sitemap):
60
  """
61
  Parse the sitemap and extract the URLs.
@@ -78,7 +118,9 @@ def parse_sitemap(sitemap):
78
  "news_image":news_image.text if news_image is not None else None,
79
  "publisher":publisher.text if publisher is not None else None,
80
  "last_mode":last_mode.text if last_mode is not None else None,
81
- "loc":loc.text if loc is not None else None
 
 
82
  }
83
  print(url)
84
  if loc is not None:
@@ -91,6 +133,11 @@ def parse_sitemap(sitemap):
91
  def read_root():
92
  return {"Me": "NewsHUB"}
93
 
 
 
 
 
 
94
  @app.get("/news/thehindu")
95
  def get_news():
96
  get_data = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
@@ -108,7 +155,34 @@ def get_news():
108
  get_data = get_site_map('https://indianexpress.com/news-sitemap.xml')
109
  parse_data = parse_sitemap(get_data)
110
  return parse_data
111
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
 
114
  class NewsUrl(BaseModel):
 
42
  allow_headers=["*"],
43
  )
44
 
45
+ # ----------------------
46
+ # Categories definitions
47
+ # ----------------------
48
+ # A simple, extensible set of categories and keyword heuristics to classify news titles.
49
+ CATEGORIES = [
50
+ "politics",
51
+ "business",
52
+ "technology",
53
+ "sports",
54
+ "entertainment",
55
+ "health",
56
+ "science",
57
+ "world",
58
+ "india",
59
+ "opinion",
60
+ ]
61
+
62
+ CATEGORY_KEYWORDS = {
63
+ "politics": ["election", "minister", "parliament", "bjp", "congress", "policy", "senate", "government"],
64
+ "business": ["market", "stocks", "ipo", "economy", "trade", "merger", "acquisition", "startup", "funding"],
65
+ "technology": ["tech", "ai", "software", "app", "android", "ios", "google", "apple", "microsoft", "internet"],
66
+ "sports": ["match", "tournament", "cricket", "football", "soccer", "tennis", "olympic", "ipl", "world cup"],
67
+ "entertainment": ["movie", "film", "bollywood", "hollywood", "song", "music", "actor", "actress", "trailer"],
68
+ "health": ["health", "covid", "vaccine", "disease", "hospital", "fitness", "diet", "mental"],
69
+ "science": ["research", "study", "space", "nasa", "isro", "discovery", "quantum", "biology", "physics"],
70
+ "world": ["global", "world", "international", "united nations", "china", "us", "europe", "russia", "ukraine"],
71
+ "india": ["india", "indian", "delhi", "mumbai", "bengaluru", "karnataka", "maharashtra", "kerala"],
72
+ "opinion": ["opinion", "editorial", "op-ed", "analysis", "column"],
73
+ }
74
+
75
+ def categorize_title(title: str) -> str | None:
76
+ if not title:
77
+ return None
78
+ t = title.lower()
79
+ for cat, keywords in CATEGORY_KEYWORDS.items():
80
+ if any(k in t for k in keywords):
81
+ return cat
82
+ # Fallback: None (uncategorized)
83
+ return None
84
+
85
 
86
  def get_site_map(url):
87
  """
 
95
  return response.text
96
  else:
97
  return None
98
+
99
  def parse_sitemap(sitemap):
100
  """
101
  Parse the sitemap and extract the URLs.
 
118
  "news_image":news_image.text if news_image is not None else None,
119
  "publisher":publisher.text if publisher is not None else None,
120
  "last_mode":last_mode.text if last_mode is not None else None,
121
+ "loc":loc.text if loc is not None else None,
122
+ # Attach a naive category based on title keywords
123
+ "category": categorize_title(news_title.text) if news_title is not None else None,
124
  }
125
  print(url)
126
  if loc is not None:
 
133
  def read_root():
134
  return {"Me": "NewsHUB"}
135
 
136
+ @app.get("/categories")
137
+ def list_categories():
138
+ """Return the list of supported categories."""
139
+ return {"categories": CATEGORIES}
140
+
141
  @app.get("/news/thehindu")
142
  def get_news():
143
  get_data = get_site_map('https://www.thehindu.com/sitemap/googlenews/all/all.xml')
 
155
  get_data = get_site_map('https://indianexpress.com/news-sitemap.xml')
156
  parse_data = parse_sitemap(get_data)
157
  return parse_data
158
+
159
+ @app.get("/news/by-category/{category}")
160
+ def news_by_category(category: str):
161
+ """
162
+ Aggregate news from available sources and filter by category (keyword-based on title).
163
+ Category must be one of /categories. Case-insensitive.
164
+ """
165
+ cat = category.lower()
166
+ if cat not in CATEGORIES:
167
+ return {"error": "invalid_category", "message": f"Category must be one of {CATEGORIES}"}
168
+
169
+ sources = [
170
+ 'https://www.thehindu.com/sitemap/googlenews/all/all.xml',
171
+ 'https://economictimes.indiatimes.com/sitemap/today',
172
+ 'https://indianexpress.com/news-sitemap.xml',
173
+ ]
174
+
175
+ aggregated: list[dict] = []
176
+ for src in sources:
177
+ sm = get_site_map(src)
178
+ if not sm:
179
+ continue
180
+ parsed = parse_sitemap(sm)
181
+ aggregated.extend(parsed)
182
+
183
+ filtered = [item for item in aggregated if (item.get("category") or "").lower() == cat]
184
+ return {"category": cat, "count": len(filtered), "items": filtered}
185
+
186
 
187
 
188
  class NewsUrl(BaseModel):