nivakaran commited on
Commit
ff3017c
·
verified ·
1 Parent(s): e78fedb

Deploy from GitHub Actions

Browse files
frontend/app/components/dashboard/TrendingTopics.tsx CHANGED
@@ -5,12 +5,20 @@
5
 
6
  import React, { useEffect, useState } from 'react';
7
 
 
 
 
 
 
 
 
8
  interface TrendingTopic {
9
  topic: string;
10
  momentum: number;
11
  is_spike: boolean;
12
  count_current_hour?: number;
13
  avg_count?: number;
 
14
  }
15
 
16
  interface TrendingData {
@@ -44,7 +52,6 @@ export const TrendingTopics: React.FC = () => {
44
  };
45
 
46
  fetchTrending();
47
- // Refresh every 30 seconds
48
  const interval = setInterval(fetchTrending, 30000);
49
  return () => clearInterval(interval);
50
  }, []);
@@ -159,23 +166,37 @@ export const TrendingTopics: React.FC = () => {
159
  data.trending_topics.slice(0, 8).map((topic, idx) => (
160
  <div
161
  key={idx}
162
- className={`flex items-center justify-between p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
163
  >
164
- <div className="flex items-center gap-3">
165
- <span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
166
- <div>
167
- <p className="font-semibold text-white capitalize">{topic.topic}</p>
168
- <p className="text-xs text-gray-400">
169
- {topic.is_spike ? '🔥 Spiking' : 'Trending'}
 
 
 
 
 
 
 
170
  </p>
 
171
  </div>
172
  </div>
173
- <div className="text-right">
174
- <p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
175
- {topic.momentum.toFixed(0)}x
176
- </p>
177
- <p className="text-xs text-gray-500">momentum</p>
178
- </div>
 
 
 
 
 
 
179
  </div>
180
  ))
181
  )}
 
5
 
6
  import React, { useEffect, useState } from 'react';
7
 
8
+ interface RelatedFeed {
9
+ summary: string;
10
+ domain: string;
11
+ timestamp: string;
12
+ source: string;
13
+ }
14
+
15
  interface TrendingTopic {
16
  topic: string;
17
  momentum: number;
18
  is_spike: boolean;
19
  count_current_hour?: number;
20
  avg_count?: number;
21
+ related_feeds?: RelatedFeed[];
22
  }
23
 
24
  interface TrendingData {
 
52
  };
53
 
54
  fetchTrending();
 
55
  const interval = setInterval(fetchTrending, 30000);
56
  return () => clearInterval(interval);
57
  }, []);
 
166
  data.trending_topics.slice(0, 8).map((topic, idx) => (
167
  <div
168
  key={idx}
169
+ className={`flex flex-col p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
170
  >
171
+ <div className="flex items-center justify-between w-full">
172
+ <div className="flex items-center gap-3">
173
+ <span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
174
+ <div>
175
+ <p className="font-semibold text-white capitalize">{topic.topic}</p>
176
+ <p className="text-xs text-gray-400">
177
+ {topic.is_spike ? '🔥 Spiking' : 'Trending'}
178
+ </p>
179
+ </div>
180
+ </div>
181
+ <div className="text-right">
182
+ <p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
183
+ {topic.momentum.toFixed(0)}x
184
  </p>
185
+ <p className="text-xs text-gray-500">momentum</p>
186
  </div>
187
  </div>
188
+
189
+ {/* Related Feeds Context */}
190
+ {topic.related_feeds && topic.related_feeds.length > 0 && (
191
+ <div className="mt-3 pl-3 border-l-2 border-gray-600/30 space-y-2">
192
+ {topic.related_feeds.map((feed, fIdx) => (
193
+ <div key={fIdx} className="text-xs text-gray-300/80 leading-relaxed">
194
+ <span className="text-gray-500 font-medium text-[10px] uppercase tracking-wider mr-2">[{feed.domain}]</span>
195
+ {feed.summary.length > 100 ? feed.summary.substring(0, 100) + '...' : feed.summary}
196
+ </div>
197
+ ))}
198
+ </div>
199
+ )}
200
  </div>
201
  ))
202
  )}
inspect_chroma.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import logging
3
+ import sys
4
+ from pathlib import Path
5
+ from collections import Counter
6
+
7
+ # Setup path
8
+ sys.path.append(str(Path.cwd()))
9
+
10
+ from src.storage.config import config
11
+ from src.storage.chromadb_store import ChromaDBStore
12
+
13
+ # Mute logging
14
+ logging.basicConfig(level=logging.ERROR)
15
+
16
+ def inspect():
17
+ print("Connecting to ChromaDB...")
18
+ store = ChromaDBStore()
19
+
20
+ if not store.collection:
21
+ print("Could not connect to collection.")
22
+ return
23
+
24
+ count = store.collection.count()
25
+ print(f"Total documents: {count}")
26
+
27
+ if count == 0:
28
+ return
29
+
30
+ # Get a sample or all metadata
31
+ # ChromaDB get() without ids returns everything if limit allows, or we can page.
32
+ # tailored for 2000 docs, let's just get all metadatas.
33
+
34
+ print("Fetching metadata...")
35
+ data = store.collection.get(include=["metadatas"])
36
+ metadatas = data["metadatas"]
37
+
38
+ domains = Counter()
39
+ sources = Counter()
40
+ impacts = Counter()
41
+
42
+ for meta in metadatas:
43
+ if not meta: continue
44
+ domains[meta.get("domain", "unknown")] += 1
45
+ sources[meta.get("platform", "unknown")] += 1
46
+ impacts[meta.get("impact_type", "unknown")] += 1
47
+
48
+ print("\n--- Domain Distribution ---")
49
+ for d, c in domains.most_common():
50
+ print(f"{d}: {c}")
51
+
52
+ print("\n--- Source/Platform Distribution ---")
53
+ for s, c in sources.most_common():
54
+ print(f"{s}: {c}")
55
+
56
+ print("\n--- Impact Type Distribution ---")
57
+ for i, c in impacts.most_common():
58
+ print(f"{i}: {c}")
59
+
60
+ if __name__ == "__main__":
61
+ inspect()
main.py CHANGED
@@ -1113,10 +1113,25 @@ def get_trending_topics(limit: int = 10):
1113
  """
1114
  try:
1115
  from src.utils.trending_detector import get_trending_now, get_spikes
1116
-
 
 
 
 
1117
  trending = get_trending_now(limit=limit)
1118
  spikes = get_spikes()
1119
 
 
 
 
 
 
 
 
 
 
 
 
1120
  return {
1121
  "status": "success",
1122
  "trending_topics": trending,
 
1113
  """
1114
  try:
1115
  from src.utils.trending_detector import get_trending_now, get_spikes
1116
+ # Use the global storage_manager instance defined earlier in main.py
1117
+ # no need to import it if we are inside main.py function scope where it's visible or passed
1118
+ # But since this is a route function, it might need global access or import.
1119
+ # Assuming storage_manager is available globally in this file as it was initialized earlier.
1120
+
1121
  trending = get_trending_now(limit=limit)
1122
  spikes = get_spikes()
1123
 
1124
+ # Enrich top 5 trending topics with related feeds
1125
+ for topic in trending[:5]:
1126
+ keyword = topic["topic"]
1127
+ # Search for relevant feeds (limit 2 per topic to keep payload small)
1128
+ try:
1129
+ related = storage_manager.search_feeds(keyword, limit=2)
1130
+ topic["related_feeds"] = related
1131
+ except Exception as e:
1132
+ logger.warning(f"Error searching feeds for topic {keyword}: {e}")
1133
+ topic["related_feeds"] = []
1134
+
1135
  return {
1136
  "status": "success",
1137
  "trending_topics": trending,
src/rag.py CHANGED
@@ -375,10 +375,43 @@ class RogerRAG:
375
  search_question = self._reformulate_question(question)
376
 
377
  # ChromaDB semantic search
378
- docs = self.retriever.search(
379
- search_question, n_results=5, domain_filter=domain_filter
 
 
380
  )
381
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
382
  if not docs:
383
  return {
384
  "answer": "I couldn't find any relevant intelligence data to answer your question.",
 
375
  search_question = self._reformulate_question(question)
376
 
377
  # ChromaDB semantic search
378
+ # ChromaDB semantic search
379
+ # FETCH MORE results (20) to allow for diversity filtering
380
+ raw_docs = self.retriever.search(
381
+ search_question, n_results=20, domain_filter=domain_filter
382
  )
383
 
384
+ # DIVERSITY RERANKING
385
+ # Ensure we don't just show 5 gazettes.
386
+ # We want a mix of domains if possible.
387
+ unique_domains = {}
388
+ diverse_docs = []
389
+
390
+ # Priority domains for situational awareness
391
+ priority_domains = {'intelligence', 'social', 'economical', 'meteorological'}
392
+
393
+ for doc in raw_docs:
394
+ domain = doc.get("domain", "unknown")
395
+ platform = doc.get("metadata", {}).get("platform", "unknown")
396
+
397
+ # Key to track redundancy: domain + platform
398
+ key = f"{domain}_{platform}"
399
+
400
+ # Allow max 2 docs per domain/platform combo,
401
+ # UNLESS it's a priority domain with high similarity (>0.4)
402
+ limit = 2
403
+ if domain in priority_domains and doc['similarity'] > 0.4:
404
+ limit = 3
405
+
406
+ if unique_domains.get(key, 0) < limit:
407
+ diverse_docs.append(doc)
408
+ unique_domains[key] = unique_domains.get(key, 0) + 1
409
+
410
+ if len(diverse_docs) >= 7: # Stop after getting 7 diverse docs
411
+ break
412
+
413
+ docs = diverse_docs
414
+
415
  if not docs:
416
  return {
417
  "answer": "I couldn't find any relevant intelligence data to answer your question.",
src/storage/sqlite_cache.py CHANGED
@@ -151,6 +151,37 @@ class SQLiteCache:
151
  conn.close()
152
  return results
153
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
154
  def get_entries_since(self, timestamp: str) -> list:
155
  """
156
  Get entries added/updated after timestamp.
 
151
  conn.close()
152
  return results
153
 
154
+ def search_entries(self, query: str, limit: int = 10) -> list:
155
+ """
156
+ Search for entries containing specific text.
157
+ Args:
158
+ query: Text to search for (case-insensitive LIKE)
159
+ limit: Max results
160
+ """
161
+ if not query or len(query) < 2:
162
+ return []
163
+
164
+ conn = sqlite3.connect(self.db_path)
165
+ cursor = conn.execute(
166
+ "SELECT content_hash, first_seen, last_seen, event_id, summary_preview FROM seen_hashes WHERE summary_preview LIKE ? ORDER BY last_seen DESC LIMIT ?",
167
+ (f"%{query}%", limit),
168
+ )
169
+
170
+ results = []
171
+ for row in cursor.fetchall():
172
+ results.append(
173
+ {
174
+ "content_hash": row[0],
175
+ "first_seen": row[1],
176
+ "last_seen": row[2],
177
+ "event_id": row[3],
178
+ "summary_preview": row[4],
179
+ }
180
+ )
181
+
182
+ conn.close()
183
+ return results
184
+
185
  def get_entries_since(self, timestamp: str) -> list:
186
  """
187
  Get entries added/updated after timestamp.
src/storage/storage_manager.py CHANGED
@@ -393,6 +393,52 @@ class StorageManager:
393
  logger.error(f"[FEED_RETRIEVAL] Error: {e}")
394
  return []
395
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
396
  def get_feeds_since(self, timestamp: datetime) -> List[Dict[str, Any]]:
397
  """
398
  Get all feeds added after given timestamp.
 
393
  logger.error(f"[FEED_RETRIEVAL] Error: {e}")
394
  return []
395
 
396
+ return feeds
397
+
398
+ def search_feeds(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
399
+ """
400
+ Search feeds by keyword and return enriched results.
401
+ """
402
+ try:
403
+ entries = self.sqlite_cache.search_entries(query, limit=limit)
404
+ feeds = []
405
+
406
+ for entry in entries:
407
+ event_id = entry.get("event_id")
408
+ if not event_id:
409
+ continue
410
+
411
+ try:
412
+ # Try to get metadata from Chroma (optional)
413
+ chroma_data = self.chromadb.collection.get(ids=[event_id])
414
+ metadata = {}
415
+ if chroma_data and chroma_data["metadatas"]:
416
+ metadata = chroma_data["metadatas"][0]
417
+
418
+ feeds.append({
419
+ "event_id": event_id,
420
+ "summary": entry.get("summary_preview", ""),
421
+ "domain": metadata.get("domain", "unknown"),
422
+ "severity": metadata.get("severity", "medium"),
423
+ "timestamp": metadata.get("timestamp", entry.get("last_seen")),
424
+ "source": metadata.get("source", "feed")
425
+ })
426
+ except Exception:
427
+ # Fallback if chroma fails
428
+ feeds.append({
429
+ "event_id": event_id,
430
+ "summary": entry.get("summary_preview", ""),
431
+ "domain": "unknown",
432
+ "severity": "medium",
433
+ "timestamp": entry.get("last_seen")
434
+ })
435
+
436
+ return feeds
437
+
438
+ except Exception as e:
439
+ logger.error(f"[FEED_SEARCH] Error searching for '{query}': {e}")
440
+ return []
441
+
442
  def get_feeds_since(self, timestamp: datetime) -> List[Dict[str, Any]]:
443
  """
444
  Get all feeds added after given timestamp.
src/utils/trending_detector.py CHANGED
@@ -70,12 +70,53 @@ TRENDING_STOPWORDS = {
70
  "week",
71
  "month",
72
  "year",
73
- # Generic actions
 
 
 
 
 
 
 
 
 
 
74
  "said",
75
  "says",
76
  "told",
77
  "according",
78
  "sources",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  }
80
 
81
 
 
70
  "week",
71
  "month",
72
  "year",
73
+ "hour",
74
+ "minute",
75
+ "second",
76
+ "time",
77
+ "date",
78
+ # Days
79
+ "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
80
+ # Months
81
+ "january", "february", "march", "april", "may", "june",
82
+ "july", "august", "september", "october", "november", "december",
83
+ # Generic actions/descriptions
84
  "said",
85
  "says",
86
  "told",
87
  "according",
88
  "sources",
89
+ "media",
90
+ "press",
91
+ "release",
92
+ "statement",
93
+ "general",
94
+ "public",
95
+ "national",
96
+ "international",
97
+ "local",
98
+ "central",
99
+ "department",
100
+ "division",
101
+ "authority",
102
+ "board",
103
+ "committee",
104
+ "director",
105
+ "secretary",
106
+ "commission",
107
+ "report",
108
+ "reports",
109
+ "reported",
110
+ # Location generic
111
+ "district",
112
+ "province",
113
+ "area",
114
+ "region",
115
+ "island",
116
+ "nation",
117
+ "country",
118
+ "state",
119
+ "western", "eastern", "southern", "northern", "central",
120
  }
121
 
122
 
src/utils/utils.py CHANGED
@@ -1472,6 +1472,7 @@ def tool_health_alerts() -> Dict[str, Any]:
1472
  Get health alerts and disease outbreak information for Sri Lanka.
1473
 
1474
  Includes dengue case counts, epidemic alerts, and health advisories.
 
1475
 
1476
  Returns:
1477
  Dict with health alerts, disease data, and notifications
@@ -1507,59 +1508,93 @@ def tool_health_alerts() -> Dict[str, Any]:
1507
  resp = _safe_get("https://www.health.gov.lk/", timeout=30)
1508
  if resp:
1509
  soup = BeautifulSoup(resp.text, "html.parser")
1510
- page_text = soup.get_text(separator="\n", strip=True).lower()
1511
-
1512
- # Check for outbreak keywords
1513
- outbreak_keywords = [
1514
- "outbreak",
1515
- "epidemic",
1516
- "alert",
1517
- "warning",
1518
- "emergency",
1519
- ]
1520
- for kw in outbreak_keywords:
1521
- if kw in page_text:
1522
- # Try to extract the context
1523
- idx = page_text.find(kw)
1524
- context = page_text[max(0, idx - 50) : idx + 100]
1525
- if len(context) > 20:
1526
- result["alerts"].append(
1527
- {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1528
  "type": "health_notice",
1529
- "text": context.strip()[:150],
1530
- "severity": (
1531
- "medium" if kw in ["alert", "warning"] else "low"
1532
- ),
1533
- }
1534
- )
1535
- break
1536
 
1537
- # Check for dengue data
1538
  dengue_match = re.search(r"dengue[:\s]*(\d{1,5})\s*(?:cases?)?", page_text)
1539
  if dengue_match:
1540
  try:
1541
  result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
 
1542
  except ValueError:
1543
  pass
1544
 
1545
- logger.info(
1546
- f"[HEALTH] Fetched - Dengue cases: {result['dengue']['weekly_cases']}"
1547
- )
1548
-
1549
- # Add seasonal health advisory
1550
- current_month = utc_now().month
1551
- if current_month in [5, 6, 10, 11]: # Monsoon = mosquito season
1552
- result["advisories"].append(
1553
- {
1554
- "type": "seasonal",
1555
- "text": "Monsoon season: Increased dengue risk. Remove stagnant water around homes.",
1556
- "severity": "medium",
1557
- }
1558
- )
1559
-
1560
  except Exception as e:
1561
  logger.warning(f"[HEALTH] Scraping error: {e}")
1562
- result["error"] = str(e)
 
 
 
 
 
 
 
 
 
 
1563
 
1564
  # Update cache
1565
  _health_cache = result
@@ -1834,47 +1869,80 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
1834
  resp = _safe_get("https://www.waterboard.lk/", timeout=30)
1835
  if resp:
1836
  soup = BeautifulSoup(resp.text, "html.parser")
1837
- page_text = soup.get_text(separator="\n", strip=True).lower()
1838
-
1839
- # Check for disruption keywords
1840
- disruption_keywords = [
1841
- "disruption",
1842
- "interruption",
1843
- "cut off",
1844
- "maintenance",
1845
- "repair",
1846
- ]
1847
- for kw in disruption_keywords:
1848
- if kw in page_text:
1849
- result["status"] = "disruptions_reported"
1850
- idx = page_text.find(kw)
1851
- context = page_text[max(0, idx - 30) : idx + 120]
1852
-
1853
- # Try to extract area name
1854
- area_patterns = [
1855
- r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura)",
1856
- r"(nugegoda|dehiwala|mount lavinia|moratuwa|maharagama)",
1857
- ]
1858
- area = "Multiple areas"
1859
- for pattern in area_patterns:
1860
- match = re.search(pattern, context, re.I)
1861
- if match:
1862
- area = match.group(1).title()
1863
- break
1864
-
1865
- result["active_disruptions"].append(
1866
- {
1867
- "area": area,
1868
- "type": kw,
1869
- "details": context.strip()[:150],
1870
- "severity": "medium",
1871
- }
1872
- )
1873
- break
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1874
 
1875
- logger.info(
1876
- f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}"
1877
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1878
 
1879
  # If no disruptions found via scraping, report normal
1880
  if not result["active_disruptions"]:
@@ -1883,8 +1951,8 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
1883
 
1884
  except Exception as e:
1885
  logger.warning(f"[WATER] Scraping error: {e}")
 
1886
  result["error"] = str(e)
1887
- result["status"] = "unknown"
1888
 
1889
  # Update cache
1890
  _water_cache = result
 
1472
  Get health alerts and disease outbreak information for Sri Lanka.
1473
 
1474
  Includes dengue case counts, epidemic alerts, and health advisories.
1475
+ Filters out navigation text (circulars, menus) for cleaner alerts.
1476
 
1477
  Returns:
1478
  Dict with health alerts, disease data, and notifications
 
1508
  resp = _safe_get("https://www.health.gov.lk/", timeout=30)
1509
  if resp:
1510
  soup = BeautifulSoup(resp.text, "html.parser")
1511
+
1512
+ # 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
1513
+ for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe"]):
1514
+ trash.decompose()
1515
+
1516
+ # Also remove specific menu containers if identifiable
1517
+ for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
1518
+ menu.decompose()
1519
+
1520
+ # 2. Look for explicit alerts first (Marquees, Alert Banners)
1521
+ explicit_alerts = []
1522
+
1523
+ # Check marquees (common on govt sites)
1524
+ for marquee in soup.find_all("marquee"):
1525
+ text = marquee.get_text(strip=True)
1526
+ if text and len(text) > 20 and "welcome" not in text.lower():
1527
+ explicit_alerts.append(text)
1528
+
1529
+ # Check alert divs
1530
+ for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
1531
+ text = alert_div.get_text(strip=True)
1532
+ if text and len(text) > 20:
1533
+ explicit_alerts.append(text)
1534
+
1535
+ # Add explicit alerts found
1536
+ for alert_text in explicit_alerts[:3]: # Limit to 3
1537
+ # Filter out "Circular" noise which is document listing, not public health alert
1538
+ if "circular" not in alert_text.lower():
1539
+ result["alerts"].append({
1540
+ "type": "health_notice",
1541
+ "text": alert_text[:200], # Truncate clean text
1542
+ "severity": "medium"
1543
+ })
1544
+
1545
+ # 3. If no explicit alerts, do a safer text search on remaining body content
1546
+ if not result["alerts"]:
1547
+ # Get text only from main content area if possible
1548
+ main_content = soup.select_one("main, #content, .container, body") or soup.body
1549
+ page_text = main_content.get_text(separator=" ", strip=True).lower()
1550
+
1551
+ # Check for outbreak keywords in context
1552
+ outbreak_keywords = [
1553
+ "dengue outbreak",
1554
+ "epidemic alert",
1555
+ "health emergency",
1556
+ "spread of disease",
1557
+ "influenza warning"
1558
+ ]
1559
+
1560
+ for kw in outbreak_keywords:
1561
+ if kw in page_text:
1562
+ idx = page_text.find(kw)
1563
+ # Extract sentence-like context
1564
+ context = page_text[max(0, idx - 20) : idx + 150]
1565
+ # Clean up
1566
+ context = " ".join(context.split())
1567
+
1568
+ if len(context) > 20 and "circular" not in context:
1569
+ result["alerts"].append({
1570
  "type": "health_notice",
1571
+ "text": f"...{context}...",
1572
+ "severity": "medium"
1573
+ })
1574
+ break
 
 
 
1575
 
1576
+ # 4. Check for Dengue stats specifically
1577
  dengue_match = re.search(r"dengue[:\s]*(\d{1,5})\s*(?:cases?)?", page_text)
1578
  if dengue_match:
1579
  try:
1580
  result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
1581
+ logger.info(f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}")
1582
  except ValueError:
1583
  pass
1584
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1585
  except Exception as e:
1586
  logger.warning(f"[HEALTH] Scraping error: {e}")
1587
+ # Don't fail completely, return baseline
1588
+
1589
+ # fallback: If still no alerts, maybe add seasonal one
1590
+ if not result["alerts"]:
1591
+ current_month = utc_now().month
1592
+ if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
1593
+ result["advisories"].append({
1594
+ "type": "seasonal",
1595
+ "text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
1596
+ "severity": "medium",
1597
+ })
1598
 
1599
  # Update cache
1600
  _health_cache = result
 
1869
  resp = _safe_get("https://www.waterboard.lk/", timeout=30)
1870
  if resp:
1871
  soup = BeautifulSoup(resp.text, "html.parser")
1872
+
1873
+ # 1. Clean DOM - Remove typically noisy elements
1874
+ for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe", "form"]):
1875
+ trash.decompose()
1876
+
1877
+ # Remove menu containers explicitly
1878
+ for menu in soup.select(".menu, .navigation, #main-menu, .top-bar, .service-block"):
1879
+ menu.decompose()
1880
+
1881
+ # 2. Look for explicit alerts (Marquee is common on SL govt sites)
1882
+ alerts_found = []
1883
+
1884
+ # Check marquees
1885
+ for marquee in soup.find_all("marquee"):
1886
+ text = marquee.get_text(separator=" ", strip=True)
1887
+ if len(text) > 10:
1888
+ alerts_found.append({"text": text, "source": "ticker"})
1889
+
1890
+ # Check alert classes
1891
+ for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
1892
+ text = alert.get_text(separator=" ", strip=True)
1893
+ if len(text) > 10:
1894
+ alerts_found.append({"text": text, "source": "alert_box"})
1895
+
1896
+ # 3. If no explicit alerts, search body text with STRICTER validation
1897
+ if not alerts_found:
1898
+ main_content = soup.select_one("main, #content, .container, body") or soup.body
1899
+ if main_content:
1900
+ # Get paragraph texts mainly
1901
+ for p in main_content.find_all(["p", "div", "span"]):
1902
+ text = p.get_text(strip=True)
1903
+ if len(text) < 20 or len(text) > 300: # Ignore too short/long blocks
1904
+ continue
1905
+
1906
+ text_lower = text.lower()
1907
+
1908
+ # Must have explicit "water" context AND disruption keyword
1909
+ has_water = any(w in text_lower for w in ["water supply", "water cut", "nwsdb", "water board"])
1910
+ has_issue = any(w in text_lower for w in ["interruption", "disruption", "suspended", "stopped", "low pressure"])
1911
+
1912
+ # Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
1913
+ is_garbage = any(w in text_lower for w in ["benefits", "payment", "service without", "bill", "vision", "mission"])
1914
+
1915
+ if has_water and has_issue and not is_garbage:
1916
+ alerts_found.append({"text": text, "source": "content_match"})
1917
+
1918
+ # Process found alerts
1919
+ for item in alerts_found:
1920
+ text = item["text"]
1921
+ text_lower = text.lower()
1922
+
1923
+ # Double check garbage filtering
1924
+ if any(w in text_lower for w in ["benefits", "payment", "check out", "click here"]):
1925
+ continue
1926
 
1927
+ result["status"] = "disruptions_reported"
1928
+
1929
+ # Extract Area
1930
+ area = "Multiple areas"
1931
+ # Common major areas regex
1932
+ area_match = re.search(r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura|kalutara|negombo)", text_lower, re.I)
1933
+ if area_match:
1934
+ area = area_match.group(1).title()
1935
+
1936
+ # Deduplicate
1937
+ if not any(d["details"] == text for d in result["active_disruptions"]):
1938
+ result["active_disruptions"].append({
1939
+ "area": area,
1940
+ "type": "Water Disruption",
1941
+ "details": text[:200] + ("..." if len(text) > 200 else ""),
1942
+ "severity": "medium"
1943
+ })
1944
+
1945
+ logger.info(f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}")
1946
 
1947
  # If no disruptions found via scraping, report normal
1948
  if not result["active_disruptions"]:
 
1951
 
1952
  except Exception as e:
1953
  logger.warning(f"[WATER] Scraping error: {e}")
1954
+ # Don't overwrite default valid return structure, just add error
1955
  result["error"] = str(e)
 
1956
 
1957
  # Update cache
1958
  _water_cache = result