Deploy from GitHub Actions
Browse files- src/rag.py +11 -11
- src/storage/storage_manager.py +24 -18
- src/utils/trending_detector.py +24 -4
- src/utils/utils.py +128 -59
src/rag.py
CHANGED
|
@@ -382,34 +382,34 @@ class RogerRAG:
|
|
| 382 |
)
|
| 383 |
|
| 384 |
# DIVERSITY RERANKING
|
| 385 |
-
# Ensure we don't just show 5 gazettes.
|
| 386 |
# We want a mix of domains if possible.
|
| 387 |
unique_domains = {}
|
| 388 |
diverse_docs = []
|
| 389 |
-
|
| 390 |
# Priority domains for situational awareness
|
| 391 |
-
priority_domains = {
|
| 392 |
-
|
| 393 |
for doc in raw_docs:
|
| 394 |
domain = doc.get("domain", "unknown")
|
| 395 |
platform = doc.get("metadata", {}).get("platform", "unknown")
|
| 396 |
-
|
| 397 |
# Key to track redundancy: domain + platform
|
| 398 |
key = f"{domain}_{platform}"
|
| 399 |
-
|
| 400 |
-
# Allow max 2 docs per domain/platform combo,
|
| 401 |
# UNLESS it's a priority domain with high similarity (>0.4)
|
| 402 |
limit = 2
|
| 403 |
-
if domain in priority_domains and doc[
|
| 404 |
limit = 3
|
| 405 |
-
|
| 406 |
if unique_domains.get(key, 0) < limit:
|
| 407 |
diverse_docs.append(doc)
|
| 408 |
unique_domains[key] = unique_domains.get(key, 0) + 1
|
| 409 |
-
|
| 410 |
if len(diverse_docs) >= 7: # Stop after getting 7 diverse docs
|
| 411 |
break
|
| 412 |
-
|
| 413 |
docs = diverse_docs
|
| 414 |
|
| 415 |
if not docs:
|
|
|
|
| 382 |
)
|
| 383 |
|
| 384 |
# DIVERSITY RERANKING
|
| 385 |
+
# Ensure we don't just show 5 gazettes.
|
| 386 |
# We want a mix of domains if possible.
|
| 387 |
unique_domains = {}
|
| 388 |
diverse_docs = []
|
| 389 |
+
|
| 390 |
# Priority domains for situational awareness
|
| 391 |
+
priority_domains = {"intelligence", "social", "economical", "meteorological"}
|
| 392 |
+
|
| 393 |
for doc in raw_docs:
|
| 394 |
domain = doc.get("domain", "unknown")
|
| 395 |
platform = doc.get("metadata", {}).get("platform", "unknown")
|
| 396 |
+
|
| 397 |
# Key to track redundancy: domain + platform
|
| 398 |
key = f"{domain}_{platform}"
|
| 399 |
+
|
| 400 |
+
# Allow max 2 docs per domain/platform combo,
|
| 401 |
# UNLESS it's a priority domain with high similarity (>0.4)
|
| 402 |
limit = 2
|
| 403 |
+
if domain in priority_domains and doc["similarity"] > 0.4:
|
| 404 |
limit = 3
|
| 405 |
+
|
| 406 |
if unique_domains.get(key, 0) < limit:
|
| 407 |
diverse_docs.append(doc)
|
| 408 |
unique_domains[key] = unique_domains.get(key, 0) + 1
|
| 409 |
+
|
| 410 |
if len(diverse_docs) >= 7: # Stop after getting 7 diverse docs
|
| 411 |
break
|
| 412 |
+
|
| 413 |
docs = diverse_docs
|
| 414 |
|
| 415 |
if not docs:
|
src/storage/storage_manager.py
CHANGED
|
@@ -402,7 +402,7 @@ class StorageManager:
|
|
| 402 |
try:
|
| 403 |
entries = self.sqlite_cache.search_entries(query, limit=limit)
|
| 404 |
feeds = []
|
| 405 |
-
|
| 406 |
for entry in entries:
|
| 407 |
event_id = entry.get("event_id")
|
| 408 |
if not event_id:
|
|
@@ -414,25 +414,31 @@ class StorageManager:
|
|
| 414 |
metadata = {}
|
| 415 |
if chroma_data and chroma_data["metadatas"]:
|
| 416 |
metadata = chroma_data["metadatas"][0]
|
| 417 |
-
|
| 418 |
-
feeds.append(
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 426 |
except Exception:
|
| 427 |
# Fallback if chroma fails
|
| 428 |
-
feeds.append(
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
|
|
|
|
|
|
| 436 |
return feeds
|
| 437 |
|
| 438 |
except Exception as e:
|
|
|
|
| 402 |
try:
|
| 403 |
entries = self.sqlite_cache.search_entries(query, limit=limit)
|
| 404 |
feeds = []
|
| 405 |
+
|
| 406 |
for entry in entries:
|
| 407 |
event_id = entry.get("event_id")
|
| 408 |
if not event_id:
|
|
|
|
| 414 |
metadata = {}
|
| 415 |
if chroma_data and chroma_data["metadatas"]:
|
| 416 |
metadata = chroma_data["metadatas"][0]
|
| 417 |
+
|
| 418 |
+
feeds.append(
|
| 419 |
+
{
|
| 420 |
+
"event_id": event_id,
|
| 421 |
+
"summary": entry.get("summary_preview", ""),
|
| 422 |
+
"domain": metadata.get("domain", "unknown"),
|
| 423 |
+
"severity": metadata.get("severity", "medium"),
|
| 424 |
+
"timestamp": metadata.get(
|
| 425 |
+
"timestamp", entry.get("last_seen")
|
| 426 |
+
),
|
| 427 |
+
"source": metadata.get("source", "feed"),
|
| 428 |
+
}
|
| 429 |
+
)
|
| 430 |
except Exception:
|
| 431 |
# Fallback if chroma fails
|
| 432 |
+
feeds.append(
|
| 433 |
+
{
|
| 434 |
+
"event_id": event_id,
|
| 435 |
+
"summary": entry.get("summary_preview", ""),
|
| 436 |
+
"domain": "unknown",
|
| 437 |
+
"severity": "medium",
|
| 438 |
+
"timestamp": entry.get("last_seen"),
|
| 439 |
+
}
|
| 440 |
+
)
|
| 441 |
+
|
| 442 |
return feeds
|
| 443 |
|
| 444 |
except Exception as e:
|
src/utils/trending_detector.py
CHANGED
|
@@ -76,10 +76,26 @@ TRENDING_STOPWORDS = {
|
|
| 76 |
"time",
|
| 77 |
"date",
|
| 78 |
# Days
|
| 79 |
-
"monday",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
# Months
|
| 81 |
-
"january",
|
| 82 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
# Generic actions/descriptions
|
| 84 |
"said",
|
| 85 |
"says",
|
|
@@ -116,7 +132,11 @@ TRENDING_STOPWORDS = {
|
|
| 116 |
"nation",
|
| 117 |
"country",
|
| 118 |
"state",
|
| 119 |
-
"western",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
}
|
| 121 |
|
| 122 |
|
|
|
|
| 76 |
"time",
|
| 77 |
"date",
|
| 78 |
# Days
|
| 79 |
+
"monday",
|
| 80 |
+
"tuesday",
|
| 81 |
+
"wednesday",
|
| 82 |
+
"thursday",
|
| 83 |
+
"friday",
|
| 84 |
+
"saturday",
|
| 85 |
+
"sunday",
|
| 86 |
# Months
|
| 87 |
+
"january",
|
| 88 |
+
"february",
|
| 89 |
+
"march",
|
| 90 |
+
"april",
|
| 91 |
+
"may",
|
| 92 |
+
"june",
|
| 93 |
+
"july",
|
| 94 |
+
"august",
|
| 95 |
+
"september",
|
| 96 |
+
"october",
|
| 97 |
+
"november",
|
| 98 |
+
"december",
|
| 99 |
# Generic actions/descriptions
|
| 100 |
"said",
|
| 101 |
"says",
|
|
|
|
| 132 |
"nation",
|
| 133 |
"country",
|
| 134 |
"state",
|
| 135 |
+
"western",
|
| 136 |
+
"eastern",
|
| 137 |
+
"southern",
|
| 138 |
+
"northern",
|
| 139 |
+
"central",
|
| 140 |
}
|
| 141 |
|
| 142 |
|
src/utils/utils.py
CHANGED
|
@@ -1508,24 +1508,26 @@ def tool_health_alerts() -> Dict[str, Any]:
|
|
| 1508 |
resp = _safe_get("https://www.health.gov.lk/", timeout=30)
|
| 1509 |
if resp:
|
| 1510 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 1511 |
-
|
| 1512 |
# 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
|
| 1513 |
-
for trash in soup.find_all(
|
|
|
|
|
|
|
| 1514 |
trash.decompose()
|
| 1515 |
-
|
| 1516 |
# Also remove specific menu containers if identifiable
|
| 1517 |
for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
|
| 1518 |
menu.decompose()
|
| 1519 |
|
| 1520 |
# 2. Look for explicit alerts first (Marquees, Alert Banners)
|
| 1521 |
explicit_alerts = []
|
| 1522 |
-
|
| 1523 |
# Check marquees (common on govt sites)
|
| 1524 |
for marquee in soup.find_all("marquee"):
|
| 1525 |
text = marquee.get_text(strip=True)
|
| 1526 |
if text and len(text) > 20 and "welcome" not in text.lower():
|
| 1527 |
explicit_alerts.append(text)
|
| 1528 |
-
|
| 1529 |
# Check alert divs
|
| 1530 |
for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
|
| 1531 |
text = alert_div.get_text(strip=True)
|
|
@@ -1533,44 +1535,50 @@ def tool_health_alerts() -> Dict[str, Any]:
|
|
| 1533 |
explicit_alerts.append(text)
|
| 1534 |
|
| 1535 |
# Add explicit alerts found
|
| 1536 |
-
for alert_text in explicit_alerts[:3]:
|
| 1537 |
# Filter out "Circular" noise which is document listing, not public health alert
|
| 1538 |
if "circular" not in alert_text.lower():
|
| 1539 |
-
result["alerts"].append(
|
| 1540 |
-
|
| 1541 |
-
|
| 1542 |
-
|
| 1543 |
-
|
|
|
|
|
|
|
| 1544 |
|
| 1545 |
# 3. If no explicit alerts, do a safer text search on remaining body content
|
| 1546 |
if not result["alerts"]:
|
| 1547 |
# Get text only from main content area if possible
|
| 1548 |
-
main_content =
|
|
|
|
|
|
|
| 1549 |
page_text = main_content.get_text(separator=" ", strip=True).lower()
|
| 1550 |
-
|
| 1551 |
# Check for outbreak keywords in context
|
| 1552 |
outbreak_keywords = [
|
| 1553 |
"dengue outbreak",
|
| 1554 |
"epidemic alert",
|
| 1555 |
"health emergency",
|
| 1556 |
"spread of disease",
|
| 1557 |
-
"influenza warning"
|
| 1558 |
]
|
| 1559 |
-
|
| 1560 |
for kw in outbreak_keywords:
|
| 1561 |
if kw in page_text:
|
| 1562 |
idx = page_text.find(kw)
|
| 1563 |
# Extract sentence-like context
|
| 1564 |
context = page_text[max(0, idx - 20) : idx + 150]
|
| 1565 |
# Clean up
|
| 1566 |
-
context = " ".join(context.split())
|
| 1567 |
-
|
| 1568 |
if len(context) > 20 and "circular" not in context:
|
| 1569 |
-
result["alerts"].append(
|
| 1570 |
-
|
| 1571 |
-
|
| 1572 |
-
|
| 1573 |
-
|
|
|
|
|
|
|
| 1574 |
break
|
| 1575 |
|
| 1576 |
# 4. Check for Dengue stats specifically
|
|
@@ -1578,23 +1586,27 @@ def tool_health_alerts() -> Dict[str, Any]:
|
|
| 1578 |
if dengue_match:
|
| 1579 |
try:
|
| 1580 |
result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
|
| 1581 |
-
logger.info(
|
|
|
|
|
|
|
| 1582 |
except ValueError:
|
| 1583 |
pass
|
| 1584 |
|
| 1585 |
except Exception as e:
|
| 1586 |
logger.warning(f"[HEALTH] Scraping error: {e}")
|
| 1587 |
# Don't fail completely, return baseline
|
| 1588 |
-
|
| 1589 |
# fallback: If still no alerts, maybe add seasonal one
|
| 1590 |
if not result["alerts"]:
|
| 1591 |
current_month = utc_now().month
|
| 1592 |
if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
|
| 1593 |
-
result["advisories"].append(
|
| 1594 |
-
|
| 1595 |
-
|
| 1596 |
-
|
| 1597 |
-
|
|
|
|
|
|
|
| 1598 |
|
| 1599 |
# Update cache
|
| 1600 |
_health_cache = result
|
|
@@ -1869,24 +1881,37 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
|
|
| 1869 |
resp = _safe_get("https://www.waterboard.lk/", timeout=30)
|
| 1870 |
if resp:
|
| 1871 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 1872 |
-
|
| 1873 |
# 1. Clean DOM - Remove typically noisy elements
|
| 1874 |
-
for trash in soup.find_all(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1875 |
trash.decompose()
|
| 1876 |
-
|
| 1877 |
# Remove menu containers explicitly
|
| 1878 |
-
for menu in soup.select(
|
|
|
|
|
|
|
| 1879 |
menu.decompose()
|
| 1880 |
|
| 1881 |
# 2. Look for explicit alerts (Marquee is common on SL govt sites)
|
| 1882 |
alerts_found = []
|
| 1883 |
-
|
| 1884 |
# Check marquees
|
| 1885 |
for marquee in soup.find_all("marquee"):
|
| 1886 |
text = marquee.get_text(separator=" ", strip=True)
|
| 1887 |
if len(text) > 10:
|
| 1888 |
alerts_found.append({"text": text, "source": "ticker"})
|
| 1889 |
-
|
| 1890 |
# Check alert classes
|
| 1891 |
for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
|
| 1892 |
text = alert.get_text(separator=" ", strip=True)
|
|
@@ -1895,54 +1920,98 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
|
|
| 1895 |
|
| 1896 |
# 3. If no explicit alerts, search body text with STRICTER validation
|
| 1897 |
if not alerts_found:
|
| 1898 |
-
|
| 1899 |
-
|
|
|
|
|
|
|
| 1900 |
# Get paragraph texts mainly
|
| 1901 |
for p in main_content.find_all(["p", "div", "span"]):
|
| 1902 |
text = p.get_text(strip=True)
|
| 1903 |
-
if
|
|
|
|
|
|
|
| 1904 |
continue
|
| 1905 |
-
|
| 1906 |
text_lower = text.lower()
|
| 1907 |
-
|
| 1908 |
# Must have explicit "water" context AND disruption keyword
|
| 1909 |
-
has_water = any(
|
| 1910 |
-
|
| 1911 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1912 |
# Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
|
| 1913 |
-
is_garbage = any(
|
| 1914 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1915 |
if has_water and has_issue and not is_garbage:
|
| 1916 |
-
alerts_found.append(
|
|
|
|
|
|
|
| 1917 |
|
| 1918 |
# Process found alerts
|
| 1919 |
for item in alerts_found:
|
| 1920 |
text = item["text"]
|
| 1921 |
text_lower = text.lower()
|
| 1922 |
-
|
| 1923 |
# Double check garbage filtering
|
| 1924 |
-
if any(
|
|
|
|
|
|
|
|
|
|
| 1925 |
continue
|
| 1926 |
|
| 1927 |
result["status"] = "disruptions_reported"
|
| 1928 |
-
|
| 1929 |
# Extract Area
|
| 1930 |
area = "Multiple areas"
|
| 1931 |
# Common major areas regex
|
| 1932 |
-
area_match = re.search(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1933 |
if area_match:
|
| 1934 |
area = area_match.group(1).title()
|
| 1935 |
-
|
| 1936 |
# Deduplicate
|
| 1937 |
if not any(d["details"] == text for d in result["active_disruptions"]):
|
| 1938 |
-
result["active_disruptions"].append(
|
| 1939 |
-
|
| 1940 |
-
|
| 1941 |
-
|
| 1942 |
-
|
| 1943 |
-
|
| 1944 |
-
|
| 1945 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1946 |
|
| 1947 |
# If no disruptions found via scraping, report normal
|
| 1948 |
if not result["active_disruptions"]:
|
|
|
|
| 1508 |
resp = _safe_get("https://www.health.gov.lk/", timeout=30)
|
| 1509 |
if resp:
|
| 1510 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 1511 |
+
|
| 1512 |
# 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
|
| 1513 |
+
for trash in soup.find_all(
|
| 1514 |
+
["nav", "header", "footer", "script", "style", "noscript", "iframe"]
|
| 1515 |
+
):
|
| 1516 |
trash.decompose()
|
| 1517 |
+
|
| 1518 |
# Also remove specific menu containers if identifiable
|
| 1519 |
for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
|
| 1520 |
menu.decompose()
|
| 1521 |
|
| 1522 |
# 2. Look for explicit alerts first (Marquees, Alert Banners)
|
| 1523 |
explicit_alerts = []
|
| 1524 |
+
|
| 1525 |
# Check marquees (common on govt sites)
|
| 1526 |
for marquee in soup.find_all("marquee"):
|
| 1527 |
text = marquee.get_text(strip=True)
|
| 1528 |
if text and len(text) > 20 and "welcome" not in text.lower():
|
| 1529 |
explicit_alerts.append(text)
|
| 1530 |
+
|
| 1531 |
# Check alert divs
|
| 1532 |
for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
|
| 1533 |
text = alert_div.get_text(strip=True)
|
|
|
|
| 1535 |
explicit_alerts.append(text)
|
| 1536 |
|
| 1537 |
# Add explicit alerts found
|
| 1538 |
+
for alert_text in explicit_alerts[:3]: # Limit to 3
|
| 1539 |
# Filter out "Circular" noise which is document listing, not public health alert
|
| 1540 |
if "circular" not in alert_text.lower():
|
| 1541 |
+
result["alerts"].append(
|
| 1542 |
+
{
|
| 1543 |
+
"type": "health_notice",
|
| 1544 |
+
"text": alert_text[:200], # Truncate clean text
|
| 1545 |
+
"severity": "medium",
|
| 1546 |
+
}
|
| 1547 |
+
)
|
| 1548 |
|
| 1549 |
# 3. If no explicit alerts, do a safer text search on remaining body content
|
| 1550 |
if not result["alerts"]:
|
| 1551 |
# Get text only from main content area if possible
|
| 1552 |
+
main_content = (
|
| 1553 |
+
soup.select_one("main, #content, .container, body") or soup.body
|
| 1554 |
+
)
|
| 1555 |
page_text = main_content.get_text(separator=" ", strip=True).lower()
|
| 1556 |
+
|
| 1557 |
# Check for outbreak keywords in context
|
| 1558 |
outbreak_keywords = [
|
| 1559 |
"dengue outbreak",
|
| 1560 |
"epidemic alert",
|
| 1561 |
"health emergency",
|
| 1562 |
"spread of disease",
|
| 1563 |
+
"influenza warning",
|
| 1564 |
]
|
| 1565 |
+
|
| 1566 |
for kw in outbreak_keywords:
|
| 1567 |
if kw in page_text:
|
| 1568 |
idx = page_text.find(kw)
|
| 1569 |
# Extract sentence-like context
|
| 1570 |
context = page_text[max(0, idx - 20) : idx + 150]
|
| 1571 |
# Clean up
|
| 1572 |
+
context = " ".join(context.split())
|
| 1573 |
+
|
| 1574 |
if len(context) > 20 and "circular" not in context:
|
| 1575 |
+
result["alerts"].append(
|
| 1576 |
+
{
|
| 1577 |
+
"type": "health_notice",
|
| 1578 |
+
"text": f"...{context}...",
|
| 1579 |
+
"severity": "medium",
|
| 1580 |
+
}
|
| 1581 |
+
)
|
| 1582 |
break
|
| 1583 |
|
| 1584 |
# 4. Check for Dengue stats specifically
|
|
|
|
| 1586 |
if dengue_match:
|
| 1587 |
try:
|
| 1588 |
result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
|
| 1589 |
+
logger.info(
|
| 1590 |
+
f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}"
|
| 1591 |
+
)
|
| 1592 |
except ValueError:
|
| 1593 |
pass
|
| 1594 |
|
| 1595 |
except Exception as e:
|
| 1596 |
logger.warning(f"[HEALTH] Scraping error: {e}")
|
| 1597 |
# Don't fail completely, return baseline
|
| 1598 |
+
|
| 1599 |
# fallback: If still no alerts, maybe add seasonal one
|
| 1600 |
if not result["alerts"]:
|
| 1601 |
current_month = utc_now().month
|
| 1602 |
if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
|
| 1603 |
+
result["advisories"].append(
|
| 1604 |
+
{
|
| 1605 |
+
"type": "seasonal",
|
| 1606 |
+
"text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
|
| 1607 |
+
"severity": "medium",
|
| 1608 |
+
}
|
| 1609 |
+
)
|
| 1610 |
|
| 1611 |
# Update cache
|
| 1612 |
_health_cache = result
|
|
|
|
| 1881 |
resp = _safe_get("https://www.waterboard.lk/", timeout=30)
|
| 1882 |
if resp:
|
| 1883 |
soup = BeautifulSoup(resp.text, "html.parser")
|
| 1884 |
+
|
| 1885 |
# 1. Clean DOM - Remove typically noisy elements
|
| 1886 |
+
for trash in soup.find_all(
|
| 1887 |
+
[
|
| 1888 |
+
"nav",
|
| 1889 |
+
"header",
|
| 1890 |
+
"footer",
|
| 1891 |
+
"script",
|
| 1892 |
+
"style",
|
| 1893 |
+
"noscript",
|
| 1894 |
+
"iframe",
|
| 1895 |
+
"form",
|
| 1896 |
+
]
|
| 1897 |
+
):
|
| 1898 |
trash.decompose()
|
| 1899 |
+
|
| 1900 |
# Remove menu containers explicitly
|
| 1901 |
+
for menu in soup.select(
|
| 1902 |
+
".menu, .navigation, #main-menu, .top-bar, .service-block"
|
| 1903 |
+
):
|
| 1904 |
menu.decompose()
|
| 1905 |
|
| 1906 |
# 2. Look for explicit alerts (Marquee is common on SL govt sites)
|
| 1907 |
alerts_found = []
|
| 1908 |
+
|
| 1909 |
# Check marquees
|
| 1910 |
for marquee in soup.find_all("marquee"):
|
| 1911 |
text = marquee.get_text(separator=" ", strip=True)
|
| 1912 |
if len(text) > 10:
|
| 1913 |
alerts_found.append({"text": text, "source": "ticker"})
|
| 1914 |
+
|
| 1915 |
# Check alert classes
|
| 1916 |
for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
|
| 1917 |
text = alert.get_text(separator=" ", strip=True)
|
|
|
|
| 1920 |
|
| 1921 |
# 3. If no explicit alerts, search body text with STRICTER validation
|
| 1922 |
if not alerts_found:
|
| 1923 |
+
main_content = (
|
| 1924 |
+
soup.select_one("main, #content, .container, body") or soup.body
|
| 1925 |
+
)
|
| 1926 |
+
if main_content:
|
| 1927 |
# Get paragraph texts mainly
|
| 1928 |
for p in main_content.find_all(["p", "div", "span"]):
|
| 1929 |
text = p.get_text(strip=True)
|
| 1930 |
+
if (
|
| 1931 |
+
len(text) < 20 or len(text) > 300
|
| 1932 |
+
): # Ignore too short/long blocks
|
| 1933 |
continue
|
| 1934 |
+
|
| 1935 |
text_lower = text.lower()
|
| 1936 |
+
|
| 1937 |
# Must have explicit "water" context AND disruption keyword
|
| 1938 |
+
has_water = any(
|
| 1939 |
+
w in text_lower
|
| 1940 |
+
for w in [
|
| 1941 |
+
"water supply",
|
| 1942 |
+
"water cut",
|
| 1943 |
+
"nwsdb",
|
| 1944 |
+
"water board",
|
| 1945 |
+
]
|
| 1946 |
+
)
|
| 1947 |
+
has_issue = any(
|
| 1948 |
+
w in text_lower
|
| 1949 |
+
for w in [
|
| 1950 |
+
"interruption",
|
| 1951 |
+
"disruption",
|
| 1952 |
+
"suspended",
|
| 1953 |
+
"stopped",
|
| 1954 |
+
"low pressure",
|
| 1955 |
+
]
|
| 1956 |
+
)
|
| 1957 |
+
|
| 1958 |
# Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
|
| 1959 |
+
is_garbage = any(
|
| 1960 |
+
w in text_lower
|
| 1961 |
+
for w in [
|
| 1962 |
+
"benefits",
|
| 1963 |
+
"payment",
|
| 1964 |
+
"service without",
|
| 1965 |
+
"bill",
|
| 1966 |
+
"vision",
|
| 1967 |
+
"mission",
|
| 1968 |
+
]
|
| 1969 |
+
)
|
| 1970 |
+
|
| 1971 |
if has_water and has_issue and not is_garbage:
|
| 1972 |
+
alerts_found.append(
|
| 1973 |
+
{"text": text, "source": "content_match"}
|
| 1974 |
+
)
|
| 1975 |
|
| 1976 |
# Process found alerts
|
| 1977 |
for item in alerts_found:
|
| 1978 |
text = item["text"]
|
| 1979 |
text_lower = text.lower()
|
| 1980 |
+
|
| 1981 |
# Double check garbage filtering
|
| 1982 |
+
if any(
|
| 1983 |
+
w in text_lower
|
| 1984 |
+
for w in ["benefits", "payment", "check out", "click here"]
|
| 1985 |
+
):
|
| 1986 |
continue
|
| 1987 |
|
| 1988 |
result["status"] = "disruptions_reported"
|
| 1989 |
+
|
| 1990 |
# Extract Area
|
| 1991 |
area = "Multiple areas"
|
| 1992 |
# Common major areas regex
|
| 1993 |
+
area_match = re.search(
|
| 1994 |
+
r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura|kalutara|negombo)",
|
| 1995 |
+
text_lower,
|
| 1996 |
+
re.I,
|
| 1997 |
+
)
|
| 1998 |
if area_match:
|
| 1999 |
area = area_match.group(1).title()
|
| 2000 |
+
|
| 2001 |
# Deduplicate
|
| 2002 |
if not any(d["details"] == text for d in result["active_disruptions"]):
|
| 2003 |
+
result["active_disruptions"].append(
|
| 2004 |
+
{
|
| 2005 |
+
"area": area,
|
| 2006 |
+
"type": "Water Disruption",
|
| 2007 |
+
"details": text[:200] + ("..." if len(text) > 200 else ""),
|
| 2008 |
+
"severity": "medium",
|
| 2009 |
+
}
|
| 2010 |
+
)
|
| 2011 |
+
|
| 2012 |
+
logger.info(
|
| 2013 |
+
f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}"
|
| 2014 |
+
)
|
| 2015 |
|
| 2016 |
# If no disruptions found via scraping, report normal
|
| 2017 |
if not result["active_disruptions"]:
|