nivakaran commited on
Commit
52329fa
·
verified ·
1 Parent(s): ff3017c

Deploy from GitHub Actions

Browse files
src/rag.py CHANGED
@@ -382,34 +382,34 @@ class RogerRAG:
382
  )
383
 
384
  # DIVERSITY RERANKING
385
- # Ensure we don't just show 5 gazettes.
386
  # We want a mix of domains if possible.
387
  unique_domains = {}
388
  diverse_docs = []
389
-
390
  # Priority domains for situational awareness
391
- priority_domains = {'intelligence', 'social', 'economical', 'meteorological'}
392
-
393
  for doc in raw_docs:
394
  domain = doc.get("domain", "unknown")
395
  platform = doc.get("metadata", {}).get("platform", "unknown")
396
-
397
  # Key to track redundancy: domain + platform
398
  key = f"{domain}_{platform}"
399
-
400
- # Allow max 2 docs per domain/platform combo,
401
  # UNLESS it's a priority domain with high similarity (>0.4)
402
  limit = 2
403
- if domain in priority_domains and doc['similarity'] > 0.4:
404
  limit = 3
405
-
406
  if unique_domains.get(key, 0) < limit:
407
  diverse_docs.append(doc)
408
  unique_domains[key] = unique_domains.get(key, 0) + 1
409
-
410
  if len(diverse_docs) >= 7: # Stop after getting 7 diverse docs
411
  break
412
-
413
  docs = diverse_docs
414
 
415
  if not docs:
 
382
  )
383
 
384
  # DIVERSITY RERANKING
385
+ # Ensure we don't just show 5 gazettes.
386
  # We want a mix of domains if possible.
387
  unique_domains = {}
388
  diverse_docs = []
389
+
390
  # Priority domains for situational awareness
391
+ priority_domains = {"intelligence", "social", "economical", "meteorological"}
392
+
393
  for doc in raw_docs:
394
  domain = doc.get("domain", "unknown")
395
  platform = doc.get("metadata", {}).get("platform", "unknown")
396
+
397
  # Key to track redundancy: domain + platform
398
  key = f"{domain}_{platform}"
399
+
400
+ # Allow max 2 docs per domain/platform combo,
401
  # UNLESS it's a priority domain with high similarity (>0.4)
402
  limit = 2
403
+ if domain in priority_domains and doc["similarity"] > 0.4:
404
  limit = 3
405
+
406
  if unique_domains.get(key, 0) < limit:
407
  diverse_docs.append(doc)
408
  unique_domains[key] = unique_domains.get(key, 0) + 1
409
+
410
  if len(diverse_docs) >= 7: # Stop after getting 7 diverse docs
411
  break
412
+
413
  docs = diverse_docs
414
 
415
  if not docs:
src/storage/storage_manager.py CHANGED
@@ -402,7 +402,7 @@ class StorageManager:
402
  try:
403
  entries = self.sqlite_cache.search_entries(query, limit=limit)
404
  feeds = []
405
-
406
  for entry in entries:
407
  event_id = entry.get("event_id")
408
  if not event_id:
@@ -414,25 +414,31 @@ class StorageManager:
414
  metadata = {}
415
  if chroma_data and chroma_data["metadatas"]:
416
  metadata = chroma_data["metadatas"][0]
417
-
418
- feeds.append({
419
- "event_id": event_id,
420
- "summary": entry.get("summary_preview", ""),
421
- "domain": metadata.get("domain", "unknown"),
422
- "severity": metadata.get("severity", "medium"),
423
- "timestamp": metadata.get("timestamp", entry.get("last_seen")),
424
- "source": metadata.get("source", "feed")
425
- })
 
 
 
 
426
  except Exception:
427
  # Fallback if chroma fails
428
- feeds.append({
429
- "event_id": event_id,
430
- "summary": entry.get("summary_preview", ""),
431
- "domain": "unknown",
432
- "severity": "medium",
433
- "timestamp": entry.get("last_seen")
434
- })
435
-
 
 
436
  return feeds
437
 
438
  except Exception as e:
 
402
  try:
403
  entries = self.sqlite_cache.search_entries(query, limit=limit)
404
  feeds = []
405
+
406
  for entry in entries:
407
  event_id = entry.get("event_id")
408
  if not event_id:
 
414
  metadata = {}
415
  if chroma_data and chroma_data["metadatas"]:
416
  metadata = chroma_data["metadatas"][0]
417
+
418
+ feeds.append(
419
+ {
420
+ "event_id": event_id,
421
+ "summary": entry.get("summary_preview", ""),
422
+ "domain": metadata.get("domain", "unknown"),
423
+ "severity": metadata.get("severity", "medium"),
424
+ "timestamp": metadata.get(
425
+ "timestamp", entry.get("last_seen")
426
+ ),
427
+ "source": metadata.get("source", "feed"),
428
+ }
429
+ )
430
  except Exception:
431
  # Fallback if chroma fails
432
+ feeds.append(
433
+ {
434
+ "event_id": event_id,
435
+ "summary": entry.get("summary_preview", ""),
436
+ "domain": "unknown",
437
+ "severity": "medium",
438
+ "timestamp": entry.get("last_seen"),
439
+ }
440
+ )
441
+
442
  return feeds
443
 
444
  except Exception as e:
src/utils/trending_detector.py CHANGED
@@ -76,10 +76,26 @@ TRENDING_STOPWORDS = {
76
  "time",
77
  "date",
78
  # Days
79
- "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
 
 
 
 
 
 
80
  # Months
81
- "january", "february", "march", "april", "may", "june",
82
- "july", "august", "september", "october", "november", "december",
 
 
 
 
 
 
 
 
 
 
83
  # Generic actions/descriptions
84
  "said",
85
  "says",
@@ -116,7 +132,11 @@ TRENDING_STOPWORDS = {
116
  "nation",
117
  "country",
118
  "state",
119
- "western", "eastern", "southern", "northern", "central",
 
 
 
 
120
  }
121
 
122
 
 
76
  "time",
77
  "date",
78
  # Days
79
+ "monday",
80
+ "tuesday",
81
+ "wednesday",
82
+ "thursday",
83
+ "friday",
84
+ "saturday",
85
+ "sunday",
86
  # Months
87
+ "january",
88
+ "february",
89
+ "march",
90
+ "april",
91
+ "may",
92
+ "june",
93
+ "july",
94
+ "august",
95
+ "september",
96
+ "october",
97
+ "november",
98
+ "december",
99
  # Generic actions/descriptions
100
  "said",
101
  "says",
 
132
  "nation",
133
  "country",
134
  "state",
135
+ "western",
136
+ "eastern",
137
+ "southern",
138
+ "northern",
139
+ "central",
140
  }
141
 
142
 
src/utils/utils.py CHANGED
@@ -1508,24 +1508,26 @@ def tool_health_alerts() -> Dict[str, Any]:
1508
  resp = _safe_get("https://www.health.gov.lk/", timeout=30)
1509
  if resp:
1510
  soup = BeautifulSoup(resp.text, "html.parser")
1511
-
1512
  # 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
1513
- for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe"]):
 
 
1514
  trash.decompose()
1515
-
1516
  # Also remove specific menu containers if identifiable
1517
  for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
1518
  menu.decompose()
1519
 
1520
  # 2. Look for explicit alerts first (Marquees, Alert Banners)
1521
  explicit_alerts = []
1522
-
1523
  # Check marquees (common on govt sites)
1524
  for marquee in soup.find_all("marquee"):
1525
  text = marquee.get_text(strip=True)
1526
  if text and len(text) > 20 and "welcome" not in text.lower():
1527
  explicit_alerts.append(text)
1528
-
1529
  # Check alert divs
1530
  for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
1531
  text = alert_div.get_text(strip=True)
@@ -1533,44 +1535,50 @@ def tool_health_alerts() -> Dict[str, Any]:
1533
  explicit_alerts.append(text)
1534
 
1535
  # Add explicit alerts found
1536
- for alert_text in explicit_alerts[:3]: # Limit to 3
1537
  # Filter out "Circular" noise which is document listing, not public health alert
1538
  if "circular" not in alert_text.lower():
1539
- result["alerts"].append({
1540
- "type": "health_notice",
1541
- "text": alert_text[:200], # Truncate clean text
1542
- "severity": "medium"
1543
- })
 
 
1544
 
1545
  # 3. If no explicit alerts, do a safer text search on remaining body content
1546
  if not result["alerts"]:
1547
  # Get text only from main content area if possible
1548
- main_content = soup.select_one("main, #content, .container, body") or soup.body
 
 
1549
  page_text = main_content.get_text(separator=" ", strip=True).lower()
1550
-
1551
  # Check for outbreak keywords in context
1552
  outbreak_keywords = [
1553
  "dengue outbreak",
1554
  "epidemic alert",
1555
  "health emergency",
1556
  "spread of disease",
1557
- "influenza warning"
1558
  ]
1559
-
1560
  for kw in outbreak_keywords:
1561
  if kw in page_text:
1562
  idx = page_text.find(kw)
1563
  # Extract sentence-like context
1564
  context = page_text[max(0, idx - 20) : idx + 150]
1565
  # Clean up
1566
- context = " ".join(context.split())
1567
-
1568
  if len(context) > 20 and "circular" not in context:
1569
- result["alerts"].append({
1570
- "type": "health_notice",
1571
- "text": f"...{context}...",
1572
- "severity": "medium"
1573
- })
 
 
1574
  break
1575
 
1576
  # 4. Check for Dengue stats specifically
@@ -1578,23 +1586,27 @@ def tool_health_alerts() -> Dict[str, Any]:
1578
  if dengue_match:
1579
  try:
1580
  result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
1581
- logger.info(f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}")
 
 
1582
  except ValueError:
1583
  pass
1584
 
1585
  except Exception as e:
1586
  logger.warning(f"[HEALTH] Scraping error: {e}")
1587
  # Don't fail completely, return baseline
1588
-
1589
  # fallback: If still no alerts, maybe add seasonal one
1590
  if not result["alerts"]:
1591
  current_month = utc_now().month
1592
  if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
1593
- result["advisories"].append({
1594
- "type": "seasonal",
1595
- "text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
1596
- "severity": "medium",
1597
- })
 
 
1598
 
1599
  # Update cache
1600
  _health_cache = result
@@ -1869,24 +1881,37 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
1869
  resp = _safe_get("https://www.waterboard.lk/", timeout=30)
1870
  if resp:
1871
  soup = BeautifulSoup(resp.text, "html.parser")
1872
-
1873
  # 1. Clean DOM - Remove typically noisy elements
1874
- for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe", "form"]):
 
 
 
 
 
 
 
 
 
 
 
1875
  trash.decompose()
1876
-
1877
  # Remove menu containers explicitly
1878
- for menu in soup.select(".menu, .navigation, #main-menu, .top-bar, .service-block"):
 
 
1879
  menu.decompose()
1880
 
1881
  # 2. Look for explicit alerts (Marquee is common on SL govt sites)
1882
  alerts_found = []
1883
-
1884
  # Check marquees
1885
  for marquee in soup.find_all("marquee"):
1886
  text = marquee.get_text(separator=" ", strip=True)
1887
  if len(text) > 10:
1888
  alerts_found.append({"text": text, "source": "ticker"})
1889
-
1890
  # Check alert classes
1891
  for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
1892
  text = alert.get_text(separator=" ", strip=True)
@@ -1895,54 +1920,98 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
1895
 
1896
  # 3. If no explicit alerts, search body text with STRICTER validation
1897
  if not alerts_found:
1898
- main_content = soup.select_one("main, #content, .container, body") or soup.body
1899
- if main_content:
 
 
1900
  # Get paragraph texts mainly
1901
  for p in main_content.find_all(["p", "div", "span"]):
1902
  text = p.get_text(strip=True)
1903
- if len(text) < 20 or len(text) > 300: # Ignore too short/long blocks
 
 
1904
  continue
1905
-
1906
  text_lower = text.lower()
1907
-
1908
  # Must have explicit "water" context AND disruption keyword
1909
- has_water = any(w in text_lower for w in ["water supply", "water cut", "nwsdb", "water board"])
1910
- has_issue = any(w in text_lower for w in ["interruption", "disruption", "suspended", "stopped", "low pressure"])
1911
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1912
  # Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
1913
- is_garbage = any(w in text_lower for w in ["benefits", "payment", "service without", "bill", "vision", "mission"])
1914
-
 
 
 
 
 
 
 
 
 
 
1915
  if has_water and has_issue and not is_garbage:
1916
- alerts_found.append({"text": text, "source": "content_match"})
 
 
1917
 
1918
  # Process found alerts
1919
  for item in alerts_found:
1920
  text = item["text"]
1921
  text_lower = text.lower()
1922
-
1923
  # Double check garbage filtering
1924
- if any(w in text_lower for w in ["benefits", "payment", "check out", "click here"]):
 
 
 
1925
  continue
1926
 
1927
  result["status"] = "disruptions_reported"
1928
-
1929
  # Extract Area
1930
  area = "Multiple areas"
1931
  # Common major areas regex
1932
- area_match = re.search(r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura|kalutara|negombo)", text_lower, re.I)
 
 
 
 
1933
  if area_match:
1934
  area = area_match.group(1).title()
1935
-
1936
  # Deduplicate
1937
  if not any(d["details"] == text for d in result["active_disruptions"]):
1938
- result["active_disruptions"].append({
1939
- "area": area,
1940
- "type": "Water Disruption",
1941
- "details": text[:200] + ("..." if len(text) > 200 else ""),
1942
- "severity": "medium"
1943
- })
1944
-
1945
- logger.info(f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}")
 
 
 
 
1946
 
1947
  # If no disruptions found via scraping, report normal
1948
  if not result["active_disruptions"]:
 
1508
  resp = _safe_get("https://www.health.gov.lk/", timeout=30)
1509
  if resp:
1510
  soup = BeautifulSoup(resp.text, "html.parser")
1511
+
1512
  # 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
1513
+ for trash in soup.find_all(
1514
+ ["nav", "header", "footer", "script", "style", "noscript", "iframe"]
1515
+ ):
1516
  trash.decompose()
1517
+
1518
  # Also remove specific menu containers if identifiable
1519
  for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
1520
  menu.decompose()
1521
 
1522
  # 2. Look for explicit alerts first (Marquees, Alert Banners)
1523
  explicit_alerts = []
1524
+
1525
  # Check marquees (common on govt sites)
1526
  for marquee in soup.find_all("marquee"):
1527
  text = marquee.get_text(strip=True)
1528
  if text and len(text) > 20 and "welcome" not in text.lower():
1529
  explicit_alerts.append(text)
1530
+
1531
  # Check alert divs
1532
  for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
1533
  text = alert_div.get_text(strip=True)
 
1535
  explicit_alerts.append(text)
1536
 
1537
  # Add explicit alerts found
1538
+ for alert_text in explicit_alerts[:3]: # Limit to 3
1539
  # Filter out "Circular" noise which is document listing, not public health alert
1540
  if "circular" not in alert_text.lower():
1541
+ result["alerts"].append(
1542
+ {
1543
+ "type": "health_notice",
1544
+ "text": alert_text[:200], # Truncate clean text
1545
+ "severity": "medium",
1546
+ }
1547
+ )
1548
 
1549
  # 3. If no explicit alerts, do a safer text search on remaining body content
1550
  if not result["alerts"]:
1551
  # Get text only from main content area if possible
1552
+ main_content = (
1553
+ soup.select_one("main, #content, .container, body") or soup.body
1554
+ )
1555
  page_text = main_content.get_text(separator=" ", strip=True).lower()
1556
+
1557
  # Check for outbreak keywords in context
1558
  outbreak_keywords = [
1559
  "dengue outbreak",
1560
  "epidemic alert",
1561
  "health emergency",
1562
  "spread of disease",
1563
+ "influenza warning",
1564
  ]
1565
+
1566
  for kw in outbreak_keywords:
1567
  if kw in page_text:
1568
  idx = page_text.find(kw)
1569
  # Extract sentence-like context
1570
  context = page_text[max(0, idx - 20) : idx + 150]
1571
  # Clean up
1572
+ context = " ".join(context.split())
1573
+
1574
  if len(context) > 20 and "circular" not in context:
1575
+ result["alerts"].append(
1576
+ {
1577
+ "type": "health_notice",
1578
+ "text": f"...{context}...",
1579
+ "severity": "medium",
1580
+ }
1581
+ )
1582
  break
1583
 
1584
  # 4. Check for Dengue stats specifically
 
1586
  if dengue_match:
1587
  try:
1588
  result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
1589
+ logger.info(
1590
+ f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}"
1591
+ )
1592
  except ValueError:
1593
  pass
1594
 
1595
  except Exception as e:
1596
  logger.warning(f"[HEALTH] Scraping error: {e}")
1597
  # Don't fail completely, return baseline
1598
+
1599
  # fallback: If still no alerts, maybe add seasonal one
1600
  if not result["alerts"]:
1601
  current_month = utc_now().month
1602
  if current_month in [5, 6, 10, 11, 12]: # Monsoon = mosquito season
1603
+ result["advisories"].append(
1604
+ {
1605
+ "type": "seasonal",
1606
+ "text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
1607
+ "severity": "medium",
1608
+ }
1609
+ )
1610
 
1611
  # Update cache
1612
  _health_cache = result
 
1881
  resp = _safe_get("https://www.waterboard.lk/", timeout=30)
1882
  if resp:
1883
  soup = BeautifulSoup(resp.text, "html.parser")
1884
+
1885
  # 1. Clean DOM - Remove typically noisy elements
1886
+ for trash in soup.find_all(
1887
+ [
1888
+ "nav",
1889
+ "header",
1890
+ "footer",
1891
+ "script",
1892
+ "style",
1893
+ "noscript",
1894
+ "iframe",
1895
+ "form",
1896
+ ]
1897
+ ):
1898
  trash.decompose()
1899
+
1900
  # Remove menu containers explicitly
1901
+ for menu in soup.select(
1902
+ ".menu, .navigation, #main-menu, .top-bar, .service-block"
1903
+ ):
1904
  menu.decompose()
1905
 
1906
  # 2. Look for explicit alerts (Marquee is common on SL govt sites)
1907
  alerts_found = []
1908
+
1909
  # Check marquees
1910
  for marquee in soup.find_all("marquee"):
1911
  text = marquee.get_text(separator=" ", strip=True)
1912
  if len(text) > 10:
1913
  alerts_found.append({"text": text, "source": "ticker"})
1914
+
1915
  # Check alert classes
1916
  for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
1917
  text = alert.get_text(separator=" ", strip=True)
 
1920
 
1921
  # 3. If no explicit alerts, search body text with STRICTER validation
1922
  if not alerts_found:
1923
+ main_content = (
1924
+ soup.select_one("main, #content, .container, body") or soup.body
1925
+ )
1926
+ if main_content:
1927
  # Get paragraph texts mainly
1928
  for p in main_content.find_all(["p", "div", "span"]):
1929
  text = p.get_text(strip=True)
1930
+ if (
1931
+ len(text) < 20 or len(text) > 300
1932
+ ): # Ignore too short/long blocks
1933
  continue
1934
+
1935
  text_lower = text.lower()
1936
+
1937
  # Must have explicit "water" context AND disruption keyword
1938
+ has_water = any(
1939
+ w in text_lower
1940
+ for w in [
1941
+ "water supply",
1942
+ "water cut",
1943
+ "nwsdb",
1944
+ "water board",
1945
+ ]
1946
+ )
1947
+ has_issue = any(
1948
+ w in text_lower
1949
+ for w in [
1950
+ "interruption",
1951
+ "disruption",
1952
+ "suspended",
1953
+ "stopped",
1954
+ "low pressure",
1955
+ ]
1956
+ )
1957
+
1958
  # Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
1959
+ is_garbage = any(
1960
+ w in text_lower
1961
+ for w in [
1962
+ "benefits",
1963
+ "payment",
1964
+ "service without",
1965
+ "bill",
1966
+ "vision",
1967
+ "mission",
1968
+ ]
1969
+ )
1970
+
1971
  if has_water and has_issue and not is_garbage:
1972
+ alerts_found.append(
1973
+ {"text": text, "source": "content_match"}
1974
+ )
1975
 
1976
  # Process found alerts
1977
  for item in alerts_found:
1978
  text = item["text"]
1979
  text_lower = text.lower()
1980
+
1981
  # Double check garbage filtering
1982
+ if any(
1983
+ w in text_lower
1984
+ for w in ["benefits", "payment", "check out", "click here"]
1985
+ ):
1986
  continue
1987
 
1988
  result["status"] = "disruptions_reported"
1989
+
1990
  # Extract Area
1991
  area = "Multiple areas"
1992
  # Common major areas regex
1993
+ area_match = re.search(
1994
+ r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura|kalutara|negombo)",
1995
+ text_lower,
1996
+ re.I,
1997
+ )
1998
  if area_match:
1999
  area = area_match.group(1).title()
2000
+
2001
  # Deduplicate
2002
  if not any(d["details"] == text for d in result["active_disruptions"]):
2003
+ result["active_disruptions"].append(
2004
+ {
2005
+ "area": area,
2006
+ "type": "Water Disruption",
2007
+ "details": text[:200] + ("..." if len(text) > 200 else ""),
2008
+ "severity": "medium",
2009
+ }
2010
+ )
2011
+
2012
+ logger.info(
2013
+ f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}"
2014
+ )
2015
 
2016
  # If no disruptions found via scraping, report normal
2017
  if not result["active_disruptions"]: