Spaces:

nivakaran
/

modelx

Sleeping

App Files Files Community

nivakaran commited on Dec 14, 2025

Commit

ff3017c

verified ·

1 Parent(s): e78fedb

Deploy from GitHub Actions

Browse files

Files changed (8) hide show

frontend/app/components/dashboard/TrendingTopics.tsx +35 -14
inspect_chroma.py +61 -0
main.py +16 -1
src/rag.py +35 -2
src/storage/sqlite_cache.py +31 -0
src/storage/storage_manager.py +46 -0
src/utils/trending_detector.py +42 -1
src/utils/utils.py +151 -83

frontend/app/components/dashboard/TrendingTopics.tsx CHANGED Viewed

@@ -5,12 +5,20 @@
 import React, { useEffect, useState } from 'react';
 interface TrendingTopic {
     topic: string;
     momentum: number;
     is_spike: boolean;
     count_current_hour?: number;
     avg_count?: number;
 }
 interface TrendingData {
@@ -44,7 +52,6 @@ export const TrendingTopics: React.FC = () => {
         };
         fetchTrending();
-        // Refresh every 30 seconds
         const interval = setInterval(fetchTrending, 30000);
         return () => clearInterval(interval);
     }, []);
@@ -159,23 +166,37 @@ export const TrendingTopics: React.FC = () => {
                     data.trending_topics.slice(0, 8).map((topic, idx) => (
                         <div
                             key={idx}
-                            className={`flex items-center justify-between p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
                         >
-                            <div className="flex items-center gap-3">
-                                <span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
-                                <div>
-                                    <p className="font-semibold text-white capitalize">{topic.topic}</p>
-                                    <p className="text-xs text-gray-400">
-                                        {topic.is_spike ? '🔥 Spiking' : 'Trending'}
                                     </p>
                                 </div>
                             </div>
-                            <div className="text-right">
-                                <p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
-                                    {topic.momentum.toFixed(0)}x
-                                </p>
-                                <p className="text-xs text-gray-500">momentum</p>
-                            </div>
                         </div>
                     ))
                 )}

 import React, { useEffect, useState } from 'react';
+interface RelatedFeed {
+    summary: string;
+    domain: string;
+    timestamp: string;
+    source: string;
+}
 interface TrendingTopic {
     topic: string;
     momentum: number;
     is_spike: boolean;
     count_current_hour?: number;
     avg_count?: number;
+    related_feeds?: RelatedFeed[];
 }
 interface TrendingData {
         };
         fetchTrending();
         const interval = setInterval(fetchTrending, 30000);
         return () => clearInterval(interval);
     }, []);
                     data.trending_topics.slice(0, 8).map((topic, idx) => (
                         <div
                             key={idx}
+                            className={`flex flex-col p-3 rounded-xl ${getMomentumBg(topic.momentum)} border border-gray-700/30 transition-all hover:scale-[1.02]`}
                         >
+                            <div className="flex items-center justify-between w-full">
+                                <div className="flex items-center gap-3">
+                                    <span className="text-lg font-bold text-gray-500">#{idx + 1}</span>
+                                    <div>
+                                        <p className="font-semibold text-white capitalize">{topic.topic}</p>
+                                        <p className="text-xs text-gray-400">
+                                            {topic.is_spike ? '🔥 Spiking' : 'Trending'}
+                                        </p>
+                                    </div>
+                                </div>
+                                <div className="text-right">
+                                    <p className={`text-lg font-bold ${getMomentumColor(topic.momentum)}`}>
+                                        {topic.momentum.toFixed(0)}x
                                     </p>
+                                    <p className="text-xs text-gray-500">momentum</p>
                                 </div>
                             </div>
+                            {/* Related Feeds Context */}
+                            {topic.related_feeds && topic.related_feeds.length > 0 && (
+                                <div className="mt-3 pl-3 border-l-2 border-gray-600/30 space-y-2">
+                                    {topic.related_feeds.map((feed, fIdx) => (
+                                        <div key={fIdx} className="text-xs text-gray-300/80 leading-relaxed">
+                                            <span className="text-gray-500 font-medium text-[10px] uppercase tracking-wider mr-2">[{feed.domain}]</span>
+                                            {feed.summary.length > 100 ? feed.summary.substring(0, 100) + '...' : feed.summary}
+                                        </div>
+                                    ))}
+                                </div>
+                            )}
                         </div>
                     ))
                 )}

inspect_chroma.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import logging
+import sys
+from pathlib import Path
+from collections import Counter
+# Setup path
+sys.path.append(str(Path.cwd()))
+from src.storage.config import config
+from src.storage.chromadb_store import ChromaDBStore
+# Mute logging
+logging.basicConfig(level=logging.ERROR)
+def inspect():
+    print("Connecting to ChromaDB...")
+    store = ChromaDBStore()
+    if not store.collection:
+        print("Could not connect to collection.")
+        return
+    count = store.collection.count()
+    print(f"Total documents: {count}")
+    if count == 0:
+        return
+    # Get a sample or all metadata
+    # ChromaDB get() without ids returns everything if limit allows, or we can page.
+    # tailored for 2000 docs, let's just get all metadatas.
+    print("Fetching metadata...")
+    data = store.collection.get(include=["metadatas"])
+    metadatas = data["metadatas"]
+    domains = Counter()
+    sources = Counter()
+    impacts = Counter()
+    for meta in metadatas:
+        if not meta: continue
+        domains[meta.get("domain", "unknown")] += 1
+        sources[meta.get("platform", "unknown")] += 1
+        impacts[meta.get("impact_type", "unknown")] += 1
+    print("\n--- Domain Distribution ---")
+    for d, c in domains.most_common():
+        print(f"{d}: {c}")
+    print("\n--- Source/Platform Distribution ---")
+    for s, c in sources.most_common():
+        print(f"{s}: {c}")
+    print("\n--- Impact Type Distribution ---")
+    for i, c in impacts.most_common():
+        print(f"{i}: {c}")
+if __name__ == "__main__":
+    inspect()

main.py CHANGED Viewed

@@ -1113,10 +1113,25 @@ def get_trending_topics(limit: int = 10):
     """
     try:
         from src.utils.trending_detector import get_trending_now, get_spikes
         trending = get_trending_now(limit=limit)
         spikes = get_spikes()
         return {
             "status": "success",
             "trending_topics": trending,

     """
     try:
         from src.utils.trending_detector import get_trending_now, get_spikes
+        # Use the global storage_manager instance defined earlier in main.py
+        # no need to import it if we are inside main.py function scope where it's visible or passed
+        # But since this is a route function, it might need global access or import.
+        # Assuming storage_manager is available globally in this file as it was initialized earlier.
         trending = get_trending_now(limit=limit)
         spikes = get_spikes()
+        # Enrich top 5 trending topics with related feeds
+        for topic in trending[:5]:
+            keyword = topic["topic"]
+            # Search for relevant feeds (limit 2 per topic to keep payload small)
+            try:
+                related = storage_manager.search_feeds(keyword, limit=2)
+                topic["related_feeds"] = related
+            except Exception as e:
+                logger.warning(f"Error searching feeds for topic {keyword}: {e}")
+                topic["related_feeds"] = []
         return {
             "status": "success",
             "trending_topics": trending,

src/rag.py CHANGED Viewed

@@ -375,10 +375,43 @@ class RogerRAG:
             search_question = self._reformulate_question(question)
         # ChromaDB semantic search
-        docs = self.retriever.search(
-            search_question, n_results=5, domain_filter=domain_filter
         )
         if not docs:
             return {
                 "answer": "I couldn't find any relevant intelligence data to answer your question.",

             search_question = self._reformulate_question(question)
         # ChromaDB semantic search
+        # ChromaDB semantic search
+        # FETCH MORE results (20) to allow for diversity filtering
+        raw_docs = self.retriever.search(
+            search_question, n_results=20, domain_filter=domain_filter
         )
+        # DIVERSITY RERANKING
+        # Ensure we don't just show 5 gazettes.
+        # We want a mix of domains if possible.
+        unique_domains = {}
+        diverse_docs = []
+        # Priority domains for situational awareness
+        priority_domains = {'intelligence', 'social', 'economical', 'meteorological'}
+        for doc in raw_docs:
+            domain = doc.get("domain", "unknown")
+            platform = doc.get("metadata", {}).get("platform", "unknown")
+            # Key to track redundancy: domain + platform
+            key = f"{domain}_{platform}"
+            # Allow max 2 docs per domain/platform combo,
+            # UNLESS it's a priority domain with high similarity (>0.4)
+            limit = 2
+            if domain in priority_domains and doc['similarity'] > 0.4:
+                limit = 3
+            if unique_domains.get(key, 0) < limit:
+                diverse_docs.append(doc)
+                unique_domains[key] = unique_domains.get(key, 0) + 1
+            if len(diverse_docs) >= 7:  # Stop after getting 7 diverse docs
+                break
+        docs = diverse_docs
         if not docs:
             return {
                 "answer": "I couldn't find any relevant intelligence data to answer your question.",

src/storage/sqlite_cache.py CHANGED Viewed

@@ -151,6 +151,37 @@ class SQLiteCache:
         conn.close()
         return results
     def get_entries_since(self, timestamp: str) -> list:
         """
         Get entries added/updated after timestamp.

         conn.close()
         return results
+    def search_entries(self, query: str, limit: int = 10) -> list:
+        """
+        Search for entries containing specific text.
+        Args:
+            query: Text to search for (case-insensitive LIKE)
+            limit: Max results
+        """
+        if not query or len(query) < 2:
+            return []
+        conn = sqlite3.connect(self.db_path)
+        cursor = conn.execute(
+            "SELECT content_hash, first_seen, last_seen, event_id, summary_preview FROM seen_hashes WHERE summary_preview LIKE ? ORDER BY last_seen DESC LIMIT ?",
+            (f"%{query}%", limit),
+        )
+        results = []
+        for row in cursor.fetchall():
+            results.append(
+                {
+                    "content_hash": row[0],
+                    "first_seen": row[1],
+                    "last_seen": row[2],
+                    "event_id": row[3],
+                    "summary_preview": row[4],
+                }
+            )
+        conn.close()
+        return results
     def get_entries_since(self, timestamp: str) -> list:
         """
         Get entries added/updated after timestamp.

src/storage/storage_manager.py CHANGED Viewed

@@ -393,6 +393,52 @@ class StorageManager:
             logger.error(f"[FEED_RETRIEVAL] Error: {e}")
             return []
     def get_feeds_since(self, timestamp: datetime) -> List[Dict[str, Any]]:
         """
         Get all feeds added after given timestamp.

             logger.error(f"[FEED_RETRIEVAL] Error: {e}")
             return []
+        return feeds
+    def search_feeds(self, query: str, limit: int = 5) -> List[Dict[str, Any]]:
+        """
+        Search feeds by keyword and return enriched results.
+        """
+        try:
+            entries = self.sqlite_cache.search_entries(query, limit=limit)
+            feeds = []
+            for entry in entries:
+                event_id = entry.get("event_id")
+                if not event_id:
+                    continue
+                try:
+                    # Try to get metadata from Chroma (optional)
+                    chroma_data = self.chromadb.collection.get(ids=[event_id])
+                    metadata = {}
+                    if chroma_data and chroma_data["metadatas"]:
+                        metadata = chroma_data["metadatas"][0]
+                    feeds.append({
+                        "event_id": event_id,
+                        "summary": entry.get("summary_preview", ""),
+                        "domain": metadata.get("domain", "unknown"),
+                        "severity": metadata.get("severity", "medium"),
+                        "timestamp": metadata.get("timestamp", entry.get("last_seen")),
+                        "source": metadata.get("source", "feed")
+                    })
+                except Exception:
+                    # Fallback if chroma fails
+                    feeds.append({
+                        "event_id": event_id,
+                        "summary": entry.get("summary_preview", ""),
+                        "domain": "unknown",
+                        "severity": "medium",
+                        "timestamp": entry.get("last_seen")
+                    })
+            return feeds
+        except Exception as e:
+            logger.error(f"[FEED_SEARCH] Error searching for '{query}': {e}")
+            return []
     def get_feeds_since(self, timestamp: datetime) -> List[Dict[str, Any]]:
         """
         Get all feeds added after given timestamp.

src/utils/trending_detector.py CHANGED Viewed

@@ -70,12 +70,53 @@ TRENDING_STOPWORDS = {
     "week",
     "month",
     "year",
-    # Generic actions
     "said",
     "says",
     "told",
     "according",
     "sources",
 }

     "week",
     "month",
     "year",
+    "hour",
+    "minute",
+    "second",
+    "time",
+    "date",
+    # Days
+    "monday", "tuesday", "wednesday", "thursday", "friday", "saturday", "sunday",
+    # Months
+    "january", "february", "march", "april", "may", "june",
+    "july", "august", "september", "october", "november", "december",
+    # Generic actions/descriptions
     "said",
     "says",
     "told",
     "according",
     "sources",
+    "media",
+    "press",
+    "release",
+    "statement",
+    "general",
+    "public",
+    "national",
+    "international",
+    "local",
+    "central",
+    "department",
+    "division",
+    "authority",
+    "board",
+    "committee",
+    "director",
+    "secretary",
+    "commission",
+    "report",
+    "reports",
+    "reported",
+    # Location generic
+    "district",
+    "province",
+    "area",
+    "region",
+    "island",
+    "nation",
+    "country",
+    "state",
+    "western", "eastern", "southern", "northern", "central",
 }

src/utils/utils.py CHANGED Viewed

@@ -1472,6 +1472,7 @@ def tool_health_alerts() -> Dict[str, Any]:
     Get health alerts and disease outbreak information for Sri Lanka.
     Includes dengue case counts, epidemic alerts, and health advisories.
     Returns:
         Dict with health alerts, disease data, and notifications
@@ -1507,59 +1508,93 @@ def tool_health_alerts() -> Dict[str, Any]:
         resp = _safe_get("https://www.health.gov.lk/", timeout=30)
         if resp:
             soup = BeautifulSoup(resp.text, "html.parser")
-            page_text = soup.get_text(separator="\n", strip=True).lower()
-            # Check for outbreak keywords
-            outbreak_keywords = [
-                "outbreak",
-                "epidemic",
-                "alert",
-                "warning",
-                "emergency",
-            ]
-            for kw in outbreak_keywords:
-                if kw in page_text:
-                    # Try to extract the context
-                    idx = page_text.find(kw)
-                    context = page_text[max(0, idx - 50) : idx + 100]
-                    if len(context) > 20:
-                        result["alerts"].append(
-                            {
                                 "type": "health_notice",
-                                "text": context.strip()[:150],
-                                "severity": (
-                                    "medium" if kw in ["alert", "warning"] else "low"
-                                ),
-                            }
-                        )
-                        break
-            # Check for dengue data
             dengue_match = re.search(r"dengue[:\s]*(\d{1,5})\s*(?:cases?)?", page_text)
             if dengue_match:
                 try:
                     result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
                 except ValueError:
                     pass
-            logger.info(
-                f"[HEALTH] Fetched - Dengue cases: {result['dengue']['weekly_cases']}"
-            )
-        # Add seasonal health advisory
-        current_month = utc_now().month
-        if current_month in [5, 6, 10, 11]:  # Monsoon = mosquito season
-            result["advisories"].append(
-                {
-                    "type": "seasonal",
-                    "text": "Monsoon season: Increased dengue risk. Remove stagnant water around homes.",
-                    "severity": "medium",
-                }
-            )
     except Exception as e:
         logger.warning(f"[HEALTH] Scraping error: {e}")
-        result["error"] = str(e)
     # Update cache
     _health_cache = result
@@ -1834,47 +1869,80 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
         resp = _safe_get("https://www.waterboard.lk/", timeout=30)
         if resp:
             soup = BeautifulSoup(resp.text, "html.parser")
-            page_text = soup.get_text(separator="\n", strip=True).lower()
-            # Check for disruption keywords
-            disruption_keywords = [
-                "disruption",
-                "interruption",
-                "cut off",
-                "maintenance",
-                "repair",
-            ]
-            for kw in disruption_keywords:
-                if kw in page_text:
-                    result["status"] = "disruptions_reported"
-                    idx = page_text.find(kw)
-                    context = page_text[max(0, idx - 30) : idx + 120]
-                    # Try to extract area name
-                    area_patterns = [
-                        r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura)",
-                        r"(nugegoda|dehiwala|mount lavinia|moratuwa|maharagama)",
-                    ]
-                    area = "Multiple areas"
-                    for pattern in area_patterns:
-                        match = re.search(pattern, context, re.I)
-                        if match:
-                            area = match.group(1).title()
-                            break
-                    result["active_disruptions"].append(
-                        {
-                            "area": area,
-                            "type": kw,
-                            "details": context.strip()[:150],
-                            "severity": "medium",
-                        }
-                    )
-                    break
-            logger.info(
-                f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}"
-            )
         # If no disruptions found via scraping, report normal
         if not result["active_disruptions"]:
@@ -1883,8 +1951,8 @@ def tool_water_supply_alerts() -> Dict[str, Any]:
     except Exception as e:
         logger.warning(f"[WATER] Scraping error: {e}")
         result["error"] = str(e)
-        result["status"] = "unknown"
     # Update cache
     _water_cache = result

     Get health alerts and disease outbreak information for Sri Lanka.
     Includes dengue case counts, epidemic alerts, and health advisories.
+    Filters out navigation text (circulars, menus) for cleaner alerts.
     Returns:
         Dict with health alerts, disease data, and notifications
         resp = _safe_get("https://www.health.gov.lk/", timeout=30)
         if resp:
             soup = BeautifulSoup(resp.text, "html.parser")
+            # 1. Clean up DOM - Remove navigation, footers, scripts that contain keyword noise
+            for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe"]):
+                trash.decompose()
+            # Also remove specific menu containers if identifiable
+            for menu in soup.select(".menu, .navigation, #main-menu, .top-bar"):
+                menu.decompose()
+            # 2. Look for explicit alerts first (Marquees, Alert Banners)
+            explicit_alerts = []
+            # Check marquees (common on govt sites)
+            for marquee in soup.find_all("marquee"):
+                text = marquee.get_text(strip=True)
+                if text and len(text) > 20 and "welcome" not in text.lower():
+                    explicit_alerts.append(text)
+            # Check alert divs
+            for alert_div in soup.select(".alert, .notice, .warning, .news-ticker"):
+                text = alert_div.get_text(strip=True)
+                if text and len(text) > 20:
+                    explicit_alerts.append(text)
+            # Add explicit alerts found
+            for alert_text in explicit_alerts[:3]: # Limit to 3
+                # Filter out "Circular" noise which is document listing, not public health alert
+                if "circular" not in alert_text.lower():
+                    result["alerts"].append({
+                        "type": "health_notice",
+                        "text": alert_text[:200],  # Truncate clean text
+                        "severity": "medium"
+                    })
+            # 3. If no explicit alerts, do a safer text search on remaining body content
+            if not result["alerts"]:
+                # Get text only from main content area if possible
+                main_content = soup.select_one("main, #content, .container, body") or soup.body
+                page_text = main_content.get_text(separator=" ", strip=True).lower()
+                # Check for outbreak keywords in context
+                outbreak_keywords = [
+                    "dengue outbreak",
+                    "epidemic alert",
+                    "health emergency",
+                    "spread of disease",
+                    "influenza warning"
+                ]
+                for kw in outbreak_keywords:
+                    if kw in page_text:
+                        idx = page_text.find(kw)
+                        # Extract sentence-like context
+                        context = page_text[max(0, idx - 20) : idx + 150]
+                        # Clean up
+                        context = " ".join(context.split())
+                        if len(context) > 20 and "circular" not in context:
+                            result["alerts"].append({
                                 "type": "health_notice",
+                                "text": f"...{context}...",
+                                "severity": "medium"
+                            })
+                            break
+            # 4. Check for Dengue stats specifically
             dengue_match = re.search(r"dengue[:\s]*(\d{1,5})\s*(?:cases?)?", page_text)
             if dengue_match:
                 try:
                     result["dengue"]["weekly_cases"] = int(dengue_match.group(1))
+                    logger.info(f"[HEALTH] Found Dengue cases: {result['dengue']['weekly_cases']}")
                 except ValueError:
                     pass
     except Exception as e:
         logger.warning(f"[HEALTH] Scraping error: {e}")
+        # Don't fail completely, return baseline
+    # fallback: If still no alerts, maybe add seasonal one
+    if not result["alerts"]:
+        current_month = utc_now().month
+        if current_month in [5, 6, 10, 11, 12]:  # Monsoon = mosquito season
+            result["advisories"].append({
+                "type": "seasonal",
+                "text": "Mosquito Control: Remove stagnant water to prevent Dengue breeding.",
+                "severity": "medium",
+            })
     # Update cache
     _health_cache = result
         resp = _safe_get("https://www.waterboard.lk/", timeout=30)
         if resp:
             soup = BeautifulSoup(resp.text, "html.parser")
+            # 1. Clean DOM - Remove typically noisy elements
+            for trash in soup.find_all(["nav", "header", "footer", "script", "style", "noscript", "iframe", "form"]):
+                trash.decompose()
+            # Remove menu containers explicitly
+            for menu in soup.select(".menu, .navigation, #main-menu, .top-bar, .service-block"):
+                menu.decompose()
+            # 2. Look for explicit alerts (Marquee is common on SL govt sites)
+            alerts_found = []
+            # Check marquees
+            for marquee in soup.find_all("marquee"):
+                text = marquee.get_text(separator=" ", strip=True)
+                if len(text) > 10:
+                    alerts_found.append({"text": text, "source": "ticker"})
+            # Check alert classes
+            for alert in soup.select(".alert, .notice, .warning, .news-ticker"):
+                text = alert.get_text(separator=" ", strip=True)
+                if len(text) > 10:
+                    alerts_found.append({"text": text, "source": "alert_box"})
+            # 3. If no explicit alerts, search body text with STRICTER validation
+            if not alerts_found:
+                 main_content = soup.select_one("main, #content, .container, body") or soup.body
+                 if main_content:
+                    # Get paragraph texts mainly
+                    for p in main_content.find_all(["p", "div", "span"]):
+                        text = p.get_text(strip=True)
+                        if len(text) < 20 or len(text) > 300: # Ignore too short/long blocks
+                            continue
+                        text_lower = text.lower()
+                        # Must have explicit "water" context AND disruption keyword
+                        has_water = any(w in text_lower for w in ["water supply", "water cut", "nwsdb", "water board"])
+                        has_issue = any(w in text_lower for w in ["interruption", "disruption", "suspended", "stopped", "low pressure"])
+                        # Stopwords that indicate this is NOT an alert (slogans, payment info, etc)
+                        is_garbage = any(w in text_lower for w in ["benefits", "payment", "service without", "bill", "vision", "mission"])
+                        if has_water and has_issue and not is_garbage:
+                            alerts_found.append({"text": text, "source": "content_match"})
+            # Process found alerts
+            for item in alerts_found:
+                text = item["text"]
+                text_lower = text.lower()
+                # Double check garbage filtering
+                if any(w in text_lower for w in ["benefits", "payment", "check out", "click here"]):
+                    continue
+                result["status"] = "disruptions_reported"
+                # Extract Area
+                area = "Multiple areas"
+                # Common major areas regex
+                area_match = re.search(r"(colombo|gampaha|kandy|galle|matara|jaffna|kurunegala|ratnapura|kalutara|negombo)", text_lower, re.I)
+                if area_match:
+                    area = area_match.group(1).title()
+                # Deduplicate
+                if not any(d["details"] == text for d in result["active_disruptions"]):
+                    result["active_disruptions"].append({
+                        "area": area,
+                        "type": "Water Disruption",
+                        "details": text[:200] + ("..." if len(text) > 200 else ""),
+                        "severity": "medium"
+                    })
+            logger.info(f"[WATER] Fetched - Disruptions: {len(result['active_disruptions'])}")
         # If no disruptions found via scraping, report normal
         if not result["active_disruptions"]:
     except Exception as e:
         logger.warning(f"[WATER] Scraping error: {e}")
+        # Don't overwrite default valid return structure, just add error
         result["error"] = str(e)
     # Update cache
     _water_cache = result