Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

Dmitry Beresnev commited on Mar 15

Commit

c571607

1 Parent(s): 4bd052e

Added a shared SQLite store and buffer.

Added a background worker process with retries and batching.
Updated dashboard to enqueue items and read summaries from the shared cache.

Files changed (4) hide show

app/pages/05_Dashboard.py +12 -9
app/utils/ai_summary_store.py +195 -0
app/utils/ai_summary_worker.py +110 -0
app/utils/news_cache.py +8 -5

app/pages/05_Dashboard.py CHANGED Viewed

@@ -26,7 +26,8 @@ from components.news import (
     display_economic_calendar_widget
 )
 from utils.breaking_news_scorer import get_breaking_news_scorer
-from utils.ai_summary_cache import ai_summary_cache
 # Import news scrapers
 try:
@@ -265,6 +266,10 @@ with st.sidebar:
 # Check for forced refresh (don't clear yet - wait until after fetching)
 force_refresh = st.session_state.get('force_refresh', False)
 # Fetch news from all sources IN PARALLEL for maximum performance
 import pandas as pd
 from concurrent.futures import ThreadPoolExecutor, as_completed
@@ -556,12 +561,10 @@ all_items = []
 for df in ai_summary_dfs:
     if df.empty:
         continue
-    records = df.to_dict("records")
-    all_items.extend(records)
 if all_items:
-    ai_summary_cache.buffer_items(all_items)
-    ai_summary_cache.maybe_flush()
 # Clear force refresh flag after fetching is complete
 if force_refresh:
@@ -875,9 +878,9 @@ ai_summary_pct = (ai_summarized / total_items * 100) if total_items else 0.0
 st.markdown("---")
 @st.fragment(run_every=60)
 def render_ai_summary_section():
-    summaries, last_update = ai_summary_cache.get_summaries()
-    status = ai_summary_cache.get_status()
-    last_update_text = last_update.strftime("%Y-%m-%d %H:%M:%S") if last_update else "N/A"
     buffer_remaining = status.get("buffer_remaining_seconds")
     buffer_text = "N/A"
     if buffer_remaining is not None:
@@ -901,7 +904,7 @@ def render_ai_summary_section():
     )
     if summaries:
-        for item in summaries[:50]:
             source = item.get("source", "")
             summary = item.get("summary", "")
             title = item.get("title", "")

     display_economic_calendar_widget
 )
 from utils.breaking_news_scorer import get_breaking_news_scorer
+from utils.ai_summary_store import init_db, enqueue_items, fetch_summaries, get_status
+from utils.ai_summary_worker import start_worker_if_needed
 # Import news scrapers
 try:
 # Check for forced refresh (don't clear yet - wait until after fetching)
 force_refresh = st.session_state.get('force_refresh', False)
+# Initialize AI summary store/worker (shared across sessions/processes)
+init_db()
+start_worker_if_needed()
 # Fetch news from all sources IN PARALLEL for maximum performance
 import pandas as pd
 from concurrent.futures import ThreadPoolExecutor, as_completed
 for df in ai_summary_dfs:
     if df.empty:
         continue
+    all_items.extend(df.to_dict("records"))
 if all_items:
+    enqueue_items(all_items)
 # Clear force refresh flag after fetching is complete
 if force_refresh:
 st.markdown("---")
 @st.fragment(run_every=60)
 def render_ai_summary_section():
+    summaries = fetch_summaries(limit=50)
+    status = get_status()
+    last_update_text = status.get("last_update") or "N/A"
     buffer_remaining = status.get("buffer_remaining_seconds")
     buffer_text = "N/A"
     if buffer_remaining is not None:
     )
     if summaries:
+        for item in summaries:
             source = item.get("source", "")
             summary = item.get("summary", "")
             title = item.get("title", "")

app/utils/ai_summary_store.py ADDED Viewed

	@@ -0,0 +1,195 @@

+"""SQLite-backed AI summary buffer and cache (shared across processes)."""
+import os
+import sqlite3
+import time
+from contextlib import contextmanager
+from datetime import datetime
+from typing import Dict, Iterable, List, Optional, Tuple
+DB_PATH = os.getenv("AI_SUMMARY_DB_PATH", "/tmp/ai_summary_cache.sqlite")
+BUFFER_SECONDS = int(os.getenv("LLM_SUMMARY_BUFFER_SECONDS", "120"))
+BATCH_MAX_CHARS = int(os.getenv("LLM_SUMMARY_BATCH_MAX_CHARS", "2400"))
+def _connect() -> sqlite3.Connection:
+    conn = sqlite3.connect(DB_PATH, timeout=30, isolation_level=None)
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.execute("PRAGMA synchronous=NORMAL;")
+    return conn
+@contextmanager
+def _db():
+    conn = _connect()
+    try:
+        yield conn
+    finally:
+        conn.close()
+def init_db():
+    with _db() as conn:
+        conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS summary_buffer (
+                item_key TEXT PRIMARY KEY,
+                title TEXT NOT NULL,
+                source TEXT,
+                created_at REAL NOT NULL
+            );
+            """
+        )
+        conn.execute(
+            """
+            CREATE TABLE IF NOT EXISTS summaries (
+                item_key TEXT PRIMARY KEY,
+                title TEXT NOT NULL,
+                source TEXT,
+                summary TEXT NOT NULL,
+                updated_at REAL NOT NULL
+            );
+            """
+        )
+def _item_key(item: Dict) -> str:
+    if item.get("id") is not None:
+        return str(item.get("id"))
+    title = str(item.get("title", "")).strip()
+    source = str(item.get("source", "")).strip()
+    if not title:
+        return ""
+    return f"{source}|{title}".lower()
+def enqueue_items(items: Iterable[Dict]):
+    now = time.time()
+    rows = []
+    for item in items:
+        key = _item_key(item)
+        title = str(item.get("title", "")).strip()
+        if not key or not title:
+            continue
+        source = str(item.get("source", "")).strip()
+        rows.append((key, title, source, now))
+    if not rows:
+        return
+    with _db() as conn:
+        conn.executemany(
+            """
+            INSERT OR IGNORE INTO summary_buffer (item_key, title, source, created_at)
+            VALUES (?, ?, ?, ?)
+            """,
+            rows,
+        )
+def get_status() -> Dict:
+    with _db() as conn:
+        buffer_count = conn.execute("SELECT COUNT(*) FROM summary_buffer").fetchone()[0]
+        summaries_count = conn.execute("SELECT COUNT(*) FROM summaries").fetchone()[0]
+        last_update_row = conn.execute("SELECT MAX(updated_at) FROM summaries").fetchone()
+        buffer_oldest_row = conn.execute("SELECT MIN(created_at) FROM summary_buffer").fetchone()
+    last_update = (
+        datetime.fromtimestamp(last_update_row[0]).strftime("%Y-%m-%d %H:%M:%S")
+        if last_update_row and last_update_row[0]
+        else None
+    )
+    buffer_oldest = buffer_oldest_row[0] if buffer_oldest_row else None
+    buffer_remaining = None
+    if buffer_oldest:
+        age = time.time() - buffer_oldest
+        buffer_remaining = max(BUFFER_SECONDS - age, 0)
+    return {
+        "buffer_size": buffer_count,
+        "total_summaries": summaries_count,
+        "last_update": last_update,
+        "buffer_remaining_seconds": buffer_remaining,
+        "batch_max_chars": BATCH_MAX_CHARS,
+        "buffer_window_seconds": BUFFER_SECONDS,
+    }
+def fetch_summaries(limit: int = 50) -> List[Dict]:
+    with _db() as conn:
+        rows = conn.execute(
+            """
+            SELECT title, source, summary, updated_at
+            FROM summaries
+            ORDER BY updated_at DESC
+            LIMIT ?
+            """,
+            (limit,),
+        ).fetchall()
+    results = []
+    for title, source, summary, updated_at in rows:
+        results.append(
+            {
+                "title": title,
+                "source": source or "",
+                "summary": summary,
+                "timestamp": datetime.fromtimestamp(updated_at),
+            }
+        )
+    return results
+def _build_input_text(title: str, source: str) -> str:
+    if source:
+        return f"Source: {source}\nTitle: {title}"
+    return f"Title: {title}"
+def fetch_ready_batches(max_chars_total: int, buffer_seconds: int) -> List[List[Tuple[str, str, str]]]:
+    cutoff = time.time() - buffer_seconds
+    with _db() as conn:
+        rows = conn.execute(
+            """
+            SELECT item_key, title, source
+            FROM summary_buffer
+            WHERE created_at <= ?
+            ORDER BY created_at ASC
+            """,
+            (cutoff,),
+        ).fetchall()
+    batches: List[List[Tuple[str, str, str]]] = []
+    current: List[Tuple[str, str, str]] = []
+    current_chars = 0
+    for item_key, title, source in rows:
+        text = _build_input_text(title, source or "")
+        text_len = len(text)
+        if current and current_chars + text_len > max_chars_total:
+            batches.append(current)
+            current = []
+            current_chars = 0
+        current.append((item_key, title, source or ""))
+        current_chars += text_len
+    if current:
+        batches.append(current)
+    return batches
+def store_summaries(items: List[Tuple[str, str, str, str]]):
+    now = time.time()
+    with _db() as conn:
+        conn.executemany(
+            """
+            INSERT OR REPLACE INTO summaries (item_key, title, source, summary, updated_at)
+            VALUES (?, ?, ?, ?, ?)
+            """,
+            [(k, t, s, summary, now) for k, t, s, summary in items],
+        )
+        conn.executemany(
+            "DELETE FROM summary_buffer WHERE item_key = ?",
+            [(k,) for k, _, _, _ in items],
+        )

app/utils/ai_summary_worker.py ADDED Viewed

	@@ -0,0 +1,110 @@

+"""Background worker process for AI summarization."""
+import os
+import time
+import logging
+import signal
+import sqlite3
+from typing import List, Tuple
+from utils.llm_summarizer import OpenAICompatSummarizer
+from utils.ai_summary_store import (
+    init_db,
+    fetch_ready_batches,
+    store_summaries,
+    BATCH_MAX_CHARS,
+    BUFFER_SECONDS,
+)
+logger = logging.getLogger(__name__)
+PID_FILE = os.getenv("AI_SUMMARY_WORKER_PID", "/tmp/ai_summary_worker.pid")
+POLL_SECONDS = int(os.getenv("AI_SUMMARY_POLL_SECONDS", "5"))
+MAX_RETRIES = int(os.getenv("LLM_SUMMARY_RETRIES", "3"))
+class Worker:
+    def __init__(self):
+        self._stop = False
+        self.summarizer = OpenAICompatSummarizer()
+    def stop(self, *_args):
+        self._stop = True
+    def run(self):
+        init_db()
+        signal.signal(signal.SIGTERM, self.stop)
+        signal.signal(signal.SIGINT, self.stop)
+        while not self._stop:
+            try:
+                batches = fetch_ready_batches(BATCH_MAX_CHARS, BUFFER_SECONDS)
+                for batch in batches:
+                    self._process_batch(batch)
+            except sqlite3.Error as exc:
+                logger.warning(f"AI worker DB error: {exc}")
+            except Exception as exc:
+                logger.warning(f"AI worker error: {exc}")
+            time.sleep(POLL_SECONDS)
+    def _process_batch(self, batch: List[Tuple[str, str, str]]):
+        if not batch or not self.summarizer.enabled:
+            return
+        texts = []
+        for _, title, source in batch:
+            if source:
+                texts.append(f"Source: {source}\nTitle: {title}")
+            else:
+                texts.append(f"Title: {title}")
+        for attempt in range(1, MAX_RETRIES + 1):
+            summaries = self.summarizer._summarize_chunk(texts, source="dashboard")
+            if summaries and len(summaries) == len(batch):
+                break
+            if attempt < MAX_RETRIES:
+                time.sleep(2 ** attempt)
+        else:
+            logger.warning("AI worker failed to summarize batch after retries")
+            return
+        to_store = []
+        for (item_key, title, source), summary in zip(batch, summaries):
+            if not summary:
+                continue
+            to_store.append((item_key, title, source, summary))
+        if to_store:
+            store_summaries(to_store)
+def _pid_running(pid: int) -> bool:
+    try:
+        os.kill(pid, 0)
+        return True
+    except Exception:
+        return False
+def start_worker_if_needed():
+    if os.path.exists(PID_FILE):
+        try:
+            with open(PID_FILE, "r", encoding="utf-8") as f:
+                pid = int(f.read().strip() or 0)
+            if pid and _pid_running(pid):
+                return
+        except Exception:
+            pass
+    pid = os.fork()
+    if pid != 0:
+        return
+    os.setsid()
+    with open(PID_FILE, "w", encoding="utf-8") as f:
+        f.write(str(os.getpid()))
+    worker = Worker()
+    worker.run()

app/utils/news_cache.py CHANGED Viewed

@@ -49,6 +49,7 @@ class NewsCacheManager:
         source: str,
         fetcher_func: Callable,
         force_refresh: bool = False,
         **kwargs
     ) -> List[Dict]:
         """
@@ -58,6 +59,7 @@ class NewsCacheManager:
             source: News source ('twitter', 'reddit', 'rss', 'ai_tech')
             fetcher_func: Function to fetch fresh news
             force_refresh: If True, bypass cache and fetch fresh
             **kwargs: Arguments to pass to fetcher_func
         Returns:
@@ -93,12 +95,13 @@ class NewsCacheManager:
             # Update cache
             self._update_cache(source, new_items)
-            # Deduplicate across sources
-            deduplicated = self._deduplicate(new_items, source)
-            logger.info(f"✅ Fetched {len(new_items)} items for {source}, {len(deduplicated)} unique after dedup")
-            return deduplicated
         except Exception as e:
             logger.error(f"Error fetching news for {source}: {e}")

         source: str,
         fetcher_func: Callable,
         force_refresh: bool = False,
+        deduplicate: bool = False,
         **kwargs
     ) -> List[Dict]:
         """
             source: News source ('twitter', 'reddit', 'rss', 'ai_tech')
             fetcher_func: Function to fetch fresh news
             force_refresh: If True, bypass cache and fetch fresh
+            deduplicate: If True, remove duplicates across sources using global index
             **kwargs: Arguments to pass to fetcher_func
         Returns:
             # Update cache
             self._update_cache(source, new_items)
+            if deduplicate:
+                deduplicated = self._deduplicate(new_items, source)
+                logger.info(f"✅ Fetched {len(new_items)} items for {source}, {len(deduplicated)} unique after dedup")
+                return deduplicated
+            logger.info(f"✅ Fetched {len(new_items)} items for {source} (dedup disabled)")
+            return new_items
         except Exception as e:
             logger.error(f"Error fetching news for {source}: {e}")