Spaces:

ResearchEngineering
/

FinancialPlatform

Paused

App Files Files Community

Dmitry Beresnev commited on Mar 15

Commit

451e369

1 Parent(s): c571607

fix ai summaries cache

Browse files

Files changed (4) hide show

app/pages/05_Dashboard.py +2 -2
app/utils/ai_summary_store.py +220 -128
app/utils/ai_summary_worker.py +2 -3
requirements.txt +1 -0

app/pages/05_Dashboard.py CHANGED Viewed

@@ -26,7 +26,7 @@ from components.news import (
     display_economic_calendar_widget
 )
 from utils.breaking_news_scorer import get_breaking_news_scorer
-from utils.ai_summary_store import init_db, enqueue_items, fetch_summaries, get_status
 from utils.ai_summary_worker import start_worker_if_needed
 # Import news scrapers
@@ -267,7 +267,7 @@ with st.sidebar:
 force_refresh = st.session_state.get('force_refresh', False)
 # Initialize AI summary store/worker (shared across sessions/processes)
-init_db()
 start_worker_if_needed()
 # Fetch news from all sources IN PARALLEL for maximum performance

     display_economic_calendar_widget
 )
 from utils.breaking_news_scorer import get_breaking_news_scorer
+from utils.ai_summary_store import init_storage, enqueue_items, fetch_summaries, get_status
 from utils.ai_summary_worker import start_worker_if_needed
 # Import news scrapers
 force_refresh = st.session_state.get('force_refresh', False)
 # Initialize AI summary store/worker (shared across sessions/processes)
+init_storage()
 start_worker_if_needed()
 # Fetch news from all sources IN PARALLEL for maximum performance

app/utils/ai_summary_store.py CHANGED Viewed

@@ -1,114 +1,104 @@
-"""SQLite-backed AI summary buffer and cache (shared across processes)."""
 import os
-import sqlite3
 import time
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Dict, Iterable, List, Optional, Tuple
-DB_PATH = os.getenv("AI_SUMMARY_DB_PATH", "/tmp/ai_summary_cache.sqlite")
 BUFFER_SECONDS = int(os.getenv("LLM_SUMMARY_BUFFER_SECONDS", "120"))
 BATCH_MAX_CHARS = int(os.getenv("LLM_SUMMARY_BATCH_MAX_CHARS", "2400"))
-def _connect() -> sqlite3.Connection:
-    conn = sqlite3.connect(DB_PATH, timeout=30, isolation_level=None)
-    conn.execute("PRAGMA journal_mode=WAL;")
-    conn.execute("PRAGMA synchronous=NORMAL;")
-    return conn
-@contextmanager
-def _db():
-    conn = _connect()
-    try:
-        yield conn
-    finally:
-        conn.close()
-def init_db():
-    with _db() as conn:
-        conn.execute(
-            """
-            CREATE TABLE IF NOT EXISTS summary_buffer (
-                item_key TEXT PRIMARY KEY,
-                title TEXT NOT NULL,
-                source TEXT,
-                created_at REAL NOT NULL
-            );
-            """
-        )
-        conn.execute(
-            """
-            CREATE TABLE IF NOT EXISTS summaries (
-                item_key TEXT PRIMARY KEY,
-                title TEXT NOT NULL,
-                source TEXT,
-                summary TEXT NOT NULL,
-                updated_at REAL NOT NULL
-            );
-            """
-        )
-def _item_key(item: Dict) -> str:
-    if item.get("id") is not None:
-        return str(item.get("id"))
-    title = str(item.get("title", "")).strip()
-    source = str(item.get("source", "")).strip()
-    if not title:
-        return ""
-    return f"{source}|{title}".lower()
-def enqueue_items(items: Iterable[Dict]):
-    now = time.time()
-    rows = []
-    for item in items:
-        key = _item_key(item)
-        title = str(item.get("title", "")).strip()
-        if not key or not title:
-            continue
-        source = str(item.get("source", "")).strip()
-        rows.append((key, title, source, now))
-    if not rows:
-        return
-    with _db() as conn:
-        conn.executemany(
-            """
-            INSERT OR IGNORE INTO summary_buffer (item_key, title, source, created_at)
-            VALUES (?, ?, ?, ?)
-            """,
-            rows,
-        )
 def get_status() -> Dict:
-    with _db() as conn:
-        buffer_count = conn.execute("SELECT COUNT(*) FROM summary_buffer").fetchone()[0]
-        summaries_count = conn.execute("SELECT COUNT(*) FROM summaries").fetchone()[0]
-        last_update_row = conn.execute("SELECT MAX(updated_at) FROM summaries").fetchone()
-        buffer_oldest_row = conn.execute("SELECT MIN(created_at) FROM summary_buffer").fetchone()
-    last_update = (
-        datetime.fromtimestamp(last_update_row[0]).strftime("%Y-%m-%d %H:%M:%S")
-        if last_update_row and last_update_row[0]
-        else None
-    )
-    buffer_oldest = buffer_oldest_row[0] if buffer_oldest_row else None
     buffer_remaining = None
     if buffer_oldest:
         age = time.time() - buffer_oldest
         buffer_remaining = max(BUFFER_SECONDS - age, 0)
     return {
         "buffer_size": buffer_count,
         "total_summaries": summaries_count,
-        "last_update": last_update,
         "buffer_remaining_seconds": buffer_remaining,
         "batch_max_chars": BATCH_MAX_CHARS,
         "buffer_window_seconds": BUFFER_SECONDS,
@@ -116,61 +106,48 @@ def get_status() -> Dict:
 def fetch_summaries(limit: int = 50) -> List[Dict]:
-    with _db() as conn:
-        rows = conn.execute(
-            """
-            SELECT title, source, summary, updated_at
-            FROM summaries
-            ORDER BY updated_at DESC
-            LIMIT ?
-            """,
-            (limit,),
-        ).fetchall()
     results = []
-    for title, source, summary, updated_at in rows:
         results.append(
             {
-                "title": title,
-                "source": source or "",
-                "summary": summary,
-                "timestamp": datetime.fromtimestamp(updated_at),
             }
         )
     return results
-def _build_input_text(title: str, source: str) -> str:
-    if source:
-        return f"Source: {source}\nTitle: {title}"
-    return f"Title: {title}"
 def fetch_ready_batches(max_chars_total: int, buffer_seconds: int) -> List[List[Tuple[str, str, str]]]:
     cutoff = time.time() - buffer_seconds
-    with _db() as conn:
-        rows = conn.execute(
-            """
-            SELECT item_key, title, source
-            FROM summary_buffer
-            WHERE created_at <= ?
-            ORDER BY created_at ASC
-            """,
-            (cutoff,),
-        ).fetchall()
     batches: List[List[Tuple[str, str, str]]] = []
     current: List[Tuple[str, str, str]] = []
     current_chars = 0
-    for item_key, title, source in rows:
-        text = _build_input_text(title, source or "")
         text_len = len(text)
         if current and current_chars + text_len > max_chars_total:
             batches.append(current)
             current = []
             current_chars = 0
-        current.append((item_key, title, source or ""))
         current_chars += text_len
     if current:
@@ -180,16 +157,131 @@ def fetch_ready_batches(max_chars_total: int, buffer_seconds: int) -> List[List[
 def store_summaries(items: List[Tuple[str, str, str, str]]):
     now = time.time()
-    with _db() as conn:
-        conn.executemany(
-            """
-            INSERT OR REPLACE INTO summaries (item_key, title, source, summary, updated_at)
-            VALUES (?, ?, ?, ?, ?)
-            """,
-            [(k, t, s, summary, now) for k, t, s, summary in items],
-        )
-        conn.executemany(
-            "DELETE FROM summary_buffer WHERE item_key = ?",
-            [(k,) for k, _, _, _ in items],
-        )

+"""File-backed AI summary buffer and cache with optional HF dataset sync."""
+import json
 import os
 import time
 from contextlib import contextmanager
 from datetime import datetime
 from typing import Dict, Iterable, List, Optional, Tuple
+try:
+    import fcntl
+except Exception:  # pragma: no cover
+    fcntl = None
+try:
+    from huggingface_hub import HfApi, snapshot_download
+except Exception:  # pragma: no cover
+    HfApi = None
+    snapshot_download = None
+CACHE_DIR = os.getenv("AI_SUMMARY_CACHE_DIR", "./ai-summary-cache")
 BUFFER_SECONDS = int(os.getenv("LLM_SUMMARY_BUFFER_SECONDS", "120"))
 BATCH_MAX_CHARS = int(os.getenv("LLM_SUMMARY_BATCH_MAX_CHARS", "2400"))
+HF_REPO_ID = os.getenv("AI_SUMMARY_HF_REPO", "ResearchEngineering/ai_news_summaries")
+HF_REPO_TYPE = os.getenv("AI_SUMMARY_HF_REPO_TYPE", "dataset")
+BUFFER_FILE = "buffer.jsonl"
+SUMMARIES_FILE = "summaries.jsonl"
+META_FILE = "meta.json"
+LOCK_FILE = ".lock"
+def init_storage():
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    if snapshot_download and HF_REPO_ID:
+        _maybe_restore_from_hf()
+    _ensure_files()
+def enqueue_items(items: Iterable[Dict]):
+    init_storage()
+    now = time.time()
+    with _file_lock():
+        buffer_items = _read_jsonl(BUFFER_FILE)
+        summaries = _read_jsonl(SUMMARIES_FILE)
+        existing_keys = {item.get("item_key") for item in buffer_items if item.get("item_key")}
+        existing_keys.update({item.get("item_key") for item in summaries if item.get("item_key")})
+        added = 0
+        for item in items:
+            key = _item_key(item)
+            title = str(item.get("title", "")).strip()
+            if not key or not title or key in existing_keys:
+                continue
+            source = str(item.get("source", "")).strip()
+            buffer_items.append(
+                {
+                    "item_key": key,
+                    "title": title,
+                    "source": source,
+                    "created_at": now,
+                }
+            )
+            existing_keys.add(key)
+            added += 1
+        if added:
+            _write_jsonl(BUFFER_FILE, buffer_items)
 def get_status() -> Dict:
+    init_storage()
+    with _file_lock():
+        buffer_items = _read_jsonl(BUFFER_FILE)
+        summaries = _read_jsonl(SUMMARIES_FILE)
+    buffer_count = len(buffer_items)
+    summaries_count = len(summaries)
+    last_update = None
+    if summaries:
+        last_update = max(item.get("updated_at", 0) for item in summaries)
+    buffer_oldest = None
+    if buffer_items:
+        buffer_oldest = min(item.get("created_at", 0) for item in buffer_items)
     buffer_remaining = None
     if buffer_oldest:
         age = time.time() - buffer_oldest
         buffer_remaining = max(BUFFER_SECONDS - age, 0)
+    last_update_text = (
+        datetime.fromtimestamp(last_update).strftime("%Y-%m-%d %H:%M:%S") if last_update else None
+    )
     return {
         "buffer_size": buffer_count,
         "total_summaries": summaries_count,
+        "last_update": last_update_text,
         "buffer_remaining_seconds": buffer_remaining,
         "batch_max_chars": BATCH_MAX_CHARS,
         "buffer_window_seconds": BUFFER_SECONDS,
 def fetch_summaries(limit: int = 50) -> List[Dict]:
+    init_storage()
+    with _file_lock():
+        summaries = _read_jsonl(SUMMARIES_FILE)
+    summaries.sort(key=lambda x: x.get("updated_at", 0), reverse=True)
     results = []
+    for item in summaries[:limit]:
         results.append(
             {
+                "title": item.get("title", ""),
+                "source": item.get("source", ""),
+                "summary": item.get("summary", ""),
+                "timestamp": datetime.fromtimestamp(item.get("updated_at", time.time())),
             }
         )
     return results
 def fetch_ready_batches(max_chars_total: int, buffer_seconds: int) -> List[List[Tuple[str, str, str]]]:
+    init_storage()
     cutoff = time.time() - buffer_seconds
+    with _file_lock():
+        buffer_items = _read_jsonl(BUFFER_FILE)
+    eligible = [item for item in buffer_items if item.get("created_at", 0) <= cutoff]
+    eligible.sort(key=lambda x: x.get("created_at", 0))
     batches: List[List[Tuple[str, str, str]]] = []
     current: List[Tuple[str, str, str]] = []
     current_chars = 0
+    for item in eligible:
+        title = item.get("title", "")
+        source = item.get("source", "")
+        text = _build_input_text(title, source)
         text_len = len(text)
         if current and current_chars + text_len > max_chars_total:
             batches.append(current)
             current = []
             current_chars = 0
+        current.append((item.get("item_key"), title, source))
         current_chars += text_len
     if current:
 def store_summaries(items: List[Tuple[str, str, str, str]]):
+    if not items:
+        return
+    init_storage()
     now = time.time()
+    with _file_lock():
+        summaries = _read_jsonl(SUMMARIES_FILE)
+        buffer_items = _read_jsonl(BUFFER_FILE)
+        summaries_by_key = {item.get("item_key"): item for item in summaries if item.get("item_key")}
+        buffer_by_key = {item.get("item_key"): item for item in buffer_items if item.get("item_key")}
+        for item_key, title, source, summary in items:
+            summaries_by_key[item_key] = {
+                "item_key": item_key,
+                "title": title,
+                "source": source,
+                "summary": summary,
+                "updated_at": now,
+            }
+            if item_key in buffer_by_key:
+                del buffer_by_key[item_key]
+        _write_jsonl(SUMMARIES_FILE, list(summaries_by_key.values()))
+        _write_jsonl(BUFFER_FILE, list(buffer_by_key.values()))
+    _write_meta({"last_sync": None, "last_update": now})
+    _sync_to_hf_if_configured()
+def _item_key(item: Dict) -> str:
+    if item.get("id") is not None:
+        return str(item.get("id"))
+    title = str(item.get("title", "")).strip()
+    source = str(item.get("source", "")).strip()
+    if not title:
+        return ""
+    return f"{source}|{title}".lower()
+def _build_input_text(title: str, source: str) -> str:
+    if source:
+        return f"Source: {source}\nTitle: {title}"
+    return f"Title: {title}"
+def _ensure_files():
+    for name in (BUFFER_FILE, SUMMARIES_FILE):
+        path = os.path.join(CACHE_DIR, name)
+        if not os.path.exists(path):
+            with open(path, "w", encoding="utf-8") as f:
+                f.write("")
+def _read_jsonl(filename: str) -> List[Dict]:
+    path = os.path.join(CACHE_DIR, filename)
+    if not os.path.exists(path):
+        return []
+    items = []
+    with open(path, "r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                items.append(json.loads(line))
+            except Exception:
+                continue
+    return items
+def _write_jsonl(filename: str, items: List[Dict]):
+    path = os.path.join(CACHE_DIR, filename)
+    tmp_path = path + ".tmp"
+    with open(tmp_path, "w", encoding="utf-8") as f:
+        for item in items:
+            f.write(json.dumps(item, ensure_ascii=True) + "\n")
+    os.replace(tmp_path, path)
+def _write_meta(data: Dict):
+    path = os.path.join(CACHE_DIR, META_FILE)
+    tmp_path = path + ".tmp"
+    with open(tmp_path, "w", encoding="utf-8") as f:
+        json.dump(data, f)
+    os.replace(tmp_path, path)
+@contextmanager
+def _file_lock():
+    os.makedirs(CACHE_DIR, exist_ok=True)
+    lock_path = os.path.join(CACHE_DIR, LOCK_FILE)
+    if fcntl is None:
+        yield
+        return
+    with open(lock_path, "w", encoding="utf-8") as lock_file:
+        fcntl.flock(lock_file, fcntl.LOCK_EX)
+        try:
+            yield
+        finally:
+            fcntl.flock(lock_file, fcntl.LOCK_UN)
+def _maybe_restore_from_hf():
+    if not snapshot_download:
+        return
+    if not HF_REPO_ID:
+        return
+    if os.path.exists(os.path.join(CACHE_DIR, SUMMARIES_FILE)):
+        return
+    snapshot_download(
+        repo_id=HF_REPO_ID,
+        repo_type=HF_REPO_TYPE,
+        local_dir=CACHE_DIR,
+        local_dir_use_symlinks=False,
+    )
+def _sync_to_hf_if_configured():
+    if not HfApi or not HF_REPO_ID:
+        return
+    api = HfApi()
+    api.upload_folder(
+        folder_path=CACHE_DIR,
+        repo_id=HF_REPO_ID,
+        repo_type=HF_REPO_TYPE,
+    )

app/utils/ai_summary_worker.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import List, Tuple
 from utils.llm_summarizer import OpenAICompatSummarizer
 from utils.ai_summary_store import (
-    init_db,
     fetch_ready_batches,
     store_summaries,
     BATCH_MAX_CHARS,
@@ -32,7 +32,7 @@ class Worker:
         self._stop = True
     def run(self):
-        init_db()
         signal.signal(signal.SIGTERM, self.stop)
         signal.signal(signal.SIGINT, self.stop)
@@ -107,4 +107,3 @@ def start_worker_if_needed():
     worker = Worker()
     worker.run()

 from utils.llm_summarizer import OpenAICompatSummarizer
 from utils.ai_summary_store import (
+    init_storage,
     fetch_ready_batches,
     store_summaries,
     BATCH_MAX_CHARS,
         self._stop = True
     def run(self):
+        init_storage()
         signal.signal(signal.SIGTERM, self.stop)
         signal.signal(signal.SIGINT, self.stop)
     worker = Worker()
     worker.run()

requirements.txt CHANGED Viewed

@@ -10,3 +10,4 @@ beautifulsoup4>=4.12.0
 lxml>=5.0.0
 ntscraper
 playwright>=1.40.0

 lxml>=5.0.0
 ntscraper
 playwright>=1.40.0
+huggingface_hub>=0.22.2