Dmitry Beresnev commited on
Commit ·
f00d814
1
Parent(s): bcf73e3
fix summarization process
Browse files- app/pages/05_Dashboard.py +42 -11
- app/utils/llm_summarizer.py +9 -2
- app/utils/news_cache.py +0 -15
app/pages/05_Dashboard.py
CHANGED
|
@@ -26,6 +26,7 @@ from components.news import (
|
|
| 26 |
display_economic_calendar_widget
|
| 27 |
)
|
| 28 |
from utils.breaking_news_scorer import get_breaking_news_scorer
|
|
|
|
| 29 |
|
| 30 |
# Import news scrapers
|
| 31 |
try:
|
|
@@ -539,6 +540,47 @@ if fetch_errors:
|
|
| 539 |
for err in fetch_errors:
|
| 540 |
logger.warning(f"Fetch error: {err}")
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
# Clear force refresh flag after fetching is complete
|
| 543 |
if force_refresh:
|
| 544 |
st.session_state.force_refresh = False
|
|
@@ -839,17 +881,6 @@ if 'fetch_errors' in locals() and fetch_errors:
|
|
| 839 |
st.caption(f"• {error}")
|
| 840 |
|
| 841 |
# ---- AI SUMMARY METRICS ----
|
| 842 |
-
ai_summary_dfs = [
|
| 843 |
-
twitter_df,
|
| 844 |
-
reddit_df,
|
| 845 |
-
rss_all_df,
|
| 846 |
-
ai_tech_df,
|
| 847 |
-
sectoral_news_df,
|
| 848 |
-
market_events_df,
|
| 849 |
-
economic_calendar_df,
|
| 850 |
-
predictions_df,
|
| 851 |
-
]
|
| 852 |
-
|
| 853 |
total_items = sum(len(df) for df in ai_summary_dfs if not df.empty)
|
| 854 |
ai_summarized = 0
|
| 855 |
for df in ai_summary_dfs:
|
|
|
|
| 26 |
display_economic_calendar_widget
|
| 27 |
)
|
| 28 |
from utils.breaking_news_scorer import get_breaking_news_scorer
|
| 29 |
+
from utils.llm_summarizer import OpenAICompatSummarizer
|
| 30 |
|
| 31 |
# Import news scrapers
|
| 32 |
try:
|
|
|
|
| 540 |
for err in fetch_errors:
|
| 541 |
logger.warning(f"Fetch error: {err}")
|
| 542 |
|
| 543 |
+
# Batch AI summarization after all sources are collected
|
| 544 |
+
ai_summary_dfs = [
|
| 545 |
+
twitter_df,
|
| 546 |
+
reddit_df,
|
| 547 |
+
rss_all_df,
|
| 548 |
+
ai_tech_df,
|
| 549 |
+
sectoral_news_df,
|
| 550 |
+
market_events_df,
|
| 551 |
+
economic_calendar_df,
|
| 552 |
+
predictions_df,
|
| 553 |
+
]
|
| 554 |
+
|
| 555 |
+
summarizer = OpenAICompatSummarizer()
|
| 556 |
+
if summarizer.enabled:
|
| 557 |
+
all_items = []
|
| 558 |
+
for df in ai_summary_dfs:
|
| 559 |
+
if df.empty:
|
| 560 |
+
continue
|
| 561 |
+
if "summary_raw" not in df.columns:
|
| 562 |
+
df["summary_raw"] = df.get("summary", "")
|
| 563 |
+
records = df.to_dict("records")
|
| 564 |
+
for record in records:
|
| 565 |
+
if "summary_raw" not in record:
|
| 566 |
+
record["summary_raw"] = record.get("summary", "")
|
| 567 |
+
all_items.extend(records)
|
| 568 |
+
|
| 569 |
+
if all_items:
|
| 570 |
+
with st.spinner("Summarizing news with AI..."):
|
| 571 |
+
summarizer.summarize_items(all_items, source="dashboard")
|
| 572 |
+
|
| 573 |
+
ai_map = {
|
| 574 |
+
item.get("id"): item.get("summary_ai")
|
| 575 |
+
for item in all_items
|
| 576 |
+
if item.get("id") is not None
|
| 577 |
+
}
|
| 578 |
+
for df in ai_summary_dfs:
|
| 579 |
+
if df.empty or "id" not in df.columns:
|
| 580 |
+
continue
|
| 581 |
+
df["summary_ai"] = df["id"].map(ai_map)
|
| 582 |
+
df["summary"] = df["summary_ai"].fillna(df["summary"])
|
| 583 |
+
|
| 584 |
# Clear force refresh flag after fetching is complete
|
| 585 |
if force_refresh:
|
| 586 |
st.session_state.force_refresh = False
|
|
|
|
| 881 |
st.caption(f"• {error}")
|
| 882 |
|
| 883 |
# ---- AI SUMMARY METRICS ----
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
total_items = sum(len(df) for df in ai_summary_dfs if not df.empty)
|
| 885 |
ai_summarized = 0
|
| 886 |
for df in ai_summary_dfs:
|
app/utils/llm_summarizer.py
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
import json
|
| 4 |
import logging
|
| 5 |
import os
|
|
|
|
| 6 |
from typing import Dict, List, Optional, Tuple
|
| 7 |
|
| 8 |
import requests
|
|
@@ -27,10 +28,11 @@ class OpenAICompatSummarizer:
|
|
| 27 |
self.api_base = (api_base or os.getenv("LLM_API_BASE") or "https://researchengineering-agi.hf.space").rstrip("/")
|
| 28 |
self.api_key = api_key if api_key is not None else os.getenv("LLM_API_KEY", "")
|
| 29 |
self.model = model or os.getenv("LLM_MODEL", "gpt-4o-mini")
|
| 30 |
-
self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "
|
| 31 |
self.max_items_per_request = max_items_per_request or int(os.getenv("LLM_SUMMARY_BATCH", "8"))
|
| 32 |
self.max_chars_per_item = max_chars_per_item or int(os.getenv("LLM_SUMMARY_MAX_CHARS", "1200"))
|
| 33 |
self.enabled = os.getenv("ENABLE_AI_SUMMARIZATION", "true").lower() in {"1", "true", "yes"}
|
|
|
|
| 34 |
|
| 35 |
self._chat_url = f"{self.api_base}/v1/chat/completions"
|
| 36 |
|
|
@@ -40,6 +42,8 @@ class OpenAICompatSummarizer:
|
|
| 40 |
|
| 41 |
candidates: List[Tuple[Dict, str]] = []
|
| 42 |
for item in items:
|
|
|
|
|
|
|
| 43 |
text = self._build_input_text(item)
|
| 44 |
if text:
|
| 45 |
candidates.append((item, text))
|
|
@@ -47,7 +51,8 @@ class OpenAICompatSummarizer:
|
|
| 47 |
if not candidates:
|
| 48 |
return items
|
| 49 |
|
| 50 |
-
|
|
|
|
| 51 |
texts = [text for _, text in chunk]
|
| 52 |
summaries = self._summarize_chunk(texts, source=source)
|
| 53 |
if not summaries:
|
|
@@ -56,6 +61,8 @@ class OpenAICompatSummarizer:
|
|
| 56 |
if summary:
|
| 57 |
item["summary_ai"] = summary
|
| 58 |
item["summary"] = summary
|
|
|
|
|
|
|
| 59 |
|
| 60 |
return items
|
| 61 |
|
|
|
|
| 3 |
import json
|
| 4 |
import logging
|
| 5 |
import os
|
| 6 |
+
import time
|
| 7 |
from typing import Dict, List, Optional, Tuple
|
| 8 |
|
| 9 |
import requests
|
|
|
|
| 28 |
self.api_base = (api_base or os.getenv("LLM_API_BASE") or "https://researchengineering-agi.hf.space").rstrip("/")
|
| 29 |
self.api_key = api_key if api_key is not None else os.getenv("LLM_API_KEY", "")
|
| 30 |
self.model = model or os.getenv("LLM_MODEL", "gpt-4o-mini")
|
| 31 |
+
self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "120"))
|
| 32 |
self.max_items_per_request = max_items_per_request or int(os.getenv("LLM_SUMMARY_BATCH", "8"))
|
| 33 |
self.max_chars_per_item = max_chars_per_item or int(os.getenv("LLM_SUMMARY_MAX_CHARS", "1200"))
|
| 34 |
self.enabled = os.getenv("ENABLE_AI_SUMMARIZATION", "true").lower() in {"1", "true", "yes"}
|
| 35 |
+
self.sleep_seconds = float(os.getenv("LLM_SUMMARY_SLEEP_SECONDS", "0"))
|
| 36 |
|
| 37 |
self._chat_url = f"{self.api_base}/v1/chat/completions"
|
| 38 |
|
|
|
|
| 42 |
|
| 43 |
candidates: List[Tuple[Dict, str]] = []
|
| 44 |
for item in items:
|
| 45 |
+
if str(item.get("summary_ai", "")).strip():
|
| 46 |
+
continue
|
| 47 |
text = self._build_input_text(item)
|
| 48 |
if text:
|
| 49 |
candidates.append((item, text))
|
|
|
|
| 51 |
if not candidates:
|
| 52 |
return items
|
| 53 |
|
| 54 |
+
chunks = self._chunked(candidates, self.max_items_per_request)
|
| 55 |
+
for idx, chunk in enumerate(chunks, start=1):
|
| 56 |
texts = [text for _, text in chunk]
|
| 57 |
summaries = self._summarize_chunk(texts, source=source)
|
| 58 |
if not summaries:
|
|
|
|
| 61 |
if summary:
|
| 62 |
item["summary_ai"] = summary
|
| 63 |
item["summary"] = summary
|
| 64 |
+
if self.sleep_seconds > 0 and idx < len(chunks):
|
| 65 |
+
time.sleep(self.sleep_seconds)
|
| 66 |
|
| 67 |
return items
|
| 68 |
|
app/utils/news_cache.py
CHANGED
|
@@ -11,11 +11,6 @@ from typing import List, Dict, Optional, Callable
|
|
| 11 |
|
| 12 |
import pandas as pd
|
| 13 |
|
| 14 |
-
try:
|
| 15 |
-
from utils.llm_summarizer import OpenAICompatSummarizer
|
| 16 |
-
except Exception: # pragma: no cover - optional dependency
|
| 17 |
-
OpenAICompatSummarizer = None
|
| 18 |
-
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
|
|
@@ -48,7 +43,6 @@ class NewsCacheManager:
|
|
| 48 |
'filtered_cache': {} # Cached filtered results
|
| 49 |
}
|
| 50 |
logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
|
| 51 |
-
self.summarizer = OpenAICompatSummarizer() if OpenAICompatSummarizer else None
|
| 52 |
|
| 53 |
def get_news(
|
| 54 |
self,
|
|
@@ -95,7 +89,6 @@ class NewsCacheManager:
|
|
| 95 |
return self.cache[source]['raw_news']
|
| 96 |
|
| 97 |
self._prepare_summaries(new_items)
|
| 98 |
-
self._apply_ai_summaries(new_items, source=source)
|
| 99 |
|
| 100 |
# Update cache
|
| 101 |
self._update_cache(source, new_items)
|
|
@@ -244,14 +237,6 @@ class NewsCacheManager:
|
|
| 244 |
if 'summary_raw' not in item:
|
| 245 |
item['summary_raw'] = item.get('summary', '')
|
| 246 |
|
| 247 |
-
def _apply_ai_summaries(self, items: List[Dict], source: Optional[str] = None):
|
| 248 |
-
if not items or not self.summarizer or not getattr(self.summarizer, 'enabled', False):
|
| 249 |
-
return
|
| 250 |
-
try:
|
| 251 |
-
self.summarizer.summarize_items(items, source=source)
|
| 252 |
-
except Exception as exc:
|
| 253 |
-
logger.warning(f"AI summarization skipped due to error: {exc}")
|
| 254 |
-
|
| 255 |
def get_filtered_news(
|
| 256 |
self,
|
| 257 |
source_df: pd.DataFrame,
|
|
|
|
| 11 |
|
| 12 |
import pandas as pd
|
| 13 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
logger = logging.getLogger(__name__)
|
| 15 |
|
| 16 |
|
|
|
|
| 43 |
'filtered_cache': {} # Cached filtered results
|
| 44 |
}
|
| 45 |
logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
|
|
|
|
| 46 |
|
| 47 |
def get_news(
|
| 48 |
self,
|
|
|
|
| 89 |
return self.cache[source]['raw_news']
|
| 90 |
|
| 91 |
self._prepare_summaries(new_items)
|
|
|
|
| 92 |
|
| 93 |
# Update cache
|
| 94 |
self._update_cache(source, new_items)
|
|
|
|
| 237 |
if 'summary_raw' not in item:
|
| 238 |
item['summary_raw'] = item.get('summary', '')
|
| 239 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
def get_filtered_news(
|
| 241 |
self,
|
| 242 |
source_df: pd.DataFrame,
|