Dmitry Beresnev commited on
Commit
f00d814
·
1 Parent(s): bcf73e3

fix summarization process

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -26,6 +26,7 @@ from components.news import (
26
  display_economic_calendar_widget
27
  )
28
  from utils.breaking_news_scorer import get_breaking_news_scorer
 
29
 
30
  # Import news scrapers
31
  try:
@@ -539,6 +540,47 @@ if fetch_errors:
539
  for err in fetch_errors:
540
  logger.warning(f"Fetch error: {err}")
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # Clear force refresh flag after fetching is complete
543
  if force_refresh:
544
  st.session_state.force_refresh = False
@@ -839,17 +881,6 @@ if 'fetch_errors' in locals() and fetch_errors:
839
  st.caption(f"• {error}")
840
 
841
  # ---- AI SUMMARY METRICS ----
842
- ai_summary_dfs = [
843
- twitter_df,
844
- reddit_df,
845
- rss_all_df,
846
- ai_tech_df,
847
- sectoral_news_df,
848
- market_events_df,
849
- economic_calendar_df,
850
- predictions_df,
851
- ]
852
-
853
  total_items = sum(len(df) for df in ai_summary_dfs if not df.empty)
854
  ai_summarized = 0
855
  for df in ai_summary_dfs:
 
26
  display_economic_calendar_widget
27
  )
28
  from utils.breaking_news_scorer import get_breaking_news_scorer
29
+ from utils.llm_summarizer import OpenAICompatSummarizer
30
 
31
  # Import news scrapers
32
  try:
 
540
  for err in fetch_errors:
541
  logger.warning(f"Fetch error: {err}")
542
 
543
+ # Batch AI summarization after all sources are collected
544
+ ai_summary_dfs = [
545
+ twitter_df,
546
+ reddit_df,
547
+ rss_all_df,
548
+ ai_tech_df,
549
+ sectoral_news_df,
550
+ market_events_df,
551
+ economic_calendar_df,
552
+ predictions_df,
553
+ ]
554
+
555
+ summarizer = OpenAICompatSummarizer()
556
+ if summarizer.enabled:
557
+ all_items = []
558
+ for df in ai_summary_dfs:
559
+ if df.empty:
560
+ continue
561
+ if "summary_raw" not in df.columns:
562
+ df["summary_raw"] = df.get("summary", "")
563
+ records = df.to_dict("records")
564
+ for record in records:
565
+ if "summary_raw" not in record:
566
+ record["summary_raw"] = record.get("summary", "")
567
+ all_items.extend(records)
568
+
569
+ if all_items:
570
+ with st.spinner("Summarizing news with AI..."):
571
+ summarizer.summarize_items(all_items, source="dashboard")
572
+
573
+ ai_map = {
574
+ item.get("id"): item.get("summary_ai")
575
+ for item in all_items
576
+ if item.get("id") is not None
577
+ }
578
+ for df in ai_summary_dfs:
579
+ if df.empty or "id" not in df.columns:
580
+ continue
581
+ df["summary_ai"] = df["id"].map(ai_map)
582
+ df["summary"] = df["summary_ai"].fillna(df["summary"])
583
+
584
  # Clear force refresh flag after fetching is complete
585
  if force_refresh:
586
  st.session_state.force_refresh = False
 
881
  st.caption(f"• {error}")
882
 
883
  # ---- AI SUMMARY METRICS ----
 
 
 
 
 
 
 
 
 
 
 
884
  total_items = sum(len(df) for df in ai_summary_dfs if not df.empty)
885
  ai_summarized = 0
886
  for df in ai_summary_dfs:
app/utils/llm_summarizer.py CHANGED
@@ -3,6 +3,7 @@
3
  import json
4
  import logging
5
  import os
 
6
  from typing import Dict, List, Optional, Tuple
7
 
8
  import requests
@@ -27,10 +28,11 @@ class OpenAICompatSummarizer:
27
  self.api_base = (api_base or os.getenv("LLM_API_BASE") or "https://researchengineering-agi.hf.space").rstrip("/")
28
  self.api_key = api_key if api_key is not None else os.getenv("LLM_API_KEY", "")
29
  self.model = model or os.getenv("LLM_MODEL", "gpt-4o-mini")
30
- self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "20"))
31
  self.max_items_per_request = max_items_per_request or int(os.getenv("LLM_SUMMARY_BATCH", "8"))
32
  self.max_chars_per_item = max_chars_per_item or int(os.getenv("LLM_SUMMARY_MAX_CHARS", "1200"))
33
  self.enabled = os.getenv("ENABLE_AI_SUMMARIZATION", "true").lower() in {"1", "true", "yes"}
 
34
 
35
  self._chat_url = f"{self.api_base}/v1/chat/completions"
36
 
@@ -40,6 +42,8 @@ class OpenAICompatSummarizer:
40
 
41
  candidates: List[Tuple[Dict, str]] = []
42
  for item in items:
 
 
43
  text = self._build_input_text(item)
44
  if text:
45
  candidates.append((item, text))
@@ -47,7 +51,8 @@ class OpenAICompatSummarizer:
47
  if not candidates:
48
  return items
49
 
50
- for chunk in self._chunked(candidates, self.max_items_per_request):
 
51
  texts = [text for _, text in chunk]
52
  summaries = self._summarize_chunk(texts, source=source)
53
  if not summaries:
@@ -56,6 +61,8 @@ class OpenAICompatSummarizer:
56
  if summary:
57
  item["summary_ai"] = summary
58
  item["summary"] = summary
 
 
59
 
60
  return items
61
 
 
3
  import json
4
  import logging
5
  import os
6
+ import time
7
  from typing import Dict, List, Optional, Tuple
8
 
9
  import requests
 
28
  self.api_base = (api_base or os.getenv("LLM_API_BASE") or "https://researchengineering-agi.hf.space").rstrip("/")
29
  self.api_key = api_key if api_key is not None else os.getenv("LLM_API_KEY", "")
30
  self.model = model or os.getenv("LLM_MODEL", "gpt-4o-mini")
31
+ self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "120"))
32
  self.max_items_per_request = max_items_per_request or int(os.getenv("LLM_SUMMARY_BATCH", "8"))
33
  self.max_chars_per_item = max_chars_per_item or int(os.getenv("LLM_SUMMARY_MAX_CHARS", "1200"))
34
  self.enabled = os.getenv("ENABLE_AI_SUMMARIZATION", "true").lower() in {"1", "true", "yes"}
35
+ self.sleep_seconds = float(os.getenv("LLM_SUMMARY_SLEEP_SECONDS", "0"))
36
 
37
  self._chat_url = f"{self.api_base}/v1/chat/completions"
38
 
 
42
 
43
  candidates: List[Tuple[Dict, str]] = []
44
  for item in items:
45
+ if str(item.get("summary_ai", "")).strip():
46
+ continue
47
  text = self._build_input_text(item)
48
  if text:
49
  candidates.append((item, text))
 
51
  if not candidates:
52
  return items
53
 
54
+ chunks = self._chunked(candidates, self.max_items_per_request)
55
+ for idx, chunk in enumerate(chunks, start=1):
56
  texts = [text for _, text in chunk]
57
  summaries = self._summarize_chunk(texts, source=source)
58
  if not summaries:
 
61
  if summary:
62
  item["summary_ai"] = summary
63
  item["summary"] = summary
64
+ if self.sleep_seconds > 0 and idx < len(chunks):
65
+ time.sleep(self.sleep_seconds)
66
 
67
  return items
68
 
app/utils/news_cache.py CHANGED
@@ -11,11 +11,6 @@ from typing import List, Dict, Optional, Callable
11
 
12
  import pandas as pd
13
 
14
- try:
15
- from utils.llm_summarizer import OpenAICompatSummarizer
16
- except Exception: # pragma: no cover - optional dependency
17
- OpenAICompatSummarizer = None
18
-
19
  logger = logging.getLogger(__name__)
20
 
21
 
@@ -48,7 +43,6 @@ class NewsCacheManager:
48
  'filtered_cache': {} # Cached filtered results
49
  }
50
  logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
51
- self.summarizer = OpenAICompatSummarizer() if OpenAICompatSummarizer else None
52
 
53
  def get_news(
54
  self,
@@ -95,7 +89,6 @@ class NewsCacheManager:
95
  return self.cache[source]['raw_news']
96
 
97
  self._prepare_summaries(new_items)
98
- self._apply_ai_summaries(new_items, source=source)
99
 
100
  # Update cache
101
  self._update_cache(source, new_items)
@@ -244,14 +237,6 @@ class NewsCacheManager:
244
  if 'summary_raw' not in item:
245
  item['summary_raw'] = item.get('summary', '')
246
 
247
- def _apply_ai_summaries(self, items: List[Dict], source: Optional[str] = None):
248
- if not items or not self.summarizer or not getattr(self.summarizer, 'enabled', False):
249
- return
250
- try:
251
- self.summarizer.summarize_items(items, source=source)
252
- except Exception as exc:
253
- logger.warning(f"AI summarization skipped due to error: {exc}")
254
-
255
  def get_filtered_news(
256
  self,
257
  source_df: pd.DataFrame,
 
11
 
12
  import pandas as pd
13
 
 
 
 
 
 
14
  logger = logging.getLogger(__name__)
15
 
16
 
 
43
  'filtered_cache': {} # Cached filtered results
44
  }
45
  logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
 
46
 
47
  def get_news(
48
  self,
 
89
  return self.cache[source]['raw_news']
90
 
91
  self._prepare_summaries(new_items)
 
92
 
93
  # Update cache
94
  self._update_cache(source, new_items)
 
237
  if 'summary_raw' not in item:
238
  item['summary_raw'] = item.get('summary', '')
239
 
 
 
 
 
 
 
 
 
240
  def get_filtered_news(
241
  self,
242
  source_df: pd.DataFrame,