Dmitry Beresnev commited on
Commit
bcf73e3
·
1 Parent(s): ab86fc1

add news summarization by ai

Browse files
app/pages/05_Dashboard.py CHANGED
@@ -838,6 +838,42 @@ if 'fetch_errors' in locals() and fetch_errors:
838
  for error in fetch_errors:
839
  st.caption(f"• {error}")
840
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
841
  # Auto-refresh logic
842
  if auto_refresh:
843
  import time
 
838
  for error in fetch_errors:
839
  st.caption(f"• {error}")
840
 
841
+ # ---- AI SUMMARY METRICS ----
842
+ ai_summary_dfs = [
843
+ twitter_df,
844
+ reddit_df,
845
+ rss_all_df,
846
+ ai_tech_df,
847
+ sectoral_news_df,
848
+ market_events_df,
849
+ economic_calendar_df,
850
+ predictions_df,
851
+ ]
852
+
853
+ total_items = sum(len(df) for df in ai_summary_dfs if not df.empty)
854
+ ai_summarized = 0
855
+ for df in ai_summary_dfs:
856
+ if df.empty or "summary_ai" not in df.columns:
857
+ continue
858
+ ai_summarized += df["summary_ai"].fillna("").astype(str).str.strip().ne("").sum()
859
+
860
+ ai_summary_pct = (ai_summarized / total_items * 100) if total_items else 0.0
861
+
862
+ st.markdown("---")
863
+ st.markdown("## 🤖 AI Summary")
864
+ st.markdown(
865
+ f"""
866
+ <div style="background: linear-gradient(135deg, #1E222D 0%, #131722 100%); border: 1px solid #2A2E39; border-radius: 8px; padding: 20px; margin-bottom: 12px;">
867
+ <div style="color: #E0E3EB; font-size: 16px; font-weight: 600; margin-bottom: 6px;">Current AI Summarizations</div>
868
+ <div style="color: #D1D4DC; font-size: 14px; line-height: 1.6;">
869
+ {ai_summarized} / {total_items} items summarized
870
+ <span style="color: #787B86; font-size: 12px; margin-left: 8px;">({ai_summary_pct:.1f}% coverage)</span>
871
+ </div>
872
+ </div>
873
+ """,
874
+ unsafe_allow_html=True,
875
+ )
876
+
877
  # Auto-refresh logic
878
  if auto_refresh:
879
  import time
app/utils/llm_summarizer.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """OpenAI-compatible LLM summarizer for news items."""
2
+
3
+ import json
4
+ import logging
5
+ import os
6
+ from typing import Dict, List, Optional, Tuple
7
+
8
+ import requests
9
+
10
+ logger = logging.getLogger(__name__)
11
+
12
+
13
+ class OpenAICompatSummarizer:
14
+ """
15
+ Summarize news items using an OpenAI-compatible chat completions API.
16
+ """
17
+
18
+ def __init__(
19
+ self,
20
+ api_base: Optional[str] = None,
21
+ api_key: Optional[str] = None,
22
+ model: Optional[str] = None,
23
+ timeout: Optional[int] = None,
24
+ max_items_per_request: Optional[int] = None,
25
+ max_chars_per_item: Optional[int] = None,
26
+ ):
27
+ self.api_base = (api_base or os.getenv("LLM_API_BASE") or "https://researchengineering-agi.hf.space").rstrip("/")
28
+ self.api_key = api_key if api_key is not None else os.getenv("LLM_API_KEY", "")
29
+ self.model = model or os.getenv("LLM_MODEL", "gpt-4o-mini")
30
+ self.timeout = timeout or int(os.getenv("LLM_TIMEOUT", "20"))
31
+ self.max_items_per_request = max_items_per_request or int(os.getenv("LLM_SUMMARY_BATCH", "8"))
32
+ self.max_chars_per_item = max_chars_per_item or int(os.getenv("LLM_SUMMARY_MAX_CHARS", "1200"))
33
+ self.enabled = os.getenv("ENABLE_AI_SUMMARIZATION", "true").lower() in {"1", "true", "yes"}
34
+
35
+ self._chat_url = f"{self.api_base}/v1/chat/completions"
36
+
37
+ def summarize_items(self, items: List[Dict], source: Optional[str] = None) -> List[Dict]:
38
+ if not self.enabled or not items:
39
+ return items
40
+
41
+ candidates: List[Tuple[Dict, str]] = []
42
+ for item in items:
43
+ text = self._build_input_text(item)
44
+ if text:
45
+ candidates.append((item, text))
46
+
47
+ if not candidates:
48
+ return items
49
+
50
+ for chunk in self._chunked(candidates, self.max_items_per_request):
51
+ texts = [text for _, text in chunk]
52
+ summaries = self._summarize_chunk(texts, source=source)
53
+ if not summaries:
54
+ continue
55
+ for (item, _), summary in zip(chunk, summaries):
56
+ if summary:
57
+ item["summary_ai"] = summary
58
+ item["summary"] = summary
59
+
60
+ return items
61
+
62
+ def _build_input_text(self, item: Dict) -> str:
63
+ title = str(item.get("title", "")).strip()
64
+ summary = str(item.get("summary_raw", item.get("summary", ""))).strip()
65
+ extra = str(item.get("content", item.get("text", item.get("description", "")))).strip()
66
+
67
+ parts = []
68
+ if title:
69
+ parts.append(f"Title: {title}")
70
+ if summary and summary != title:
71
+ parts.append(f"Summary: {summary}")
72
+ if extra and extra not in summary and extra not in title:
73
+ parts.append(f"Details: {extra}")
74
+
75
+ combined = "\n".join(parts).strip()
76
+ if not combined:
77
+ return ""
78
+
79
+ if len(combined) > self.max_chars_per_item:
80
+ combined = combined[: self.max_chars_per_item].rstrip()
81
+
82
+ return combined
83
+
84
+ def _summarize_chunk(self, texts: List[str], source: Optional[str] = None) -> List[str]:
85
+ system_prompt = (
86
+ "You are a financial news summarizer. "
87
+ "Return concise, factual summaries in 1-2 sentences, <=240 characters each. "
88
+ "Do not add speculation or new facts."
89
+ )
90
+ source_line = f"Source: {source}" if source else ""
91
+
92
+ items_text = []
93
+ for idx, text in enumerate(texts, start=1):
94
+ items_text.append(f"{idx}. {text}")
95
+
96
+ user_prompt = (
97
+ "Summarize each item below. "
98
+ "Return a JSON array of strings in the same order. "
99
+ "No extra text.\n"
100
+ f"{source_line}\n\n" + "\n\n".join(items_text)
101
+ )
102
+
103
+ payload = {
104
+ "model": self.model,
105
+ "messages": [
106
+ {"role": "system", "content": system_prompt},
107
+ {"role": "user", "content": user_prompt},
108
+ ],
109
+ "temperature": 0.2,
110
+ }
111
+
112
+ headers = {"Content-Type": "application/json"}
113
+ if self.api_key:
114
+ headers["Authorization"] = f"Bearer {self.api_key}"
115
+
116
+ try:
117
+ response = requests.post(self._chat_url, json=payload, headers=headers, timeout=self.timeout)
118
+ response.raise_for_status()
119
+ data = response.json()
120
+ content = (
121
+ data.get("choices", [{}])[0]
122
+ .get("message", {})
123
+ .get("content", "")
124
+ .strip()
125
+ )
126
+ summaries = self._parse_json_array(content)
127
+ if summaries and len(summaries) == len(texts):
128
+ return summaries
129
+ logger.warning("LLM summarizer returned unexpected format or length")
130
+ return []
131
+ except Exception as exc:
132
+ logger.warning(f"LLM summarization failed: {exc}")
133
+ return []
134
+
135
+ def _parse_json_array(self, content: str) -> List[str]:
136
+ if not content:
137
+ return []
138
+ try:
139
+ parsed = json.loads(content)
140
+ if isinstance(parsed, list):
141
+ return [str(x).strip() for x in parsed]
142
+ return []
143
+ except Exception:
144
+ return []
145
+
146
+ def _chunked(self, items: List[Tuple[Dict, str]], size: int) -> List[List[Tuple[Dict, str]]]:
147
+ if size <= 0:
148
+ return [items]
149
+ return [items[i : i + size] for i in range(0, len(items), size)]
app/utils/news_cache.py CHANGED
@@ -6,10 +6,16 @@ Centralized cache manager for Twitter, Reddit, RSS, and AI/Tech news feeds
6
  import hashlib
7
  import logging
8
  import re
9
- import pandas as pd
10
  from datetime import datetime, timedelta
11
  from typing import List, Dict, Optional, Callable
12
 
 
 
 
 
 
 
 
13
  logger = logging.getLogger(__name__)
14
 
15
 
@@ -42,6 +48,7 @@ class NewsCacheManager:
42
  'filtered_cache': {} # Cached filtered results
43
  }
44
  logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
 
45
 
46
  def get_news(
47
  self,
@@ -87,6 +94,9 @@ class NewsCacheManager:
87
  # Return cached data if available, even if expired
88
  return self.cache[source]['raw_news']
89
 
 
 
 
90
  # Update cache
91
  self._update_cache(source, new_items)
92
 
@@ -172,7 +182,8 @@ class NewsCacheManager:
172
  MD5 hash string
173
  """
174
  title = self._normalize_text(item.get('title', ''))
175
- summary = self._normalize_text(item.get('summary', '')[:200]) # First 200 chars
 
176
 
177
  # Combine title and summary
178
  combined = f"{title}|{summary}"
@@ -228,6 +239,19 @@ class NewsCacheManager:
228
  self.cache[source]['last_fetch'] = datetime.now()
229
  logger.info(f"📦 Updated cache for {source} with {len(items)} items")
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def get_filtered_news(
232
  self,
233
  source_df: pd.DataFrame,
 
6
  import hashlib
7
  import logging
8
  import re
 
9
  from datetime import datetime, timedelta
10
  from typing import List, Dict, Optional, Callable
11
 
12
+ import pandas as pd
13
+
14
+ try:
15
+ from utils.llm_summarizer import OpenAICompatSummarizer
16
+ except Exception: # pragma: no cover - optional dependency
17
+ OpenAICompatSummarizer = None
18
+
19
  logger = logging.getLogger(__name__)
20
 
21
 
 
48
  'filtered_cache': {} # Cached filtered results
49
  }
50
  logger.info(f"NewsCacheManager initialized with {default_ttl}s TTL")
51
+ self.summarizer = OpenAICompatSummarizer() if OpenAICompatSummarizer else None
52
 
53
  def get_news(
54
  self,
 
94
  # Return cached data if available, even if expired
95
  return self.cache[source]['raw_news']
96
 
97
+ self._prepare_summaries(new_items)
98
+ self._apply_ai_summaries(new_items, source=source)
99
+
100
  # Update cache
101
  self._update_cache(source, new_items)
102
 
 
182
  MD5 hash string
183
  """
184
  title = self._normalize_text(item.get('title', ''))
185
+ summary_source = item.get('summary_raw', item.get('summary', ''))
186
+ summary = self._normalize_text(str(summary_source)[:200]) # First 200 chars
187
 
188
  # Combine title and summary
189
  combined = f"{title}|{summary}"
 
239
  self.cache[source]['last_fetch'] = datetime.now()
240
  logger.info(f"📦 Updated cache for {source} with {len(items)} items")
241
 
242
+ def _prepare_summaries(self, items: List[Dict]):
243
+ for item in items:
244
+ if 'summary_raw' not in item:
245
+ item['summary_raw'] = item.get('summary', '')
246
+
247
+ def _apply_ai_summaries(self, items: List[Dict], source: Optional[str] = None):
248
+ if not items or not self.summarizer or not getattr(self.summarizer, 'enabled', False):
249
+ return
250
+ try:
251
+ self.summarizer.summarize_items(items, source=source)
252
+ except Exception as exc:
253
+ logger.warning(f"AI summarization skipped due to error: {exc}")
254
+
255
  def get_filtered_news(
256
  self,
257
  source_df: pd.DataFrame,