DivYonko commited on
Commit
e7565ac
Β·
1 Parent(s): 405872f

Fix PDF: strip emoji/unicode chars that Helvetica can't render

Browse files
Files changed (1) hide show
  1. ml/report_generator.py +286 -0
ml/report_generator.py CHANGED
@@ -7,12 +7,298 @@ Uses fpdf2 β€” no system dependencies required.
7
  from __future__ import annotations
8
 
9
  import io
 
10
  from datetime import datetime
11
  from collections import Counter
12
 
13
  from fpdf import FPDF
14
 
15
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  # ── Colour palette (matches dashboard theme) ──────────────────────────────────
17
  _C_BG = (7, 7, 15) # dark background
18
  _C_CARD = (15, 15, 30)
 
7
  from __future__ import annotations
8
 
9
  import io
10
+ import re
11
  from datetime import datetime
12
  from collections import Counter
13
 
14
  from fpdf import FPDF
15
 
16
 
17
+ # ── Colour palette (matches dashboard theme) ──────────────────────────────────
18
+ _C_BG = (7, 7, 15)
19
+ _C_CARD = (15, 15, 30)
20
+ _C_ACCENT = (124, 58, 237)
21
+ _C_POS = (34, 197, 94)
22
+ _C_NEU = (234, 179, 8)
23
+ _C_NEG = (239, 68, 68)
24
+ _C_TEXT1 = (241, 245, 249)
25
+ _C_TEXT2 = (148, 163, 184)
26
+ _C_WHITE = (255, 255, 255)
27
+ _C_DIVIDER = (30, 30, 50)
28
+
29
+ TOPIC_COLORS = {
30
+ "Appreciation": (245, 158, 11),
31
+ "Question": ( 59, 130, 246),
32
+ "Request/Feedback":(139, 92, 246),
33
+ "Promo": (236, 72, 153),
34
+ "Spam": (239, 68, 68),
35
+ "General": (107, 114, 128),
36
+ "MCQ Answer": ( 16, 185, 129),
37
+ }
38
+
39
+
40
+ def _safe(text: str, max_len: int = 80) -> str:
41
+ """Strip emoji and non-Latin-1 characters so Helvetica can render them."""
42
+ if not text:
43
+ return ""
44
+ # Remove emoji and symbols outside Latin-1
45
+ cleaned = re.sub(r'[^\x00-\xFF]', '', str(text))
46
+ # Collapse multiple spaces
47
+ cleaned = re.sub(r'\s+', ' ', cleaned).strip()
48
+ return cleaned[:max_len]
49
+
50
+
51
+ class LivePulsePDF(FPDF):
52
+
53
+ def __init__(self):
54
+ super().__init__(orientation="P", unit="mm", format="A4")
55
+ self.set_auto_page_break(auto=True, margin=15)
56
+ self.set_margins(15, 15, 15)
57
+
58
+ def header(self):
59
+ self.set_fill_color(*_C_BG)
60
+ self.rect(0, 0, 210, 20, "F")
61
+ self.set_font("Helvetica", "B", 11)
62
+ self.set_text_color(*_C_ACCENT)
63
+ self.set_y(6)
64
+ self.cell(0, 8, "LivePulse | YouTube Live Chat Analytics", align="L")
65
+ self.set_font("Helvetica", "", 8)
66
+ self.set_text_color(*_C_TEXT2)
67
+ self.cell(0, 8, f"Generated {datetime.now().strftime('%d %b %Y %H:%M')}", align="R")
68
+ self.ln(4)
69
+
70
+ def footer(self):
71
+ self.set_y(-12)
72
+ self.set_font("Helvetica", "", 8)
73
+ self.set_text_color(*_C_TEXT2)
74
+ self.cell(0, 8, f"Page {self.page_no()}", align="C")
75
+
76
+ def section_title(self, title: str, pill: str = "") -> None:
77
+ self.set_fill_color(*_C_CARD)
78
+ self.set_draw_color(*_C_ACCENT)
79
+ self.set_line_width(0.5)
80
+ self.rect(15, self.get_y(), 180, 9, "FD")
81
+ self.set_font("Helvetica", "B", 10)
82
+ self.set_text_color(*_C_TEXT1)
83
+ self.set_x(17)
84
+ self.cell(140, 9, _safe(title), ln=0)
85
+ if pill:
86
+ self.set_font("Helvetica", "", 8)
87
+ self.set_text_color(*_C_ACCENT)
88
+ self.cell(0, 9, _safe(pill), align="R")
89
+ self.ln(11)
90
+
91
+ def stat_box(self, label: str, value: str, color: tuple, x: float, y: float, w: float = 42, h: float = 18) -> None:
92
+ self.set_fill_color(*_C_CARD)
93
+ self.set_draw_color(*color)
94
+ self.set_line_width(0.4)
95
+ self.rect(x, y, w, h, "FD")
96
+ self.set_fill_color(*color)
97
+ self.rect(x, y, w, 1.5, "F")
98
+ self.set_font("Helvetica", "B", 14)
99
+ self.set_text_color(*color)
100
+ self.set_xy(x, y + 3)
101
+ self.cell(w, 7, _safe(value, 12), align="C")
102
+ self.set_font("Helvetica", "", 7)
103
+ self.set_text_color(*_C_TEXT2)
104
+ self.set_xy(x, y + 10)
105
+ self.cell(w, 5, _safe(label.upper(), 20), align="C")
106
+
107
+ def h_bar(self, label: str, value: int, max_val: int, color: tuple, bar_w: float = 100) -> None:
108
+ y = self.get_y()
109
+ self.set_font("Helvetica", "", 8)
110
+ self.set_text_color(*_C_TEXT1)
111
+ self.set_x(17)
112
+ self.cell(55, 6, _safe(label, 35), ln=0)
113
+ self.set_fill_color(*_C_DIVIDER)
114
+ self.rect(73, y + 1, bar_w, 4, "F")
115
+ fill_w = (value / max(max_val, 1)) * bar_w
116
+ self.set_fill_color(*color)
117
+ self.rect(73, y + 1, fill_w, 4, "F")
118
+ self.set_font("Helvetica", "B", 8)
119
+ self.set_text_color(*_C_TEXT2)
120
+ self.set_xy(175, y)
121
+ self.cell(20, 6, str(value), align="R")
122
+ self.ln(7)
123
+
124
+ def table_header(self, cols: list[tuple[str, float]]) -> None:
125
+ self.set_fill_color(*_C_CARD)
126
+ self.set_font("Helvetica", "B", 8)
127
+ self.set_text_color(*_C_ACCENT)
128
+ for label, w in cols:
129
+ self.cell(w, 7, _safe(label), border=0, fill=True, align="L")
130
+ self.ln(7)
131
+ self.set_draw_color(*_C_ACCENT)
132
+ self.set_line_width(0.3)
133
+ self.line(15, self.get_y(), 195, self.get_y())
134
+ self.ln(1)
135
+
136
+ def table_row(self, values: list[tuple[str, float]], alt: bool = False) -> None:
137
+ if alt:
138
+ self.set_fill_color(20, 20, 35)
139
+ else:
140
+ self.set_fill_color(*_C_BG)
141
+ self.set_font("Helvetica", "", 8)
142
+ self.set_text_color(*_C_TEXT1)
143
+ for val, w in values:
144
+ self.cell(w, 6, _safe(str(val), 40), border=0, fill=True, align="L")
145
+ self.ln(6)
146
+
147
+
148
+ def generate_report(
149
+ all_data: list[dict],
150
+ stream_title: str = "LivePulse Stream",
151
+ msg_limit: int = 100,
152
+ ) -> bytes:
153
+ pdf = LivePulsePDF()
154
+ pdf.add_page()
155
+
156
+ # Cover
157
+ pdf.set_fill_color(*_C_BG)
158
+ pdf.rect(0, 20, 210, 40, "F")
159
+ pdf.set_font("Helvetica", "B", 20)
160
+ pdf.set_text_color(*_C_TEXT1)
161
+ pdf.set_y(28)
162
+ pdf.cell(0, 10, "Dashboard Report", align="C", ln=True)
163
+ pdf.set_font("Helvetica", "", 11)
164
+ pdf.set_text_color(*_C_ACCENT)
165
+ pdf.cell(0, 8, _safe(stream_title, 80), align="C", ln=True)
166
+ pdf.set_font("Helvetica", "", 9)
167
+ pdf.set_text_color(*_C_TEXT2)
168
+ pdf.cell(0, 6, f"Total messages analysed: {len(all_data)}", align="C", ln=True)
169
+ pdf.ln(8)
170
+
171
+ if not all_data:
172
+ pdf.set_font("Helvetica", "", 11)
173
+ pdf.set_text_color(*_C_NEG)
174
+ pdf.cell(0, 10, "No data available.", align="C")
175
+ return bytes(pdf.output())
176
+
177
+ # Pre-compute
178
+ sentiments = [m.get("sentiment", "Neutral") for m in all_data]
179
+ topics = [m.get("topic", "General") for m in all_data]
180
+ action_types = [m.get("action_type", "N/A") for m in all_data]
181
+ authors = [m.get("author", "Unknown") for m in all_data]
182
+
183
+ c_pos = sentiments.count("Positive")
184
+ c_neu = sentiments.count("Neutral")
185
+ c_neg = sentiments.count("Negative")
186
+ c_total = max(len(all_data), 1)
187
+
188
+ topic_counts = Counter(topics)
189
+ action_counts = Counter(a for a in action_types if a not in ("N/A", "", None))
190
+ author_counts = Counter(authors)
191
+
192
+ try:
193
+ from datetime import datetime as _dt
194
+ recent = all_data[-50:]
195
+ n = len(recent)
196
+ t0 = _dt.fromisoformat(recent[0]["time"])
197
+ t1 = _dt.fromisoformat(recent[-1]["time"])
198
+ elapsed = max((t1 - t0).total_seconds() / 60, 0.1)
199
+ rate = round(n / elapsed, 1)
200
+ pos_ratio = sum(1 for m in recent if m.get("sentiment") == "Positive") / max(n, 1)
201
+ q_density = sum(1 for m in recent if m.get("topic") == "Question") / max(n, 1)
202
+ rate_norm = min(rate / 60, 1.0)
203
+ eng_score = round((rate_norm * 0.4 + pos_ratio * 0.4 + q_density * 0.2) * 100)
204
+ except Exception:
205
+ eng_score = 0; rate = 0.0; pos_ratio = 0.0; q_density = 0.0
206
+
207
+ # Section 1: Engagement Summary
208
+ pdf.section_title("Engagement Summary", "Live")
209
+ y0 = pdf.get_y()
210
+ pdf.stat_box("Engagement", str(eng_score), _C_ACCENT, 15, y0)
211
+ pdf.stat_box("Positive", f"{c_pos} ({c_pos/c_total*100:.0f}%)", _C_POS, 59, y0)
212
+ pdf.stat_box("Neutral", f"{c_neu} ({c_neu/c_total*100:.0f}%)", _C_NEU, 103, y0)
213
+ pdf.stat_box("Negative", f"{c_neg} ({c_neg/c_total*100:.0f}%)", _C_NEG, 147, y0)
214
+ pdf.set_y(y0 + 22)
215
+ y1 = pdf.get_y()
216
+ pdf.stat_box("Total Msgs", str(c_total), _C_TEXT2, 15, y1)
217
+ pdf.stat_box("Msgs/min", f"{rate:.1f}", _C_ACCENT, 59, y1)
218
+ pdf.stat_box("Pos ratio", f"{pos_ratio*100:.0f}%",_C_POS, 103, y1)
219
+ pdf.stat_box("Q density", f"{q_density*100:.0f}%",_C_NEU, 147, y1)
220
+ pdf.set_y(y1 + 22)
221
+ pdf.ln(4)
222
+
223
+ # Section 2: Topic Distribution
224
+ pdf.section_title("Topic Distribution", "All Time")
225
+ max_topic = max(topic_counts.values(), default=1)
226
+ for topic in ["Appreciation", "Question", "Request/Feedback", "Promo", "Spam", "General", "MCQ Answer"]:
227
+ pdf.h_bar(topic, topic_counts.get(topic, 0), max_topic, TOPIC_COLORS.get(topic, _C_TEXT2))
228
+ pdf.ln(4)
229
+
230
+ # Section 3: Action Types
231
+ if action_counts:
232
+ pdf.section_title("Top Action Types", "Questions & Requests")
233
+ max_action = max(action_counts.values(), default=1)
234
+ for action, count in action_counts.most_common(15):
235
+ pdf.h_bar(action[:40], count, max_action, _C_ACCENT)
236
+ pdf.ln(4)
237
+
238
+ # Section 4: Top Contributors
239
+ pdf.section_title("Top Contributors", "All Time")
240
+ cols = [("Author", 60), ("Messages", 25), ("Positive%", 30), ("Neutral%", 30), ("Negative%", 30)]
241
+ pdf.table_header(cols)
242
+ for i, (author, count) in enumerate(author_counts.most_common(15)):
243
+ author_msgs = [m for m in all_data if m.get("author") == author]
244
+ total_a = max(len(author_msgs), 1)
245
+ pos_p = round(sum(1 for m in author_msgs if m.get("sentiment") == "Positive") / total_a * 100)
246
+ neu_p = round(sum(1 for m in author_msgs if m.get("sentiment") == "Neutral") / total_a * 100)
247
+ neg_p = round(sum(1 for m in author_msgs if m.get("sentiment") == "Negative") / total_a * 100)
248
+ pdf.table_row([
249
+ (_safe(author, 28), 60), (str(count), 25),
250
+ (f"{pos_p}%", 30), (f"{neu_p}%", 30), (f"{neg_p}%", 30),
251
+ ], alt=(i % 2 == 1))
252
+ pdf.ln(4)
253
+
254
+ # Section 5: Recent Comments
255
+ pdf.add_page()
256
+ recent_msgs = all_data[-msg_limit:]
257
+ pdf.section_title("Recent Comments", f"Last {len(recent_msgs)} messages")
258
+ cols_c = [("Author", 40), ("Message", 90), ("Sentiment", 22), ("Topic", 28)]
259
+ pdf.table_header(cols_c)
260
+ sent_colors = {"Positive": _C_POS, "Negative": _C_NEG, "Neutral": _C_NEU}
261
+ for i, msg in enumerate(reversed(recent_msgs)):
262
+ author = _safe(msg.get("author", ""), 18)
263
+ text = _safe(msg.get("text", ""), 55)
264
+ sent = msg.get("sentiment", "Neutral")
265
+ topic = _safe(msg.get("topic", "General"), 14)
266
+ alt = (i % 2 == 1)
267
+ pdf.set_fill_color(20, 20, 35) if alt else pdf.set_fill_color(*_C_BG)
268
+ pdf.set_font("Helvetica", "", 7.5)
269
+ pdf.set_text_color(*_C_TEXT1)
270
+ pdf.cell(40, 5.5, author, border=0, fill=True)
271
+ pdf.cell(90, 5.5, text, border=0, fill=True)
272
+ pdf.set_text_color(*sent_colors.get(sent, _C_TEXT2))
273
+ pdf.cell(22, 5.5, sent, border=0, fill=True)
274
+ pdf.set_text_color(*TOPIC_COLORS.get(topic, _C_TEXT2))
275
+ pdf.cell(28, 5.5, topic, border=0, fill=True)
276
+ pdf.ln(5.5)
277
+
278
+ # Section 6: Questions Log
279
+ questions = [m for m in all_data if m.get("topic") == "Question"]
280
+ if questions:
281
+ pdf.add_page()
282
+ pdf.section_title("Questions Asked", f"{len(questions)} total")
283
+ cols_q = [("Author", 40), ("Question", 115), ("Action Type", 40)]
284
+ pdf.table_header(cols_q)
285
+ for i, msg in enumerate(reversed(questions[-100:])):
286
+ author = _safe(msg.get("author", ""), 18)
287
+ text = _safe(msg.get("text", ""), 65)
288
+ action = _safe(msg.get("action_type", "N/A"), 22)
289
+ alt = (i % 2 == 1)
290
+ pdf.set_fill_color(20, 20, 35) if alt else pdf.set_fill_color(*_C_BG)
291
+ pdf.set_font("Helvetica", "", 7.5)
292
+ pdf.set_text_color(*_C_TEXT1)
293
+ pdf.cell(40, 5.5, author, border=0, fill=True)
294
+ pdf.cell(115, 5.5, text, border=0, fill=True)
295
+ pdf.set_text_color(*_C_ACCENT)
296
+ pdf.cell(40, 5.5, action, border=0, fill=True)
297
+ pdf.ln(5.5)
298
+
299
+ return bytes(pdf.output())
300
+
301
+
302
  # ── Colour palette (matches dashboard theme) ──────────────────────────────────
303
  _C_BG = (7, 7, 15) # dark background
304
  _C_CARD = (15, 15, 30)