rairo commited on
Commit
e8d80ac
·
verified ·
1 Parent(s): f2b1c99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +483 -299
app.py CHANGED
@@ -1,16 +1,18 @@
1
  ##############################################################################
2
- # Sozo Business Studio · 10-Jul-2025 (full drop-in)
3
  # • Restores PDF branch alongside fixed Video branch
4
  # • Shared chart-tag grammar across both paths
5
  # • Narrator text cleans scene labels + chart talk
6
  # • Matplotlib animation starts from blank; artists returned (blit=True)
7
  # • Gemini Flash-preview image gen with placeholder fallback
8
  # • Silent-audio fallback keeps mux lengths equal
 
9
  ##############################################################################
10
 
11
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
12
  from pathlib import Path
13
  from typing import Tuple, Dict, List
 
14
 
15
  import streamlit as st
16
  import pandas as pd
@@ -19,6 +21,7 @@ import matplotlib
19
  matplotlib.use("Agg")
20
  import matplotlib.pyplot as plt
21
  from matplotlib.animation import FuncAnimation, FFMpegWriter
 
22
  from fpdf import FPDF, HTMLMixin
23
  from markdown_it import MarkdownIt
24
  from PIL import Image
@@ -27,15 +30,14 @@ import cv2
27
  from langchain_experimental.agents import create_pandas_dataframe_agent
28
  from langchain_google_genai import ChatGoogleGenerativeAI
29
  from google import genai
30
- from google.genai import types # for GenerateContentConfig
31
 
32
  # ─── CONFIG ────────────────────────────────────────────────────────────────
33
-
34
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
35
  st.title("📊 Sozo Business Studio")
36
  st.caption("AI transforms business data into compelling narratives.")
37
 
38
- FPS, WIDTH, HEIGHT = 24, 1280, 720
39
  MAX_CHARTS, VIDEO_SCENES = 5, 5
40
 
41
  API_KEY = os.getenv("GEMINI_API_KEY")
@@ -43,22 +45,17 @@ if not API_KEY:
43
  st.error("⚠️ GEMINI_API_KEY is not set."); st.stop()
44
  GEM = genai.Client(api_key=API_KEY)
45
 
46
- DG_KEY = os.getenv("DEEPGRAM_API_KEY") # optional for narration
47
-
48
- # --- IMPROVED: State management for an interactive, non-freezing UI ---
49
- st.session_state.setdefault("bundle", None)
50
- st.session_state.setdefault("report_md", None)
51
- st.session_state.setdefault("chart_descs", [])
52
- st.session_state.setdefault("generated_charts", {}) # Dict[desc, base64_string]
53
- st.session_state.setdefault("pdf_bytes", None)
54
- st.session_state.setdefault("df", None)
55
- st.session_state.setdefault("current_file_key", None)
56
-
57
 
58
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
59
 
60
- # ─── HELPERS ───────────────────────────────────────────────────────────────
 
 
 
 
61
 
 
62
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
63
  """Load CSV/Excel, return (df, err)."""
64
  try:
@@ -90,8 +87,13 @@ def deepgram_tts(txt: str) -> Tuple[bytes, str]:
90
  r = requests.post(
91
  "https://api.deepgram.com/v1/speak",
92
  params={"model": "aura-2-andromeda-en"},
93
- headers={"Authorization": f"Token {DG_KEY}", "Content-Type": "application/json"},
94
- json={"text": txt}, timeout=30)
 
 
 
 
 
95
  r.raise_for_status()
96
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
97
  except Exception:
@@ -99,78 +101,96 @@ def deepgram_tts(txt: str) -> Tuple[bytes, str]:
99
 
100
  def generate_silence_mp3(duration: float, out: Path):
101
  subprocess.run(
102
- ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
103
- "-t", f"{duration:.3f}", "-q:a", "9", str(out)],
104
- check=True, capture_output=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
105
 
106
  def audio_duration(path: str) -> float:
107
  try:
108
  res = subprocess.run(
109
- ["ffprobe", "-v", "error", "-show_entries", "format=duration",
110
- "-of", "default=nw=1:nk=1", path],
111
- text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
 
 
 
 
 
 
 
 
 
 
 
 
112
  return float(res.stdout.strip())
113
  except Exception:
114
  return 5.0
115
 
116
  TAG_RE = re.compile(
117
- r'[<[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>"\'\]]+?)["\']?\s*[>\]]',
118
- re.I)
119
- extract_chart_tags = lambda t: list(dict.fromkeys(m.group("d").strip()
120
- for m in TAG_RE.finditer(t or "")))
121
-
122
- # --- FIXED: Escaped the hyphen to treat it as a literal character ---
123
- re_scene = re.compile(r"^\s*scene\s*\d+[:\.- ]*", re.I)
124
-
125
  def clean_narration(txt: str) -> str:
126
  txt = re_scene.sub("", txt)
127
  txt = TAG_RE.sub("", txt)
128
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
129
- txt = re.sub(r"\s{2,}", " ", txt).strip()
130
- return txt
131
 
132
  # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
133
-
134
  def placeholder_img() -> Image.Image:
135
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
136
 
137
- @st.cache_data(show_spinner="Generating image...")
138
  def generate_image_from_prompt(prompt: str) -> Image.Image:
139
  model_main = "gemini-2.0-flash-exp-image-generation"
140
  model_fallback = "gemini-2.0-flash-preview-image-generation"
141
  full_prompt = "A clean business-presentation illustration: " + prompt
142
 
143
  def fetch(model_name):
144
- try:
145
- res = GEM.models.generate_content(
146
- model=model_name,
147
- contents=full_prompt,
148
- generation_config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
149
- )
150
- for part in res.candidates[0].content.parts:
151
- if getattr(part, "inline_data", None):
152
- return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
153
- return None
154
- except Exception:
155
- return None
156
-
157
- img = fetch(model_main) or fetch(model_fallback)
158
- return img if img else placeholder_img()
159
 
160
- # ─── PDF & REPORT GENERATION (REFACTORED) ──────────────────────────────────
 
 
 
 
161
 
 
162
  class PDF(FPDF, HTMLMixin):
163
  pass
164
 
165
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
166
- """Builds a PDF from markdown text and a dictionary of chart descriptions to base64 image strings."""
167
- def replacer(match):
168
- desc = match.group("d").strip()
169
- if desc in charts and charts[desc]:
170
- return f'<img src="data:image/png;base64,{charts[desc]}">'
171
- return ""
172
-
173
- html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(TAG_RE.sub(replacer, md))
174
  pdf = PDF()
175
  pdf.set_auto_page_break(True, margin=15)
176
  pdf.add_page()
@@ -179,251 +199,404 @@ def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
179
  pdf.ln(3)
180
  pdf.set_font("Arial", "", 11)
181
  pdf.write_html(html)
182
- return bytes(pdf.output(dest="S"))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
183
 
184
- def generate_report_text(df: pd.DataFrame, ctx: str) -> Tuple[str, List[str]]:
185
- """Generates only the text part of the report. This is the fast, first step."""
186
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
187
-
188
  ctx_dict = {
189
- "shape": df.shape, "columns": list(df.columns), "user_ctx": ctx or "General business analysis",
190
- "data_sample": df.head().to_dict('records'),
 
 
191
  "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
192
- "missing_values": {col: int(count) for col, count in df.isnull().sum().to_dict().items() if count > 0},
193
- "numeric_summary": df.describe().to_dict() if not df.select_dtypes(include=np.number).empty else {}
 
 
 
 
 
 
 
194
  }
195
- cols = ", ".join(ctx_dict["columns"][:8])
 
196
  report_prompt = f"""
197
  You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
 
198
  **Dataset Analysis Context:**
199
- {json.dumps(ctx_dict, indent=2, default=str)}
 
200
  **Instructions:**
201
- 1. **Identify Data Domain**: First, determine what type of data this represents.
202
  2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
203
- 3. **Data Quality Assessment**: Comment on data completeness and reliability.
204
- 4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain.
205
- 5. **Strategic Recommendations**: Offer concrete, actionable recommendations.
206
- 6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like:
207
- `<generate_chart: "chart_type | specific description">`
 
 
 
 
208
  Valid chart types: bar, pie, line, scatter, hist
209
  Base every chart on actual columns: {cols}
210
- 7. **Format Requirements**: Use professional business language and clear headers (## Executive Summary, etc.).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
211
  """
 
 
212
  md = llm.invoke(report_prompt).content
213
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
214
- return md, chart_descs
215
-
216
- def generate_single_chart(description: str, df: pd.DataFrame) -> str:
217
- """Generates one chart using the agent and returns it as a base64 string. More reliable."""
218
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
219
- agent = create_pandas_dataframe_agent(
220
- llm=llm, df=df, verbose=False, allow_dangerous_code=True,
221
- agent_type="openai-functions", handle_parsing_errors=True
222
- )
223
- chart_prompt = f"""
224
- Your task is to generate Python code to create a single, static, professional chart using matplotlib based on the provided dataframe `df`.
225
- The user's request is: '{description}'.
226
-
227
- Follow these rules strictly:
228
- 1. The dataframe is already loaded and available as a variable named `df`.
229
- 2. Generate only the Python code to produce the plot. Do not add any explanation or surrounding text.
230
- 3. Use `plt.figure()` to create a new figure for the plot.
231
- 4. Add a clear title and labels to the axes.
232
- 5. DO NOT use `st.pyplot()` or `plt.show()`. The code will be executed to save the figure.
233
- 6. Ensure the final code block is pure Python.
234
- """
235
- for _ in range(2): # Retry once on failure
 
 
 
 
 
 
 
 
 
 
236
  try:
237
- response = agent.invoke({"input": chart_prompt})
238
- code_to_execute = response['output'].strip().replace("```python", "").replace("```", "")
239
-
240
- fig, ax = plt.subplots(figsize=(10, 6), dpi=150)
241
- exec_globals = {'df': df, 'pd': pd, 'np': np, 'plt': plt, 'fig': fig, 'ax': ax}
242
- exec(code_to_execute, exec_globals)
243
-
244
- if fig.axes and any(ax.get_children() for ax in fig.axes):
245
- buf = io.BytesIO()
246
- fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white")
247
- plt.close(fig)
248
- return base64.b64encode(buf.getvalue()).decode()
249
- plt.close(fig)
250
- except Exception as e:
251
- st.warning(f"Chart generation attempt failed: {e}")
252
- plt.close("all")
253
- return None
254
-
255
- # ─── ANIMATION HELPERS (YOUR ORIGINAL CODE) ────────────────────────────────
256
-
257
- def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path, fps: int = FPS) -> str:
 
 
258
  frames = max(int(dur * fps), fps)
259
- vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"), fps, (WIDTH, HEIGHT))
 
260
  blank = np.full_like(img_cv2, 255)
261
  for i in range(frames):
262
- a = i / (frames - 1) if frames > 1 else 1.0
263
  vid.write(cv2.addWeighted(blank, 1 - a, img_cv2, a, 0))
264
  vid.release()
265
  return str(out)
266
 
267
- def animate_chart(desc: str, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
268
- """Render an animated chart whose clip length equals the audio length `dur`."""
 
269
  ctype, *rest = [s.strip().lower() for s in desc.split("|", 1)]
270
  ctype = ctype or "bar"
271
  title = rest[0] if rest else desc
272
 
 
273
  if ctype == "pie":
274
- cat_cols = df.select_dtypes(exclude="number").columns
275
- num_cols = df.select_dtypes(include="number").columns
276
- if not cat_cols.any() or not num_cols.any(): raise ValueError("Pie chart requires one categorical and one numeric column.")
277
- cat, num = cat_cols[0], num_cols[0]
278
  plot_df = df.groupby(cat)[num].sum().sort_values(ascending=False).head(8)
279
  elif ctype in ("bar", "hist"):
280
- num_cols = df.select_dtypes(include="number").columns
281
- if not num_cols.any(): raise ValueError(f"{ctype} chart requires a numeric column.")
282
- num = num_cols[0]
283
  plot_df = df[num]
284
- else: # line / scatter
285
- num_cols = df.select_dtypes(include="number").columns
286
- if len(num_cols) < 2: raise ValueError("Line/scatter chart requires at least two numeric columns.")
287
- plot_df = df[list(num_cols[:2])].sort_index()
288
 
289
  frames = max(10, int(dur * fps))
290
  fig, ax = plt.subplots(figsize=(WIDTH / 100, HEIGHT / 100), dpi=100)
291
 
292
- artists = []
293
  if ctype == "pie":
294
- wedges, _ = ax.pie(np.zeros_like(plot_df.values), labels=plot_df.index, startangle=90)
295
- ax.set_title(title); artists.extend(wedges)
296
- def init(): [w.set_alpha(0) for w in wedges]; return artists
 
297
  def update(i):
298
  a = i / (frames - 1)
299
- wedges, _ = ax.pie(plot_df.values * a, labels=plot_df.index, startangle=90)
300
  for w in wedges: w.set_alpha(a)
301
  return wedges
 
302
  elif ctype == "bar":
303
  bars = ax.bar(plot_df.index, np.zeros_like(plot_df.values), color="#1f77b4")
304
- ax.set_ylim(0, plot_df.max() * 1.1); ax.set_title(title); artists.extend(bars)
305
- def init(): return artists
 
306
  def update(i):
307
  a = i / (frames - 1)
308
- for b, h in zip(bars, plot_df.values): b.set_height(h * a)
309
- return artists
 
 
310
  elif ctype == "hist":
311
  _, _, patches = ax.hist(plot_df, bins=20, color="#1f77b4", alpha=0)
312
- ax.set_title(title); artists.extend(patches)
313
- def init(): [p.set_alpha(0) for p in patches]; return artists
 
314
  def update(i):
315
  a = i / (frames - 1)
316
  for p in patches: p.set_alpha(a)
317
- return artists
 
318
  elif ctype == "scatter":
319
- pts = ax.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1], s=10, alpha=0)
320
- ax.set_title(title); ax.grid(alpha=.3); artists.append(pts)
321
- def init(): pts.set_alpha(0); return artists
322
- def update(i): pts.set_alpha(i / (frames - 1)); return artists
 
 
 
 
323
  else: # line
324
  line, = ax.plot([], [], lw=2)
325
- x_full = plot_df.iloc[:, 0]
326
- y_full = plot_df.iloc[:, 1]
327
- ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min(), y_full.max())
328
- ax.set_title(title); ax.grid(alpha=.3); artists.append(line)
329
- def init(): line.set_data([], []); return artists
 
 
 
 
330
  def update(i):
331
  k = max(2, int(len(x_full) * i / (frames - 1)))
332
  line.set_data(x_full[:k], y_full.iloc[:k])
333
- return artists
334
-
335
- anim = FuncAnimation(fig, update, init_func=init, frames=frames, blit=True, interval=1000 / fps)
336
- anim.save(str(out), writer=FFMpegWriter(fps=fps, metadata={'artist': 'Sozo'}), dpi=144)
 
 
 
 
337
  plt.close(fig)
338
  return str(out)
339
 
340
  def safe_chart(desc, df, dur, out):
341
  try:
342
  return animate_chart(desc, df, dur, out)
343
- except Exception as e:
344
- st.warning(f"Animated chart failed ('{desc}'): {e}. Using static fallback.")
345
  with plt.ioff():
346
- fig, ax = plt.subplots()
347
- try:
348
- df.select_dtypes(include=np.number).plot(ax=ax)
349
- ax.set_title(desc)
350
- except Exception:
351
- ax.text(0.5, 0.5, 'Could not render chart', ha='center', va='center')
352
-
353
  p = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
354
- fig.savefig(p, bbox_inches="tight"); plt.close(fig)
355
- img_path = str(p)
356
- img = cv2.imread(img_path)
357
- if img is None: # Handle case where image read fails
358
- img = np.full((HEIGHT, WIDTH, 3), 230, dtype=np.uint8) # Fallback gray image
359
- img_resized = cv2.resize(img, (WIDTH, HEIGHT))
360
- return animate_image_fade(img_resized, dur, out)
361
 
362
  def concat_media(paths: List[str], out: Path, kind="video"):
363
- if not paths: return
364
- lst_path = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.txt"
365
- with lst_path.open("w", encoding="utf-8") as f:
366
- for p in paths:
367
- if Path(p).exists() and Path(p).stat().st_size > 0:
368
- f.write(f"file '{Path(p).resolve().as_posix()}'\n")
369
- if not lst_path.is_file() or lst_path.stat().st_size == 0:
370
- if lst_path.is_file(): lst_path.unlink()
371
  return
372
-
373
- cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", str(lst_path), "-c", "copy", str(out)]
374
- subprocess.run(cmd, check=True, capture_output=True)
375
- lst_path.unlink(missing_ok=True)
376
-
377
- # ─── VIDEO GENERATION (YOUR ORIGINAL CODE) ─────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
378
 
 
379
  def build_story_prompt(ctx_dict):
380
  cols = ", ".join(ctx_dict["columns"][:6])
381
  return f"""
382
  You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
 
383
  **Complete Dataset Context:**
384
- {json.dumps(ctx_dict, indent=2, default=str)}
 
385
  **Task Requirements:**
386
  1. **Identify the Data Story**: Determine what business domain this data represents and what story it tells
387
  2. **Create {VIDEO_SCENES} distinct scenes** that build a logical narrative arc
388
  3. **Each scene must contain:**
389
  - 1-2 sentences of clear, professional narration (plain English, no jargon)
390
  - Exactly one chart tag: `<generate_chart: "chart_type | specific description">`
 
391
  **Chart Guidelines:**
392
- - Valid types: bar, pie, line, scatter, hist
393
- - Base all charts on actual columns: {cols}
 
 
 
 
 
 
 
394
  **Narrative Structure:**
395
- - Scene 1: Set the context and introduce the main story
396
- - Middle scenes: Develop key insights and supporting evidence
397
- - Final scene: Conclude with actionable takeaways or future outlook
398
- **Output Format:**
399
- Separate each scene with exactly [SCENE_BREAK]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
400
  """
401
 
402
  def generate_video(buf: bytes, name: str, ctx: str, key: str):
403
  try:
404
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
405
  except Exception:
406
- st.error("🔴 FFmpeg not available — cannot render video."); return None
 
407
 
408
  df, err = load_dataframe_safely(buf, name)
409
  if err:
410
- st.error(err); return None
 
 
 
 
 
411
 
412
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.2)
413
  ctx_dict = {
414
- "shape": df.shape, "columns": list(df.columns), "user_ctx": ctx or "General business analysis",
415
- "data_sample": df.head().to_dict('records'),
416
- "numeric_summary": df.describe().to_dict() if not df.select_dtypes(include=np.number).empty else {}
 
 
 
 
 
 
 
 
417
  }
 
418
  script = llm.invoke(build_story_prompt(ctx_dict)).content
419
  scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
420
 
421
  video_parts, audio_parts, temps = [], [], []
422
  for idx, sc in enumerate(scenes[:VIDEO_SCENES]):
423
- st.progress((idx + 1) / VIDEO_SCENES, text=f"Rendering Scene {idx + 1}/{VIDEO_SCENES}")
 
 
 
 
424
  descs = extract_chart_tags(sc)
425
  narrative = clean_narration(sc)
426
 
 
427
  audio_bytes, _ = deepgram_tts(narrative)
428
  mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
429
  if audio_bytes:
@@ -432,127 +605,138 @@ def generate_video(buf: bytes, name: str, ctx: str, key: str):
432
  else:
433
  dur = 5.0
434
  generate_silence_mp3(dur, mp3)
435
- audio_parts.append(str(mp3)); temps.append(mp3)
 
436
 
 
437
  mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
438
  if descs:
439
  safe_chart(descs[0], df, dur, mp4)
440
  else:
441
  img = generate_image_from_prompt(narrative)
442
- img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
 
 
443
  animate_image_fade(img_cv, dur, mp4)
444
- video_parts.append(str(mp4)); temps.append(mp4)
 
445
 
 
446
  silent_vid = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
447
  concat_media(video_parts, silent_vid, "video")
448
  audio_mix = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
449
  concat_media(audio_parts, audio_mix, "audio")
450
 
451
  final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
452
- if silent_vid.exists() and silent_vid.stat().st_size > 0 and audio_mix.exists() and audio_mix.stat().st_size > 0:
453
- subprocess.run(
454
- ["ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix),
455
- "-c:v", "copy", "-c:a", "aac", "-shortest", str(final_vid)],
456
- check=True, capture_output=True)
457
- else:
458
- st.error("Failed to generate video or audio components.")
459
- return None
 
 
 
 
 
 
 
 
 
 
460
 
461
  for p in temps + [silent_vid, audio_mix]:
462
  p.unlink(missing_ok=True)
 
463
  return str(final_vid)
464
 
465
- # ─── UI & WORKFLOW (RESTRUCTURED FOR RESPONSIVENESS) ───────────────────────
 
 
 
466
 
467
- mode = st.radio("Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True)
468
  upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
469
-
470
  if upl:
471
- file_key = sha1_bytes(upl.getvalue())
472
- if file_key != st.session_state.current_file_key:
473
- st.session_state.report_md = None
474
- st.session_state.chart_descs = []
475
- st.session_state.generated_charts = {}
476
- st.session_state.pdf_bytes = None
477
- st.session_state.bundle = None
478
- st.session_state.current_file_key = file_key
479
- df, err = load_dataframe_safely(upl.getvalue(), upl.name)
480
- if err:
481
- st.error(f"Error loading data: {err}")
482
- st.session_state.df = None
483
- else:
484
- st.session_state.df = df
485
- st.rerun()
486
 
487
- if st.session_state.get("df") is not None:
488
- with st.expander("📊 Data Preview", expanded=True):
489
- st.dataframe(arrow_df(st.session_state.df.head()))
490
- ctx = st.text_area("Business context or specific instructions (optional)")
 
491
 
492
  if mode == "Report (PDF)":
493
- if st.button("🚀 Generate Report", type="primary", disabled=(st.session_state.report_md is not None)):
494
- with st.spinner("Analyzing data and drafting report..."):
495
- md, descs = generate_report_text(st.session_state.df, ctx)
496
- st.session_state.report_md = md
497
- st.session_state.chart_descs = descs
498
- st.rerun()
499
- else: # Video Mode
500
- if st.button("🎬 Generate Video", type="primary"):
501
- st.warning("Video generation is a long process and will lock the UI.")
502
- with st.spinner("Generating video... This may take several minutes."):
503
- key = st.session_state.current_file_key
504
- path = generate_video(upl.getvalue(), upl.name, ctx, key)
505
- if path:
506
- st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
507
- st.rerun()
508
-
509
- # ─── OUTPUT DISPLAY ────────────────────────────────────────────────────────
510
-
511
- if st.session_state.get("report_md"):
 
 
 
 
 
 
 
512
  st.subheader("📄 Generated Report")
513
-
514
- preview_md = st.session_state.report_md
515
- for desc, b64_data in st.session_state.generated_charts.items():
516
- if b64_data:
517
- img_tag = f'<img src="data:image/png;base64,{b64_data}" width="600">'
518
- preview_md = TAG_RE.sub(lambda m: img_tag if m.group("d").strip() == desc else m.group(0), preview_md, count=1)
519
-
520
- preview_md = TAG_RE.sub("[Chart will be generated here]", preview_md)
521
-
522
  with st.expander("View Report", expanded=True):
523
- st.markdown(preview_md, unsafe_allow_html=True)
524
-
525
- pending_charts = [d for d in st.session_state.chart_descs if d not in st.session_state.generated_charts]
526
- if pending_charts:
527
- if st.button("📊 Generate Visualizations", use_container_width=True, type="primary"):
528
- for desc in pending_charts:
529
- with st.spinner(f"Generating chart: {desc}"):
530
- b64_image = generate_single_chart(desc, st.session_state.df)
531
- st.session_state.generated_charts[desc] = b64_image
532
- st.rerun()
533
-
534
- all_charts_processed = st.session_state.chart_descs and len(st.session_state.generated_charts) == len(st.session_state.chart_descs)
535
- if all_charts_processed:
536
  c1, c2 = st.columns(2)
537
  with c1:
538
- if st.session_state.pdf_bytes is None:
539
- with st.spinner("Building PDF..."):
540
- st.session_state.pdf_bytes = build_pdf(st.session_state.report_md, st.session_state.generated_charts)
541
- st.download_button("Download PDF", st.session_state.pdf_bytes, "business_report.pdf", "application/pdf", use_container_width=True)
 
 
 
542
  with c2:
543
- if DG_KEY and st.button("🔊 Narrate Summary", use_container_width=True):
544
- txt = clean_narration(st.session_state.report_md)
545
  audio, mime = deepgram_tts(txt)
546
- st.audio(audio, format=mime) if audio else st.error("Narration failed.")
547
-
548
- elif bundle := st.session_state.get("bundle"):
549
- if bundle["type"] == "video":
550
- st.subheader("🎬 Generated Video Narrative")
551
- vp = bundle["video_path"]
552
- if Path(vp).exists():
553
- with open(vp, "rb") as f:
554
- st.video(f.read())
555
- with open(vp, "rb") as f:
556
- st.download_button("Download Video", f, f"sozo_narrative_{bundle['key'][:8]}.mp4", "video/mp4")
557
- else:
558
- st.error("Video file missing – generation may have failed.")
 
 
 
 
 
 
 
 
 
 
 
1
  ##############################################################################
2
+ # Sozo Business Studio · 10-Jul-2025
3
  # • Restores PDF branch alongside fixed Video branch
4
  # • Shared chart-tag grammar across both paths
5
  # • Narrator text cleans scene labels + chart talk
6
  # • Matplotlib animation starts from blank; artists returned (blit=True)
7
  # • Gemini Flash-preview image gen with placeholder fallback
8
  # • Silent-audio fallback keeps mux lengths equal
9
+ # • NEW (2025-07-06): Lazy-loading of PDF charts + st.rerun()
10
  ##############################################################################
11
 
12
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
13
  from pathlib import Path
14
  from typing import Tuple, Dict, List
15
+ from concurrent.futures import ThreadPoolExecutor
16
 
17
  import streamlit as st
18
  import pandas as pd
 
21
  matplotlib.use("Agg")
22
  import matplotlib.pyplot as plt
23
  from matplotlib.animation import FuncAnimation, FFMpegWriter
24
+
25
  from fpdf import FPDF, HTMLMixin
26
  from markdown_it import MarkdownIt
27
  from PIL import Image
 
30
  from langchain_experimental.agents import create_pandas_dataframe_agent
31
  from langchain_google_genai import ChatGoogleGenerativeAI
32
  from google import genai
33
+ from google.genai import types # GenerateContentConfig
34
 
35
  # ─── CONFIG ────────────────────────────────────────────────────────────────
 
36
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
37
  st.title("📊 Sozo Business Studio")
38
  st.caption("AI transforms business data into compelling narratives.")
39
 
40
+ FPS, WIDTH, HEIGHT = 24, 1280, 720
41
  MAX_CHARTS, VIDEO_SCENES = 5, 5
42
 
43
  API_KEY = os.getenv("GEMINI_API_KEY")
 
45
  st.error("⚠️ GEMINI_API_KEY is not set."); st.stop()
46
  GEM = genai.Client(api_key=API_KEY)
47
 
48
+ DG_KEY = os.getenv("DEEPGRAM_API_KEY") # optional narration
 
 
 
 
 
 
 
 
 
 
49
 
50
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
51
 
52
+ # ─── LAZY-LOADING SCAFFOLDING ──────────────────────────────────────────────
53
+ EXEC = ThreadPoolExecutor(max_workers=4) # parallel chart threads
54
+ if "lazy_reports" not in st.session_state: # key → report dict
55
+ st.session_state.lazy_reports = {}
56
+ st.session_state.setdefault("bundle", None) # video branch
57
 
58
+ # ─── HELPERS ───────────────────────────────────────────────────────────────
59
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
60
  """Load CSV/Excel, return (df, err)."""
61
  try:
 
87
  r = requests.post(
88
  "https://api.deepgram.com/v1/speak",
89
  params={"model": "aura-2-andromeda-en"},
90
+ headers={
91
+ "Authorization": f"Token {DG_KEY}",
92
+ "Content-Type": "application/json",
93
+ },
94
+ json={"text": txt},
95
+ timeout=30,
96
+ )
97
  r.raise_for_status()
98
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
99
  except Exception:
 
101
 
102
  def generate_silence_mp3(duration: float, out: Path):
103
  subprocess.run(
104
+ [
105
+ "ffmpeg",
106
+ "-y",
107
+ "-f",
108
+ "lavfi",
109
+ "-i",
110
+ "anullsrc=r=44100:cl=mono",
111
+ "-t",
112
+ f"{duration:.3f}",
113
+ "-q:a",
114
+ "9",
115
+ str(out),
116
+ ],
117
+ check=True,
118
+ capture_output=True,
119
+ )
120
 
121
  def audio_duration(path: str) -> float:
122
  try:
123
  res = subprocess.run(
124
+ [
125
+ "ffprobe",
126
+ "-v",
127
+ "error",
128
+ "-show_entries",
129
+ "format=duration",
130
+ "-of",
131
+ "default=nw=1:nk=1",
132
+ path,
133
+ ],
134
+ text=True,
135
+ stdout=subprocess.PIPE,
136
+ stderr=subprocess.PIPE,
137
+ check=True,
138
+ )
139
  return float(res.stdout.strip())
140
  except Exception:
141
  return 5.0
142
 
143
  TAG_RE = re.compile(
144
+ r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]',
145
+ re.I,
146
+ )
147
+ extract_chart_tags = lambda t: list(
148
+ dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or ""))
149
+ )
150
+
151
+ re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
152
  def clean_narration(txt: str) -> str:
153
  txt = re_scene.sub("", txt)
154
  txt = TAG_RE.sub("", txt)
155
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
156
+ return re.sub(r"\s{2,}", " ", txt).strip()
 
157
 
158
  # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
 
159
  def placeholder_img() -> Image.Image:
160
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
161
 
 
162
  def generate_image_from_prompt(prompt: str) -> Image.Image:
163
  model_main = "gemini-2.0-flash-exp-image-generation"
164
  model_fallback = "gemini-2.0-flash-preview-image-generation"
165
  full_prompt = "A clean business-presentation illustration: " + prompt
166
 
167
  def fetch(model_name):
168
+ res = GEM.models.generate_content(
169
+ model=model_name,
170
+ contents=full_prompt,
171
+ config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
172
+ )
173
+ for part in res.candidates[0].content.parts:
174
+ if getattr(part, "inline_data", None):
175
+ return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
176
+ return None
 
 
 
 
 
 
177
 
178
+ try:
179
+ img = fetch(model_main) or fetch(model_fallback)
180
+ return img if img else placeholder_img()
181
+ except Exception:
182
+ return placeholder_img()
183
 
184
+ # ─── PDF GENERATION ────────────────────────────────────────────────────────
185
  class PDF(FPDF, HTMLMixin):
186
  pass
187
 
188
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
189
+ html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(
190
+ TAG_RE.sub(
191
+ lambda m: f'<img src="{charts.get(m.group("d").strip(), "")}">', md
192
+ )
193
+ )
 
 
 
194
  pdf = PDF()
195
  pdf.set_auto_page_break(True, margin=15)
196
  pdf.add_page()
 
199
  pdf.ln(3)
200
  pdf.set_font("Arial", "", 11)
201
  pdf.write_html(html)
202
+ return pdf.output(dest="S").encode("latin-1")
203
+
204
+ # ─── QUICK STATIC CHART (fallback if LLM code fails) ───────────────────────
205
+ def quick_chart(desc: str, df: pd.DataFrame, out: Path):
206
+ ctype, *rest = [s.strip().lower() for s in desc.split("|", 1)]
207
+ ctype = ctype or "bar"
208
+ title = rest[0] if rest else desc
209
+ num_cols = df.select_dtypes("number").columns
210
+ cat_cols = df.select_dtypes(exclude="number").columns
211
+
212
+ with plt.ioff():
213
+ fig, ax = plt.subplots(figsize=(6, 3.4), dpi=150)
214
+ if ctype == "pie" and len(cat_cols) >= 1 and len(num_cols) >= 1:
215
+ plot = df.groupby(cat_cols[0])[num_cols[0]].sum().head(8)
216
+ ax.pie(plot, labels=plot.index, autopct="%1.1f%%", startangle=90)
217
+ elif ctype == "line" and len(num_cols) >= 1:
218
+ df[num_cols[0]].plot(kind="line", ax=ax)
219
+ elif ctype == "scatter" and len(num_cols) >= 2:
220
+ ax.scatter(df[num_cols[0]], df[num_cols[1]], s=10, alpha=0.7)
221
+ elif ctype == "hist" and len(num_cols) >= 1:
222
+ ax.hist(df[num_cols[0]], bins=20, alpha=0.7)
223
+ else: # bar fallback
224
+ plot = df[num_cols[0]].value_counts().head(10)
225
+ plot.plot(kind="bar", ax=ax)
226
+ ax.set_title(title)
227
+ fig.tight_layout()
228
+ fig.savefig(out, bbox_inches="tight", facecolor="white")
229
+ plt.close(fig)
230
+
231
+ # ─── REPORT (STEP 1) — prepare markdown instantly ────────────────────────
232
+ def prepare_report(buf: bytes, name: str, ctx: str):
233
+ df, err = load_dataframe_safely(buf, name)
234
+ if err:
235
+ st.error(err)
236
+ return None, None, None
237
+
238
+ llm = ChatGoogleGenerativeAI(
239
+ model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1
240
+ )
241
 
242
+ # ─── original enhanced context & prompt (UNTOUCHED) ───────────────────
 
 
 
243
  ctx_dict = {
244
+ "shape": df.shape,
245
+ "columns": list(df.columns),
246
+ "user_ctx": ctx or "General business analysis",
247
+ "full_dataframe": df.to_dict("records"),
248
  "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
249
+ "missing_values": {
250
+ col: int(count) for col, count in df.isnull().sum().to_dict().items()
251
+ },
252
+ "numeric_summary": {
253
+ col: {stat: float(val) for stat, val in stats.items()}
254
+ for col, stats in df.describe().to_dict().items()
255
+ }
256
+ if len(df.select_dtypes(include=["number"]).columns) > 0
257
+ else {},
258
  }
259
+ cols = ", ".join(ctx_dict["columns"][:6])
260
+
261
  report_prompt = f"""
262
  You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
263
+
264
  **Dataset Analysis Context:**
265
+ {json.dumps(ctx_dict, indent=2)}
266
+
267
  **Instructions:**
268
+ 1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
269
  2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
270
+ 3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
271
+ 4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain:
272
+ - Trends and patterns
273
+ - Outliers or anomalies
274
+ - Performance indicators
275
+ - Risk factors or opportunities
276
+ 5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
277
+ 6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
278
+
279
  Valid chart types: bar, pie, line, scatter, hist
280
  Base every chart on actual columns: {cols}
281
+ Choose chart types strategically:
282
+ - bar: for categorical comparisons
283
+ - pie: for proportional breakdowns (when categories < 7)
284
+ - line: for time series or trends
285
+ - scatter: for correlation analysis
286
+ - hist: for distribution analysis
287
+
288
+ 7. **Format Requirements**:
289
+ - Use professional business language
290
+ - Include relevant metrics and percentages
291
+ - Structure with clear headers (## Executive Summary, ## Key Insights, etc.)
292
+ - End with ## Next Steps section
293
+
294
+ **Domain-Specific Focus Areas:**
295
+ - If sales data: focus on revenue trends, customer segments, product performance
296
+ - If HR data: focus on workforce analytics, retention, performance metrics
297
+ - If financial data: focus on profitability, cost analysis, financial health
298
+ - If operational data: focus on efficiency, bottlenecks, process optimization
299
+ - If customer data: focus on behavior patterns, satisfaction, churn analysis
300
+
301
+ Generate insights that would be valuable to C-level executives and department heads.
302
  """
303
+ # ─── end original prompt ───────────────────────────────────────────────
304
+
305
  md = llm.invoke(report_prompt).content
306
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
307
+ return df, md, chart_descs
308
+
309
+ # ─── REPORT (STEP 2) background worker per chart ───────────────────────
310
+ def render_chart_worker(rep_key: str, desc: str):
311
+ """Generate one chart (LLM + fallback)."""
312
+ rep = st.session_state.lazy_reports[rep_key]
313
+ df = rep["df"]
314
+
315
+ img_path = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
316
+ try:
317
+ agent = create_pandas_dataframe_agent(
318
+ llm=ChatGoogleGenerativeAI(
319
+ model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1
320
+ ),
321
+ df=df,
322
+ verbose=False,
323
+ allow_dangerous_code=True,
324
+ )
325
+ chart_prompt = f"""
326
+ Create a professional {desc} chart using matplotlib with these requirements:
327
+ 1. Use a clean, business-appropriate style
328
+ 2. Include proper title, axis labels, and legends
329
+ 3. Apply appropriate color schemes (avoid rainbow colors)
330
+ 4. Ensure text is readable (font size 10+)
331
+ 5. Format numbers appropriately (e.g., currency, percentages)
332
+ 6. Save the figure with high quality
333
+ 7. Handle any missing or null values appropriately
334
+ """
335
+ agent.run(chart_prompt)
336
+ if not img_path.exists():
337
+ raise RuntimeError("LLM did not save figure")
338
+ except Exception:
339
  try:
340
+ quick_chart(desc, df, img_path)
341
+ except Exception:
342
+ img_path = None
343
+
344
+ rep["charts"][desc] = str(img_path) if img_path and img_path.exists() else ""
345
+ rep["pending"].discard(desc)
346
+
347
+ if not rep["pending"]:
348
+ rep["pdf"] = build_pdf(rep["md"], rep["charts"])
349
+ rep["finished"] = True
350
+ st.rerun()
351
+
352
+ # ─── Helper: inline image or grey placeholder ─────────────────────────────
353
+ def _inline_image_or_placeholder(rep, desc):
354
+ p = rep["charts"].get(desc)
355
+ if p and Path(p).exists():
356
+ b64 = base64.b64encode(Path(p).read_bytes()).decode()
357
+ return f'<img src="data:image/png;base64,{b64}">'
358
+ return '<img height="250" width="400" style="background:#ddd;">'
359
+
360
+ # ─── ANIMATION HELPERS (unchanged) ────────────────────────────────────────
361
+ def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path,
362
+ fps: int = FPS) -> str:
363
  frames = max(int(dur * fps), fps)
364
+ vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"),
365
+ fps, (WIDTH, HEIGHT))
366
  blank = np.full_like(img_cv2, 255)
367
  for i in range(frames):
368
+ a = i / frames
369
  vid.write(cv2.addWeighted(blank, 1 - a, img_cv2, a, 0))
370
  vid.release()
371
  return str(out)
372
 
373
+ def animate_chart(desc: str, df: pd.DataFrame, dur: float, out: Path,
374
+ fps: int = FPS) -> str:
375
+ """Render an animated chart whose clip length equals `dur`."""
376
  ctype, *rest = [s.strip().lower() for s in desc.split("|", 1)]
377
  ctype = ctype or "bar"
378
  title = rest[0] if rest else desc
379
 
380
+ # prepare data
381
  if ctype == "pie":
382
+ cat = df.select_dtypes(exclude="number").columns[0]
383
+ num = df.select_dtypes(include="number").columns[0]
 
 
384
  plot_df = df.groupby(cat)[num].sum().sort_values(ascending=False).head(8)
385
  elif ctype in ("bar", "hist"):
386
+ num = df.select_dtypes(include="number").columns[0]
 
 
387
  plot_df = df[num]
388
+ else:
389
+ cols = df.select_dtypes(include="number").columns[:2]
390
+ plot_df = df[list(cols)].sort_index()
 
391
 
392
  frames = max(10, int(dur * fps))
393
  fig, ax = plt.subplots(figsize=(WIDTH / 100, HEIGHT / 100), dpi=100)
394
 
395
+ # branches
396
  if ctype == "pie":
397
+ wedges, _ = ax.pie(plot_df, labels=plot_df.index, startangle=90)
398
+ ax.set_title(title)
399
+
400
+ def init(): [w.set_alpha(0) for w in wedges]; return wedges
401
  def update(i):
402
  a = i / (frames - 1)
 
403
  for w in wedges: w.set_alpha(a)
404
  return wedges
405
+
406
  elif ctype == "bar":
407
  bars = ax.bar(plot_df.index, np.zeros_like(plot_df.values), color="#1f77b4")
408
+ ax.set_ylim(0, plot_df.max() * 1.1); ax.set_title(title)
409
+
410
+ def init(): return bars
411
  def update(i):
412
  a = i / (frames - 1)
413
+ for b, h in zip(bars, plot_df.values):
414
+ b.set_height(h * a)
415
+ return bars
416
+
417
  elif ctype == "hist":
418
  _, _, patches = ax.hist(plot_df, bins=20, color="#1f77b4", alpha=0)
419
+ ax.set_title(title)
420
+
421
+ def init(): [p.set_alpha(0) for p in patches]; return patches
422
  def update(i):
423
  a = i / (frames - 1)
424
  for p in patches: p.set_alpha(a)
425
+ return patches
426
+
427
  elif ctype == "scatter":
428
+ pts = ax.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1],
429
+ s=10, alpha=0)
430
+ ax.set_title(title); ax.grid(alpha=.3)
431
+
432
+ def init(): pts.set_alpha(0); return [pts]
433
+ def update(i):
434
+ pts.set_alpha(i / (frames - 1)); return [pts]
435
+
436
  else: # line
437
  line, = ax.plot([], [], lw=2)
438
+ x_full = (plot_df.iloc[:, 0] if plot_df.shape[1] > 1
439
+ else np.arange(len(plot_df)))
440
+ y_full = (plot_df.iloc[:, 1] if plot_df.shape[1] > 1
441
+ else plot_df.iloc[:, 0])
442
+ ax.set_xlim(x_full.min(), x_full.max())
443
+ ax.set_ylim(y_full.min(), y_full.max())
444
+ ax.set_title(title); ax.grid(alpha=.3)
445
+
446
+ def init(): line.set_data([], []); return [line]
447
  def update(i):
448
  k = max(2, int(len(x_full) * i / (frames - 1)))
449
  line.set_data(x_full[:k], y_full.iloc[:k])
450
+ return [line]
451
+
452
+ anim = FuncAnimation(fig, update, init_func=init,
453
+ frames=frames, blit=True,
454
+ interval=1000 / fps)
455
+ anim.save(str(out),
456
+ writer=FFMpegWriter(fps=fps, metadata={'artist':'Sozo'}),
457
+ dpi=144)
458
  plt.close(fig)
459
  return str(out)
460
 
461
  def safe_chart(desc, df, dur, out):
462
  try:
463
  return animate_chart(desc, df, dur, out)
464
+ except Exception:
 
465
  with plt.ioff():
466
+ df.plot(ax=plt.gca())
 
 
 
 
 
 
467
  p = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
468
+ plt.savefig(p, bbox_inches="tight"); plt.close()
469
+ img = cv2.resize(cv2.imread(str(p)), (WIDTH, HEIGHT))
470
+ return animate_image_fade(img, dur, out)
 
 
 
 
471
 
472
  def concat_media(paths: List[str], out: Path, kind="video"):
473
+ if not paths:
 
 
 
 
 
 
 
474
  return
475
+ lst = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.txt"
476
+ with lst.open("w") as f:
477
+ for p in paths:
478
+ if Path(p).exists():
479
+ f.write(f"file '{Path(p).resolve()}'\n")
480
+ subprocess.run(
481
+ [
482
+ "ffmpeg",
483
+ "-y",
484
+ "-f",
485
+ "concat",
486
+ "-safe",
487
+ "0",
488
+ "-i",
489
+ str(lst),
490
+ "-c:v" if kind == "video" else "-c:a",
491
+ "copy",
492
+ str(out),
493
+ ],
494
+ check=True,
495
+ capture_output=True,
496
+ )
497
+ lst.unlink(missing_ok=True)
498
 
499
+ # ─── VIDEO GENERATION (original prompt & logic) ───────────────���────────────
500
  def build_story_prompt(ctx_dict):
501
  cols = ", ".join(ctx_dict["columns"][:6])
502
  return f"""
503
  You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
504
+
505
  **Complete Dataset Context:**
506
+ {json.dumps(ctx_dict, indent=2)}
507
+
508
  **Task Requirements:**
509
  1. **Identify the Data Story**: Determine what business domain this data represents and what story it tells
510
  2. **Create {VIDEO_SCENES} distinct scenes** that build a logical narrative arc
511
  3. **Each scene must contain:**
512
  - 1-2 sentences of clear, professional narration (plain English, no jargon)
513
  - Exactly one chart tag: `<generate_chart: "chart_type | specific description">`
514
+
515
  **Chart Guidelines:**
516
+ - Valid types: bar, pie, line, scatter, hist
517
+ - Base all charts on actual columns: {cols}
518
+ - Choose chart types that best tell the story:
519
+ * bar: categorical comparisons, rankings
520
+ * pie: proportional breakdowns (≤6 categories)
521
+ * line: trends over time, progression
522
+ * scatter: relationships, correlations
523
+ * hist: distributions, frequency analysis
524
+
525
  **Narrative Structure:**
526
+ - Scene 1: Set the context and introduce the main story
527
+ - Middle scenes: Develop key insights and supporting evidence
528
+ - Final scene: Conclude with actionable takeaways or future outlook
529
+
530
+ **Content Standards:**
531
+ - Use conversational, executive-level language
532
+ - Include specific data insights (trends, percentages, comparisons)
533
+ - Avoid chart descriptions in narration ("as shown in the chart")
534
+ - Make each scene self-contained but connected to the overall story
535
+ - Focus on business impact and actionable insights
536
+
537
+ **Domain-Specific Approaches:**
538
+ - Sales data: Customer journey, revenue trends, market performance
539
+ - HR data: Workforce insights, talent analytics, organizational health
540
+ - Financial data: Performance indicators, cost analysis, profitability
541
+ - Operational data: Process efficiency, bottlenecks, optimization opportunities
542
+ - Customer data: Behavior patterns, satisfaction trends, retention analysis
543
+
544
+ **Output Format:** Separate each scene with exactly [SCENE_BREAK]
545
+
546
+ **Example Structure:**
547
+ Our company's data reveals fascinating insights about market performance over the past year. Let's explore what the numbers tell us about our growth trajectory.
548
+ <generate_chart: "line | monthly revenue growth over 12 months">
549
+ [SCENE_BREAK]
550
+ Customer acquisition has shown remarkable patterns, with certain segments driving significantly more value than others. The data shows a clear preference emerging in our target markets.
551
+ <generate_chart: "bar | customer acquisition by segment">
552
+
553
+ Create a compelling, data-driven story that executives would find engaging and actionable.
554
  """
555
 
556
  def generate_video(buf: bytes, name: str, ctx: str, key: str):
557
  try:
558
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
559
  except Exception:
560
+ st.error("🔴 FFmpeg not available — cannot render video.")
561
+ return None
562
 
563
  df, err = load_dataframe_safely(buf, name)
564
  if err:
565
+ st.error(err)
566
+ return None
567
+
568
+ llm = ChatGoogleGenerativeAI(
569
+ model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.2
570
+ )
571
 
 
572
  ctx_dict = {
573
+ "shape": df.shape,
574
+ "columns": list(df.columns),
575
+ "user_ctx": ctx or "General business analysis",
576
+ "full_dataframe": df.to_dict("records"),
577
+ "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
578
+ "numeric_summary": {
579
+ col: {stat: float(val) for stat, val in stats.items()}
580
+ for col, stats in df.describe().to_dict().items()
581
+ }
582
+ if len(df.select_dtypes(include=["number"]).columns) > 0
583
+ else {},
584
  }
585
+
586
  script = llm.invoke(build_story_prompt(ctx_dict)).content
587
  scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
588
 
589
  video_parts, audio_parts, temps = [], [], []
590
  for idx, sc in enumerate(scenes[:VIDEO_SCENES]):
591
+ st.progress(
592
+ (idx + 1) / VIDEO_SCENES,
593
+ text=f"Rendering Scene {idx + 1}/{VIDEO_SCENES}",
594
+ )
595
+
596
  descs = extract_chart_tags(sc)
597
  narrative = clean_narration(sc)
598
 
599
+ # audio
600
  audio_bytes, _ = deepgram_tts(narrative)
601
  mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
602
  if audio_bytes:
 
605
  else:
606
  dur = 5.0
607
  generate_silence_mp3(dur, mp3)
608
+ audio_parts.append(str(mp3))
609
+ temps.append(mp3)
610
 
611
+ # visual
612
  mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
613
  if descs:
614
  safe_chart(descs[0], df, dur, mp4)
615
  else:
616
  img = generate_image_from_prompt(narrative)
617
+ img_cv = cv2.cvtColor(
618
+ np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR
619
+ )
620
  animate_image_fade(img_cv, dur, mp4)
621
+ video_parts.append(str(mp4))
622
+ temps.append(mp4)
623
 
624
+ # concat
625
  silent_vid = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
626
  concat_media(video_parts, silent_vid, "video")
627
  audio_mix = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
628
  concat_media(audio_parts, audio_mix, "audio")
629
 
630
  final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
631
+ subprocess.run(
632
+ [
633
+ "ffmpeg",
634
+ "-y",
635
+ "-i",
636
+ str(silent_vid),
637
+ "-i",
638
+ str(audio_mix),
639
+ "-c:v",
640
+ "copy",
641
+ "-c:a",
642
+ "aac",
643
+ "-shortest",
644
+ str(final_vid),
645
+ ],
646
+ check=True,
647
+ capture_output=True,
648
+ )
649
 
650
  for p in temps + [silent_vid, audio_mix]:
651
  p.unlink(missing_ok=True)
652
+
653
  return str(final_vid)
654
 
655
+ # ─── UI ────────────────────────────────────────────────────────────────────
656
+ mode = st.radio(
657
+ "Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True
658
+ )
659
 
 
660
  upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
 
661
  if upl:
662
+ df_prev, _ = load_dataframe_safely(upl.getvalue(), upl.name)
663
+ with st.expander("📊 Data Preview"):
664
+ st.dataframe(arrow_df(df_prev.head()))
 
 
 
 
 
 
 
 
 
 
 
 
665
 
666
+ ctx = st.text_area("Business context or specific instructions (optional)")
667
+
668
+ # ─── Generate button ──────────────────────────────────────────────────────
669
+ if st.button("🚀 Generate", type="primary", disabled=not upl):
670
+ key = sha1_bytes(b"".join([upl.getvalue(), mode.encode(), ctx.encode()]))
671
 
672
  if mode == "Report (PDF)":
673
+ df, md, chart_descs = prepare_report(upl.getvalue(), upl.name, ctx)
674
+ if df is None:
675
+ st.stop()
676
+
677
+ st.session_state.lazy_reports[key] = {
678
+ "df": df,
679
+ "md": md,
680
+ "charts": {},
681
+ "pending": set(chart_descs),
682
+ "finished": False,
683
+ }
684
+ for d in chart_descs:
685
+ EXEC.submit(render_chart_worker, key, d)
686
+
687
+ st.rerun()
688
+
689
+ else: # video branch
690
+ st.session_state.bundle = None
691
+ path = generate_video(upl.getvalue(), upl.name, ctx, key)
692
+ if path:
693
+ st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
694
+ st.rerun()
695
+
696
+ # ─── OUTPUT ───────────────────────────────────────────────────────────────
697
+ # 1) live PDF reports (may be multiple)
698
+ for rep_key, rep in st.session_state.lazy_reports.items():
699
  st.subheader("📄 Generated Report")
700
+ md_with_imgs = TAG_RE.sub(
701
+ lambda m: _inline_image_or_placeholder(rep, m.group("d").strip()), rep["md"]
702
+ )
 
 
 
 
 
 
703
  with st.expander("View Report", expanded=True):
704
+ st.markdown(md_with_imgs, unsafe_allow_html=True)
705
+
706
+ if rep["finished"]:
 
 
 
 
 
 
 
 
 
 
707
  c1, c2 = st.columns(2)
708
  with c1:
709
+ st.download_button(
710
+ "Download PDF",
711
+ rep["pdf"],
712
+ f"business_report_{rep_key[:8]}.pdf",
713
+ "application/pdf",
714
+ use_container_width=True,
715
+ )
716
  with c2:
717
+ if DG_KEY and st.button("🔊 Narrate Summary", key=f"aud_{rep_key}"):
718
+ txt = re.sub(r"<[^>]+>", "", rep["md"])
719
  audio, mime = deepgram_tts(txt)
720
+ if audio:
721
+ st.audio(audio, format=mime)
722
+ else:
723
+ st.error("Narration failed.")
724
+ else:
725
+ st.info("Charts are still rendering… feel free to keep browsing.")
726
+
727
+ # 2) video branch output
728
+ if (bundle := st.session_state.get("bundle")) and bundle.get("type") == "video":
729
+ st.subheader("🎬 Generated Video Narrative")
730
+ vp = bundle["video_path"]
731
+ if Path(vp).exists():
732
+ with open(vp, "rb") as f:
733
+ st.video(f.read())
734
+ with open(vp, "rb") as f:
735
+ st.download_button(
736
+ "Download Video",
737
+ f,
738
+ f"sozo_narrative_{bundle['key'][:8]}.mp4",
739
+ "video/mp4",
740
+ )
741
+ else:
742
+ st.error("Video file missing – generation failed.")