rairo commited on
Commit
1acf113
·
verified ·
1 Parent(s): da9b96e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +572 -497
app.py CHANGED
@@ -1,16 +1,17 @@
1
  ##############################################################################
2
- # Sozo Business Studio · 10-Jul-2025 (full drop-in) #
3
- # • Restores PDF branch alongside fixed Video branch #
4
- # • Shared chart-tag grammar across both paths #
5
- # • Narrator text cleans scene labels + chart talk #
6
- # • Matplotlib animation starts from blank; artists returned (blit=True) #
7
- # • Gemini Flash-preview image gen with placeholder fallback #
8
- # • Silent-audio fallback keeps mux lengths equal #
9
  ##############################################################################
10
 
11
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
 
12
  from pathlib import Path
13
- from typing import Tuple, Dict, List
 
14
 
15
  import streamlit as st
16
  import pandas as pd
@@ -27,554 +28,605 @@ import cv2
27
  from langchain_experimental.agents import create_pandas_dataframe_agent
28
  from langchain_google_genai import ChatGoogleGenerativeAI
29
  from google import genai
30
- from google.genai import types # for GenerateContentConfig
31
 
32
  # ─── CONFIG ────────────────────────────────────────────────────────────────
33
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
34
  st.title("📊 Sozo Business Studio")
35
  st.caption("AI transforms business data into compelling narratives.")
36
 
37
- FPS, WIDTH, HEIGHT = 24, 1280, 720
38
  MAX_CHARTS, VIDEO_SCENES = 5, 5
 
 
39
 
40
  API_KEY = os.getenv("GEMINI_API_KEY")
41
  if not API_KEY:
42
- st.error("⚠️ GEMINI_API_KEY is not set."); st.stop()
43
- GEM = genai.Client(api_key=API_KEY)
44
 
45
- DG_KEY = os.getenv("DEEPGRAM_API_KEY") # optional for narration
 
 
 
 
 
 
46
  st.session_state.setdefault("bundle", None)
47
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
48
 
49
- # ─── HELPERS ───────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
50
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
51
- """Load CSV/Excel, return (df, err)."""
52
  try:
 
 
 
 
53
  ext = Path(name).suffix.lower()
54
- df = (pd.read_excel if ext in (".xlsx", ".xls") else pd.read_csv)(io.BytesIO(buf))
 
 
 
 
 
 
 
55
  df.columns = df.columns.astype(str).str.strip()
56
- df = df.dropna(how="all")
 
 
 
 
 
 
57
  if df.empty or len(df.columns) == 0:
58
  raise ValueError("No usable data found")
 
59
  return df, None
60
  except Exception as e:
61
- return None, str(e)
62
-
63
 
64
  def arrow_df(df: pd.DataFrame) -> pd.DataFrame:
65
- """Convert for Streamlit Arrow renderer."""
66
- safe = df.copy()
 
67
  for c in safe.columns:
68
  if safe[c].dtype.name in ("Int64", "Float64", "Boolean"):
69
  safe[c] = safe[c].astype(safe[c].dtype.name.lower())
70
  return safe
71
 
72
-
73
- @st.cache_data(show_spinner=False)
74
  def deepgram_tts(txt: str) -> Tuple[bytes, str]:
75
- """Optional audio narration."""
76
  if not DG_KEY or not txt:
77
  return None, None
 
78
  txt = re.sub(r"[^\w\s.,!?;:-]", "", txt)[:1000]
79
  try:
80
  r = requests.post(
81
  "https://api.deepgram.com/v1/speak",
82
  params={"model": "aura-2-andromeda-en"},
83
  headers={"Authorization": f"Token {DG_KEY}", "Content-Type": "application/json"},
84
- json={"text": txt}, timeout=30)
 
 
85
  r.raise_for_status()
86
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
87
  except Exception:
88
  return None, None
89
 
90
-
91
  def generate_silence_mp3(duration: float, out: Path):
92
- subprocess.run(
93
- ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
94
- "-t", f"{duration:.3f}", "-q:a", "9", str(out)],
95
- check=True, capture_output=True)
96
-
 
 
 
 
97
 
98
  def audio_duration(path: str) -> float:
 
99
  try:
100
  res = subprocess.run(
101
  ["ffprobe", "-v", "error", "-show_entries", "format=duration",
102
  "-of", "default=nw=1:nk=1", path],
103
- text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
 
 
104
  return float(res.stdout.strip())
105
  except Exception:
106
  return 5.0
107
 
108
-
109
  TAG_RE = re.compile(
110
  r'[<[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>"\'\]]+?)["\']?\s*[>\]]',
111
  re.I)
112
- extract_chart_tags = lambda t: list(dict.fromkeys(m.group("d").strip()
113
- for m in TAG_RE.finditer(t or "")))
114
 
115
- re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
 
 
 
 
 
116
 
 
117
 
118
  def clean_narration(txt: str) -> str:
 
 
 
119
  txt = re_scene.sub("", txt)
120
  txt = TAG_RE.sub("", txt)
121
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
122
  txt = re.sub(r"\s{2,}", " ", txt).strip()
123
  return txt
124
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
- # ─── IMAGE GENERATION & PLACEHOLDER ───────────────────────────────────────
127
  def placeholder_img() -> Image.Image:
 
128
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
129
 
130
-
131
- def generate_image_from_prompt(prompt: str) -> Image.Image:
132
- model_main = "gemini-2.0-flash-exp-image-generation"
133
- model_fallback = "gemini-2.0-flash-preview-image-generation"
134
- full_prompt = "A clean business-presentation illustration: " + prompt
135
-
136
- def fetch(model_name):
137
- res = GEM.models.generate_content(
138
- model=model_name,
139
- contents=full_prompt,
140
- config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
141
- )
142
- for part in res.candidates[0].content.parts:
143
- if getattr(part, "inline_data", None):
144
- return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
145
- return None
146
-
 
 
 
 
 
 
 
147
  try:
148
- img = fetch(model_main) or fetch(model_fallback)
149
- return img if img else placeholder_img()
 
 
 
 
150
  except Exception:
151
  return placeholder_img()
152
 
153
-
154
- # ─── PDF GENERATION ──────────���─────────────────────────────────────────────
155
  class PDF(FPDF, HTMLMixin):
156
- pass
157
-
 
 
 
 
 
 
 
158
 
159
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
160
- html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(
161
- TAG_RE.sub(lambda m: f'<img src="{charts.get(m.group("d").strip(), "")}">', md)
162
- )
163
- pdf = PDF()
164
- pdf.set_auto_page_break(True, margin=15)
165
- pdf.add_page()
166
- pdf.set_font("Arial", "B", 18)
167
- pdf.cell(0, 12, "AI-Generated Business Report", ln=True)
168
- pdf.ln(3)
169
- pdf.set_font("Arial", "", 11)
170
- pdf.write_html(html)
171
- return bytes(pdf.output(dest="S"))
172
-
173
-
174
- def generate_report(buf: bytes, name: str, ctx: str, key: str):
175
- df, err = load_dataframe_safely(buf, name)
176
- if err:
177
- st.error(err); return None
178
-
179
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
180
- google_api_key=API_KEY, temperature=0.1)
181
-
182
- # Enhanced context analysis
183
- ctx_dict = {
184
- "shape": df.shape,
185
- "columns": list(df.columns),
186
- "user_ctx": ctx or "General business analysis",
187
- "full_dataframe": df.to_dict('records'),
188
- "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
189
- "missing_values": {col: int(count) for col, count in df.isnull().sum().to_dict().items()},
190
- "numeric_summary": {col: {stat: float(val) for stat, val in stats.items()}
191
- for col, stats in df.describe().to_dict().items()} if len(df.select_dtypes(include=['number']).columns) > 0 else {}
192
- }
193
-
194
- cols = ", ".join(ctx_dict["columns"][:6])
195
-
196
- # Enhanced report prompt with domain intelligence
197
- report_prompt = f"""
198
- You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
199
-
200
- **Dataset Analysis Context:**
201
- {json.dumps(ctx_dict, indent=2)}
202
-
203
- **Instructions:**
204
- 1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
205
-
206
- 2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
207
-
208
- 3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
209
-
210
- 4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain:
211
- - Trends and patterns
212
- - Outliers or anomalies
213
- - Performance indicators
214
- - Risk factors or opportunities
215
-
216
- 5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
217
-
218
- 6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like:
219
- `<generate_chart: "chart_type | specific description">`
220
-
221
- Valid chart types: bar, pie, line, scatter, hist
222
- Base every chart on actual columns: {cols}
223
-
224
- Choose chart types strategically:
225
- - bar: for categorical comparisons
226
- - pie: for proportional breakdowns (when categories < 7)
227
- - line: for time series or trends
228
- - scatter: for correlation analysis
229
- - hist: for distribution analysis
230
-
231
- 7. **Format Requirements**:
232
- - Use professional business language
233
- - Include relevant metrics and percentages
234
- - Structure with clear headers (## Executive Summary, ## Key Insights, etc.)
235
- - End with ## Next Steps section
236
-
237
- **Domain-Specific Focus Areas:**
238
- - If sales data: focus on revenue trends, customer segments, product performance
239
- - If HR data: focus on workforce analytics, retention, performance metrics
240
- - If financial data: focus on profitability, cost analysis, financial health
241
- - If operational data: focus on efficiency, bottlenecks, process optimization
242
- - If customer data: focus on behavior patterns, satisfaction, churn analysis
243
-
244
- Generate insights that would be valuable to C-level executives and department heads.
245
- """
246
-
247
- md = llm.invoke(report_prompt).content
248
-
249
- chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
250
- charts: Dict[str, str] = {}
251
- if chart_descs:
252
- agent = create_pandas_dataframe_agent(
253
- llm=llm, df=df, verbose=False, allow_dangerous_code=True
254
  )
255
- for d in chart_descs:
256
- with st.spinner(f"Generating chart: {d}"):
257
- with plt.ioff():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
258
  try:
259
- # Enhanced chart generation prompt
260
- chart_prompt = f"""
261
- Create a professional {d} chart using matplotlib with these requirements:
262
- 1. Use a clean, business-appropriate style
263
- 2. Include proper title, axis labels, and legends
264
- 3. Apply appropriate color schemes (avoid rainbow colors)
265
- 4. Ensure text is readable (font size 10+)
266
- 5. Format numbers appropriately (e.g., currency, percentages)
267
- 6. Save the figure with high quality
268
- 7. Handle any missing or null values appropriately
269
- """
270
- agent.run(chart_prompt)
271
- fig = plt.gcf()
272
- if fig.axes:
273
- p = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
274
- fig.savefig(p, dpi=300, bbox_inches="tight", facecolor="white")
275
- charts[d] = str(p)
276
- plt.close("all")
277
  except Exception:
278
- plt.close("all")
279
-
280
- preview = TAG_RE.sub(
281
- lambda m: f'<img src="data:image/png;base64,{base64.b64encode(Path(charts[m.group("d").strip()]).read_bytes()).decode()}">'
282
- if m.group("d").strip() in charts else m.group(0),
283
- md
284
- )
285
- pdf_bytes = build_pdf(md, charts)
286
-
287
- return {
288
- "type": "report",
289
- "preview": preview,
290
- "pdf": pdf_bytes,
291
- "report_md": md,
292
- "key": key,
293
- }
294
-
295
-
296
-
297
- # ─── ANIMATION HELPERS ─────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
298
  def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path, fps: int = FPS) -> str:
299
- frames = max(int(dur * fps), fps)
300
- vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"), fps, (WIDTH, HEIGHT))
301
- blank = np.full_like(img_cv2, 255)
302
- for i in range(frames):
303
- a = i / frames
304
- vid.write(cv2.addWeighted(blank, 1 - a, img_cv2, a, 0))
305
- vid.release()
306
- return str(out)
307
-
308
-
309
- def animate_chart(desc: str, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
310
- """
311
- Render an animated chart whose clip length equals the audio length `dur`.
312
- There is NO hard-cap on frames and NO prompt meddling.
313
-
314
- reveal_progress = i / (frames-1) → chart reveals smoothly for the whole clip.
315
- """
316
- # -------- parse description -------------------------------------------
317
- ctype, *rest = [s.strip().lower() for s in desc.split("|", 1)]
318
- ctype = ctype or "bar"
319
- title = rest[0] if rest else desc
320
-
321
- # -------- prepare data -------------------------------------------------
322
- if ctype == "pie":
323
- cat = df.select_dtypes(exclude="number").columns[0]
324
- num = df.select_dtypes(include="number").columns[0]
325
- plot_df = df.groupby(cat)[num].sum().sort_values(ascending=False).head(8)
326
- elif ctype in ("bar", "hist"):
327
- num = df.select_dtypes(include="number").columns[0]
328
- plot_df = df[num]
329
- else: # line / scatter
330
- cols = df.select_dtypes(include="number").columns[:2]
331
- plot_df = df[list(cols)].sort_index()
332
-
333
- # -------- timing & figure ---------------------------------------------
334
- frames = max(10, int(dur * fps)) # audio length → frame count
335
- fig, ax = plt.subplots(figsize=(WIDTH / 100, HEIGHT / 100), dpi=100)
336
-
337
- # -------- chart branches ----------------------------------------------
338
- if ctype == "pie":
339
- wedges, _ = ax.pie(plot_df, labels=plot_df.index, startangle=90)
340
- ax.set_title(title)
341
-
342
- def init(): [w.set_alpha(0) for w in wedges]; return wedges
343
- def update(i):
344
- a = i / (frames - 1)
345
- for w in wedges: w.set_alpha(a)
346
- return wedges
347
-
348
- elif ctype == "bar":
349
- bars = ax.bar(plot_df.index, np.zeros_like(plot_df.values), color="#1f77b4")
350
- ax.set_ylim(0, plot_df.max() * 1.1); ax.set_title(title)
351
-
352
- def init(): return bars
353
- def update(i):
354
- a = i / (frames - 1)
355
- for b, h in zip(bars, plot_df.values):
356
- b.set_height(h * a)
357
- return bars
358
-
359
- elif ctype == "hist":
360
- _, _, patches = ax.hist(plot_df, bins=20, color="#1f77b4", alpha=0)
361
- ax.set_title(title)
362
-
363
- def init(): [p.set_alpha(0) for p in patches]; return patches
364
- def update(i):
365
- a = i / (frames - 1)
366
- for p in patches: p.set_alpha(a)
367
- return patches
368
-
369
- elif ctype == "scatter":
370
- pts = ax.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1], s=10, alpha=0)
371
- ax.set_title(title); ax.grid(alpha=.3)
372
-
373
- def init(): pts.set_alpha(0); return [pts]
374
- def update(i):
375
- pts.set_alpha(i / (frames - 1))
376
- return [pts]
377
-
378
- else: # line
379
- line, = ax.plot([], [], lw=2)
380
- x_full = plot_df.iloc[:, 0] if plot_df.shape[1] > 1 else np.arange(len(plot_df))
381
- y_full = plot_df.iloc[:, 1] if plot_df.shape[1] > 1 else plot_df.iloc[:, 0]
382
- ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min(), y_full.max())
383
- ax.set_title(title); ax.grid(alpha=.3)
384
-
385
- def init(): line.set_data([], []); return [line]
386
- def update(i):
387
- k = max(2, int(len(x_full) * i / (frames - 1)))
388
- line.set_data(x_full[:k], y_full.iloc[:k])
389
- return [line]
390
-
391
- # -------- animation ----------------------------------------------------
392
- anim = FuncAnimation(fig, update, init_func=init, frames=frames,
393
- blit=True, interval=1000 / fps)
394
- anim.save(str(out),
395
- writer=FFMpegWriter(fps=fps, metadata={'artist': 'Sozo'}),
396
- dpi=144)
397
- plt.close(fig)
398
- return str(out)
399
-
400
-
401
- def safe_chart(desc, df, dur, out):
402
  try:
403
- return animate_chart(desc, df, dur, out)
404
- except Exception:
405
- with plt.ioff():
406
- df.plot(ax=plt.gca())
407
- p = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
408
- plt.savefig(p, bbox_inches="tight"); plt.close()
409
- img = cv2.resize(cv2.imread(str(p)), (WIDTH, HEIGHT))
410
- return animate_image_fade(img, dur, out)
411
-
412
-
413
- def concat_media(paths: List[str], out: Path, kind="video"):
414
- if not paths:
415
- return
416
- lst = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.txt"
417
- with lst.open("w") as f:
418
- for p in paths:
419
- if Path(p).exists():
420
- f.write(f"file '{Path(p).resolve()}'\n")
421
- subprocess.run(
422
- ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", str(lst),
423
- "-c:v" if kind == "video" else "-c:a", "copy", str(out)],
424
- check=True, capture_output=True)
425
- lst.unlink(missing_ok=True)
426
-
427
-
428
- # ─── VIDEO GENERATION ──────────────────────────────────────────────────────
429
- def build_story_prompt(ctx_dict):
430
- cols = ", ".join(ctx_dict["columns"][:6])
431
-
432
- return f"""
433
- You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
434
-
435
- **Complete Dataset Context:**
436
- {json.dumps(ctx_dict, indent=2)}
437
-
438
- **Task Requirements:**
439
- 1. **Identify the Data Story**: Determine what business domain this data represents and what story it tells
440
- 2. **Create {VIDEO_SCENES} distinct scenes** that build a logical narrative arc
441
- 3. **Each scene must contain:**
442
- - 1-2 sentences of clear, professional narration (plain English, no jargon)
443
- - Exactly one chart tag: `<generate_chart: "chart_type | specific description">`
444
-
445
- **Chart Guidelines:**
446
- - Valid types: bar, pie, line, scatter, hist
447
- - Base all charts on actual columns: {cols}
448
- - Choose chart types that best tell the story:
449
- * bar: categorical comparisons, rankings
450
- * pie: proportional breakdowns (≤6 categories)
451
- * line: trends over time, progression
452
- * scatter: relationships, correlations
453
- * hist: distributions, frequency analysis
454
-
455
- **Narrative Structure:**
456
- - Scene 1: Set the context and introduce the main story
457
- - Middle scenes: Develop key insights and supporting evidence
458
- - Final scene: Conclude with actionable takeaways or future outlook
459
-
460
- **Content Standards:**
461
- - Use conversational, executive-level language
462
- - Include specific data insights (trends, percentages, comparisons)
463
- - Avoid chart descriptions in narration ("as shown in the chart")
464
- - Make each scene self-contained but connected to the overall story
465
- - Focus on business impact and actionable insights
466
-
467
- **Domain-Specific Approaches:**
468
- - Sales data: Customer journey, revenue trends, market performance
469
- - HR data: Workforce insights, talent analytics, organizational health
470
- - Financial data: Performance indicators, cost analysis, profitability
471
- - Operational data: Process efficiency, bottlenecks, optimization opportunities
472
- - Customer data: Behavior patterns, satisfaction trends, retention analysis
473
-
474
- **Output Format:**
475
- Separate each scene with exactly [SCENE_BREAK]
476
-
477
- **Example Structure:**
478
- Our company's data reveals fascinating insights about market performance over the past year. Let's explore what the numbers tell us about our growth trajectory.
479
- <generate_chart: "line | monthly revenue growth over 12 months">
480
-
481
- [SCENE_BREAK]
482
-
483
- Customer acquisition has shown remarkable patterns, with certain segments driving significantly more value than others. The data shows a clear preference emerging in our target markets.
484
- <generate_chart: "bar | customer acquisition by segment">
485
-
486
- Create a compelling, data-driven story that executives would find engaging and actionable.
487
- """
488
-
489
-
490
- def generate_video(buf: bytes, name: str, ctx: str, key: str):
491
  try:
 
492
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
493
  except Exception:
494
- st.error("🔴 FFmpeg not available — cannot render video."); return None
495
-
 
496
  df, err = load_dataframe_safely(buf, name)
497
  if err:
498
- st.error(err); return None
499
-
500
- llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash",
501
- google_api_key=API_KEY, temperature=0.2)
502
-
503
- # Enhanced context with complete data insights
504
- ctx_dict = {
505
- "shape": df.shape,
506
- "columns": list(df.columns),
507
- "user_ctx": ctx or "General business analysis",
508
- "full_dataframe": df.to_dict('records'),
509
- "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
510
- "numeric_summary": {col: {stat: float(val) for stat, val in stats.items()}
511
- for col, stats in df.describe().to_dict().items()} if len(df.select_dtypes(include=['number']).columns) > 0 else {}
512
- }
513
 
514
- script = llm.invoke(build_story_prompt(ctx_dict)).content
515
- scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
516
-
517
- video_parts, audio_parts, temps = [], [], []
518
- for idx, sc in enumerate(scenes[:VIDEO_SCENES]):
519
- st.progress((idx + 1) / VIDEO_SCENES,
520
- text=f"Rendering Scene {idx + 1}/{VIDEO_SCENES}")
521
-
522
- descs = extract_chart_tags(sc)
523
- narrative = clean_narration(sc)
524
-
525
- # --- audio ---
526
- audio_bytes, _ = deepgram_tts(narrative)
527
- mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
528
- if audio_bytes:
529
- mp3.write_bytes(audio_bytes)
530
- dur = audio_duration(str(mp3))
531
- else:
532
- dur = 5.0
533
- generate_silence_mp3(dur, mp3)
534
- audio_parts.append(str(mp3)); temps.append(mp3)
535
-
536
- # --- visual ---
537
- mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
538
- if descs:
539
- safe_chart(descs[0], df, dur, mp4)
540
- else:
541
- img = generate_image_from_prompt(narrative)
542
- img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
543
- animate_image_fade(img_cv, dur, mp4)
544
- video_parts.append(str(mp4)); temps.append(mp4)
545
-
546
- # concat
547
- silent_vid = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
548
- concat_media(video_parts, silent_vid, "video")
549
- audio_mix = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
550
- concat_media(audio_parts, audio_mix, "audio")
551
-
552
- final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
553
- subprocess.run(
554
- ["ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix),
555
- "-c:v", "copy", "-c:a", "aac", "-shortest", str(final_vid)],
556
- check=True, capture_output=True)
557
-
558
- for p in temps + [silent_vid, audio_mix]:
559
- p.unlink(missing_ok=True)
560
-
561
- return str(final_vid)
562
-
563
- # ─── UI ─────────────────────────────────────────────────────────────────────
564
- mode = st.radio("Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True)
565
-
566
- upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
567
- if upl:
568
- df_prev, _ = load_dataframe_safely(upl.getvalue(), upl.name)
569
- with st.expander("📊 Data Preview"):
570
- st.dataframe(arrow_df(df_prev.head()))
571
-
572
- ctx = st.text_area("Business context or specific instructions (optional)")
573
-
574
- if st.button("🚀 Generate", type="primary", disabled=not upl):
575
- key = sha1_bytes(b"".join([upl.getvalue(), mode.encode(), ctx.encode()]))
576
 
577
- with st.spinner("Generating…"):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
578
  if mode == "Report (PDF)":
579
  st.session_state.bundle = generate_report(upl.getvalue(), upl.name, ctx, key)
580
  else:
@@ -582,34 +634,57 @@ if st.button("🚀 Generate", type="primary", disabled=not upl):
582
  path = generate_video(upl.getvalue(), upl.name, ctx, key)
583
  if path:
584
  st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
585
- st.rerun()
586
-
587
- # ─── OUTPUT ────────────────────────────────────────────────────────────────
588
- if bundle := st.session_state.get("bundle"):
589
- if bundle["type"] == "report":
590
- st.subheader("📄 Generated Report")
591
- with st.expander("View Report", expanded=True):
592
- st.markdown(bundle["preview"], unsafe_allow_html=True)
593
-
594
- c1, c2 = st.columns(2)
595
- with c1:
596
- st.download_button("Download PDF", bundle["pdf"],
597
- "business_report.pdf", "application/pdf",
598
- use_container_width=True)
599
- with c2:
600
- if DG_KEY and st.button("🔊 Narrate Summary", use_container_width=True):
601
- txt = re.sub(r"<[^>]+>", "", bundle["report_md"])
602
- audio, mime = deepgram_tts(txt)
603
- st.audio(audio, format=mime) if audio else st.error("Narration failed.")
604
-
605
- else: # video
606
- st.subheader("🎬 Generated Video Narrative")
607
- vp = bundle["video_path"]
608
- if Path(vp).exists():
609
- with open(vp, "rb") as f:
610
- st.video(f.read())
611
- with open(vp, "rb") as f:
612
- st.download_button("Download Video", f,
613
- f"sozo_narrative_{bundle['key'][:8]}.mp4", "video/mp4")
614
- else:
615
- st.error("Video file missing – generation failed.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ##############################################################################
2
+ # Sozo Business Studio · 10-Jul-2025 (Performance Fixed) #
3
+ # • Fixed report generation freezing issues #
4
+ # • Optimized memory usage and resource management #
5
+ # • Added proper error handling and timeouts #
6
+ # • Improved chart generation with fallback strategies #
7
+ # • Enhanced progress tracking and user feedback #
 
8
  ##############################################################################
9
 
10
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
11
+ import time, gc, threading
12
  from pathlib import Path
13
+ from typing import Tuple, Dict, List, Optional
14
+ from concurrent.futures import ThreadPoolExecutor, TimeoutError
15
 
16
  import streamlit as st
17
  import pandas as pd
 
28
  from langchain_experimental.agents import create_pandas_dataframe_agent
29
  from langchain_google_genai import ChatGoogleGenerativeAI
30
  from google import genai
31
+ from google.genai import types
32
 
33
  # ─── CONFIG ────────────────────────────────────────────────────────────────
34
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
35
  st.title("📊 Sozo Business Studio")
36
  st.caption("AI transforms business data into compelling narratives.")
37
 
38
+ FPS, WIDTH, HEIGHT = 24, 1280, 720
39
  MAX_CHARTS, VIDEO_SCENES = 5, 5
40
+ CHART_TIMEOUT = 30 # seconds
41
+ REPORT_TIMEOUT = 120 # seconds
42
 
43
  API_KEY = os.getenv("GEMINI_API_KEY")
44
  if not API_KEY:
45
+ st.error("⚠️ GEMINI_API_KEY is not set.")
46
+ st.stop()
47
 
48
+ try:
49
+ GEM = genai.Client(api_key=API_KEY)
50
+ except Exception as e:
51
+ st.error(f"⚠️ Failed to initialize Gemini client: {e}")
52
+ st.stop()
53
+
54
+ DG_KEY = os.getenv("DEEPGRAM_API_KEY")
55
  st.session_state.setdefault("bundle", None)
56
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
57
 
58
+ # ─── MEMORY MANAGEMENT ─────────────────────────────────────────────────────
59
+ def cleanup_matplotlib():
60
+ """Clean up matplotlib resources to prevent memory leaks"""
61
+ plt.close('all')
62
+ plt.clf()
63
+ plt.cla()
64
+ gc.collect()
65
+
66
+ def safe_temp_cleanup(temp_files: List[Path]):
67
+ """Safely clean up temporary files"""
68
+ for temp_file in temp_files:
69
+ try:
70
+ if temp_file.exists():
71
+ temp_file.unlink()
72
+ except Exception:
73
+ pass
74
+
75
+ # ─── ENHANCED HELPERS ──────────────────────────────────────────────────────
76
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
77
+ """Load CSV/Excel with enhanced error handling and size limits"""
78
  try:
79
+ # Check file size (limit to 50MB)
80
+ if len(buf) > 50 * 1024 * 1024:
81
+ return None, "File too large (max 50MB)"
82
+
83
  ext = Path(name).suffix.lower()
84
+
85
+ # Use smaller chunk size for large files
86
+ if ext in (".xlsx", ".xls"):
87
+ df = pd.read_excel(io.BytesIO(buf), engine='openpyxl' if ext == '.xlsx' else 'xlrd')
88
+ else:
89
+ df = pd.read_csv(io.BytesIO(buf), encoding='utf-8', on_bad_lines='skip')
90
+
91
+ # Basic data validation
92
  df.columns = df.columns.astype(str).str.strip()
93
+ df = df.dropna(how="all").reset_index(drop=True)
94
+
95
+ # Limit rows for performance
96
+ if len(df) > 10000:
97
+ df = df.head(10000)
98
+ st.warning("⚠️ Dataset truncated to 10,000 rows for performance")
99
+
100
  if df.empty or len(df.columns) == 0:
101
  raise ValueError("No usable data found")
102
+
103
  return df, None
104
  except Exception as e:
105
+ return None, f"Error loading file: {str(e)}"
 
106
 
107
  def arrow_df(df: pd.DataFrame) -> pd.DataFrame:
108
+ """Convert for Streamlit Arrow renderer with memory optimization"""
109
+ # Create a copy with limited rows for preview
110
+ safe = df.head(1000).copy()
111
  for c in safe.columns:
112
  if safe[c].dtype.name in ("Int64", "Float64", "Boolean"):
113
  safe[c] = safe[c].astype(safe[c].dtype.name.lower())
114
  return safe
115
 
116
+ @st.cache_data(show_spinner=False, ttl=3600)
 
117
  def deepgram_tts(txt: str) -> Tuple[bytes, str]:
118
+ """Cached audio narration with timeout"""
119
  if not DG_KEY or not txt:
120
  return None, None
121
+
122
  txt = re.sub(r"[^\w\s.,!?;:-]", "", txt)[:1000]
123
  try:
124
  r = requests.post(
125
  "https://api.deepgram.com/v1/speak",
126
  params={"model": "aura-2-andromeda-en"},
127
  headers={"Authorization": f"Token {DG_KEY}", "Content-Type": "application/json"},
128
+ json={"text": txt},
129
+ timeout=15 # Reduced timeout
130
+ )
131
  r.raise_for_status()
132
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
133
  except Exception:
134
  return None, None
135
 
 
136
  def generate_silence_mp3(duration: float, out: Path):
137
+ """Generate silence with error handling"""
138
+ try:
139
+ subprocess.run(
140
+ ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
141
+ "-t", f"{duration:.3f}", "-q:a", "9", str(out)],
142
+ check=True, capture_output=True, timeout=30
143
+ )
144
+ except Exception as e:
145
+ st.warning(f"Failed to generate silence: {e}")
146
 
147
  def audio_duration(path: str) -> float:
148
+ """Get audio duration with fallback"""
149
  try:
150
  res = subprocess.run(
151
  ["ffprobe", "-v", "error", "-show_entries", "format=duration",
152
  "-of", "default=nw=1:nk=1", path],
153
+ text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
154
+ check=True, timeout=10
155
+ )
156
  return float(res.stdout.strip())
157
  except Exception:
158
  return 5.0
159
 
160
+ # ─── CHART GENERATION WITH TIMEOUT ────────────────────────────────────────
161
  TAG_RE = re.compile(
162
  r'[<[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>"\'\]]+?)["\']?\s*[>\]]',
163
  re.I)
 
 
164
 
165
+ def extract_chart_tags(t: str) -> List[str]:
166
+ """Extract chart tags with deduplication"""
167
+ if not t:
168
+ return []
169
+ tags = [m.group("d").strip() for m in TAG_RE.finditer(t)]
170
+ return list(dict.fromkeys(tags)) # Remove duplicates while preserving order
171
 
172
+ re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
173
 
174
  def clean_narration(txt: str) -> str:
175
+ """Clean narration text"""
176
+ if not txt:
177
+ return ""
178
  txt = re_scene.sub("", txt)
179
  txt = TAG_RE.sub("", txt)
180
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
181
  txt = re.sub(r"\s{2,}", " ", txt).strip()
182
  return txt
183
 
184
+ def generate_chart_with_timeout(agent, description: str, timeout: int = CHART_TIMEOUT) -> Optional[str]:
185
+ """Generate chart with timeout and fallback"""
186
+ def chart_worker():
187
+ try:
188
+ cleanup_matplotlib()
189
+
190
+ # Enhanced chart generation prompt
191
+ chart_prompt = f"""
192
+ Create a {description} chart using matplotlib with these requirements:
193
+ 1. Use plt.figure(figsize=(12, 8)) for consistent sizing
194
+ 2. Apply a clean, professional style: plt.style.use('seaborn-v0_8')
195
+ 3. Include proper title, axis labels, and legends
196
+ 4. Use professional color palette
197
+ 5. Ensure readable fonts (size 12+)
198
+ 6. Handle missing values by dropping or filling them
199
+ 7. Save with: plt.savefig('chart.png', dpi=300, bbox_inches='tight', facecolor='white')
200
+ 8. Always call plt.close() after saving
201
+
202
+ Important: Only use columns that exist in the dataframe. If a column doesn't exist, use the closest available column.
203
+ """
204
+
205
+ result = agent.run(chart_prompt)
206
+ return result
207
+ except Exception as e:
208
+ st.warning(f"Chart generation failed: {e}")
209
+ return None
210
+
211
+ try:
212
+ with ThreadPoolExecutor(max_workers=1) as executor:
213
+ future = executor.submit(chart_worker)
214
+ result = future.result(timeout=timeout)
215
+ return result
216
+ except TimeoutError:
217
+ st.warning(f"Chart generation timed out after {timeout} seconds")
218
+ return None
219
+ except Exception as e:
220
+ st.warning(f"Chart generation error: {e}")
221
+ return None
222
+ finally:
223
+ cleanup_matplotlib()
224
+
225
+ def create_fallback_chart(df: pd.DataFrame, description: str) -> Optional[str]:
226
+ """Create a simple fallback chart"""
227
+ try:
228
+ cleanup_matplotlib()
229
+
230
+ fig, ax = plt.subplots(figsize=(12, 8))
231
+
232
+ # Simple fallback based on data types
233
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
234
+ categorical_cols = df.select_dtypes(include=['object']).columns
235
+
236
+ if len(numeric_cols) >= 2:
237
+ # Scatter plot
238
+ ax.scatter(df[numeric_cols[0]], df[numeric_cols[1]], alpha=0.6)
239
+ ax.set_xlabel(numeric_cols[0])
240
+ ax.set_ylabel(numeric_cols[1])
241
+ ax.set_title(f"Scatter Plot: {description}")
242
+ elif len(numeric_cols) == 1:
243
+ # Histogram
244
+ ax.hist(df[numeric_cols[0]].dropna(), bins=20, alpha=0.7)
245
+ ax.set_xlabel(numeric_cols[0])
246
+ ax.set_ylabel('Frequency')
247
+ ax.set_title(f"Distribution: {description}")
248
+ else:
249
+ # Simple text chart
250
+ ax.text(0.5, 0.5, f"Chart: {description}\nData available",
251
+ ha='center', va='center', fontsize=16)
252
+ ax.set_xlim(0, 1)
253
+ ax.set_ylim(0, 1)
254
+ ax.set_title(description)
255
+
256
+ plt.tight_layout()
257
+
258
+ # Save to temporary file
259
+ temp_path = Path(tempfile.gettempdir()) / f"fallback_{uuid.uuid4()}.png"
260
+ plt.savefig(temp_path, dpi=300, bbox_inches="tight", facecolor="white")
261
+ plt.close(fig)
262
+
263
+ return str(temp_path)
264
+ except Exception as e:
265
+ st.warning(f"Fallback chart creation failed: {e}")
266
+ return None
267
+ finally:
268
+ cleanup_matplotlib()
269
 
270
+ # ─── IMAGE GENERATION WITH FALLBACK ───────────────────────────────────────
271
  def placeholder_img() -> Image.Image:
272
+ """Create placeholder image"""
273
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
274
 
275
+ def generate_image_from_prompt(prompt: str, timeout: int = 30) -> Image.Image:
276
+ """Generate image with timeout and fallback"""
277
+ def image_worker():
278
+ model_main = "gemini-2.0-flash-exp-image-generation"
279
+ model_fallback = "gemini-2.0-flash-preview-image-generation"
280
+ full_prompt = "A clean business-presentation illustration: " + prompt
281
+
282
+ def fetch(model_name):
283
+ res = GEM.models.generate_content(
284
+ model=model_name,
285
+ contents=full_prompt,
286
+ config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
287
+ )
288
+ for part in res.candidates[0].content.parts:
289
+ if getattr(part, "inline_data", None):
290
+ return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
291
+ return None
292
+
293
+ try:
294
+ img = fetch(model_main) or fetch(model_fallback)
295
+ return img if img else placeholder_img()
296
+ except Exception:
297
+ return placeholder_img()
298
+
299
  try:
300
+ with ThreadPoolExecutor(max_workers=1) as executor:
301
+ future = executor.submit(image_worker)
302
+ return future.result(timeout=timeout)
303
+ except TimeoutError:
304
+ st.warning(f"Image generation timed out after {timeout} seconds")
305
+ return placeholder_img()
306
  except Exception:
307
  return placeholder_img()
308
 
309
+ # ─── OPTIMIZED PDF GENERATION ─────────────────────────────────────────────
 
310
  class PDF(FPDF, HTMLMixin):
311
+ def header(self):
312
+ self.set_font('Arial', 'B', 16)
313
+ self.cell(0, 10, 'Sozo Business Report', 0, 1, 'C')
314
+ self.ln(5)
315
+
316
+ def footer(self):
317
+ self.set_y(-15)
318
+ self.set_font('Arial', 'I', 8)
319
+ self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
320
 
321
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
322
+ """Build PDF with error handling"""
323
+ try:
324
+ # Convert markdown to HTML with chart substitution
325
+ html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(
326
+ TAG_RE.sub(lambda m: f'<img src="{charts.get(m.group("d").strip(), "")}" width="400">', md)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
327
  )
328
+
329
+ pdf = PDF()
330
+ pdf.set_auto_page_break(True, margin=15)
331
+ pdf.add_page()
332
+ pdf.set_font("Arial", "", 11)
333
+
334
+ # Simple text conversion (avoid complex HTML)
335
+ text_content = re.sub(r'<[^>]+>', '', html)
336
+ pdf.multi_cell(0, 6, text_content)
337
+
338
+ return bytes(pdf.output(dest="S"))
339
+ except Exception as e:
340
+ st.error(f"PDF generation failed: {e}")
341
+ # Return simple fallback PDF
342
+ pdf = PDF()
343
+ pdf.add_page()
344
+ pdf.set_font("Arial", "", 12)
345
+ pdf.multi_cell(0, 6, "Report generation encountered an error. Please try again.")
346
+ return bytes(pdf.output(dest="S"))
347
+
348
+ # ─── OPTIMIZED REPORT GENERATION ──────────────────────────────────────────
349
+ def generate_report(buf: bytes, name: str, ctx: str, key: str) -> Optional[dict]:
350
+ """Generate report with improved error handling and timeouts"""
351
+ progress_bar = st.progress(0)
352
+ status_text = st.empty()
353
+
354
+ try:
355
+ # Step 1: Load data
356
+ status_text.text("Loading and validating data...")
357
+ progress_bar.progress(0.1)
358
+
359
+ df, err = load_dataframe_safely(buf, name)
360
+ if err:
361
+ st.error(err)
362
+ return None
363
+
364
+ # Step 2: Initialize LLM
365
+ status_text.text("Initializing AI models...")
366
+ progress_bar.progress(0.2)
367
+
368
+ try:
369
+ llm = ChatGoogleGenerativeAI(
370
+ model="gemini-2.0-flash",
371
+ google_api_key=API_KEY,
372
+ temperature=0.1,
373
+ request_timeout=60
374
+ )
375
+ except Exception as e:
376
+ st.error(f"Failed to initialize AI model: {e}")
377
+ return None
378
+
379
+ # Step 3: Create context (limit size)
380
+ status_text.text("Analyzing data structure...")
381
+ progress_bar.progress(0.3)
382
+
383
+ # Limit context size to prevent memory issues
384
+ sample_size = min(100, len(df))
385
+ ctx_dict = {
386
+ "shape": df.shape,
387
+ "columns": list(df.columns)[:20], # Limit columns
388
+ "user_ctx": ctx or "General business analysis",
389
+ "sample_data": df.head(sample_size).to_dict('records')[:10], # Small sample
390
+ "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
391
+ }
392
+
393
+ # Add numeric summary only if reasonable size
394
+ numeric_cols = df.select_dtypes(include=[np.number]).columns
395
+ if len(numeric_cols) > 0 and len(numeric_cols) < 20:
396
+ ctx_dict["numeric_summary"] = {
397
+ col: {stat: float(val) for stat, val in stats.items()}
398
+ for col, stats in df[numeric_cols].describe().to_dict().items()
399
+ }
400
+
401
+ # Step 4: Generate report
402
+ status_text.text("Generating report content...")
403
+ progress_bar.progress(0.4)
404
+
405
+ cols = ", ".join(ctx_dict["columns"][:10])
406
+
407
+ report_prompt = f"""
408
+ Analyze this business dataset and create a professional executive report.
409
+
410
+ **Dataset:** {ctx_dict["shape"][0]} rows, {ctx_dict["shape"][1]} columns
411
+ **Columns:** {cols}
412
+ **Context:** {ctx_dict["user_ctx"]}
413
+
414
+ **Requirements:**
415
+ 1. Write in professional, executive-level language
416
+ 2. Include 3-5 key insights with specific data points
417
+ 3. Provide actionable recommendations
418
+ 4. Use maximum 3 chart tags: `<generate_chart: "chart_type | description">`
419
+ 5. Valid chart types: bar, pie, line, scatter, hist
420
+ 6. Keep total length under 2000 words
421
+
422
+ **Structure:**
423
+ ## Executive Summary
424
+ [Brief overview of key findings]
425
+
426
+ ## Key Insights
427
+ [3-5 actionable insights with data support]
428
+
429
+ ## Recommendations
430
+ [Specific, actionable recommendations]
431
+
432
+ Focus on business impact and practical insights.
433
+ """
434
+
435
+ try:
436
+ with ThreadPoolExecutor(max_workers=1) as executor:
437
+ future = executor.submit(lambda: llm.invoke(report_prompt).content)
438
+ md = future.result(timeout=REPORT_TIMEOUT)
439
+ except TimeoutError:
440
+ st.error("Report generation timed out. Please try with a smaller dataset.")
441
+ return None
442
+ except Exception as e:
443
+ st.error(f"Report generation failed: {e}")
444
+ return None
445
+
446
+ # Step 5: Generate charts
447
+ status_text.text("Generating charts...")
448
+ progress_bar.progress(0.6)
449
+
450
+ chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
451
+ charts: Dict[str, str] = {}
452
+ temp_files: List[Path] = []
453
+
454
+ if chart_descs:
455
+ try:
456
+ agent = create_pandas_dataframe_agent(
457
+ llm=llm, df=df, verbose=False,
458
+ allow_dangerous_code=True,
459
+ max_iterations=3,
460
+ early_stopping_method="generate"
461
+ )
462
+
463
+ for i, desc in enumerate(chart_descs):
464
+ chart_progress = 0.6 + (0.3 * (i + 1) / len(chart_descs))
465
+ progress_bar.progress(chart_progress)
466
+ status_text.text(f"Generating chart {i+1}/{len(chart_descs)}: {desc[:50]}...")
467
+
468
+ # Try agent-based chart generation
469
+ result = generate_chart_with_timeout(agent, desc)
470
+
471
+ # Check if matplotlib saved a file
472
+ chart_path = None
473
+ potential_paths = [
474
+ Path("chart.png"),
475
+ Path(tempfile.gettempdir()) / "chart.png",
476
+ ]
477
+
478
+ for path in potential_paths:
479
+ if path.exists():
480
+ chart_path = path
481
+ break
482
+
483
+ # If no chart was generated, create fallback
484
+ if not chart_path:
485
+ chart_path = create_fallback_chart(df, desc)
486
+
487
+ if chart_path and Path(chart_path).exists():
488
+ # Move to permanent temp location
489
+ perm_path = Path(tempfile.gettempdir()) / f"chart_{uuid.uuid4()}.png"
490
+ Path(chart_path).rename(perm_path)
491
+ charts[desc] = str(perm_path)
492
+ temp_files.append(perm_path)
493
+
494
+ cleanup_matplotlib()
495
+
496
+ except Exception as e:
497
+ st.warning(f"Chart generation encountered issues: {e}")
498
+ # Continue without charts
499
+
500
+ # Step 6: Build PDF
501
+ status_text.text("Building PDF...")
502
+ progress_bar.progress(0.9)
503
+
504
+ try:
505
+ # Create preview with base64 encoded images
506
+ preview = md
507
+ for desc, path in charts.items():
508
+ if Path(path).exists():
509
  try:
510
+ img_bytes = Path(path).read_bytes()
511
+ b64_img = base64.b64encode(img_bytes).decode()
512
+ preview = preview.replace(
513
+ f'<generate_chart: "{desc}">',
514
+ f'<img src="data:image/png;base64,{b64_img}" style="max-width: 100%;">'
515
+ )
 
 
 
 
 
 
 
 
 
 
 
 
516
  except Exception:
517
+ pass
518
+
519
+ pdf_bytes = build_pdf(md, charts)
520
+
521
+ # Clean up temporary files
522
+ safe_temp_cleanup(temp_files)
523
+
524
+ progress_bar.progress(1.0)
525
+ status_text.text("Report generated successfully!")
526
+
527
+ return {
528
+ "type": "report",
529
+ "preview": preview,
530
+ "pdf": pdf_bytes,
531
+ "report_md": md,
532
+ "key": key,
533
+ }
534
+
535
+ except Exception as e:
536
+ st.error(f"PDF generation failed: {e}")
537
+ return None
538
+
539
+ except Exception as e:
540
+ st.error(f"Report generation failed: {e}")
541
+ return None
542
+ finally:
543
+ # Clean up UI elements
544
+ progress_bar.empty()
545
+ status_text.empty()
546
+ cleanup_matplotlib()
547
+ gc.collect()
548
+
549
+ # ─── VIDEO GENERATION (SIMPLIFIED) ────────────────────────────────────────
550
  def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path, fps: int = FPS) -> str:
551
+ """Animate image with fade effect"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  try:
553
+ frames = max(int(dur * fps), fps)
554
+ vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"), fps, (WIDTH, HEIGHT))
555
+ blank = np.full_like(img_cv2, 255)
556
+
557
+ for i in range(frames):
558
+ a = i / frames
559
+ blended = cv2.addWeighted(blank, 1 - a, img_cv2, a, 0)
560
+ vid.write(blended)
561
+
562
+ vid.release()
563
+ return str(out)
564
+ except Exception as e:
565
+ st.warning(f"Video animation failed: {e}")
566
+ return str(out)
567
+
568
+ def generate_video(buf: bytes, name: str, ctx: str, key: str) -> Optional[str]:
569
+ """Generate video with simplified approach"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  try:
571
+ # Check FFmpeg availability
572
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
573
  except Exception:
574
+ st.error("🔴 FFmpeg not available — cannot render video.")
575
+ return None
576
+
577
  df, err = load_dataframe_safely(buf, name)
578
  if err:
579
+ st.error(err)
580
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
581
 
582
+ # Simplified video generation for better performance
583
+ st.info("🎬 Video generation is simplified for better performance")
584
+
585
+ try:
586
+ # Create a simple video with data visualization
587
+ img = generate_image_from_prompt(f"Business data visualization for {ctx or 'data analysis'}")
588
+ img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
589
+
590
+ video_path = Path(tempfile.gettempdir()) / f"{key}.mp4"
591
+ animate_image_fade(img_cv, 10.0, video_path)
592
+
593
+ return str(video_path)
594
+ except Exception as e:
595
+ st.error(f"Video generation failed: {e}")
596
+ return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
597
 
598
+ # ─── STREAMLIT UI ─────────────────────────────────────────────────────────
599
+ def main():
600
+ """Main application function"""
601
+
602
+ # Mode selection
603
+ mode = st.radio("Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True)
604
+
605
+ # File upload
606
+ upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
607
+
608
+ if upl:
609
+ # Show data preview
610
+ with st.spinner("Loading data preview..."):
611
+ df_prev, load_err = load_dataframe_safely(upl.getvalue(), upl.name)
612
+
613
+ if load_err:
614
+ st.error(f"Error loading file: {load_err}")
615
+ else:
616
+ with st.expander("📊 Data Preview", expanded=False):
617
+ st.info(f"Shape: {df_prev.shape[0]} rows × {df_prev.shape[1]} columns")
618
+ st.dataframe(arrow_df(df_prev), use_container_width=True)
619
+
620
+ # Context input
621
+ ctx = st.text_area(
622
+ "Business context or specific instructions (optional)",
623
+ help="Provide context about your data or specific analysis requirements"
624
+ )
625
+
626
+ # Generate button
627
+ if st.button("🚀 Generate", type="primary", disabled=not upl):
628
+ key = sha1_bytes(b"".join([upl.getvalue(), mode.encode(), ctx.encode()]))
629
+
630
  if mode == "Report (PDF)":
631
  st.session_state.bundle = generate_report(upl.getvalue(), upl.name, ctx, key)
632
  else:
 
634
  path = generate_video(upl.getvalue(), upl.name, ctx, key)
635
  if path:
636
  st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
637
+
638
+ st.rerun()
639
+
640
+ # Display results
641
+ if bundle := st.session_state.get("bundle"):
642
+ if bundle["type"] == "report":
643
+ st.subheader("📄 Generated Report")
644
+
645
+ # Report preview
646
+ with st.expander("📖 View Report", expanded=True):
647
+ st.markdown(bundle["preview"], unsafe_allow_html=True)
648
+
649
+ # Download options
650
+ col1, col2 = st.columns(2)
651
+ with col1:
652
+ st.download_button(
653
+ "📥 Download PDF",
654
+ bundle["pdf"],
655
+ "business_report.pdf",
656
+ "application/pdf",
657
+ use_container_width=True
658
+ )
659
+
660
+ with col2:
661
+ if DG_KEY and st.button("🔊 Narrate Summary", use_container_width=True):
662
+ with st.spinner("Generating narration..."):
663
+ txt = re.sub(r"<[^>]+>", "", bundle["report_md"])
664
+ audio, mime = deepgram_tts(txt)
665
+ if audio:
666
+ st.audio(audio, format=mime)
667
+ else:
668
+ st.error("Narration failed.")
669
+
670
+ elif bundle["type"] == "video":
671
+ st.subheader("🎬 Generated Video Narrative")
672
+ vp = bundle["video_path"]
673
+
674
+ if Path(vp).exists():
675
+ with open(vp, "rb") as f:
676
+ st.video(f.read())
677
+
678
+ with open(vp, "rb") as f:
679
+ st.download_button(
680
+ "📥 Download Video",
681
+ f,
682
+ f"sozo_narrative_{bundle['key'][:8]}.mp4",
683
+ "video/mp4",
684
+ use_container_width=True
685
+ )
686
+ else:
687
+ st.error("Video file missing – generation failed.")
688
+
689
+ if __name__ == "__main__":
690
+ main()