rairo commited on
Commit
4a65224
·
verified ·
1 Parent(s): 1acf113

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +436 -567
app.py CHANGED
@@ -1,17 +1,16 @@
1
  ##############################################################################
2
- # Sozo Business Studio · 10-Jul-2025 (Performance Fixed) #
3
- # Fixed report generation freezing issues #
4
- # Optimized memory usage and resource management #
5
- # Added proper error handling and timeouts #
6
- # Improved chart generation with fallback strategies #
7
- # Enhanced progress tracking and user feedback #
 
8
  ##############################################################################
9
 
10
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
11
- import time, gc, threading
12
  from pathlib import Path
13
- from typing import Tuple, Dict, List, Optional
14
- from concurrent.futures import ThreadPoolExecutor, TimeoutError
15
 
16
  import streamlit as st
17
  import pandas as pd
@@ -28,663 +27,533 @@ import cv2
28
  from langchain_experimental.agents import create_pandas_dataframe_agent
29
  from langchain_google_genai import ChatGoogleGenerativeAI
30
  from google import genai
31
- from google.genai import types
32
 
33
  # ─── CONFIG ────────────────────────────────────────────────────────────────
 
34
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
35
  st.title("📊 Sozo Business Studio")
36
  st.caption("AI transforms business data into compelling narratives.")
37
 
38
- FPS, WIDTH, HEIGHT = 24, 1280, 720
39
  MAX_CHARTS, VIDEO_SCENES = 5, 5
40
- CHART_TIMEOUT = 30 # seconds
41
- REPORT_TIMEOUT = 120 # seconds
42
 
43
  API_KEY = os.getenv("GEMINI_API_KEY")
44
  if not API_KEY:
45
- st.error("⚠️ GEMINI_API_KEY is not set.")
46
- st.stop()
47
 
48
- try:
49
- GEM = genai.Client(api_key=API_KEY)
50
- except Exception as e:
51
- st.error(f"⚠️ Failed to initialize Gemini client: {e}")
52
- st.stop()
53
 
54
- DG_KEY = os.getenv("DEEPGRAM_API_KEY")
55
  st.session_state.setdefault("bundle", None)
 
 
 
 
 
 
 
 
56
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
57
 
58
- # ─── MEMORY MANAGEMENT ─────────────────────────────────────────────────────
59
- def cleanup_matplotlib():
60
- """Clean up matplotlib resources to prevent memory leaks"""
61
- plt.close('all')
62
- plt.clf()
63
- plt.cla()
64
- gc.collect()
65
-
66
- def safe_temp_cleanup(temp_files: List[Path]):
67
- """Safely clean up temporary files"""
68
- for temp_file in temp_files:
69
- try:
70
- if temp_file.exists():
71
- temp_file.unlink()
72
- except Exception:
73
- pass
74
 
75
- # ─── ENHANCED HELPERS ──────────────────────────────────────────────────────
76
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
77
- """Load CSV/Excel with enhanced error handling and size limits"""
78
  try:
79
- # Check file size (limit to 50MB)
80
- if len(buf) > 50 * 1024 * 1024:
81
- return None, "File too large (max 50MB)"
82
-
83
  ext = Path(name).suffix.lower()
84
-
85
- # Use smaller chunk size for large files
86
- if ext in (".xlsx", ".xls"):
87
- df = pd.read_excel(io.BytesIO(buf), engine='openpyxl' if ext == '.xlsx' else 'xlrd')
88
- else:
89
- df = pd.read_csv(io.BytesIO(buf), encoding='utf-8', on_bad_lines='skip')
90
-
91
- # Basic data validation
92
  df.columns = df.columns.astype(str).str.strip()
93
- df = df.dropna(how="all").reset_index(drop=True)
94
-
95
- # Limit rows for performance
96
- if len(df) > 10000:
97
- df = df.head(10000)
98
- st.warning("⚠️ Dataset truncated to 10,000 rows for performance")
99
-
100
  if df.empty or len(df.columns) == 0:
101
  raise ValueError("No usable data found")
102
-
103
  return df, None
104
  except Exception as e:
105
- return None, f"Error loading file: {str(e)}"
106
 
107
  def arrow_df(df: pd.DataFrame) -> pd.DataFrame:
108
- """Convert for Streamlit Arrow renderer with memory optimization"""
109
- # Create a copy with limited rows for preview
110
- safe = df.head(1000).copy()
111
  for c in safe.columns:
112
  if safe[c].dtype.name in ("Int64", "Float64", "Boolean"):
113
  safe[c] = safe[c].astype(safe[c].dtype.name.lower())
114
  return safe
115
 
116
- @st.cache_data(show_spinner=False, ttl=3600)
117
  def deepgram_tts(txt: str) -> Tuple[bytes, str]:
118
- """Cached audio narration with timeout"""
119
  if not DG_KEY or not txt:
120
  return None, None
121
-
122
  txt = re.sub(r"[^\w\s.,!?;:-]", "", txt)[:1000]
123
  try:
124
  r = requests.post(
125
  "https://api.deepgram.com/v1/speak",
126
  params={"model": "aura-2-andromeda-en"},
127
  headers={"Authorization": f"Token {DG_KEY}", "Content-Type": "application/json"},
128
- json={"text": txt},
129
- timeout=15 # Reduced timeout
130
- )
131
  r.raise_for_status()
132
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
133
  except Exception:
134
  return None, None
135
 
136
  def generate_silence_mp3(duration: float, out: Path):
137
- """Generate silence with error handling"""
138
- try:
139
- subprocess.run(
140
- ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
141
- "-t", f"{duration:.3f}", "-q:a", "9", str(out)],
142
- check=True, capture_output=True, timeout=30
143
- )
144
- except Exception as e:
145
- st.warning(f"Failed to generate silence: {e}")
146
 
147
  def audio_duration(path: str) -> float:
148
- """Get audio duration with fallback"""
149
  try:
150
  res = subprocess.run(
151
  ["ffprobe", "-v", "error", "-show_entries", "format=duration",
152
  "-of", "default=nw=1:nk=1", path],
153
- text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
154
- check=True, timeout=10
155
- )
156
  return float(res.stdout.strip())
157
  except Exception:
158
  return 5.0
159
 
160
- # ─── CHART GENERATION WITH TIMEOUT ────────────────────────────────────────
161
  TAG_RE = re.compile(
162
  r'[<[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>"\'\]]+?)["\']?\s*[>\]]',
163
  re.I)
 
 
164
 
165
- def extract_chart_tags(t: str) -> List[str]:
166
- """Extract chart tags with deduplication"""
167
- if not t:
168
- return []
169
- tags = [m.group("d").strip() for m in TAG_RE.finditer(t)]
170
- return list(dict.fromkeys(tags)) # Remove duplicates while preserving order
171
-
172
- re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
173
 
174
  def clean_narration(txt: str) -> str:
175
- """Clean narration text"""
176
- if not txt:
177
- return ""
178
  txt = re_scene.sub("", txt)
179
  txt = TAG_RE.sub("", txt)
180
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
181
  txt = re.sub(r"\s{2,}", " ", txt).strip()
182
  return txt
183
 
184
- def generate_chart_with_timeout(agent, description: str, timeout: int = CHART_TIMEOUT) -> Optional[str]:
185
- """Generate chart with timeout and fallback"""
186
- def chart_worker():
187
- try:
188
- cleanup_matplotlib()
189
-
190
- # Enhanced chart generation prompt
191
- chart_prompt = f"""
192
- Create a {description} chart using matplotlib with these requirements:
193
- 1. Use plt.figure(figsize=(12, 8)) for consistent sizing
194
- 2. Apply a clean, professional style: plt.style.use('seaborn-v0_8')
195
- 3. Include proper title, axis labels, and legends
196
- 4. Use professional color palette
197
- 5. Ensure readable fonts (size 12+)
198
- 6. Handle missing values by dropping or filling them
199
- 7. Save with: plt.savefig('chart.png', dpi=300, bbox_inches='tight', facecolor='white')
200
- 8. Always call plt.close() after saving
201
-
202
- Important: Only use columns that exist in the dataframe. If a column doesn't exist, use the closest available column.
203
- """
204
-
205
- result = agent.run(chart_prompt)
206
- return result
207
- except Exception as e:
208
- st.warning(f"Chart generation failed: {e}")
209
- return None
210
-
211
- try:
212
- with ThreadPoolExecutor(max_workers=1) as executor:
213
- future = executor.submit(chart_worker)
214
- result = future.result(timeout=timeout)
215
- return result
216
- except TimeoutError:
217
- st.warning(f"Chart generation timed out after {timeout} seconds")
218
- return None
219
- except Exception as e:
220
- st.warning(f"Chart generation error: {e}")
221
- return None
222
- finally:
223
- cleanup_matplotlib()
224
 
225
- def create_fallback_chart(df: pd.DataFrame, description: str) -> Optional[str]:
226
- """Create a simple fallback chart"""
227
- try:
228
- cleanup_matplotlib()
229
-
230
- fig, ax = plt.subplots(figsize=(12, 8))
231
-
232
- # Simple fallback based on data types
233
- numeric_cols = df.select_dtypes(include=[np.number]).columns
234
- categorical_cols = df.select_dtypes(include=['object']).columns
235
-
236
- if len(numeric_cols) >= 2:
237
- # Scatter plot
238
- ax.scatter(df[numeric_cols[0]], df[numeric_cols[1]], alpha=0.6)
239
- ax.set_xlabel(numeric_cols[0])
240
- ax.set_ylabel(numeric_cols[1])
241
- ax.set_title(f"Scatter Plot: {description}")
242
- elif len(numeric_cols) == 1:
243
- # Histogram
244
- ax.hist(df[numeric_cols[0]].dropna(), bins=20, alpha=0.7)
245
- ax.set_xlabel(numeric_cols[0])
246
- ax.set_ylabel('Frequency')
247
- ax.set_title(f"Distribution: {description}")
248
- else:
249
- # Simple text chart
250
- ax.text(0.5, 0.5, f"Chart: {description}\nData available",
251
- ha='center', va='center', fontsize=16)
252
- ax.set_xlim(0, 1)
253
- ax.set_ylim(0, 1)
254
- ax.set_title(description)
255
-
256
- plt.tight_layout()
257
-
258
- # Save to temporary file
259
- temp_path = Path(tempfile.gettempdir()) / f"fallback_{uuid.uuid4()}.png"
260
- plt.savefig(temp_path, dpi=300, bbox_inches="tight", facecolor="white")
261
- plt.close(fig)
262
-
263
- return str(temp_path)
264
- except Exception as e:
265
- st.warning(f"Fallback chart creation failed: {e}")
266
- return None
267
- finally:
268
- cleanup_matplotlib()
269
-
270
- # ─── IMAGE GENERATION WITH FALLBACK ───────────────────────────────────────
271
  def placeholder_img() -> Image.Image:
272
- """Create placeholder image"""
273
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
274
 
275
- def generate_image_from_prompt(prompt: str, timeout: int = 30) -> Image.Image:
276
- """Generate image with timeout and fallback"""
277
- def image_worker():
278
- model_main = "gemini-2.0-flash-exp-image-generation"
279
- model_fallback = "gemini-2.0-flash-preview-image-generation"
280
- full_prompt = "A clean business-presentation illustration: " + prompt
281
 
282
- def fetch(model_name):
 
283
  res = GEM.models.generate_content(
284
  model=model_name,
285
  contents=full_prompt,
286
- config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
287
  )
288
  for part in res.candidates[0].content.parts:
289
  if getattr(part, "inline_data", None):
290
  return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
291
  return None
292
-
293
- try:
294
- img = fetch(model_main) or fetch(model_fallback)
295
- return img if img else placeholder_img()
296
  except Exception:
297
- return placeholder_img()
298
-
299
- try:
300
- with ThreadPoolExecutor(max_workers=1) as executor:
301
- future = executor.submit(image_worker)
302
- return future.result(timeout=timeout)
303
- except TimeoutError:
304
- st.warning(f"Image generation timed out after {timeout} seconds")
305
- return placeholder_img()
306
- except Exception:
307
- return placeholder_img()
308
 
309
- # ─── OPTIMIZED PDF GENERATION ─────────────────────────────────────────────
310
  class PDF(FPDF, HTMLMixin):
311
- def header(self):
312
- self.set_font('Arial', 'B', 16)
313
- self.cell(0, 10, 'Sozo Business Report', 0, 1, 'C')
314
- self.ln(5)
315
-
316
- def footer(self):
317
- self.set_y(-15)
318
- self.set_font('Arial', 'I', 8)
319
- self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
320
 
321
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
322
- """Build PDF with error handling"""
323
- try:
324
- # Convert markdown to HTML with chart substitution
325
- html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(
326
- TAG_RE.sub(lambda m: f'<img src="{charts.get(m.group("d").strip(), "")}" width="400">', md)
327
- )
328
-
329
- pdf = PDF()
330
- pdf.set_auto_page_break(True, margin=15)
331
- pdf.add_page()
332
- pdf.set_font("Arial", "", 11)
333
-
334
- # Simple text conversion (avoid complex HTML)
335
- text_content = re.sub(r'<[^>]+>', '', html)
336
- pdf.multi_cell(0, 6, text_content)
337
-
338
- return bytes(pdf.output(dest="S"))
339
- except Exception as e:
340
- st.error(f"PDF generation failed: {e}")
341
- # Return simple fallback PDF
342
- pdf = PDF()
343
- pdf.add_page()
344
- pdf.set_font("Arial", "", 12)
345
- pdf.multi_cell(0, 6, "Report generation encountered an error. Please try again.")
346
- return bytes(pdf.output(dest="S"))
347
-
348
- # ─── OPTIMIZED REPORT GENERATION ──────────────────────────────────────────
349
- def generate_report(buf: bytes, name: str, ctx: str, key: str) -> Optional[dict]:
350
- """Generate report with improved error handling and timeouts"""
351
- progress_bar = st.progress(0)
352
- status_text = st.empty()
353
 
354
- try:
355
- # Step 1: Load data
356
- status_text.text("Loading and validating data...")
357
- progress_bar.progress(0.1)
358
-
359
- df, err = load_dataframe_safely(buf, name)
360
- if err:
361
- st.error(err)
362
- return None
363
-
364
- # Step 2: Initialize LLM
365
- status_text.text("Initializing AI models...")
366
- progress_bar.progress(0.2)
367
-
368
- try:
369
- llm = ChatGoogleGenerativeAI(
370
- model="gemini-2.0-flash",
371
- google_api_key=API_KEY,
372
- temperature=0.1,
373
- request_timeout=60
374
- )
375
- except Exception as e:
376
- st.error(f"Failed to initialize AI model: {e}")
377
- return None
378
-
379
- # Step 3: Create context (limit size)
380
- status_text.text("Analyzing data structure...")
381
- progress_bar.progress(0.3)
382
-
383
- # Limit context size to prevent memory issues
384
- sample_size = min(100, len(df))
385
- ctx_dict = {
386
- "shape": df.shape,
387
- "columns": list(df.columns)[:20], # Limit columns
388
- "user_ctx": ctx or "General business analysis",
389
- "sample_data": df.head(sample_size).to_dict('records')[:10], # Small sample
390
- "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
391
- }
392
-
393
- # Add numeric summary only if reasonable size
394
- numeric_cols = df.select_dtypes(include=[np.number]).columns
395
- if len(numeric_cols) > 0 and len(numeric_cols) < 20:
396
- ctx_dict["numeric_summary"] = {
397
- col: {stat: float(val) for stat, val in stats.items()}
398
- for col, stats in df[numeric_cols].describe().to_dict().items()
399
- }
400
-
401
- # Step 4: Generate report
402
- status_text.text("Generating report content...")
403
- progress_bar.progress(0.4)
404
-
405
- cols = ", ".join(ctx_dict["columns"][:10])
406
-
407
- report_prompt = f"""
408
- Analyze this business dataset and create a professional executive report.
409
-
410
- **Dataset:** {ctx_dict["shape"][0]} rows, {ctx_dict["shape"][1]} columns
411
- **Columns:** {cols}
412
- **Context:** {ctx_dict["user_ctx"]}
413
-
414
- **Requirements:**
415
- 1. Write in professional, executive-level language
416
- 2. Include 3-5 key insights with specific data points
417
- 3. Provide actionable recommendations
418
- 4. Use maximum 3 chart tags: `<generate_chart: "chart_type | description">`
419
- 5. Valid chart types: bar, pie, line, scatter, hist
420
- 6. Keep total length under 2000 words
421
-
422
- **Structure:**
423
- ## Executive Summary
424
- [Brief overview of key findings]
425
-
426
- ## Key Insights
427
- [3-5 actionable insights with data support]
428
-
429
- ## Recommendations
430
- [Specific, actionable recommendations]
431
-
432
- Focus on business impact and practical insights.
433
- """
434
-
435
- try:
436
- with ThreadPoolExecutor(max_workers=1) as executor:
437
- future = executor.submit(lambda: llm.invoke(report_prompt).content)
438
- md = future.result(timeout=REPORT_TIMEOUT)
439
- except TimeoutError:
440
- st.error("Report generation timed out. Please try with a smaller dataset.")
441
- return None
442
- except Exception as e:
443
- st.error(f"Report generation failed: {e}")
444
- return None
445
-
446
- # Step 5: Generate charts
447
- status_text.text("Generating charts...")
448
- progress_bar.progress(0.6)
449
-
450
- chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
451
- charts: Dict[str, str] = {}
452
- temp_files: List[Path] = []
453
-
454
- if chart_descs:
455
- try:
456
- agent = create_pandas_dataframe_agent(
457
- llm=llm, df=df, verbose=False,
458
- allow_dangerous_code=True,
459
- max_iterations=3,
460
- early_stopping_method="generate"
461
- )
462
-
463
- for i, desc in enumerate(chart_descs):
464
- chart_progress = 0.6 + (0.3 * (i + 1) / len(chart_descs))
465
- progress_bar.progress(chart_progress)
466
- status_text.text(f"Generating chart {i+1}/{len(chart_descs)}: {desc[:50]}...")
467
-
468
- # Try agent-based chart generation
469
- result = generate_chart_with_timeout(agent, desc)
470
-
471
- # Check if matplotlib saved a file
472
- chart_path = None
473
- potential_paths = [
474
- Path("chart.png"),
475
- Path(tempfile.gettempdir()) / "chart.png",
476
- ]
477
-
478
- for path in potential_paths:
479
- if path.exists():
480
- chart_path = path
481
- break
482
-
483
- # If no chart was generated, create fallback
484
- if not chart_path:
485
- chart_path = create_fallback_chart(df, desc)
486
-
487
- if chart_path and Path(chart_path).exists():
488
- # Move to permanent temp location
489
- perm_path = Path(tempfile.gettempdir()) / f"chart_{uuid.uuid4()}.png"
490
- Path(chart_path).rename(perm_path)
491
- charts[desc] = str(perm_path)
492
- temp_files.append(perm_path)
493
-
494
- cleanup_matplotlib()
495
-
496
- except Exception as e:
497
- st.warning(f"Chart generation encountered issues: {e}")
498
- # Continue without charts
499
-
500
- # Step 6: Build PDF
501
- status_text.text("Building PDF...")
502
- progress_bar.progress(0.9)
503
-
504
  try:
505
- # Create preview with base64 encoded images
506
- preview = md
507
- for desc, path in charts.items():
508
- if Path(path).exists():
509
- try:
510
- img_bytes = Path(path).read_bytes()
511
- b64_img = base64.b64encode(img_bytes).decode()
512
- preview = preview.replace(
513
- f'<generate_chart: "{desc}">',
514
- f'<img src="data:image/png;base64,{b64_img}" style="max-width: 100%;">'
515
- )
516
- except Exception:
517
- pass
518
-
519
- pdf_bytes = build_pdf(md, charts)
520
-
521
- # Clean up temporary files
522
- safe_temp_cleanup(temp_files)
523
-
524
- progress_bar.progress(1.0)
525
- status_text.text("Report generated successfully!")
526
-
527
- return {
528
- "type": "report",
529
- "preview": preview,
530
- "pdf": pdf_bytes,
531
- "report_md": md,
532
- "key": key,
533
- }
534
 
 
 
 
 
 
 
 
 
 
 
535
  except Exception as e:
536
- st.error(f"PDF generation failed: {e}")
537
- return None
538
-
539
- except Exception as e:
540
- st.error(f"Report generation failed: {e}")
541
- return None
542
- finally:
543
- # Clean up UI elements
544
- progress_bar.empty()
545
- status_text.empty()
546
- cleanup_matplotlib()
547
- gc.collect()
548
-
549
- # ─── VIDEO GENERATION (SIMPLIFIED) ────────────────────────────────────────
550
  def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path, fps: int = FPS) -> str:
551
- """Animate image with fade effect"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
552
  try:
553
- frames = max(int(dur * fps), fps)
554
- vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"), fps, (WIDTH, HEIGHT))
555
- blank = np.full_like(img_cv2, 255)
556
-
557
- for i in range(frames):
558
- a = i / frames
559
- blended = cv2.addWeighted(blank, 1 - a, img_cv2, a, 0)
560
- vid.write(blended)
561
-
562
- vid.release()
563
- return str(out)
564
  except Exception as e:
565
- st.warning(f"Video animation failed: {e}")
566
- return str(out)
567
-
568
- def generate_video(buf: bytes, name: str, ctx: str, key: str) -> Optional[str]:
569
- """Generate video with simplified approach"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
570
  try:
571
- # Check FFmpeg availability
572
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
573
  except Exception:
574
- st.error("🔴 FFmpeg not available — cannot render video.")
575
- return None
576
-
577
  df, err = load_dataframe_safely(buf, name)
578
  if err:
579
- st.error(err)
580
- return None
581
-
582
- # Simplified video generation for better performance
583
- st.info("🎬 Video generation is simplified for better performance")
584
-
585
- try:
586
- # Create a simple video with data visualization
587
- img = generate_image_from_prompt(f"Business data visualization for {ctx or 'data analysis'}")
588
- img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
589
-
590
- video_path = Path(tempfile.gettempdir()) / f"{key}.mp4"
591
- animate_image_fade(img_cv, 10.0, video_path)
592
-
593
- return str(video_path)
594
- except Exception as e:
595
- st.error(f"Video generation failed: {e}")
596
- return None
 
 
 
 
 
 
 
 
597
 
598
- # ─── STREAMLIT UI ─────────────────────────────────────────────────────────
599
- def main():
600
- """Main application function"""
601
-
602
- # Mode selection
603
- mode = st.radio("Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True)
604
-
605
- # File upload
606
- upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
607
-
608
- if upl:
609
- # Show data preview
610
- with st.spinner("Loading data preview..."):
611
- df_prev, load_err = load_dataframe_safely(upl.getvalue(), upl.name)
612
-
613
- if load_err:
614
- st.error(f"Error loading file: {load_err}")
615
  else:
616
- with st.expander("📊 Data Preview", expanded=False):
617
- st.info(f"Shape: {df_prev.shape[0]} rows × {df_prev.shape[1]} columns")
618
- st.dataframe(arrow_df(df_prev), use_container_width=True)
619
-
620
- # Context input
621
- ctx = st.text_area(
622
- "Business context or specific instructions (optional)",
623
- help="Provide context about your data or specific analysis requirements"
624
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
625
 
626
- # Generate button
627
- if st.button("🚀 Generate", type="primary", disabled=not upl):
628
- key = sha1_bytes(b"".join([upl.getvalue(), mode.encode(), ctx.encode()]))
629
-
630
- if mode == "Report (PDF)":
631
- st.session_state.bundle = generate_report(upl.getvalue(), upl.name, ctx, key)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  else:
633
- st.session_state.bundle = None
634
- path = generate_video(upl.getvalue(), upl.name, ctx, key)
635
- if path:
636
- st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
637
-
638
- st.rerun()
639
-
640
- # Display results
641
- if bundle := st.session_state.get("bundle"):
642
- if bundle["type"] == "report":
643
- st.subheader("📄 Generated Report")
644
-
645
- # Report preview
646
- with st.expander("📖 View Report", expanded=True):
647
- st.markdown(bundle["preview"], unsafe_allow_html=True)
648
-
649
- # Download options
650
- col1, col2 = st.columns(2)
651
- with col1:
652
- st.download_button(
653
- "📥 Download PDF",
654
- bundle["pdf"],
655
- "business_report.pdf",
656
- "application/pdf",
657
- use_container_width=True
658
- )
659
-
660
- with col2:
661
- if DG_KEY and st.button("🔊 Narrate Summary", use_container_width=True):
662
- with st.spinner("Generating narration..."):
663
- txt = re.sub(r"<[^>]+>", "", bundle["report_md"])
664
- audio, mime = deepgram_tts(txt)
665
- if audio:
666
- st.audio(audio, format=mime)
667
- else:
668
- st.error("Narration failed.")
669
-
670
- elif bundle["type"] == "video":
671
- st.subheader("🎬 Generated Video Narrative")
672
- vp = bundle["video_path"]
673
-
674
- if Path(vp).exists():
675
- with open(vp, "rb") as f:
676
- st.video(f.read())
677
-
678
- with open(vp, "rb") as f:
679
- st.download_button(
680
- "📥 Download Video",
681
- f,
682
- f"sozo_narrative_{bundle['key'][:8]}.mp4",
683
- "video/mp4",
684
- use_container_width=True
685
- )
686
- else:
687
- st.error("Video file missing – generation failed.")
688
-
689
- if __name__ == "__main__":
690
- main()
 
1
  ##############################################################################
2
+ # Sozo Business Studio · 10-Jul-2025 (full drop-in)
3
+ # Restores PDF branch alongside fixed Video branch
4
+ # Shared chart-tag grammar across both paths
5
+ # Narrator text cleans scene labels + chart talk
6
+ # Matplotlib animation starts from blank; artists returned (blit=True)
7
+ # Gemini Flash-preview image gen with placeholder fallback
8
+ # • Silent-audio fallback keeps mux lengths equal
9
  ##############################################################################
10
 
11
  import os, re, json, hashlib, uuid, base64, io, tempfile, requests, subprocess
 
12
  from pathlib import Path
13
+ from typing import Tuple, Dict, List
 
14
 
15
  import streamlit as st
16
  import pandas as pd
 
27
  from langchain_experimental.agents import create_pandas_dataframe_agent
28
  from langchain_google_genai import ChatGoogleGenerativeAI
29
  from google import genai
30
+ from google.genai import types # for GenerateContentConfig
31
 
32
  # ─── CONFIG ────────────────────────────────────────────────────────────────
33
+
34
  st.set_page_config(page_title="Sozo Business Studio", layout="wide")
35
  st.title("📊 Sozo Business Studio")
36
  st.caption("AI transforms business data into compelling narratives.")
37
 
38
+ FPS, WIDTH, HEIGHT = 24, 1280, 720
39
  MAX_CHARTS, VIDEO_SCENES = 5, 5
 
 
40
 
41
  API_KEY = os.getenv("GEMINI_API_KEY")
42
  if not API_KEY:
43
+ st.error("⚠️ GEMINI_API_KEY is not set."); st.stop()
44
+ GEM = genai.Client(api_key=API_KEY)
45
 
46
+ DG_KEY = os.getenv("DEEPGRAM_API_KEY") # optional for narration
 
 
 
 
47
 
48
+ # --- IMPROVED: State management for an interactive, non-freezing UI ---
49
  st.session_state.setdefault("bundle", None)
50
+ st.session_state.setdefault("report_md", None)
51
+ st.session_state.setdefault("chart_descs", [])
52
+ st.session_state.setdefault("generated_charts", {}) # Dict[desc, base64_string]
53
+ st.session_state.setdefault("pdf_bytes", None)
54
+ st.session_state.setdefault("df", None)
55
+ st.session_state.setdefault("current_file_key", None)
56
+
57
+
58
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
59
 
60
+ # ─── HELPERS ───────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
 
62
  def load_dataframe_safely(buf: bytes, name: str) -> Tuple[pd.DataFrame, str]:
63
+ """Load CSV/Excel, return (df, err)."""
64
  try:
 
 
 
 
65
  ext = Path(name).suffix.lower()
66
+ df = (pd.read_excel if ext in (".xlsx", ".xls") else pd.read_csv)(io.BytesIO(buf))
 
 
 
 
 
 
 
67
  df.columns = df.columns.astype(str).str.strip()
68
+ df = df.dropna(how="all")
 
 
 
 
 
 
69
  if df.empty or len(df.columns) == 0:
70
  raise ValueError("No usable data found")
 
71
  return df, None
72
  except Exception as e:
73
+ return None, str(e)
74
 
75
  def arrow_df(df: pd.DataFrame) -> pd.DataFrame:
76
+ """Convert for Streamlit Arrow renderer."""
77
+ safe = df.copy()
 
78
  for c in safe.columns:
79
  if safe[c].dtype.name in ("Int64", "Float64", "Boolean"):
80
  safe[c] = safe[c].astype(safe[c].dtype.name.lower())
81
  return safe
82
 
83
+ @st.cache_data(show_spinner=False)
84
  def deepgram_tts(txt: str) -> Tuple[bytes, str]:
85
+ """Optional audio narration."""
86
  if not DG_KEY or not txt:
87
  return None, None
 
88
  txt = re.sub(r"[^\w\s.,!?;:-]", "", txt)[:1000]
89
  try:
90
  r = requests.post(
91
  "https://api.deepgram.com/v1/speak",
92
  params={"model": "aura-2-andromeda-en"},
93
  headers={"Authorization": f"Token {DG_KEY}", "Content-Type": "application/json"},
94
+ json={"text": txt}, timeout=30)
 
 
95
  r.raise_for_status()
96
  return r.content, r.headers.get("Content-Type", "audio/mpeg")
97
  except Exception:
98
  return None, None
99
 
100
  def generate_silence_mp3(duration: float, out: Path):
101
+ subprocess.run(
102
+ ["ffmpeg", "-y", "-f", "lavfi", "-i", "anullsrc=r=44100:cl=mono",
103
+ "-t", f"{duration:.3f}", "-q:a", "9", str(out)],
104
+ check=True, capture_output=True)
 
 
 
 
 
105
 
106
  def audio_duration(path: str) -> float:
 
107
  try:
108
  res = subprocess.run(
109
  ["ffprobe", "-v", "error", "-show_entries", "format=duration",
110
  "-of", "default=nw=1:nk=1", path],
111
+ text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
 
 
112
  return float(res.stdout.strip())
113
  except Exception:
114
  return 5.0
115
 
 
116
  TAG_RE = re.compile(
117
  r'[<[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>"\'\]]+?)["\']?\s*[>\]]',
118
  re.I)
119
+ extract_chart_tags = lambda t: list(dict.fromkeys(m.group("d").strip()
120
+ for m in TAG_RE.finditer(t or "")))
121
 
122
+ re_scene = re.compile(r"^\s*scene\s*\d+[:.- ]*", re.I)
 
 
 
 
 
 
 
123
 
124
  def clean_narration(txt: str) -> str:
 
 
 
125
  txt = re_scene.sub("", txt)
126
  txt = TAG_RE.sub("", txt)
127
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
128
  txt = re.sub(r"\s{2,}", " ", txt).strip()
129
  return txt
130
 
131
+ # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
132
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
133
  def placeholder_img() -> Image.Image:
 
134
  return Image.new("RGB", (WIDTH, HEIGHT), (230, 230, 230))
135
 
136
+ @st.cache_data(show_spinner="Generating image...")
137
+ def generate_image_from_prompt(prompt: str) -> Image.Image:
138
+ # IMPROVED: Using your original model names for consistency with your environment.
139
+ model_main = "gemini-2.0-flash-exp-image-generation"
140
+ model_fallback = "gemini-2.0-flash-preview-image-generation"
141
+ full_prompt = "A clean business-presentation illustration: " + prompt
142
 
143
+ def fetch(model_name):
144
+ try:
145
  res = GEM.models.generate_content(
146
  model=model_name,
147
  contents=full_prompt,
148
+ generation_config=types.GenerateContentConfig(response_modalities=["IMAGE"]),
149
  )
150
  for part in res.candidates[0].content.parts:
151
  if getattr(part, "inline_data", None):
152
  return Image.open(io.BytesIO(part.inline_data.data)).convert("RGB")
153
  return None
 
 
 
 
154
  except Exception:
155
+ # Silently fail to allow fallback
156
+ return None
157
+
158
+ img = fetch(model_main) or fetch(model_fallback)
159
+ return img if img else placeholder_img()
160
+
161
+ # ─── PDF & REPORT GENERATION (REFACTORED) ──────────────────────────────────
 
 
 
 
162
 
 
163
  class PDF(FPDF, HTMLMixin):
164
+ pass
 
 
 
 
 
 
 
 
165
 
166
  def build_pdf(md: str, charts: Dict[str, str]) -> bytes:
167
+ """Builds a PDF from markdown text and a dictionary of chart descriptions to base64 image strings."""
168
+ def replacer(match):
169
+ desc = match.group("d").strip()
170
+ if desc in charts and charts[desc]:
171
+ return f'<img src="data:image/png;base64,{charts[desc]}">'
172
+ return ""
173
+
174
+ html = MarkdownIt("commonmark", {"breaks": True}).enable("table").render(TAG_RE.sub(replacer, md))
175
+ pdf = PDF()
176
+ pdf.set_auto_page_break(True, margin=15)
177
+ pdf.add_page()
178
+ pdf.set_font("Arial", "B", 18)
179
+ pdf.cell(0, 12, "AI-Generated Business Report", ln=True)
180
+ pdf.ln(3)
181
+ pdf.set_font("Arial", "", 11)
182
+ pdf.write_html(html)
183
+ return bytes(pdf.output(dest="S"))
184
+
185
+ def generate_report_text(df: pd.DataFrame, ctx: str) -> Tuple[str, List[str]]:
186
+ """Generates only the text part of the report. This is the fast, first step."""
187
+ # Using your original model name.
188
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
 
 
 
 
 
 
 
 
 
189
 
190
+ # IMPROVED: Sending a summary instead of the full dataframe is more efficient and robust.
191
+ ctx_dict = {
192
+ "shape": df.shape, "columns": list(df.columns), "user_ctx": ctx or "General business analysis",
193
+ "data_sample": df.head().to_dict('records'),
194
+ "data_types": {col: str(dtype) for col, dtype in df.dtypes.to_dict().items()},
195
+ "missing_values": {col: int(count) for col, count in df.isnull().sum().to_dict().items() if count > 0},
196
+ "numeric_summary": df.describe().to_dict() if not df.select_dtypes(include=np.number).empty else {}
197
+ }
198
+ cols = ", ".join(ctx_dict["columns"][:8])
199
+ report_prompt = f"""
200
+ You are a senior data analyst and business intelligence expert. Analyze the provided dataset and write a comprehensive executive-level Markdown report.
201
+ **Dataset Analysis Context:**
202
+ {json.dumps(ctx_dict, indent=2, default=str)}
203
+ **Instructions:**
204
+ 1. **Identify Data Domain**: First, determine what type of data this represents.
205
+ 2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
206
+ 3. **Data Quality Assessment**: Comment on data completeness and reliability.
207
+ 4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain.
208
+ 5. **Strategic Recommendations**: Offer concrete, actionable recommendations.
209
+ 6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like:
210
+ `<generate_chart: "chart_type | specific description">`
211
+ Valid chart types: bar, pie, line, scatter, hist
212
+ Base every chart on actual columns: {cols}
213
+ 7. **Format Requirements**: Use professional business language and clear headers (## Executive Summary, etc.).
214
+ """
215
+ md = llm.invoke(report_prompt).content
216
+ chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
217
+ return md, chart_descs
218
+
219
+ def generate_single_chart(description: str, df: pd.DataFrame) -> str:
220
+ """Generates one chart using the agent and returns it as a base64 string. More reliable."""
221
+ # Using your original model name.
222
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.1)
223
+ agent = create_pandas_dataframe_agent(
224
+ llm=llm, df=df, verbose=False, allow_dangerous_code=True,
225
+ agent_type="openai-functions", handle_parsing_errors=True
226
+ )
227
+ # IMPROVED: A more explicit prompt for the agent leads to more reliable code generation.
228
+ chart_prompt = f"""
229
+ Your task is to generate Python code to create a single, static, professional chart using matplotlib based on the provided dataframe `df`.
230
+ The user's request is: '{description}'.
231
+
232
+ Follow these rules strictly:
233
+ 1. The dataframe is already loaded and available as a variable named `df`.
234
+ 2. Generate only the Python code to produce the plot. Do not add any explanation or surrounding text.
235
+ 3. Use `plt.figure()` to create a new figure for the plot.
236
+ 4. Add a clear title and labels to the axes.
237
+ 5. DO NOT use `st.pyplot()` or `plt.show()`. The code will be executed to save the figure.
238
+ 6. Ensure the final code block is pure Python.
239
+ """
240
+ for _ in range(2): # Retry once on failure
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
241
  try:
242
+ response = agent.invoke({"input": chart_prompt})
243
+ code_to_execute = response['output'].strip().replace("```python", "").replace("```", "")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
244
 
245
+ fig, ax = plt.subplots(figsize=(10, 6), dpi=150)
246
+ exec_globals = {'df': df, 'pd': pd, 'np': np, 'plt': plt, 'fig': fig, 'ax': ax}
247
+ exec(code_to_execute, exec_globals)
248
+
249
+ if fig.axes and any(ax.get_children() for ax in fig.axes):
250
+ buf = io.BytesIO()
251
+ fig.savefig(buf, format="png", dpi=150, bbox_inches="tight", facecolor="white")
252
+ plt.close(fig)
253
+ return base64.b64encode(buf.getvalue()).decode()
254
+ plt.close(fig)
255
  except Exception as e:
256
+ st.warning(f"Chart generation attempt failed: {e}")
257
+ plt.close("all")
258
+ return None # Return None if all attempts fail
259
+
260
+ # ─── ANIMATION HELPERS (YOUR ORIGINAL CODE) ────────────────────────────────
261
+
 
 
 
 
 
 
 
 
262
  def animate_image_fade(img_cv2: np.ndarray, dur: float, out: Path, fps: int = FPS) -> str:
263
+ frames = max(int(dur * fps), fps)
264
+ vid = cv2.VideoWriter(str(out), cv2.VideoWriter_fourcc(*"mp4v"), fps, (WIDTH, HEIGHT))
265
+ blank = np.full_like(img_cv2, 255)
266
+ for i in range(frames):
267
+ a = i / (frames - 1) # Go from 0 to 1
268
+ vid.write(cv2.addWeighted(blank, 1 - a, img_cv2, a, 0))
269
+ vid.release()
270
+ return str(out)
271
+
272
+ def animate_chart(desc: str, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
273
+ """Render an animated chart whose clip length equals the audio length `dur`."""
274
+ ctype, *rest = [s.strip().lower() for s in desc.split("|", 1)]
275
+ ctype = ctype or "bar"
276
+ title = rest[0] if rest else desc
277
+
278
+ if ctype == "pie":
279
+ cat_cols = df.select_dtypes(exclude="number").columns
280
+ num_cols = df.select_dtypes(include="number").columns
281
+ if not cat_cols.any() or not num_cols.any(): raise ValueError("Pie chart requires one categorical and one numeric column.")
282
+ cat, num = cat_cols[0], num_cols[0]
283
+ plot_df = df.groupby(cat)[num].sum().sort_values(ascending=False).head(8)
284
+ elif ctype in ("bar", "hist"):
285
+ num_cols = df.select_dtypes(include="number").columns
286
+ if not num_cols.any(): raise ValueError(f"{ctype} chart requires a numeric column.")
287
+ num = num_cols[0]
288
+ plot_df = df[num]
289
+ else: # line / scatter
290
+ num_cols = df.select_dtypes(include="number").columns
291
+ if len(num_cols) < 2: raise ValueError("Line/scatter chart requires at least two numeric columns.")
292
+ plot_df = df[list(num_cols[:2])].sort_index()
293
+
294
+ frames = max(10, int(dur * fps))
295
+ fig, ax = plt.subplots(figsize=(WIDTH / 100, HEIGHT / 100), dpi=100)
296
+
297
+ artists = []
298
+ if ctype == "pie":
299
+ wedges, _ = ax.pie(np.zeros_like(plot_df.values), labels=plot_df.index, startangle=90)
300
+ ax.set_title(title); artists.extend(wedges)
301
+ def init(): [w.set_alpha(0) for w in wedges]; return artists
302
+ def update(i):
303
+ a = i / (frames - 1)
304
+ wedges, _ = ax.pie(plot_df.values * a, labels=plot_df.index, startangle=90)
305
+ for w in wedges: w.set_alpha(a)
306
+ return wedges
307
+ elif ctype == "bar":
308
+ bars = ax.bar(plot_df.index, np.zeros_like(plot_df.values), color="#1f77b4")
309
+ ax.set_ylim(0, plot_df.max() * 1.1); ax.set_title(title); artists.extend(bars)
310
+ def init(): return artists
311
+ def update(i):
312
+ a = i / (frames - 1)
313
+ for b, h in zip(bars, plot_df.values): b.set_height(h * a)
314
+ return artists
315
+ elif ctype == "hist":
316
+ _, _, patches = ax.hist(plot_df, bins=20, color="#1f77b4", alpha=0)
317
+ ax.set_title(title); artists.extend(patches)
318
+ def init(): [p.set_alpha(0) for p in patches]; return artists
319
+ def update(i):
320
+ a = i / (frames - 1)
321
+ for p in patches: p.set_alpha(a)
322
+ return artists
323
+ elif ctype == "scatter":
324
+ pts = ax.scatter(plot_df.iloc[:, 0], plot_df.iloc[:, 1], s=10, alpha=0)
325
+ ax.set_title(title); ax.grid(alpha=.3); artists.append(pts)
326
+ def init(): pts.set_alpha(0); return artists
327
+ def update(i): pts.set_alpha(i / (frames - 1)); return artists
328
+ else: # line
329
+ line, = ax.plot([], [], lw=2)
330
+ x_full = plot_df.iloc[:, 0]
331
+ y_full = plot_df.iloc[:, 1]
332
+ ax.set_xlim(x_full.min(), x_full.max()); ax.set_ylim(y_full.min(), y_full.max())
333
+ ax.set_title(title); ax.grid(alpha=.3); artists.append(line)
334
+ def init(): line.set_data([], []); return artists
335
+ def update(i):
336
+ k = max(2, int(len(x_full) * i / (frames - 1)))
337
+ line.set_data(x_full[:k], y_full.iloc[:k])
338
+ return artists
339
+
340
+ anim = FuncAnimation(fig, update, init_func=init, frames=frames, blit=True, interval=1000 / fps)
341
+ anim.save(str(out), writer=FFMpegWriter(fps=fps, metadata={'artist': 'Sozo'}), dpi=144)
342
+ plt.close(fig)
343
+ return str(out)
344
+
345
+ def safe_chart(desc, df, dur, out):
346
  try:
347
+ return animate_chart(desc, df, dur, out)
 
 
 
 
 
 
 
 
 
 
348
  except Exception as e:
349
+ st.warning(f"Animated chart failed ('{desc}'): {e}. Using static fallback.")
350
+ with plt.ioff():
351
+ fig, ax = plt.subplots()
352
+ try:
353
+ # Attempt a simple plot
354
+ df.select_dtypes(include=np.number).plot(ax=ax)
355
+ ax.set_title(desc)
356
+ except:
357
+ # If that fails, just show a text error on the image
358
+ ax.text(0.5, 0.5, 'Could not render chart', ha='center', va='center')
359
+
360
+ p = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.png"
361
+ fig.savefig(p, bbox_inches="tight"); plt.close(fig)
362
+ img = cv2.resize(cv2.imread(str(p)), (WIDTH, HEIGHT))
363
+ return animate_image_fade(img, dur, out)
364
+
365
+ def concat_media(paths: List[str], out: Path, kind="video"):
366
+ if not paths: return
367
+ lst_path = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.txt"
368
+ with lst_path.open("w", encoding="utf-8") as f:
369
+ for p in paths:
370
+ if Path(p).exists() and Path(p).stat().st_size > 0:
371
+ f.write(f"file '{Path(p).resolve().as_posix()}'\n")
372
+ if lst_path.stat().st_size == 0:
373
+ lst_path.unlink()
374
+ return
375
+
376
+ cmd = ["ffmpeg", "-y", "-f", "concat", "-safe", "0", "-i", str(lst_path), "-c", "copy", str(out)]
377
+ subprocess.run(cmd, check=True, capture_output=True)
378
+ lst_path.unlink(missing_ok=True)
379
+
380
+ # ─── VIDEO GENERATION (YOUR ORIGINAL CODE) ─────────────────────────────────
381
+
382
+ def build_story_prompt(ctx_dict):
383
+ cols = ", ".join(ctx_dict["columns"][:6])
384
+ return f"""
385
+ You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
386
+ **Complete Dataset Context:**
387
+ {json.dumps(ctx_dict, indent=2, default=str)}
388
+ **Task Requirements:**
389
+ 1. **Identify the Data Story**: Determine what business domain this data represents and what story it tells
390
+ 2. **Create {VIDEO_SCENES} distinct scenes** that build a logical narrative arc
391
+ 3. **Each scene must contain:**
392
+ - 1-2 sentences of clear, professional narration (plain English, no jargon)
393
+ - Exactly one chart tag: `<generate_chart: "chart_type | specific description">`
394
+ **Chart Guidelines:**
395
+ - Valid types: bar, pie, line, scatter, hist
396
+ - Base all charts on actual columns: {cols}
397
+ **Narrative Structure:**
398
+ - Scene 1: Set the context and introduce the main story
399
+ - Middle scenes: Develop key insights and supporting evidence
400
+ - Final scene: Conclude with actionable takeaways or future outlook
401
+ **Output Format:**
402
+ Separate each scene with exactly [SCENE_BREAK]
403
+ """
404
+
405
+ def generate_video(buf: bytes, name: str, ctx: str, key: str):
406
  try:
 
407
  subprocess.run(["ffmpeg", "-version"], check=True, capture_output=True)
408
  except Exception:
409
+ st.error("🔴 FFmpeg not available — cannot render video."); return None
410
+
 
411
  df, err = load_dataframe_safely(buf, name)
412
  if err:
413
+ st.error(err); return None
414
+
415
+ llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=API_KEY, temperature=0.2)
416
+ ctx_dict = {
417
+ "shape": df.shape, "columns": list(df.columns), "user_ctx": ctx or "General business analysis",
418
+ "data_sample": df.head().to_dict('records'),
419
+ "numeric_summary": df.describe().to_dict() if not df.select_dtypes(include=np.number).empty else {}
420
+ }
421
+ script = llm.invoke(build_story_prompt(ctx_dict)).content
422
+ scenes = [s.strip() for s in script.split("[SCENE_BREAK]") if s.strip()]
423
+
424
+ video_parts, audio_parts, temps = [], [], []
425
+ for idx, sc in enumerate(scenes[:VIDEO_SCENES]):
426
+ st.progress((idx + 1) / VIDEO_SCENES, text=f"Rendering Scene {idx + 1}/{VIDEO_SCENES}")
427
+ descs = extract_chart_tags(sc)
428
+ narrative = clean_narration(sc)
429
+
430
+ audio_bytes, _ = deepgram_tts(narrative)
431
+ mp3 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
432
+ if audio_bytes:
433
+ mp3.write_bytes(audio_bytes)
434
+ dur = audio_duration(str(mp3))
435
+ else:
436
+ dur = 5.0
437
+ generate_silence_mp3(dur, mp3)
438
+ audio_parts.append(str(mp3)); temps.append(mp3)
439
 
440
+ mp4 = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
441
+ if descs:
442
+ safe_chart(descs[0], df, dur, mp4)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
443
  else:
444
+ img = generate_image_from_prompt(narrative)
445
+ img_cv = cv2.cvtColor(np.array(img.resize((WIDTH, HEIGHT))), cv2.COLOR_RGB2BGR)
446
+ animate_image_fade(img_cv, dur, mp4)
447
+ video_parts.append(str(mp4)); temps.append(mp4)
448
+
449
+ silent_vid = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp4"
450
+ concat_media(video_parts, silent_vid, "video")
451
+ audio_mix = Path(tempfile.gettempdir()) / f"{uuid.uuid4()}.mp3"
452
+ concat_media(audio_parts, audio_mix, "audio")
453
+
454
+ final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
455
+ if silent_vid.exists() and silent_vid.stat().st_size > 0 and audio_mix.exists() and audio_mix.stat().st_size > 0:
456
+ subprocess.run(
457
+ ["ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix),
458
+ "-c:v", "copy", "-c:a", "aac", "-shortest", str(final_vid)],
459
+ check=True, capture_output=True)
460
+ else:
461
+ st.error("Failed to generate video or audio components.")
462
+ return None
463
+
464
+ for p in temps + [silent_vid, audio_mix]:
465
+ p.unlink(missing_ok=True)
466
+ return str(final_vid)
467
+
468
+ # ─── UI & WORKFLOW (RESTRUCTURED FOR RESPONSIVENESS) ───────────────────────
469
+
470
+ mode = st.radio("Select Output Format:", ["Report (PDF)", "Video Narrative"], horizontal=True)
471
+ upl = st.file_uploader("Upload CSV or Excel", type=["csv", "xlsx", "xls"])
472
+
473
+ # Reset state if a new file is uploaded
474
+ if upl and sha1_bytes(upl.getvalue()) != st.session_state.current_file_key:
475
+ st.session_state.report_md = None
476
+ st.session_state.chart_descs = []
477
+ st.session_state.generated_charts = {}
478
+ st.session_state.pdf_bytes = None
479
+ st.session_state.bundle = None
480
+ st.session_state.current_file_key = sha1_bytes(upl.getvalue())
481
+ df, err = load_dataframe_safely(upl.getvalue(), upl.name)
482
+ if err:
483
+ st.error(f"Error loading data: {err}")
484
+ st.session_state.df = None # Ensure df is cleared on error
485
+ else:
486
+ st.session_state.df = df
487
+
488
+ if st.session_state.get("df") is not None:
489
+ with st.expander("📊 Data Preview"):
490
+ st.dataframe(arrow_df(st.session_state.df.head()))
491
+ ctx = st.text_area("Business context or specific instructions (optional)")
492
+
493
+ if mode == "Report (PDF)":
494
+ if st.button("🚀 Generate Report", type="primary", disabled=(st.session_state.report_md is not None)):
495
+ with st.spinner("Analyzing data and drafting report..."):
496
+ md, descs = generate_report_text(st.session_state.df, ctx)
497
+ st.session_state.report_md = md
498
+ st.session_state.chart_descs = descs
499
+ st.rerun()
500
+ else:
501
+ if st.button("🎬 Generate Video", type="primary"):
502
+ st.warning("Video generation is a long process and will lock the UI.")
503
+ with st.spinner("Generating video... This may take several minutes."):
504
+ key = st.session_state.current_file_key
505
+ path = generate_video(upl.getvalue(), upl.name, ctx, key)
506
+ if path:
507
+ st.session_state.bundle = {"type": "video", "video_path": path, "key": key}
508
+ st.rerun()
509
+
510
+ # ─── OUTPUT DISPLAY ────────────────────────────────────────────────────────
511
+
512
+ if st.session_state.get("report_md"):
513
+ st.subheader("📄 Generated Report")
514
+
515
+ preview_md = st.session_state.report_md
516
+ for desc, b64_data in st.session_state.generated_charts.items():
517
+ if b64_data:
518
+ img_tag = f'<img src="data:image/png;base64,{b64_data}" width="600">'
519
+ preview_md = TAG_RE.sub(lambda m: img_tag if m.group("d").strip() == desc else m.group(0), preview_md, count=1)
520
 
521
+ preview_md = TAG_RE.sub("[Chart will be generated here]", preview_md)
522
+
523
+ with st.expander("View Report", expanded=True):
524
+ st.markdown(preview_md, unsafe_allow_html=True)
525
+
526
+ pending_charts = [d for d in st.session_state.chart_descs if d not in st.session_state.generated_charts]
527
+ if pending_charts:
528
+ if st.button("📊 Generate Visualizations", use_container_width=True, type="primary"):
529
+ for desc in pending_charts:
530
+ with st.spinner(f"Generating chart: {desc}"):
531
+ b64_image = generate_single_chart(desc, st.session_state.df)
532
+ st.session_state.generated_charts[desc] = b64_image
533
+ st.rerun()
534
+
535
+ all_charts_processed = st.session_state.chart_descs and len(st.session_state.generated_charts) == len(st.session_state.chart_descs)
536
+ if all_charts_processed:
537
+ c1, c2 = st.columns(2)
538
+ with c1:
539
+ if st.session_state.pdf_bytes is None:
540
+ with st.spinner("Building PDF..."):
541
+ st.session_state.pdf_bytes = build_pdf(st.session_state.report_md, st.session_state.generated_charts)
542
+ st.download_button("Download PDF", st.session_state.pdf_bytes, "business_report.pdf", "application/pdf", use_container_width=True)
543
+ with c2:
544
+ if DG_KEY and st.button("🔊 Narrate Summary", use_container_width=True):
545
+ txt = clean_narration(st.session_state.report_md)
546
+ audio, mime = deepgram_tts(txt)
547
+ st.audio(audio, format=mime) if audio else st.error("Narration failed.")
548
+
549
+ elif bundle := st.session_state.get("bundle"):
550
+ if bundle["type"] == "video":
551
+ st.subheader("🎬 Generated Video Narrative")
552
+ vp = bundle["video_path"]
553
+ if Path(vp).exists():
554
+ with open(vp, "rb") as f:
555
+ st.video(f.read())
556
+ with open(vp, "rb") as f:
557
+ st.download_button("Download Video", f, f"sozo_narrative_{bundle['key'][:8]}.mp4", "video/mp4")
558
  else:
559
+ st.error("Video file missing – generation may have failed.")