rairo commited on
Commit
db955c4
·
verified ·
1 Parent(s): e9a17b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +261 -372
app.py CHANGED
@@ -1,7 +1,5 @@
1
  ###############################################################################
2
  # Sozo Business Studio · AI transforms business data into compelling narratives
3
- # FULL, DROP-IN SCRIPT – capped at 5 charts & 5 slides, Arrow-safe previews,
4
- # DeepGram TTS, robust chart tags, fixed slide navigation / progress
5
  ###############################################################################
6
  import os, re, json, hashlib, uuid, base64, io, tempfile, wave, requests
7
  from pathlib import Path
@@ -20,434 +18,325 @@ from pptx.util import Inches, Pt
20
  from langchain_experimental.agents import create_pandas_dataframe_agent
21
  from langchain_google_genai import ChatGoogleGenerativeAI
22
  from google import genai
23
- from google.genai import types
24
 
25
  # ─────────────────────────────────────────────────────────────────────────────
26
- # PAGE CONFIG
27
- # ─────────────────────────────────────────────────────────────────────────────
28
- st.set_page_config(page_title="Sozo Business Studio", layout="wide")
29
- st.title("📊 Sozo Business Studio")
30
- st.caption("AI transforms business data into compelling narratives.")
31
-
32
- # ─────────────────────────────────────────────────────────────────────────────
33
- # CONSTANTS & FONT SETUP
34
  # ─────────────────────────────────────────────────────────────────────────────
35
  FONT_DIR = Path(__file__).parent if "__file__" in globals() else Path(".")
36
  FONT_REG = FONT_DIR / "NotoSans-Regular.ttf"
37
  FONT_BLD = FONT_DIR / "NotoSans-Bold.ttf"
38
  FONT_FAM = "NotoSans"
39
 
40
- SLIDES = 5 # 🔒 maximum number of slides
41
- MAX_CHARTS = 5 # 🔒 maximum number of charts
42
 
43
- API_KEY = os.getenv("GEMINI_API_KEY") # Gemini (LLM)
44
  if not API_KEY:
45
- st.error("⚠️ GEMINI_API_KEY environment variable is not set."); st.stop()
46
- try:
47
- GEM = genai.Client(api_key=API_KEY)
48
- except Exception as e:
49
- st.error(f"❌ Failed to initialise Google GenAI Client: {e}"); st.stop()
50
 
51
- DG_KEY = os.getenv("DEEPGRAM_API_KEY") # DeepGram (TTS)
52
 
53
  # ─────────────────────────────────────────────────────────────────────────────
54
- # SESSION STATE INITIALISATION
55
  # ─────────────────────────────────────────────────────────────────────────────
56
- for k, v in {"bundles": {}, "slide_idx": 0,
57
- "active_bundle_key": None, "upload_errors": []}.items():
58
- st.session_state.setdefault(k, v)
 
 
 
 
59
 
60
  # ─────────────────────────────────────────────────────────────────────────────
61
  # HELPERS
62
  # ─────────────────────────────────────────────────────────────────────────────
63
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
64
 
65
- def validate_file_upload(uploaded_file):
66
- errs = []
67
- if uploaded_file is None:
68
- errs.append("No file uploaded"); return errs
69
- if uploaded_file.size > 50 * 1024 * 1024:
70
- errs.append("File size exceeds 50 MB limit")
71
- if Path(uploaded_file.name).suffix.lower() not in (".csv", ".xlsx", ".xls"):
72
- errs.append("Unsupported file type – upload CSV or Excel")
73
- if uploaded_file.size == 0:
74
- errs.append("File is empty")
75
- return errs
76
-
77
- def load_dataframe_safely(file_bytes: bytes, filename: str):
78
  try:
79
- ext = Path(filename).suffix.lower()
80
- if ext == ".csv":
81
- for enc in ("utf-8", "latin-1", "cp1252"):
82
- try:
83
- df = pd.read_csv(io.BytesIO(file_bytes), encoding=enc); break
84
  except UnicodeDecodeError: continue
85
  else:
86
- df = pd.read_csv(io.BytesIO(file_bytes), encoding="utf-8", errors="replace")
87
- elif ext in (".xlsx", ".xls"):
88
- df = pd.read_excel(io.BytesIO(file_bytes))
89
  else:
90
- raise ValueError(f"Unsupported file format: {ext}")
91
- if df.empty or len(df.columns) == 0:
92
- raise ValueError("File contains no usable data")
93
- df.columns = df.columns.astype(str).str.strip()
94
- df = df.dropna(how="all")
95
- if df.empty: raise ValueError("All rows are empty")
96
- return df, None
97
  except Exception as exc:
98
- return None, str(exc)
99
-
100
- def fix_bullet(text: str) -> str:
101
- if not isinstance(text, str): return ""
102
- repl = {"\x95":"•","\x96":"-","\x97":"—","\x91":"'","\x92":"'","\x93":'"',"\x94":'"'}
103
- for b,g in repl.items(): text = text.replace(b,g)
104
- return re.sub(r"[\x80-\x9f]", "", text)
105
-
106
- # ─── Arrow-compatibility helpers ────────────────────────────────────────────
107
- def make_arrow_compatible(df: pd.DataFrame) -> pd.DataFrame:
108
- safe = df.copy()
109
- for col in safe.columns:
110
- dt = safe[col].dtype.name
111
- if dt in ("Int64","Float64","Boolean"):
112
- safe[col] = safe[col].astype(dt.lower(), copy=False)
113
- if safe[col].apply(lambda x: isinstance(x, (np.dtype,))).any():
114
- safe[col] = safe[col].astype(str)
115
  return safe
116
 
117
- def arrow_safe_info(df: pd.DataFrame) -> pd.DataFrame:
118
- info = pd.DataFrame({
119
- "Column": df.columns,
120
- "Type": [str(t) for t in df.dtypes],
121
- "Non-Null": df.notna().sum().values,
122
- "Null": df.isna().sum().values,
123
- })
124
- return make_arrow_compatible(info)
125
-
126
- # ─── PCM→WAV helper ─────────────────────────────────────────────────────────
127
- def convert_pcm_to_wav(pcm, sr=24000, channels=1, width=2):
128
- buf = io.BytesIO()
129
- with wave.open(buf,'wb') as wf:
130
- wf.setnchannels(channels); wf.setsampwidth(width); wf.setframerate(sr)
131
- wf.writeframes(pcm)
132
- buf.seek(0); return buf.getvalue()
133
-
134
- # ─── DeepGram TTS ───────────────────────────────────────────────────────────
135
  @st.cache_data(show_spinner=False)
136
- def generate_tts_audio(text: str):
137
- if not DG_KEY: return None, None
138
- text = re.sub(r"[^\w\s.,!?;:-]", "", text)[:500]
139
- if not text: return None, None
140
  try:
141
- r = requests.post(
142
- "https://api.deepgram.com/v1/speak",
143
  params={"model":"aura-asteria-en"},
144
- headers={"Authorization": f"Token {DG_KEY}",
145
  "Content-Type":"application/json"},
146
- json={"text": text},
147
- timeout=30,
148
- )
149
  r.raise_for_status()
150
- return r.content, r.headers.get("Content-Type","audio/mpeg")
151
- except Exception:
152
- return None, None
153
-
154
- # ─── Chart-tag regex (one source of truth) ──────────────────────────────────
155
- TAG_RE = re.compile(r"""
156
- [<\[]\s*generate_?chart\s*[:=]?\s*["']?
157
- (?P<desc>[^>\]'"’”]+?)["']?\s*[>\]]
158
- """, re.VERBOSE | re.IGNORECASE)
159
-
160
- extract_chart_tags = lambda txt: list(dict.fromkeys(
161
- m.group("desc").strip() for m in TAG_RE.finditer(txt or "")
162
- ))
163
-
164
- def replace_chart_tags(txt, chart_map, repl):
165
- if not isinstance(txt, str): return ""
166
- def _s(m):
167
- d = m.group("desc").strip()
168
- return repl(chart_map[d]) if d in chart_map else m.group(0)
169
- return TAG_RE.sub(_s, txt)
170
-
171
- # ─── PDF & PPTX builders ────────────────────────────────────────────────────
172
- class PDF(FPDF, HTMLMixin): pass
173
-
174
- def build_pdf(md_src, chart_map):
175
- try:
176
- md_src = fix_bullet(md_src).replace("","*")
177
- md_src = replace_chart_tags(md_src, chart_map,
178
- lambda p: f'<img src="{p}">')
179
- html = MarkdownIt("commonmark", {"breaks":True}).enable("table").render(md_src)
180
- pdf = PDF(); pdf.set_auto_page_break(True, margin=15)
181
- added=False
182
- for style,ttf in [("",FONT_REG),("B",FONT_BLD)]:
183
- if ttf.exists():
184
- try: pdf.add_font(FONT_FAM,style,str(ttf),uni=True); added=True
 
 
 
 
 
 
 
 
 
185
  except: pass
186
- if added: pdf.set_fallback_fonts([FONT_FAM])
187
- pdf.add_page()
188
- pdf.set_font(FONT_FAM if added else "Arial","B",18)
189
- pdf.cell(0,12,"AI-Generated Business Report",ln=True); pdf.ln(3)
190
- pdf.set_font(FONT_FAM if added else "Arial","",11)
191
- pdf.write_html(html)
192
- return bytes(pdf.output(dest="S"))
193
- except Exception as e:
194
- st.error(f"PDF generation failed: {e}"); return b""
195
-
196
- def build_pptx(slides, chart_map):
197
- try:
198
- prs, layout = Presentation(), Presentation().slide_layouts[1]
199
- for raw in slides:
200
- rc = fix_bullet(raw)
201
- tags = extract_chart_tags(rc)
202
- lines = [ln.strip(" •-") for ln in rc.splitlines() if ln.strip()]
203
- title = lines[0] if lines else "Slide"
204
- bullets = [ln for ln in lines[1:] if not TAG_RE.search(ln)]
205
- slide = prs.slides.add_slide(layout)
206
- slide.shapes.title.text = title
207
- tf = slide.shapes.placeholders[1].text_frame; tf.clear(); tf.word_wrap=True
208
- for ln in bullets:
209
- p = tf.add_paragraph(); p.text = ln; p.font.size = Pt(20)
210
- for t in tags:
211
- if t in chart_map:
212
- try:
213
- slide.shapes.add_picture(chart_map[t],
214
- Inches(1), Inches(3.5),
215
- width=Inches(8))
216
- except: pass
217
- break
218
- bio = io.BytesIO(); prs.save(bio); return bio.getvalue()
219
- except Exception as e:
220
- st.error(f"PPTX generation failed: {e}"); return b""
221
-
222
- # ─── ASSET GENERATOR ────────────────────────────────────────────────────────
223
  @st.cache_data(show_spinner=False)
224
- def generate_assets(_key, file_bytes, filename, mode, ctx):
225
- df, err = load_dataframe_safely(file_bytes, filename)
226
  if err: st.error(err); return None
227
- try:
228
- llm = ChatGoogleGenerativeAI(
229
- model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.1)
230
- except Exception as e:
231
- st.error(f"LLM init failed: {e}"); return None
232
- data_ctx = {
233
- "shape": df.shape,
234
- "columns": list(df.columns),
235
- "dtypes": df.dtypes.astype(str).to_dict(),
236
- "sample": df.head(3).fillna("N/A").to_dict(),
237
- "numeric_columns": df.select_dtypes("number").columns.tolist(),
238
- "user_ctx": ctx or "General business analysis",
239
- }
240
- outputs = {}
241
  if mode in ("Report","Both"):
242
- try:
243
- outputs["ReportAgent"] = llm.invoke(f"""
244
- You are a senior business analyst. Write an executive-level Markdown report
245
- with descriptive statistics, insights, and recommendations.
246
 
247
- Data context: {json.dumps(data_ctx, indent=2)}
248
-
249
- Insert chart placeholders exactly like:
250
- <generate_chart: "description of chart">
251
  """).content
252
- except Exception as e: st.error(f"Report gen failed: {e}")
253
  if mode in ("Presentation","Both"):
254
- try:
255
- outputs["PresentationAgent"] = llm.invoke(f"""
256
- Create exactly {SLIDES} concise slide scripts.
257
-
258
- Data context: {json.dumps(data_ctx, indent=2)}
259
-
260
  Each slide:
261
  Slide X - Title
262
  • bullet 1 (≤15 words)
263
  • bullet 2
264
  <generate_chart: "description">
265
  """).content
266
- except Exception as e: st.error(f"Presentation gen failed: {e}")
267
  if not outputs: return None
268
 
269
- chart_descs = extract_chart_tags("\n".join(outputs.values()))[:MAX_CHARTS]
270
- chart_paths = {}
271
  if chart_descs:
272
- try:
273
- agent = create_pandas_dataframe_agent(
274
- llm=llm, df=df, verbose=False, allow_dangerous_code=True)
275
- for d in chart_descs:
276
- with plt.ioff():
277
- try:
278
- agent.run(f"Create a {d} with matplotlib and save.")
279
- fig = plt.gcf()
280
- if fig.axes:
281
- p = Path(tempfile.gettempdir())/f"{uuid.uuid4()}.png"
282
- fig.savefig(p, dpi=300, bbox_inches="tight",
283
- facecolor="white")
284
- chart_paths[d]=str(p)
285
- plt.close("all")
286
- except: plt.close("all")
287
- except Exception as e: st.warning(f"Chart agent failed: {e}")
288
-
289
- pdf = pptx = preview = None; slides=[]
290
- try:
291
- if "ReportAgent" in outputs:
292
- md = fix_bullet(outputs["ReportAgent"])
293
- pdf = build_pdf(md, chart_paths)
294
- preview = replace_chart_tags(
295
- md, chart_paths,
296
- lambda p: f'<img src="data:image/png;base64,{base64.b64encode(Path(p).read_bytes()).decode()}" style="max-width:100%;">')
297
- if "PresentationAgent" in outputs:
298
- rs = fix_bullet(outputs["PresentationAgent"])
299
- parts = re.split(r"(?im)^\s*slide\s+\d+\s*-?\s*", rs)[1:]
300
- slides = [p.strip() for p in parts if p.strip()][:SLIDES]
301
- if slides: pptx = build_pptx(tuple(slides), chart_paths)
302
- except Exception as e: st.error(f"Assembly failed: {e}")
303
-
304
- return {
305
- "preview_md": preview, "pdf": pdf, "slides": slides, "pptx": pptx,
306
- "key": _key, "chart_count": len(chart_paths),
307
- "chart_descriptions": list(chart_paths.keys())
308
- }
309
 
310
  # ─────────────────────────────────────────────────────────────────────────────
311
  # UI
312
  # ─────────────────────────────────────────────────────────────────────────────
313
- mode = st.radio("Choose output format:",
314
- ["Report","Presentation","Both"], horizontal=True, index=2)
315
-
316
- st.subheader("📁 Upload Your Business Data")
317
- upl = st.file_uploader("Choose a CSV or Excel file",
318
- type=["csv","xlsx","xls"],
319
- help="Max 50 MB")
320
 
321
  if upl:
322
- errs = validate_file_upload(upl)
323
- if errs:
324
- for e in errs: st.error(f"❌ {e}"); st.stop()
325
- st.success(f" File '{upl.name}' uploaded ({upl.size:,} bytes)")
326
- try:
327
- df_prev, prev_err = load_dataframe_safely(upl.getvalue(), upl.name)
328
- if prev_err: st.error(prev_err); st.stop()
329
- with st.expander("📊 Data Preview", expanded=False):
330
- st.write(f"**Shape:** {df_prev.shape[0]} × {df_prev.shape[1]}")
331
- st.dataframe(make_arrow_compatible(df_prev.head()))
332
- st.write("**Column Information:**")
333
- st.dataframe(arrow_safe_info(df_prev))
334
- except Exception as e:
335
- st.error(f"Preview error: {e}"); st.stop()
336
-
337
- ctx = st.text_area(
338
- "Business context (optional)",
339
- placeholder="e.g., Q4 2024 regional sales …",
340
- help="Provide context for more relevant insights"
341
- )
342
-
343
- if not st.button("🚀 Generate Narrative", type="primary"): st.stop()
344
- if not upl: st.warning("Upload a file."); st.stop()
345
-
346
- bundle_key = sha1_bytes(b"".join([upl.getvalue(), mode.encode(), ctx.encode()]))
347
- if bundle_key in st.session_state.bundles:
348
- bundle = st.session_state.bundles[bundle_key]
349
- st.info("🔄 Using cached results.")
350
- else:
351
- with st.spinner("🤖 Analyzing data & generating assets …"):
352
- bundle = generate_assets(bundle_key, upl.getvalue(), upl.name, mode, ctx)
353
- if bundle: st.session_state.bundles[bundle_key] = bundle
354
- if not bundle: st.error("Generation failed."); st.stop()
355
-
356
- if bundle["chart_count"]:
357
- st.success(f"✅ Generated {bundle['chart_count']} chart(s).")
358
-
359
- if st.session_state.active_bundle_key != bundle["key"]:
360
- st.session_state.slide_idx = 0
361
- st.session_state.active_bundle_key = bundle["key"]
362
-
363
- # ─── Tabs
364
- if mode == "Both":
365
- tab_rep, tab_pre = st.tabs(["📄 Report","📑 Slides"])
366
- elif mode == "Report":
367
- tab_rep, tab_pre = st.container(), None
368
  else:
369
- tab_rep, tab_pre = None, st.container()
370
-
371
- # ─── Report tab
372
- if tab_rep:
373
- with tab_rep:
374
- st.subheader("📄 Generated Business Report")
375
- if bundle["preview_md"]:
376
- st.markdown(bundle["preview_md"], unsafe_allow_html=True)
377
- col1,col2 = st.columns(2)
378
- with col1:
379
- if bundle["pdf"]:
380
- st.download_button("⬇️ Download PDF", bundle["pdf"],
381
- "business_report.pdf","application/pdf")
382
- with col2:
383
- st.metric("Charts", bundle["chart_count"])
384
- else:
385
- st.warning("No report content generated.")
386
-
387
- # ─── Slides tab
388
- if tab_pre:
389
- with tab_pre:
390
- slides = bundle.get("slides", [])
391
  if not slides:
392
- st.warning("No slides generated.")
393
  else:
394
- st.session_state.slide_idx = max(
395
- 0, min(st.session_state.slide_idx, len(slides)-1))
396
- idx = st.session_state.slide_idx
397
- total = len(slides)
398
  st.progress((idx+1)/total)
399
- col1,col2,col3,col4,col5 = st.columns([1,1,2,1,1])
400
- with col1:
401
- if st.button("⬅️ Prev", disabled=idx==0):
402
- st.session_state.slide_idx -= 1; st.rerun()
403
- with col2: st.write(f"**{idx+1} / {total}**")
404
- with col3:
405
- sel = st.selectbox("Jump", range(total), idx,
406
- format_func=lambda x:f"Slide {x+1}")
407
- if sel != idx:
408
- st.session_state.slide_idx = sel; st.rerun()
409
- with col4:
410
- if st.button("Next ➡️", disabled=idx>=total-1):
411
- st.session_state.slide_idx += 1; st.rerun()
412
- with col5:
413
- if st.button("🔊 Narrate"):
414
- txt = replace_chart_tags(slides[idx], {}, lambda _:"")
415
- txt = re.sub(r'^Slide \d+\s*[-:]?\s*','',txt, flags=re.I)
416
- narr = '. '.join(l.strip('•*- ') for l in txt.split('\n') if l.strip())
417
- with st.spinner("Generating narration…"):
418
- audio, mime = generate_tts_audio(narr)
419
- if audio:
420
- if "pcm" in mime.lower() or "l16" in mime.lower():
421
- audio = convert_pcm_to_wav(audio); mime="audio/wav"
422
- st.audio(audio, format=mime)
423
- else: st.error("Narration failed.")
424
  st.divider()
425
- title_line,*body = slides[idx].split('\n')
426
- title = re.sub(r'^Slide \d+\s*[-:]?\s*','',title_line, flags=re.I).strip() or f"Slide {idx+1}"
427
- st.markdown(f"### {title}")
428
- for ln in body:
429
- ln = ln.strip()
430
- if not ln or TAG_RE.search(ln): continue
431
- if ln[:1] in "•-*": st.markdown(ln)
432
- else: st.markdown(f"• {ln}")
 
 
 
 
 
 
 
 
433
  st.divider()
434
  if bundle["pptx"]:
435
- st.download_button("⬇️ Download PPTX", bundle["pptx"],
436
- "business_presentation.pptx",
437
- "application/vnd.openxmlformats-officedocument.presentationml.presentation")
438
- st.metric("Slides", total)
439
-
440
- # ─── Footer & cache clear
441
- with st.expander("💡 Tips for Better Results", False):
442
- st.markdown("""
443
- *Use clean, well-structured data & provide business context for best insights.*
444
- """)
445
-
446
- if st.button("🧹 Clear Cache"):
447
- st.session_state.bundles.clear()
448
- st.session_state.slide_idx = 0
449
- st.session_state.active_bundle_key = None
450
- for f in Path(tempfile.gettempdir()).glob("*.png"):
451
- try: f.unlink()
452
- except: pass
453
- st.success("Cache cleared."); st.rerun()
 
1
  ###############################################################################
2
  # Sozo Business Studio · AI transforms business data into compelling narratives
 
 
3
  ###############################################################################
4
  import os, re, json, hashlib, uuid, base64, io, tempfile, wave, requests
5
  from pathlib import Path
 
18
  from langchain_experimental.agents import create_pandas_dataframe_agent
19
  from langchain_google_genai import ChatGoogleGenerativeAI
20
  from google import genai
 
21
 
22
  # ─────────────────────────────────────────────────────────────────────────────
23
+ # CONSTANTS & API KEYS
 
 
 
 
 
 
 
24
  # ─────────────────────────────────────────────────────────────────────────────
25
  FONT_DIR = Path(__file__).parent if "__file__" in globals() else Path(".")
26
  FONT_REG = FONT_DIR / "NotoSans-Regular.ttf"
27
  FONT_BLD = FONT_DIR / "NotoSans-Bold.ttf"
28
  FONT_FAM = "NotoSans"
29
 
30
+ SLIDES = 5 # hard cap
31
+ MAX_CHARTS = 5 # hard cap
32
 
33
+ API_KEY = os.getenv("GEMINI_API_KEY")
34
  if not API_KEY:
35
+ st.error("Set GEMINI_API_KEY"); st.stop()
36
+ GEM = genai.Client(api_key=API_KEY)
 
 
 
37
 
38
+ DG_KEY = os.getenv("DEEPGRAM_API_KEY") # optional narration
39
 
40
  # ─────────────────────────────────────────────────────────────────────────────
41
+ # STATE
42
  # ─────────────────────────────────────────────────────────────────────────────
43
+ st.set_page_config(page_title="Sozo Business Studio", layout="wide")
44
+ st.title("📊 Sozo Business Studio")
45
+ st.caption("AI transforms business data into compelling narratives.")
46
+
47
+ for k,v in {"bundles":{}, "slide_idx":0,
48
+ "active_bundle_key":None}.items():
49
+ st.session_state.setdefault(k,v)
50
 
51
  # ─────────────────────────────────────────────────────────────────────────────
52
  # HELPERS
53
  # ─────────────────────────────────────────────────────────────────────────────
54
  sha1_bytes = lambda b: hashlib.sha1(b).hexdigest()
55
 
56
+ def validate_file_upload(f):
57
+ e=[]
58
+ if f is None: return ["No file uploaded"]
59
+ if f.size==0: e.append("File is empty")
60
+ if f.size>50*1024*1024: e.append("File >50 MB")
61
+ if Path(f.name).suffix.lower() not in (".csv",".xlsx",".xls"):
62
+ e.append("Unsupported type")
63
+ return e
64
+
65
+ def load_dataframe_safely(bytes_, name):
 
 
 
66
  try:
67
+ ext = Path(name).suffix.lower()
68
+ if ext==".csv":
69
+ for enc in ("utf-8","latin-1","cp1252"):
70
+ try: df=pd.read_csv(io.BytesIO(bytes_),encoding=enc); break
 
71
  except UnicodeDecodeError: continue
72
  else:
73
+ df=pd.read_csv(io.BytesIO(bytes_),encoding="utf-8",errors="replace")
 
 
74
  else:
75
+ df=pd.read_excel(io.BytesIO(bytes_))
76
+ if df.empty or len(df.columns)==0:
77
+ raise ValueError("No data")
78
+ df.columns=df.columns.astype(str).str.strip()
79
+ df=df.dropna(how="all")
80
+ if df.empty: raise ValueError("Rows all empty")
81
+ return df,None
82
  except Exception as exc:
83
+ return None,str(exc)
84
+
85
+ def fix_bullet(txt):
86
+ if not isinstance(txt,str): return ""
87
+ repl = {"\x95":"•","\x96":"-","\x97":"—","\x91":"'",
88
+ "\x92":"'","\x93":'"',"\x94":'"'}
89
+ for b,g in repl.items(): txt=txt.replace(b,g)
90
+ return re.sub(r"[\x80-\x9f]","",txt)
91
+
92
+ def arrow_df(df):
93
+ safe=df.copy()
94
+ for c in safe.columns:
95
+ if safe[c].dtype.name in ("Int64","Float64","Boolean"):
96
+ safe[c]=safe[c].astype(safe[c].dtype.name.lower())
97
+ if safe[c].apply(lambda x:isinstance(x,(np.dtype,))).any():
98
+ safe[c]=safe[c].astype(str)
 
99
  return safe
100
 
101
+ def arrow_info(df):
102
+ return arrow_df(pd.DataFrame({
103
+ "Column":df.columns,
104
+ "Type":[str(t) for t in df.dtypes],
105
+ "Non-Null":df.notna().sum(),
106
+ "Null":df.isna().sum()
107
+ }))
108
+
109
+ def pcm_to_wav(pcm, sr=24000,ch=1,w=2):
110
+ b=io.BytesIO()
111
+ with wave.open(b,'wb') as wf:
112
+ wf.setnchannels(ch); wf.setsampwidth(w); wf.setframerate(sr); wf.writeframes(pcm)
113
+ b.seek(0); return b.getvalue()
114
+
 
 
 
 
115
  @st.cache_data(show_spinner=False)
116
+ def deepgram_tts(text):
117
+ if not DG_KEY: return None,None
118
+ text=re.sub(r"[^\w\s.,!?;:-]","",text)[:500]
 
119
  try:
120
+ r=requests.post("https://api.deepgram.com/v1/speak",
 
121
  params={"model":"aura-asteria-en"},
122
+ headers={"Authorization":f"Token {DG_KEY}",
123
  "Content-Type":"application/json"},
124
+ json={"text":text},timeout=30)
 
 
125
  r.raise_for_status()
126
+ return r.content,r.headers.get("Content-Type","audio/mpeg")
127
+ except Exception: return None,None
128
+
129
+ TAG_RE=re.compile(r'[<\[]\s*generate_?chart\s*[:=]?\s*["\']?(?P<d>[^>\]\'"”’]+?)["\']?\s*[>\]]',re.I)
130
+ extract_chart_tags=lambda t:list(dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")))
131
+ def repl_tags(txt,map_,f):
132
+ def sub(m):
133
+ d=m.group("d").strip()
134
+ return f(map_[d]) if d in map_ else m.group(0)
135
+ return TAG_RE.sub(sub,txt)
136
+
137
+ class PDF(FPDF,HTMLMixin): pass
138
+
139
+ def build_pdf(md,charts):
140
+ md=fix_bullet(md).replace("•","*")
141
+ md=repl_tags(md,charts,lambda p:f'<img src="{p}">')
142
+ html=MarkdownIt("commonmark",{"breaks":True}).enable("table").render(md)
143
+ pdf=PDF(); pdf.set_auto_page_break(True,margin=15)
144
+ added=False
145
+ for s,t in [("",FONT_REG),("B",FONT_BLD)]:
146
+ if t.exists():
147
+ try: pdf.add_font(FONT_FAM,s,str(t),uni=True); added=True
148
+ except: pass
149
+ if added: pdf.set_fallback_fonts([FONT_FAM])
150
+ pdf.add_page(); pdf.set_font(FONT_FAM if added else "Arial","B",18)
151
+ pdf.cell(0,12,"AI-Generated Business Report",ln=True); pdf.ln(3)
152
+ pdf.set_font(FONT_FAM if added else "Arial","",11)
153
+ pdf.write_html(html)
154
+ return bytes(pdf.output(dest="S"))
155
+
156
+ def build_pptx(slides,charts):
157
+ prs=Presentation(); layout=prs.slide_layouts[1]
158
+ for raw in slides:
159
+ rc=fix_bullet(raw); tags=extract_chart_tags(rc)
160
+ lines=[l.strip(" •-") for l in rc.splitlines() if l.strip()]
161
+ title=lines[0] if lines else "Slide"
162
+ bullets=[l for l in lines[1:] if not TAG_RE.search(l)]
163
+ s=prs.slides.add_slide(layout); s.shapes.title.text=title
164
+ tf=s.shapes.placeholders[1].text_frame; tf.clear(); tf.word_wrap=True
165
+ for b in bullets:
166
+ p=tf.add_paragraph(); p.text=b; p.font.size=Pt(20)
167
+ for t in tags:
168
+ if t in charts:
169
+ try: s.shapes.add_picture(charts[t],Inches(1),Inches(3.5),width=Inches(8))
170
  except: pass
171
+ break
172
+ bio=io.BytesIO(); prs.save(bio); return bio.getvalue()
173
+
174
+ # ─────────────────────────────────────────────────────────────────────────────
175
+ # GENERATOR
176
+ # ─────────────────────────────────────────────────────────────────────────────
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  @st.cache_data(show_spinner=False)
178
+ def make_assets(key,bytes_,name,mode,ctx):
179
+ df,err=load_dataframe_safely(bytes_,name)
180
  if err: st.error(err); return None
181
+ llm=ChatGoogleGenerativeAI(model="gemini-2.5-flash",
182
+ google_api_key=API_KEY,temperature=0.1)
183
+ ctx_data={
184
+ "shape":df.shape,
185
+ "columns":list(df.columns),
186
+ "dtypes":df.dtypes.astype(str).to_dict(),
187
+ "sample":df.head(3).fillna("N/A").to_dict(),
188
+ "numeric_columns":df.select_dtypes("number").columns.tolist(),
189
+ "user_ctx":ctx or "General business analysis"}
190
+ outputs={}
 
 
 
 
191
  if mode in ("Report","Both"):
192
+ outputs["rep"]=llm.invoke(f"""
193
+ Write an executive-level Markdown report with insights & recommendations.
194
+ Use chart tags <generate_chart: "description"> where helpful.
 
195
 
196
+ Data: {json.dumps(ctx_data,indent=2)}
 
 
 
197
  """).content
 
198
  if mode in ("Presentation","Both"):
199
+ outputs["pres"]=llm.invoke(f"""
200
+ Create exactly {SLIDES} slides.
 
 
 
 
201
  Each slide:
202
  Slide X - Title
203
  • bullet 1 (≤15 words)
204
  • bullet 2
205
  <generate_chart: "description">
206
  """).content
 
207
  if not outputs: return None
208
 
209
+ chart_descs=extract_chart_tags("\n".join(outputs.values()))[:MAX_CHARTS]
210
+ chart_paths={}
211
  if chart_descs:
212
+ agent=create_pandas_dataframe_agent(llm=llm,df=df,verbose=False,
213
+ allow_dangerous_code=True)
214
+ for d in chart_descs:
215
+ with plt.ioff():
216
+ try:
217
+ agent.run(f"Create a {d} with matplotlib and save.")
218
+ fig=plt.gcf()
219
+ if fig.axes:
220
+ p=Path(tempfile.gettempdir())/f"{uuid.uuid4()}.png"
221
+ fig.savefig(p,dpi=300,bbox_inches="tight",
222
+ facecolor="white")
223
+ chart_paths[d]=str(p)
224
+ plt.close("all")
225
+ except: plt.close("all")
226
+
227
+ pdf=pptx=preview=None; slides=[]
228
+ if "rep" in outputs:
229
+ md=fix_bullet(outputs["rep"])
230
+ pdf=build_pdf(md,chart_paths)
231
+ preview=repl_tags(md,chart_paths,
232
+ lambda p:f'<img src="data:image/png;base64,{base64.b64encode(Path(p).read_bytes()).decode()}" style="max-width:100%;">')
233
+ if "pres" in outputs:
234
+ txt=fix_bullet(outputs["pres"])
235
+ parts=re.split(r"(?im)^\s*slide\s+\d+\s*-?\s*",txt)[1:][:SLIDES]
236
+ slides=[p.strip() for p in parts if p.strip()]
237
+ if slides: pptx=build_pptx(tuple(slides),chart_paths)
238
+ return {"preview":preview,"pdf":pdf,
239
+ "slides":slides,"pptx":pptx,
240
+ "charts":chart_paths,"key":key}
 
 
 
 
 
 
 
 
241
 
242
  # ─────────────────────────────────────────────────────────────────────────────
243
  # UI
244
  # ─────────────────────────────────────────────────────────────────────────────
245
+ mode=st.radio("Output:",["Report","Presentation","Both"],horizontal=True,index=2)
246
+ st.subheader("📁 Upload Business Data")
247
+ upl=st.file_uploader("CSV or Excel file",type=["csv","xlsx","xls"])
 
 
 
 
248
 
249
  if upl:
250
+ for e in validate_file_upload(upl): st.error(e)
251
+ if validate_file_upload(upl): st.stop()
252
+ df_prev,_=load_dataframe_safely(upl.getvalue(),upl.name)
253
+ with st.expander("Data Preview",False):
254
+ st.write(f"Shape: {df_prev.shape[0]} × {df_prev.shape[1]}")
255
+ st.dataframe(arrow_df(df_prev.head()))
256
+ st.dataframe(arrow_info(df_prev))
257
+
258
+ ctx=st.text_area("Context (optional)")
259
+
260
+ if not st.button("🚀 Generate",type="primary"): st.stop()
261
+ if not upl: st.warning("Upload a file"); st.stop()
262
+
263
+ bkey=sha1_bytes(b"".join([upl.getvalue(),mode.encode(),ctx.encode()]))
264
+ if bkey in st.session_state.bundles:
265
+ bundle=st.session_state.bundles[bkey]
266
+ st.info("Using cached result")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
  else:
268
+ with st.spinner("Generating assets …"):
269
+ bundle=make_assets(bkey,upl.getvalue(),upl.name,mode,ctx)
270
+ if bundle: st.session_state.bundles[bkey]=bundle
271
+ if not bundle: st.error("Generation failed"); st.stop()
272
+
273
+ if st.session_state.active_bundle_key!=bundle["key"]:
274
+ st.session_state.slide_idx=0
275
+ st.session_state.active_bundle_key=bundle["key"]
276
+
277
+ if mode in ("Report","Both"):
278
+ with st.expander("📄 Report",True):
279
+ if bundle["preview"]:
280
+ st.markdown(bundle["preview"],unsafe_allow_html=True)
281
+ if bundle["pdf"]:
282
+ st.download_button("Download PDF",bundle["pdf"],
283
+ "business_report.pdf","application/pdf")
284
+ else: st.warning("No report.")
285
+
286
+ if mode in ("Presentation","Both"):
287
+ with st.expander("📑 Slides",True):
288
+ slides=bundle["slides"]
 
289
  if not slides:
290
+ st.warning("No slides.")
291
  else:
292
+ idx=st.session_state.slide_idx
293
+ idx=max(0,min(idx,len(slides)-1))
294
+ st.session_state.slide_idx=idx
295
+ total=len(slides)
296
  st.progress((idx+1)/total)
297
+ cols=st.columns([1,1,2,1,1])
298
+ with cols[0]:
299
+ if st.button("⬅️",disabled=idx==0):
300
+ st.session_state.slide_idx-=1; st.rerun()
301
+ with cols[1]: st.write(f"{idx+1}/{total}")
302
+ with cols[2]:
303
+ j=st.selectbox("Jump",range(total),idx,
304
+ format_func=lambda x:f"Slide {x+1}")
305
+ if j!=idx:
306
+ st.session_state.slide_idx=j; st.rerun()
307
+ with cols[3]:
308
+ if st.button("➡️",disabled=idx>=total-1):
309
+ st.session_state.slide_idx+=1; st.rerun()
310
+ with cols[4]:
311
+ if st.button("🔊"):
312
+ txt=repl_tags(slides[idx],{},lambda _:"")
313
+ txt=re.sub(r'^Slide \d+\s*[-:]?\s*','',txt,flags=re.I)
314
+ narr='. '.join(l.strip('•*- ') for l in txt.split('\n') if l.strip())
315
+ audio,mime=deepgram_tts(narr)
316
+ if audio:
317
+ if "pcm" in mime.lower() or "l16" in mime.lower():
318
+ audio=pcm_to_wav(audio); mime="audio/wav"
319
+ st.audio(audio,format=mime)
320
+ else: st.error("Narration failed")
 
321
  st.divider()
322
+
323
+ # display slide
324
+ raw=slides[idx]
325
+ title_line,*body=raw.split('\n')
326
+ title=re.sub(r'^Slide \d+\s*[-:]?\s*','',title_line,flags=re.I).strip()
327
+ st.markdown(f"### {title or f'Slide {idx+1}'}")
328
+ # bullet list
329
+ for l in body:
330
+ l=l.strip()
331
+ if not l or TAG_RE.search(l): continue
332
+ l = l.lstrip('•*- ') # strip bullet char
333
+ st.markdown(f"- {l}")
334
+ # chart image
335
+ tg=[t for t in extract_chart_tags(raw) if t in bundle["charts"]]
336
+ if tg:
337
+ st.image(bundle["charts"][tg[0]],use_container_width=True)
338
  st.divider()
339
  if bundle["pptx"]:
340
+ st.download_button("Download PPTX",bundle["pptx"],
341
+ "business_presentation.pptx",
342
+ "application/vnd.openxmlformats-officedocument.presentationml.presentation")