rairo commited on
Commit
d15492d
Β·
verified Β·
1 Parent(s): 0096675

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -15
app.py CHANGED
@@ -111,11 +111,36 @@ def audio_duration(path: str) -> float:
111
  TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'β€œβ€]?(?P<d>[^>\"\'”\]]+?)[\"\'β€œβ€]?\s*[>\]]', re.I, )
112
  extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
113
 
114
- re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
115
  def clean_narration(txt: str) -> str:
116
- txt = re_scene.sub("", txt)
 
 
 
 
117
  txt = TAG_RE.sub("", txt)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
 
 
 
 
 
119
  return re.sub(r"\s{2,}", " ", txt).strip()
120
 
121
  # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
@@ -923,11 +948,7 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
923
  1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
924
  2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
925
  3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
926
- 4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain:
927
- - Trends and patterns
928
- - Outliers or anomalies
929
- - Performance indicators
930
- - Risk factors or opportunities
931
  5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
932
  6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
933
 
@@ -992,8 +1013,12 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
992
  print(f"Failed to generate chart: {desc}")
993
  continue
994
 
995
- # 3. Assemble the final report bundle (UNCHANGED)
996
- pdf_bytes = build_pdf(md, chart_paths)
 
 
 
 
997
 
998
  return {
999
  "type": "report",
@@ -1009,7 +1034,7 @@ def build_story_prompt(ctx_dict):
1009
  cols = ", ".join(enhanced_ctx["columns"][:6])
1010
 
1011
  return f"""
1012
- You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
1013
 
1014
  **Enhanced Dataset Context:**
1015
  {json.dumps(enhanced_ctx, indent=2)}
@@ -1121,7 +1146,7 @@ def generate_video(buf: bytes, name: str, ctx: str, key: str):
1121
  concat_media(audio_parts, audio_mix, "audio")
1122
  final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
1123
  subprocess.run(
1124
- [ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac", "-shortest", str(final_vid), ],
1125
  check=True, capture_output=True,
1126
  )
1127
  for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
@@ -1184,10 +1209,11 @@ if (bundle := st.session_state.get("bundle")):
1184
 
1185
  c1, c2 = st.columns(2)
1186
  with c1:
1187
- st.download_button(
1188
- "Download PDF", bundle["pdf"], f"business_report_{bundle['key'][:8]}.pdf",
1189
- "application/pdf", use_container_width=True,
1190
- )
 
1191
  with c2:
1192
  if DG_KEY and st.button("πŸ”Š Narrate Summary", key=f"aud_{bundle['key']}"):
1193
  txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])
 
111
  TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'β€œβ€]?(?P<d>[^>\"\'”\]]+?)[\"\'β€œβ€]?\s*[>\]]', re.I, )
112
  extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
113
 
114
+ re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I | re.M)
115
  def clean_narration(txt: str) -> str:
116
+ """
117
+ Aggressively cleans text for text-to-speech by removing artifacts.
118
+ This function no longer relies on the LLM to format correctly.
119
+ """
120
+ # 1. Remove chart tags
121
  txt = TAG_RE.sub("", txt)
122
+
123
+ # 2. Remove scene numbers (e.g., "Scene 1:", "SCENE 2.", etc.)
124
+ txt = re_scene.sub("", txt)
125
+
126
+ # 3. Remove common descriptive phrases about the visuals
127
+ phrases_to_remove = [
128
+ r"as you can see in the chart",
129
+ r"this chart shows",
130
+ r"the chart illustrates",
131
+ r"in this visual",
132
+ r"this graph displays",
133
+ ]
134
+ for phrase in phrases_to_remove:
135
+ txt = re.sub(phrase, "", txt, flags=re.IGNORECASE)
136
+
137
+ # 4. Remove text within parentheses, which often contains notes
138
  txt = re.sub(r"\s*\([^)]*\)", "", txt)
139
+
140
+ # 5. Remove any remaining markdown or formatting artifacts
141
+ txt = re.sub(r"[\*#_]", "", txt)
142
+
143
+ # 6. Normalize whitespace to a single space
144
  return re.sub(r"\s{2,}", " ", txt).strip()
145
 
146
  # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
 
948
  1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
949
  2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
950
  3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
951
+ 4. **Key Insights**: You must provide exactly 5 key insights, each with its own chart tag.
 
 
 
 
952
  5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
953
  6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
954
 
 
1013
  print(f"Failed to generate chart: {desc}")
1014
  continue
1015
 
1016
+ # 3. Assemble the final report bundle
1017
+ try:
1018
+ pdf_bytes = build_pdf(md, chart_paths)
1019
+ except Exception as e:
1020
+ st.warning(f"⚠️ PDF generation failed and will be skipped. Error: {e}")
1021
+ pdf_bytes = None
1022
 
1023
  return {
1024
  "type": "report",
 
1034
  cols = ", ".join(enhanced_ctx["columns"][:6])
1035
 
1036
  return f"""
1037
+ You are a professional business storyteller and data analyst. You must create a script with exactly {VIDEO_SCENES} scenes, each separated by '[SCENE_BREAK]'.
1038
 
1039
  **Enhanced Dataset Context:**
1040
  {json.dumps(enhanced_ctx, indent=2)}
 
1146
  concat_media(audio_parts, audio_mix, "audio")
1147
  final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
1148
  subprocess.run(
1149
+ [ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac", str(final_vid), ],
1150
  check=True, capture_output=True,
1151
  )
1152
  for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
 
1209
 
1210
  c1, c2 = st.columns(2)
1211
  with c1:
1212
+ if bundle.get("pdf"):
1213
+ st.download_button(
1214
+ "Download PDF", bundle["pdf"], f"business_report_{bundle['key'][:8]}.pdf",
1215
+ "application/pdf", use_container_width=True,
1216
+ )
1217
  with c2:
1218
  if DG_KEY and st.button("πŸ”Š Narrate Summary", key=f"aud_{bundle['key']}"):
1219
  txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])