Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -111,11 +111,36 @@ def audio_duration(path: str) -> float:
|
|
| 111 |
TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'ββ]?(?P<d>[^>\"\'β\]]+?)[\"\'ββ]?\s*[>\]]', re.I, )
|
| 112 |
extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
|
| 113 |
|
| 114 |
-
re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
|
| 115 |
def clean_narration(txt: str) -> str:
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
txt = TAG_RE.sub("", txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
txt = re.sub(r"\s*\([^)]*\)", "", txt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 119 |
return re.sub(r"\s{2,}", " ", txt).strip()
|
| 120 |
|
| 121 |
# βββ IMAGE GENERATION & PLACEHOLDER ββββββββββββββββββββββββββββββββββββββββ
|
|
@@ -923,11 +948,7 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
|
|
| 923 |
1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
|
| 924 |
2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
|
| 925 |
3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
|
| 926 |
-
4. **Key Insights**:
|
| 927 |
-
- Trends and patterns
|
| 928 |
-
- Outliers or anomalies
|
| 929 |
-
- Performance indicators
|
| 930 |
-
- Risk factors or opportunities
|
| 931 |
5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
|
| 932 |
6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
|
| 933 |
|
|
@@ -992,8 +1013,12 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
|
|
| 992 |
print(f"Failed to generate chart: {desc}")
|
| 993 |
continue
|
| 994 |
|
| 995 |
-
# 3. Assemble the final report bundle
|
| 996 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 997 |
|
| 998 |
return {
|
| 999 |
"type": "report",
|
|
@@ -1009,7 +1034,7 @@ def build_story_prompt(ctx_dict):
|
|
| 1009 |
cols = ", ".join(enhanced_ctx["columns"][:6])
|
| 1010 |
|
| 1011 |
return f"""
|
| 1012 |
-
You are a professional business storyteller and data analyst.
|
| 1013 |
|
| 1014 |
**Enhanced Dataset Context:**
|
| 1015 |
{json.dumps(enhanced_ctx, indent=2)}
|
|
@@ -1121,7 +1146,7 @@ def generate_video(buf: bytes, name: str, ctx: str, key: str):
|
|
| 1121 |
concat_media(audio_parts, audio_mix, "audio")
|
| 1122 |
final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
|
| 1123 |
subprocess.run(
|
| 1124 |
-
[ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac",
|
| 1125 |
check=True, capture_output=True,
|
| 1126 |
)
|
| 1127 |
for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
|
|
@@ -1184,10 +1209,11 @@ if (bundle := st.session_state.get("bundle")):
|
|
| 1184 |
|
| 1185 |
c1, c2 = st.columns(2)
|
| 1186 |
with c1:
|
| 1187 |
-
|
| 1188 |
-
|
| 1189 |
-
|
| 1190 |
-
|
|
|
|
| 1191 |
with c2:
|
| 1192 |
if DG_KEY and st.button("π Narrate Summary", key=f"aud_{bundle['key']}"):
|
| 1193 |
txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])
|
|
|
|
| 111 |
TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'ββ]?(?P<d>[^>\"\'β\]]+?)[\"\'ββ]?\s*[>\]]', re.I, )
|
| 112 |
extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
|
| 113 |
|
| 114 |
+
re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I | re.M)
|
| 115 |
def clean_narration(txt: str) -> str:
|
| 116 |
+
"""
|
| 117 |
+
Aggressively cleans text for text-to-speech by removing artifacts.
|
| 118 |
+
This function no longer relies on the LLM to format correctly.
|
| 119 |
+
"""
|
| 120 |
+
# 1. Remove chart tags
|
| 121 |
txt = TAG_RE.sub("", txt)
|
| 122 |
+
|
| 123 |
+
# 2. Remove scene numbers (e.g., "Scene 1:", "SCENE 2.", etc.)
|
| 124 |
+
txt = re_scene.sub("", txt)
|
| 125 |
+
|
| 126 |
+
# 3. Remove common descriptive phrases about the visuals
|
| 127 |
+
phrases_to_remove = [
|
| 128 |
+
r"as you can see in the chart",
|
| 129 |
+
r"this chart shows",
|
| 130 |
+
r"the chart illustrates",
|
| 131 |
+
r"in this visual",
|
| 132 |
+
r"this graph displays",
|
| 133 |
+
]
|
| 134 |
+
for phrase in phrases_to_remove:
|
| 135 |
+
txt = re.sub(phrase, "", txt, flags=re.IGNORECASE)
|
| 136 |
+
|
| 137 |
+
# 4. Remove text within parentheses, which often contains notes
|
| 138 |
txt = re.sub(r"\s*\([^)]*\)", "", txt)
|
| 139 |
+
|
| 140 |
+
# 5. Remove any remaining markdown or formatting artifacts
|
| 141 |
+
txt = re.sub(r"[\*#_]", "", txt)
|
| 142 |
+
|
| 143 |
+
# 6. Normalize whitespace to a single space
|
| 144 |
return re.sub(r"\s{2,}", " ", txt).strip()
|
| 145 |
|
| 146 |
# βββ IMAGE GENERATION & PLACEHOLDER ββββββββββββββββββββββββββββββββββββββββ
|
|
|
|
| 948 |
1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
|
| 949 |
2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
|
| 950 |
3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
|
| 951 |
+
4. **Key Insights**: You must provide exactly 5 key insights, each with its own chart tag.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 952 |
5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
|
| 953 |
6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
|
| 954 |
|
|
|
|
| 1013 |
print(f"Failed to generate chart: {desc}")
|
| 1014 |
continue
|
| 1015 |
|
| 1016 |
+
# 3. Assemble the final report bundle
|
| 1017 |
+
try:
|
| 1018 |
+
pdf_bytes = build_pdf(md, chart_paths)
|
| 1019 |
+
except Exception as e:
|
| 1020 |
+
st.warning(f"β οΈ PDF generation failed and will be skipped. Error: {e}")
|
| 1021 |
+
pdf_bytes = None
|
| 1022 |
|
| 1023 |
return {
|
| 1024 |
"type": "report",
|
|
|
|
| 1034 |
cols = ", ".join(enhanced_ctx["columns"][:6])
|
| 1035 |
|
| 1036 |
return f"""
|
| 1037 |
+
You are a professional business storyteller and data analyst. You must create a script with exactly {VIDEO_SCENES} scenes, each separated by '[SCENE_BREAK]'.
|
| 1038 |
|
| 1039 |
**Enhanced Dataset Context:**
|
| 1040 |
{json.dumps(enhanced_ctx, indent=2)}
|
|
|
|
| 1146 |
concat_media(audio_parts, audio_mix, "audio")
|
| 1147 |
final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
|
| 1148 |
subprocess.run(
|
| 1149 |
+
[ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac", str(final_vid), ],
|
| 1150 |
check=True, capture_output=True,
|
| 1151 |
)
|
| 1152 |
for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
|
|
|
|
| 1209 |
|
| 1210 |
c1, c2 = st.columns(2)
|
| 1211 |
with c1:
|
| 1212 |
+
if bundle.get("pdf"):
|
| 1213 |
+
st.download_button(
|
| 1214 |
+
"Download PDF", bundle["pdf"], f"business_report_{bundle['key'][:8]}.pdf",
|
| 1215 |
+
"application/pdf", use_container_width=True,
|
| 1216 |
+
)
|
| 1217 |
with c2:
|
| 1218 |
if DG_KEY and st.button("π Narrate Summary", key=f"aud_{bundle['key']}"):
|
| 1219 |
txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])
|