Adk-Analyst2

Sleeping

App Files Files Community

rairo commited on Jul 6, 2025

Commit

d15492d

verified ·

1 Parent(s): 0096675

Update app.py

Browse files

Files changed (1) hide show

app.py +41 -15

app.py CHANGED Viewed

@@ -111,11 +111,36 @@ def audio_duration(path: str) -> float:
 TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
-re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I)
 def clean_narration(txt: str) -> str:
-    txt = re_scene.sub("", txt)
     txt = TAG_RE.sub("", txt)
     txt = re.sub(r"\s*\([^)]*\)", "", txt)
     return re.sub(r"\s{2,}", " ", txt).strip()
 # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
@@ -923,11 +948,7 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
     1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
     2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
     3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
-    4. **Key Insights**: Provide 4-6 actionable insights specific to the identified domain:
-       - Trends and patterns
-       - Outliers or anomalies
-       - Performance indicators
-       - Risk factors or opportunities
     5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
     6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
@@ -992,8 +1013,12 @@ def generate_report_bundle(buf: bytes, name: str, ctx: str, key: str):
                     print(f"Failed to generate chart: {desc}")
                     continue
-    # 3. Assemble the final report bundle (UNCHANGED)
-    pdf_bytes = build_pdf(md, chart_paths)
     return {
         "type": "report",
@@ -1009,7 +1034,7 @@ def build_story_prompt(ctx_dict):
     cols = ", ".join(enhanced_ctx["columns"][:6])
     return f"""
-    You are a professional business storyteller and data analyst. Create a compelling script for a {VIDEO_SCENES}-scene business video presentation.
     **Enhanced Dataset Context:**
     {json.dumps(enhanced_ctx, indent=2)}
@@ -1121,7 +1146,7 @@ def generate_video(buf: bytes, name: str, ctx: str, key: str):
     concat_media(audio_parts, audio_mix, "audio")
     final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
     subprocess.run(
-        [ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac", "-shortest", str(final_vid), ],
         check=True, capture_output=True,
     )
     for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
@@ -1184,10 +1209,11 @@ if (bundle := st.session_state.get("bundle")):
         c1, c2 = st.columns(2)
         with c1:
-            st.download_button(
-                "Download PDF", bundle["pdf"], f"business_report_{bundle['key'][:8]}.pdf",
-                "application/pdf", use_container_width=True,
-            )
         with c2:
             if DG_KEY and st.button("🔊 Narrate Summary", key=f"aud_{bundle['key']}"):
                 txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])

 TAG_RE = re.compile( r'[<[]\s*generate_?chart\s*[:=]?\s*[\"\'“”]?(?P<d>[^>\"\'”\]]+?)[\"\'“”]?\s*[>\]]', re.I, )
 extract_chart_tags = lambda t: list( dict.fromkeys(m.group("d").strip() for m in TAG_RE.finditer(t or "")) )
+re_scene = re.compile(r"^\s*scene\s*\d+[:.\- ]*", re.I | re.M)
 def clean_narration(txt: str) -> str:
+    """
+    Aggressively cleans text for text-to-speech by removing artifacts.
+    This function no longer relies on the LLM to format correctly.
+    """
+    # 1. Remove chart tags
     txt = TAG_RE.sub("", txt)
+    # 2. Remove scene numbers (e.g., "Scene 1:", "SCENE 2.", etc.)
+    txt = re_scene.sub("", txt)
+    # 3. Remove common descriptive phrases about the visuals
+    phrases_to_remove = [
+        r"as you can see in the chart",
+        r"this chart shows",
+        r"the chart illustrates",
+        r"in this visual",
+        r"this graph displays",
+    ]
+    for phrase in phrases_to_remove:
+        txt = re.sub(phrase, "", txt, flags=re.IGNORECASE)
+    # 4. Remove text within parentheses, which often contains notes
     txt = re.sub(r"\s*\([^)]*\)", "", txt)
+    # 5. Remove any remaining markdown or formatting artifacts
+    txt = re.sub(r"[\*#_]", "", txt)
+    # 6. Normalize whitespace to a single space
     return re.sub(r"\s{2,}", " ", txt).strip()
 # ─── IMAGE GENERATION & PLACEHOLDER ────────────────────────────────────────
     1. **Identify Data Domain**: First, determine what type of data this represents (e.g., sales/revenue, healthcare/medical, HR/employee, financial, operational, customer, research, etc.) based on column names and sample data.
     2. **Executive Summary**: Start with a high-level summary of key findings and business impact.
     3. **Data Quality Assessment**: Comment on data completeness, any notable missing values, and data reliability.
+    4. **Key Insights**: You must provide exactly 5 key insights, each with its own chart tag.
     5. **Strategic Recommendations**: Offer concrete, actionable recommendations based on the data.
     6. **Visual Support**: When a visualization would enhance understanding, insert chart tags like: `<generate_chart: "chart_type | specific description">`
                     print(f"Failed to generate chart: {desc}")
                     continue
+    # 3. Assemble the final report bundle
+    try:
+        pdf_bytes = build_pdf(md, chart_paths)
+    except Exception as e:
+        st.warning(f"⚠️ PDF generation failed and will be skipped. Error: {e}")
+        pdf_bytes = None
     return {
         "type": "report",
     cols = ", ".join(enhanced_ctx["columns"][:6])
     return f"""
+    You are a professional business storyteller and data analyst. You must create a script with exactly {VIDEO_SCENES} scenes, each separated by '[SCENE_BREAK]'.
     **Enhanced Dataset Context:**
     {json.dumps(enhanced_ctx, indent=2)}
     concat_media(audio_parts, audio_mix, "audio")
     final_vid = Path(tempfile.gettempdir()) / f"{key}.mp4"
     subprocess.run(
+        [ "ffmpeg", "-y", "-i", str(silent_vid), "-i", str(audio_mix), "-c:v", "copy", "-c:a", "aac", str(final_vid), ],
         check=True, capture_output=True,
     )
     for p in temps + [silent_vid, audio_mix]: p.unlink(missing_ok=True)
         c1, c2 = st.columns(2)
         with c1:
+            if bundle.get("pdf"):
+                st.download_button(
+                    "Download PDF", bundle["pdf"], f"business_report_{bundle['key'][:8]}.pdf",
+                    "application/pdf", use_container_width=True,
+                )
         with c2:
             if DG_KEY and st.button("🔊 Narrate Summary", key=f"aud_{bundle['key']}"):
                 txt = re.sub(r"<[^>]+>", "", bundle["raw_md"])