sbs-API

Build error

App Files Files Community

rairo commited on Jul 19, 2025

Commit

ec8c9c1

verified ·

1 Parent(s): d04b508

Update sozo_gen.py

Browse files

Files changed (1) hide show

sozo_gen.py +58 -28

sozo_gen.py CHANGED Viewed

@@ -500,7 +500,7 @@ def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
     return json.loads(json.dumps(context, default=str))
 def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
-    logging.info(f"Generating persona-driven report draft for project {project_id}")
     df = load_dataframe_safely(buf, name)
     llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
@@ -509,44 +509,74 @@ def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, b
         df_json = df.to_json(orient='records')
         estimated_tokens = len(df_json) / 4
         if estimated_tokens < MAX_CONTEXT_TOKENS:
-            logging.info(f"Using full JSON context.")
             data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
             context_for_charts = get_augmented_context(df, ctx)
         else:
-            raise ValueError("Dataset too large.")
     except Exception as e:
-        logging.warning(f"Falling back to augmented summary context: {e}")
         augmented_context = get_augmented_context(df, ctx)
         data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
         context_for_charts = augmented_context
-    intelligence = analyze_data_intelligence(df)
-    viz_strategy = generate_visualization_strategy(intelligence)
-    report_prompt = f"""
-    You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
-    **Data Context:**
-    {data_context_str}
-    **Intelligence Analysis:**
-    - The most interesting parts of this story may lie in the following areas: {', '.join(intelligence['insight_opportunities'])}.
-    - Weave these threads into your core narrative.
-    **Visualization Strategy:**
-    - {viz_strategy}
-    - Available Chart Types: `bar, pie, line, scatter, hist, heatmap, area, bubble`.
-    **Your Grounding Rules (Most Important):**
-    1.  **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section. Do not invent, modify, or assume any column names that are not on this list.
-    2.  **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
-    3.  **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
-    Now, begin your report. Let the data's story unfold naturally.
-    """
-    md = llm.invoke(report_prompt).content
     chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
     chart_urls = {}
     chart_generator = ChartGenerator(llm, df)

     return json.loads(json.dumps(context, default=str))
 def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
+    logging.info(f"Generating two-pass report draft for project {project_id}")
     df = load_dataframe_safely(buf, name)
     llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
         df_json = df.to_json(orient='records')
         estimated_tokens = len(df_json) / 4
         if estimated_tokens < MAX_CONTEXT_TOKENS:
+            logging.info(f"Using full JSON context for report generation.")
             data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
             context_for_charts = get_augmented_context(df, ctx)
         else:
+            raise ValueError("Dataset too large for full context.")
     except Exception as e:
+        logging.warning(f"Falling back to augmented summary context for report generation: {e}")
         augmented_context = get_augmented_context(df, ctx)
         data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
         context_for_charts = augmented_context
+    md = ""
+    try:
+        # --- Pass 1: The Analyst ---
+        analyst_prompt = f"""
+        You are a senior data analyst. Your task is to analyze the provided data and create a structured report plan.
+        Identify 4-5 of the most important, non-overlapping insights from the data.
+        For each insight, provide a brief summary and the single best chart description tag to visualize it.
+        **Data Context:**
+        {data_context_str}
+        **Output Format:**
+        Return ONLY a valid JSON array of objects. Each object must have two keys: "insight_summary" and "chart_description".
+        Ensure each chart_description is unique and directly relates to its insight_summary.
+        Example:
+        [
+          {{ "insight_summary": "Smokers incur significantly higher medical charges.", "chart_description": "bar | Average Charges by Smoker Status" }},
+          {{ "insight_summary": "Medical charges show a clear positive correlation with Body Mass Index.", "chart_description": "scatter | Charges vs. BMI" }}
+        ]
+        """
+        logging.info("Executing Analyst Pass...")
+        analyst_response = llm.invoke(analyst_prompt).content.strip()
+        if analyst_response.startswith("```json"):
+            analyst_response = analyst_response[7:-3]
+        report_plan = json.loads(analyst_response)
+        logging.info(f"Analyst Pass successful. Plan has {len(report_plan)} insights.")
+        # --- Pass 2: The Writer ---
+        writer_prompt = f"""
+        You are an expert business writer. Your task is to write a flowing, narrative-style report based on the following plan from our data analyst.
+        For each point in the plan, write a clear, insightful paragraph that explains the finding.
+        After your explanation, you **must** insert the `chart_description` tag provided in the plan, exactly as it is written in the format `<generate_chart: "the_description">`.
+        Start with a brief, engaging introduction and end with a short conclusion.
+        **Analyst's Plan:**
+        {json.dumps(report_plan, indent=2)}
+        Now, write the complete Markdown report.
+        """
+        logging.info("Executing Writer Pass...")
+        md = llm.invoke(writer_prompt).content.strip()
+        logging.info("Writer Pass successful.")
+    except Exception as e:
+        logging.error(f"Two-pass system failed: {e}. Reverting to single-pass fallback.")
+        fallback_prompt = f"""
+        You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
+        **Data Context:** {data_context_str}
+        **Your Grounding Rules (Most Important):**
+        1.  **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section.
+        2.  **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
+        3.  **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
+        Now, begin your report. Let the data's story unfold naturally.
+        """
+        md = llm.invoke(fallback_prompt).content.strip()
     chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
     chart_urls = {}
     chart_generator = ChartGenerator(llm, df)