Update sozo_gen.py
Browse files- sozo_gen.py +58 -28
sozo_gen.py
CHANGED
|
@@ -500,7 +500,7 @@ def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
|
|
| 500 |
return json.loads(json.dumps(context, default=str))
|
| 501 |
|
| 502 |
def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
|
| 503 |
-
logging.info(f"Generating
|
| 504 |
df = load_dataframe_safely(buf, name)
|
| 505 |
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
|
| 506 |
|
|
@@ -509,44 +509,74 @@ def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, b
|
|
| 509 |
df_json = df.to_json(orient='records')
|
| 510 |
estimated_tokens = len(df_json) / 4
|
| 511 |
if estimated_tokens < MAX_CONTEXT_TOKENS:
|
| 512 |
-
logging.info(f"Using full JSON context.")
|
| 513 |
data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
|
| 514 |
context_for_charts = get_augmented_context(df, ctx)
|
| 515 |
else:
|
| 516 |
-
raise ValueError("Dataset too large.")
|
| 517 |
except Exception as e:
|
| 518 |
-
logging.warning(f"Falling back to augmented summary context: {e}")
|
| 519 |
augmented_context = get_augmented_context(df, ctx)
|
| 520 |
data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
|
| 521 |
context_for_charts = augmented_context
|
| 522 |
|
| 523 |
-
|
| 524 |
-
|
| 525 |
-
|
| 526 |
-
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
- The most interesting parts of this story may lie in the following areas: {', '.join(intelligence['insight_opportunities'])}.
|
| 534 |
-
- Weave these threads into your core narrative.
|
| 535 |
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 539 |
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 544 |
|
| 545 |
-
Now, begin your report. Let the data's story unfold naturally.
|
| 546 |
-
"""
|
| 547 |
-
|
| 548 |
-
md = llm.invoke(report_prompt).content
|
| 549 |
-
|
| 550 |
chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
|
| 551 |
chart_urls = {}
|
| 552 |
chart_generator = ChartGenerator(llm, df)
|
|
|
|
| 500 |
return json.loads(json.dumps(context, default=str))
|
| 501 |
|
| 502 |
def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
|
| 503 |
+
logging.info(f"Generating two-pass report draft for project {project_id}")
|
| 504 |
df = load_dataframe_safely(buf, name)
|
| 505 |
llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
|
| 506 |
|
|
|
|
| 509 |
df_json = df.to_json(orient='records')
|
| 510 |
estimated_tokens = len(df_json) / 4
|
| 511 |
if estimated_tokens < MAX_CONTEXT_TOKENS:
|
| 512 |
+
logging.info(f"Using full JSON context for report generation.")
|
| 513 |
data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
|
| 514 |
context_for_charts = get_augmented_context(df, ctx)
|
| 515 |
else:
|
| 516 |
+
raise ValueError("Dataset too large for full context.")
|
| 517 |
except Exception as e:
|
| 518 |
+
logging.warning(f"Falling back to augmented summary context for report generation: {e}")
|
| 519 |
augmented_context = get_augmented_context(df, ctx)
|
| 520 |
data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
|
| 521 |
context_for_charts = augmented_context
|
| 522 |
|
| 523 |
+
md = ""
|
| 524 |
+
try:
|
| 525 |
+
# --- Pass 1: The Analyst ---
|
| 526 |
+
analyst_prompt = f"""
|
| 527 |
+
You are a senior data analyst. Your task is to analyze the provided data and create a structured report plan.
|
| 528 |
+
Identify 4-5 of the most important, non-overlapping insights from the data.
|
| 529 |
+
For each insight, provide a brief summary and the single best chart description tag to visualize it.
|
| 530 |
+
|
| 531 |
+
**Data Context:**
|
| 532 |
+
{data_context_str}
|
|
|
|
|
|
|
| 533 |
|
| 534 |
+
**Output Format:**
|
| 535 |
+
Return ONLY a valid JSON array of objects. Each object must have two keys: "insight_summary" and "chart_description".
|
| 536 |
+
Ensure each chart_description is unique and directly relates to its insight_summary.
|
| 537 |
+
|
| 538 |
+
Example:
|
| 539 |
+
[
|
| 540 |
+
{{ "insight_summary": "Smokers incur significantly higher medical charges.", "chart_description": "bar | Average Charges by Smoker Status" }},
|
| 541 |
+
{{ "insight_summary": "Medical charges show a clear positive correlation with Body Mass Index.", "chart_description": "scatter | Charges vs. BMI" }}
|
| 542 |
+
]
|
| 543 |
+
"""
|
| 544 |
+
logging.info("Executing Analyst Pass...")
|
| 545 |
+
analyst_response = llm.invoke(analyst_prompt).content.strip()
|
| 546 |
+
if analyst_response.startswith("```json"):
|
| 547 |
+
analyst_response = analyst_response[7:-3]
|
| 548 |
+
report_plan = json.loads(analyst_response)
|
| 549 |
+
logging.info(f"Analyst Pass successful. Plan has {len(report_plan)} insights.")
|
| 550 |
+
|
| 551 |
+
# --- Pass 2: The Writer ---
|
| 552 |
+
writer_prompt = f"""
|
| 553 |
+
You are an expert business writer. Your task is to write a flowing, narrative-style report based on the following plan from our data analyst.
|
| 554 |
+
For each point in the plan, write a clear, insightful paragraph that explains the finding.
|
| 555 |
+
After your explanation, you **must** insert the `chart_description` tag provided in the plan, exactly as it is written in the format `<generate_chart: "the_description">`.
|
| 556 |
+
Start with a brief, engaging introduction and end with a short conclusion.
|
| 557 |
+
|
| 558 |
+
**Analyst's Plan:**
|
| 559 |
+
{json.dumps(report_plan, indent=2)}
|
| 560 |
+
|
| 561 |
+
Now, write the complete Markdown report.
|
| 562 |
+
"""
|
| 563 |
+
logging.info("Executing Writer Pass...")
|
| 564 |
+
md = llm.invoke(writer_prompt).content.strip()
|
| 565 |
+
logging.info("Writer Pass successful.")
|
| 566 |
|
| 567 |
+
except Exception as e:
|
| 568 |
+
logging.error(f"Two-pass system failed: {e}. Reverting to single-pass fallback.")
|
| 569 |
+
fallback_prompt = f"""
|
| 570 |
+
You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
|
| 571 |
+
**Data Context:** {data_context_str}
|
| 572 |
+
**Your Grounding Rules (Most Important):**
|
| 573 |
+
1. **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section.
|
| 574 |
+
2. **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
|
| 575 |
+
3. **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
|
| 576 |
+
Now, begin your report. Let the data's story unfold naturally.
|
| 577 |
+
"""
|
| 578 |
+
md = llm.invoke(fallback_prompt).content.strip()
|
| 579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
|
| 581 |
chart_urls = {}
|
| 582 |
chart_generator = ChartGenerator(llm, df)
|