rairo commited on
Commit
ec8c9c1
·
verified ·
1 Parent(s): d04b508

Update sozo_gen.py

Browse files
Files changed (1) hide show
  1. sozo_gen.py +58 -28
sozo_gen.py CHANGED
@@ -500,7 +500,7 @@ def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
500
  return json.loads(json.dumps(context, default=str))
501
 
502
  def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
503
- logging.info(f"Generating persona-driven report draft for project {project_id}")
504
  df = load_dataframe_safely(buf, name)
505
  llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
506
 
@@ -509,44 +509,74 @@ def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, b
509
  df_json = df.to_json(orient='records')
510
  estimated_tokens = len(df_json) / 4
511
  if estimated_tokens < MAX_CONTEXT_TOKENS:
512
- logging.info(f"Using full JSON context.")
513
  data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
514
  context_for_charts = get_augmented_context(df, ctx)
515
  else:
516
- raise ValueError("Dataset too large.")
517
  except Exception as e:
518
- logging.warning(f"Falling back to augmented summary context: {e}")
519
  augmented_context = get_augmented_context(df, ctx)
520
  data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
521
  context_for_charts = augmented_context
522
 
523
- intelligence = analyze_data_intelligence(df)
524
- viz_strategy = generate_visualization_strategy(intelligence)
525
-
526
- report_prompt = f"""
527
- You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
528
-
529
- **Data Context:**
530
- {data_context_str}
531
-
532
- **Intelligence Analysis:**
533
- - The most interesting parts of this story may lie in the following areas: {', '.join(intelligence['insight_opportunities'])}.
534
- - Weave these threads into your core narrative.
535
 
536
- **Visualization Strategy:**
537
- - {viz_strategy}
538
- - Available Chart Types: `bar, pie, line, scatter, hist, heatmap, area, bubble`.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
539
 
540
- **Your Grounding Rules (Most Important):**
541
- 1. **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section. Do not invent, modify, or assume any column names that are not on this list.
542
- 2. **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
543
- 3. **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
 
 
 
 
 
 
 
 
544
 
545
- Now, begin your report. Let the data's story unfold naturally.
546
- """
547
-
548
- md = llm.invoke(report_prompt).content
549
-
550
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
551
  chart_urls = {}
552
  chart_generator = ChartGenerator(llm, df)
 
500
  return json.loads(json.dumps(context, default=str))
501
 
502
  def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
503
+ logging.info(f"Generating two-pass report draft for project {project_id}")
504
  df = load_dataframe_safely(buf, name)
505
  llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
506
 
 
509
  df_json = df.to_json(orient='records')
510
  estimated_tokens = len(df_json) / 4
511
  if estimated_tokens < MAX_CONTEXT_TOKENS:
512
+ logging.info(f"Using full JSON context for report generation.")
513
  data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
514
  context_for_charts = get_augmented_context(df, ctx)
515
  else:
516
+ raise ValueError("Dataset too large for full context.")
517
  except Exception as e:
518
+ logging.warning(f"Falling back to augmented summary context for report generation: {e}")
519
  augmented_context = get_augmented_context(df, ctx)
520
  data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
521
  context_for_charts = augmented_context
522
 
523
+ md = ""
524
+ try:
525
+ # --- Pass 1: The Analyst ---
526
+ analyst_prompt = f"""
527
+ You are a senior data analyst. Your task is to analyze the provided data and create a structured report plan.
528
+ Identify 4-5 of the most important, non-overlapping insights from the data.
529
+ For each insight, provide a brief summary and the single best chart description tag to visualize it.
530
+
531
+ **Data Context:**
532
+ {data_context_str}
 
 
533
 
534
+ **Output Format:**
535
+ Return ONLY a valid JSON array of objects. Each object must have two keys: "insight_summary" and "chart_description".
536
+ Ensure each chart_description is unique and directly relates to its insight_summary.
537
+
538
+ Example:
539
+ [
540
+ {{ "insight_summary": "Smokers incur significantly higher medical charges.", "chart_description": "bar | Average Charges by Smoker Status" }},
541
+ {{ "insight_summary": "Medical charges show a clear positive correlation with Body Mass Index.", "chart_description": "scatter | Charges vs. BMI" }}
542
+ ]
543
+ """
544
+ logging.info("Executing Analyst Pass...")
545
+ analyst_response = llm.invoke(analyst_prompt).content.strip()
546
+ if analyst_response.startswith("```json"):
547
+ analyst_response = analyst_response[7:-3]
548
+ report_plan = json.loads(analyst_response)
549
+ logging.info(f"Analyst Pass successful. Plan has {len(report_plan)} insights.")
550
+
551
+ # --- Pass 2: The Writer ---
552
+ writer_prompt = f"""
553
+ You are an expert business writer. Your task is to write a flowing, narrative-style report based on the following plan from our data analyst.
554
+ For each point in the plan, write a clear, insightful paragraph that explains the finding.
555
+ After your explanation, you **must** insert the `chart_description` tag provided in the plan, exactly as it is written in the format `<generate_chart: "the_description">`.
556
+ Start with a brief, engaging introduction and end with a short conclusion.
557
+
558
+ **Analyst's Plan:**
559
+ {json.dumps(report_plan, indent=2)}
560
+
561
+ Now, write the complete Markdown report.
562
+ """
563
+ logging.info("Executing Writer Pass...")
564
+ md = llm.invoke(writer_prompt).content.strip()
565
+ logging.info("Writer Pass successful.")
566
 
567
+ except Exception as e:
568
+ logging.error(f"Two-pass system failed: {e}. Reverting to single-pass fallback.")
569
+ fallback_prompt = f"""
570
+ You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
571
+ **Data Context:** {data_context_str}
572
+ **Your Grounding Rules (Most Important):**
573
+ 1. **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section.
574
+ 2. **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
575
+ 3. **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
576
+ Now, begin your report. Let the data's story unfold naturally.
577
+ """
578
+ md = llm.invoke(fallback_prompt).content.strip()
579
 
 
 
 
 
 
580
  chart_descs = extract_chart_tags(md)[:MAX_CHARTS]
581
  chart_urls = {}
582
  chart_generator = ChartGenerator(llm, df)