sbs-API

Build error

App Files Files Community

rairo commited on Jul 19, 2025

Commit

e3aa5e7

verified ·

1 Parent(s): 78debf6

Update sozo_gen.py

Browse files

Files changed (1) hide show

sozo_gen.py +64 -17

sozo_gen.py CHANGED Viewed

@@ -29,7 +29,7 @@ import requests
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 FPS, WIDTH, HEIGHT = 24, 1280, 720
 MAX_CHARTS, VIDEO_SCENES = 5, 5
-MAX_CONTEXT_TOKENS = 250000 # Set max token limit for full dataset context
 # --- API Initialization ---
 API_KEY = os.getenv("GOOGLE_API_KEY")
@@ -38,7 +38,7 @@ if not API_KEY:
 PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")
-# --- Helper Functions ---
 def load_dataframe_safely(buf, name: str):
     ext = Path(name).suffix.lower()
     df = (pd.read_excel if ext in (".xlsx", ".xls") else pd.read_csv)(buf)
@@ -147,7 +147,7 @@ def search_and_download_pexels_video(query: str, duration: float, out_path: Path
             temp_dl_path.unlink()
         return None
-# --- Chart Generation System ---
 class ChartSpecification:
     def __init__(self, chart_type: str, title: str, x_col: str, y_col: str = None, size_col: str = None, agg_method: str = None, filter_condition: str = None, top_n: int = None, color_scheme: str = "professional"):
         self.chart_type = chart_type; self.title = title; self.x_col = x_col; self.y_col = y_col; self.size_col = size_col
@@ -233,7 +233,7 @@ def prepare_plot_data(spec: ChartSpecification, df: pd.DataFrame):
         return df[numeric_cols].corr()
     return df[spec.x_col]
-# --- Animation & Video Generation ---
 def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
     plot_data = prepare_plot_data(spec, df)
     frames = max(10, int(dur * fps))
@@ -368,6 +368,42 @@ def sanitize_for_firebase_key(text: str) -> str:
         text = text.replace(char, '_')
     return text
 def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
     """Creates a detailed summary of the dataframe for the AI."""
     numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
@@ -400,41 +436,52 @@ def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
     return json.loads(json.dumps(context, default=str))
 def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
-    logging.info(f"Generating report draft for project {project_id}")
     df = load_dataframe_safely(buf, name)
-    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.1)
     data_context_str = ""
     context_for_charts = {}
     try:
         df_json = df.to_json(orient='records')
         estimated_tokens = len(df_json) / 4
         if estimated_tokens < MAX_CONTEXT_TOKENS:
-            logging.info(f"Dataset is small enough ({estimated_tokens:.0f} tokens). Using full JSON context.")
             data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
             context_for_charts = get_augmented_context(df, ctx)
         else:
-            raise ValueError("Dataset too large for full context.")
     except Exception as e:
-        logging.warning(f"Could not use full JSON context ({e}). Falling back to augmented summary.")
         augmented_context = get_augmented_context(df, ctx)
         data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
         context_for_charts = augmented_context
     report_prompt = f"""
-    You are an expert data analyst and business intelligence storyteller. Your mission is to analyze the provided data context and write a comprehensive, executive-level report in Markdown format.
     **Data Context:**
     {data_context_str}
-    **Critical Instructions:**
-    1.  **Data Grounding:** Your entire analysis and narrative **must strictly** use the column names and data provided in the 'Data Context' section. Do not invent, modify, or assume any column names that are not on this list. This is the most important rule.
-    2.  **Report Goal:** Create a well-structured, professional report in Markdown that tells a compelling story from the data. The structure of the report is entirely up to you, but it should be logical and easy to follow.
-    3.  **Visual Support:** Wherever a key finding, trend, or significant point is made in your narrative, you **must** support it with a chart tag using the format: `<generate_chart: "chart_type | a specific, compelling description">`.
-    4.  **Chart Tag Grounding:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
-    5.  **Available Chart Types:** `bar, pie, line, scatter, hist, heatmap, area, bubble`.
-    Now, generate the complete Markdown report.
     """
     md = llm.invoke(report_prompt).content

 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - [%(funcName)s] - %(message)s')
 FPS, WIDTH, HEIGHT = 24, 1280, 720
 MAX_CHARTS, VIDEO_SCENES = 5, 5
+MAX_CONTEXT_TOKENS = 500000
 # --- API Initialization ---
 API_KEY = os.getenv("GOOGLE_API_KEY")
 PEXELS_API_KEY = os.getenv("PEXELS_API_KEY")
+# --- Helper Functions (Stable) ---
 def load_dataframe_safely(buf, name: str):
     ext = Path(name).suffix.lower()
     df = (pd.read_excel if ext in (".xlsx", ".xls") else pd.read_csv)(buf)
             temp_dl_path.unlink()
         return None
+# --- Chart Generation System (Stable) ---
 class ChartSpecification:
     def __init__(self, chart_type: str, title: str, x_col: str, y_col: str = None, size_col: str = None, agg_method: str = None, filter_condition: str = None, top_n: int = None, color_scheme: str = "professional"):
         self.chart_type = chart_type; self.title = title; self.x_col = x_col; self.y_col = y_col; self.size_col = size_col
         return df[numeric_cols].corr()
     return df[spec.x_col]
+# --- Animation & Video Generation (Stable) ---
 def animate_chart(spec: ChartSpecification, df: pd.DataFrame, dur: float, out: Path, fps: int = FPS) -> str:
     plot_data = prepare_plot_data(spec, df)
     frames = max(10, int(dur * fps))
         text = text.replace(char, '_')
     return text
+# NEW: Intelligence functions to guide the storyteller AI
+def analyze_data_intelligence(df: pd.DataFrame) -> Dict:
+    """Analyzes the dataset to find key characteristics and opportunities for storytelling."""
+    numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
+    categorical_cols = df.select_dtypes(exclude=['number']).columns.tolist()
+    is_timeseries = any('date' in col.lower() or 'time' in col.lower() for col in df.columns)
+    opportunities = []
+    if is_timeseries:
+        opportunities.append("temporal trends")
+    if len(numeric_cols) > 1:
+        opportunities.append("correlations between metrics")
+    if len(categorical_cols) > 0 and len(numeric_cols) > 0:
+        opportunities.append("segmentation by category")
+    if df.isnull().sum().sum() > 0:
+        opportunities.append("impact of missing data")
+    return {
+        "insight_opportunities": opportunities,
+        "is_timeseries": is_timeseries,
+        "has_correlations": len(numeric_cols) > 1,
+        "has_segments": len(categorical_cols) > 0 and len(numeric_cols) > 0
+    }
+def generate_visualization_strategy(intelligence: Dict) -> str:
+    """Generates dynamic advice on which charts to use."""
+    strategy = "Vary your visualizations to keep the report engaging. "
+    if intelligence["is_timeseries"]:
+        strategy += "Use 'line' or 'area' charts to explore temporal trends. "
+    if intelligence["has_correlations"]:
+        strategy += "Use 'scatter' or 'heatmap' charts to reveal correlations. "
+    if intelligence["has_segments"]:
+        strategy += "Use 'bar' or 'pie' charts to compare segments. "
+    return strategy
 def get_augmented_context(df: pd.DataFrame, user_ctx: str) -> Dict:
     """Creates a detailed summary of the dataframe for the AI."""
     numeric_cols = df.select_dtypes(include=['number']).columns.tolist()
     return json.loads(json.dumps(context, default=str))
 def generate_report_draft(buf, name: str, ctx: str, uid: str, project_id: str, bucket):
+    logging.info(f"Generating persona-driven report draft for project {project_id}")
     df = load_dataframe_safely(buf, name)
+    llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", google_api_key=API_KEY, temperature=0.2)
+    # --- Try/Fallback Context Strategy ---
     data_context_str = ""
     context_for_charts = {}
     try:
         df_json = df.to_json(orient='records')
         estimated_tokens = len(df_json) / 4
         if estimated_tokens < MAX_CONTEXT_TOKENS:
+            logging.info(f"Using full JSON context.")
             data_context_str = f"Here is the full dataset in JSON format:\n{df_json}"
             context_for_charts = get_augmented_context(df, ctx)
         else:
+            raise ValueError("Dataset too large.")
     except Exception as e:
+        logging.warning(f"Falling back to augmented summary context: {e}")
         augmented_context = get_augmented_context(df, ctx)
         data_context_str = f"The full dataset is too large to display. Here is a detailed summary:\n{json.dumps(augmented_context, indent=2)}"
         context_for_charts = augmented_context
+    # --- Persona-Driven Prompting ---
+    intelligence = analyze_data_intelligence(df)
+    viz_strategy = generate_visualization_strategy(intelligence)
     report_prompt = f"""
+    You are an elite data storyteller and business intelligence expert. Your mission is to uncover the compelling, hidden narrative in this dataset and present it as a captivating story in Markdown format that drives action.
     **Data Context:**
     {data_context_str}
+    **Intelligence Analysis:**
+    - The most interesting parts of this story may lie in the following areas: {', '.join(intelligence['insight_opportunities'])}.
+    - Weave these threads into your core narrative.
+    **Visualization Strategy:**
+    - {viz_strategy}
+    - Available Chart Types: `bar, pie, line, scatter, hist, heatmap, area, bubble`.
+    **Your Grounding Rules (Most Important):**
+    1.  **Strict Accuracy:** Your entire analysis and narrative **must strictly** use the column names provided in the 'Data Context' section. Do not invent, modify, or assume any column names that are not on this list.
+    2.  **Chart Support:** Wherever a key finding is made, you **must** support it with a chart tag: `<generate_chart: "chart_type | a specific, compelling description">`.
+    3.  **Chart Accuracy:** The column names used in your chart descriptions **must** also be an exact match from the provided data context.
+    Now, begin your report. Let the data's story unfold naturally.
     """
     md = llm.invoke(report_prompt).content