{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "0708537e", "metadata": {}, "outputs": [], "source": [ "from langchain_core.tools import tool\n", "from dotenv import load_dotenv\n", "import sys,os\n", "from io import StringIO\n", "from pydantic import BaseModel, Field\n", "import pandas as pd\n", "from typing import Optional\n" ] }, { "cell_type": "code", "execution_count": 28, "id": "4c5a0c5a", "metadata": {}, "outputs": [], "source": [ "from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle # <-- ADD THIS IMPORT\n", "from reportlab.platypus import Paragraph, Frame # <-- ADD FRAME IMPORT\n" ] }, { "cell_type": "code", "execution_count": 29, "id": "85b4fabd", "metadata": {}, "outputs": [], "source": [ "@tool\n", "def eda_fact_sheet(path:str,sample_size: int = 3, freq_top: int = 3):\n", " \"\"\"\n", " Generates a batchwise fact sheet for a dataset.\n", "\n", " Args:\n", " path (str): Path to CSV or Excel dataset.\n", " sample_size (int): Number of example values per column.\n", " freq_top (int): Number of top frequent values per column.\n", "\n", " Returns:\n", " dict: Fact sheet containing dataset-level info and column-level stats in batches.\n", " \"\"\"\n", " path = path.strip().strip('\"').strip(\"'\")\n", " path = os.path.normpath(path)\n", "\n", " #--- Load dataset ---\n", " if path.endswith(\".csv\"):\n", " df = pd.read_csv(path)\n", " elif path.endswith(\".xlsx\") or path.endswith(\".xls\"):\n", " df = pd.read_excel(path)\n", " else:\n", " raise ValueError(\"Unsupported file format. Use CSV or Excel.\")\n", "\n", " df = pd.read_csv(path)\n", "\n", " n_rows, n_columns = df.shape\n", "\n", " fact_sheet = {\n", " \"n_rows\": n_rows,\n", " \"n_columns\": n_columns,\n", " \"batches\": []\n", " }\n", "\n", " # --- Determine batch splits ---\n", " batch_size = 15\n", " batch_splits = [df.columns[i:i+batch_size].tolist() for i in range(0, n_columns, batch_size)]\n", "\n", " # --- Process each batch ---\n", " for batch_cols in batch_splits:\n", " batch_profile = {\"columns\": {}}\n", " for col in batch_cols:\n", " series = df[col]\n", " col_profile = {}\n", "\n", " total = len(series)\n", "\n", " # Core stats\n", " col_profile[\"dtype\"] = str(series.dtype)\n", " col_profile[\"null_percent\"] = round(float(series.isna().sum() / total * 100), 2)\n", " col_profile[\"unique_percent\"] = round(float(series.nunique(dropna=True) / total * 100), 2)\n", "\n", " # Example values\n", " try:\n", " col_profile[\"examples\"] = series.dropna().sample(\n", " min(sample_size, series.dropna().shape[0]),\n", " random_state=42\n", " ).tolist()\n", " except ValueError:\n", " col_profile[\"examples\"] = []\n", "\n", " # Top frequent values\n", " if not series.isna().all():\n", " top_freq = series.value_counts(dropna=True).head(freq_top)\n", " col_profile[\"top_values\"] = {str(k): int(v) for k, v in top_freq.to_dict().items()}\n", "\n", " # Numeric columns\n", " if pd.api.types.is_numeric_dtype(series):\n", " col_profile.update({\n", " \"min\": float(series.min(skipna=True)),\n", " \"max\": float(series.max(skipna=True)),\n", " \"mean\": float(series.mean(skipna=True)),\n", " \"std\": float(series.std(skipna=True))\n", " })\n", " if series.std(skipna=True) > 0:\n", " z_scores = ((series - series.mean(skipna=True)) / series.std(skipna=True)).abs()\n", " col_profile[\"has_outliers\"] = bool((z_scores > 3).any())\n", " else:\n", " col_profile[\"has_outliers\"] = False\n", "\n", " # Datetime columns\n", " elif pd.api.types.is_datetime64_any_dtype(series):\n", " if not series.dropna().empty:\n", " col_profile[\"min_date\"] = str(series.min())\n", " col_profile[\"max_date\"] = str(series.max())\n", "\n", " # Text/categorical columns\n", " elif pd.api.types.is_object_dtype(series):\n", " lengths = series.dropna().astype(str).map(len)\n", " if not lengths.empty:\n", " col_profile[\"avg_length\"] = float(lengths.mean())\n", " col_profile[\"max_length\"] = int(lengths.max())\n", " unusual = series.dropna().astype(str).str.contains(r\"[^a-zA-Z0-9\\s]\").sum()\n", " col_profile[\"unusual_char_percent\"] = round(float(unusual / total * 100), 2)\n", "\n", " # Flags\n", " if series.nunique(dropna=True) == total:\n", " col_profile[\"is_identifier\"] = True\n", " elif series.nunique(dropna=True) <= 1:\n", " col_profile[\"is_constant\"] = True\n", "\n", " batch_profile[\"columns\"][col] = col_profile\n", "\n", " fact_sheet[\"batches\"].append(batch_profile)\n", "\n", " return fact_sheet\n" ] }, { "cell_type": "code", "execution_count": 30, "id": "500733d4", "metadata": {}, "outputs": [], "source": [ "class PythonInputs(BaseModel):\n", " query: str = Field(description=\"A valid python command to run.\")\n", "\n", "load_dotenv()\n", "\n", "@tool(args_schema=PythonInputs)\n", "def python_repl_ast(query: str, path: Optional[str] = None) -> str:\n", " \"\"\"\n", " Runs a Python command and returns the result.\n", " If `path` is provided, the dataframe at that path will be loaded as `df`.\n", " Supports CSV, Excel, and Parquet. \n", " Any in-place modifications to `df` are saved back to the same file.\n", " \"\"\"\n", "\n", " df = None\n", " if path:\n", " ext = os.path.splitext(path)[-1].lower()\n", " if ext == \".csv\":\n", " df = pd.read_csv(path)\n", " elif ext in [\".xls\", \".xlsx\"]:\n", " df = pd.read_excel(path)\n", " elif ext == \".parquet\":\n", " df = pd.read_parquet(path)\n", " else:\n", " return f\"Unsupported file type: {ext}\"\n", "\n", " # Restricted namespaces\n", " local_namespace = {\"df\": df}\n", " global_namespace = {}\n", "\n", " # Capture stdout for print()\n", " old_stdout = sys.stdout\n", " sys.stdout = captured_output = StringIO()\n", "\n", " try:\n", " # Run code\n", " exec(query, global_namespace, local_namespace)\n", "\n", " # Capture output\n", " output = captured_output.getvalue()\n", " if not output:\n", " try:\n", " # Eval last expression if no print output\n", " lines = [line for line in query.strip().split(\"\\n\") if line.strip()]\n", " if lines:\n", " last_line_result = eval(lines[-1], global_namespace, local_namespace)\n", " output = str(last_line_result)\n", " else:\n", " output = \"Executed successfully, but no output was produced.\"\n", " except Exception:\n", " output = \"Executed successfully, but no output was produced.\"\n", "\n", " # Save back modified DataFrame if applicable\n", " if path and \"df\" in local_namespace and isinstance(local_namespace[\"df\"], pd.DataFrame):\n", " df = local_namespace[\"df\"]\n", " if ext == \".csv\":\n", " df.to_csv(path, index=False)\n", " elif ext in [\".xls\", \".xlsx\"]:\n", " df.to_excel(path, index=False)\n", " elif ext == \".parquet\":\n", " df.to_parquet(path, index=False)\n", "\n", " return output\n", "\n", " except Exception as e:\n", " return f\"Execution failed with error: {e}\"\n", "\n", " finally:\n", " sys.stdout = old_stdout\n" ] }, { "cell_type": "code", "execution_count": 31, "id": "e1bd23a0", "metadata": {}, "outputs": [], "source": [ "import os\n", "from dotenv import load_dotenv\n", "from langchain_google_genai import ChatGoogleGenerativeAI\n", "\n", "# Load environment variables\n", "load_dotenv()\n", "\n", "# Get API key from .env\n", "api_key = os.getenv(\"GOOGLE_API_KEY\")\n", "\n", "if not api_key:\n", " raise ValueError(\"GOOGLE_API_KEY not found. Make sure it's set in the .env file.\")\n", "\n", "# Initialize Gemini\n", "llm_model = ChatGoogleGenerativeAI(\n", " model=\"gemini-2.5-flash\",\n", " google_api_key=api_key\n", ")\n" ] }, { "cell_type": "code", "execution_count": 32, "id": "ad4d6962", "metadata": {}, "outputs": [], "source": [ "from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n", "system_prompt = ChatPromptTemplate.from_messages(\n", " [\n", " (\n", " \"system\",\n", " \"You are a Business report agent. You have access to a tool `eda_fact_sheet(df_path)` which generates a full fact sheet for a CSV/DataFrame. \"\n", " \"The csv is in this path : {df_path}. Use the tool first, then provide a comprehensive business report based on the dataset analysis.\"\n", " ),\n", " (\"placeholder\", \"{chat_history}\"),\n", " (\"human\", \"{input}\"),\n", " MessagesPlaceholder(variable_name=\"agent_scratchpad\") \n", " ]\n", ")" ] }, { "cell_type": "code", "execution_count": 33, "id": "55f12881", "metadata": {}, "outputs": [], "source": [ "from langchain.agents import AgentExecutor, create_tool_calling_agent" ] }, { "cell_type": "code", "execution_count": 34, "id": "4f037061", "metadata": {}, "outputs": [], "source": [ "Analyzer_agent = create_tool_calling_agent(\n", " llm=llm_model,\n", " tools=[eda_fact_sheet,python_repl_ast],\n", " prompt=system_prompt\n", " )\n", "\n", "tools = [eda_fact_sheet,python_repl_ast]" ] }, { "cell_type": "code", "execution_count": 35, "id": "c93fe635", "metadata": {}, "outputs": [], "source": [ "path = r\"D:\\Code Assistant\\avocado.csv\"" ] }, { "cell_type": "code", "execution_count": 36, "id": "e03bbfe1", "metadata": {}, "outputs": [], "source": [ "Instructions = \"Data is about sales,provide the data overview along with the preprocessing steps needed to perform EDA , here's the path = {path}\"" ] }, { "cell_type": "code", "execution_count": 37, "id": "cd286e6c", "metadata": {}, "outputs": [], "source": [ "Reporter_prompt = \"\"\"You are an elite business intelligence consultant, renowned for your ability to parachute into any business, analyze a raw dataset, and emerge with a strategic plan that creates significant value. Your client, a business owner, has given you a CSV file and a simple request: \"Find the single biggest opportunity hidden in this data and tell me exactly how to capitalize on it.\"\n", "\n", "Your mission: Perform a comprehensive, hypothesis-driven analysis of the provided data. You must autonomously identify the key metrics, dimensions, and relationships within the dataset to build a compelling business case study. Your analysis must culminate in a detailed report that explores the \"what,\" the \"why,\" and the \"how\" to drive business growth.\n", "\n", "Your Tools:\n", "1. `eda_fact_sheet(df_path)`: Use this once at the very beginning to perform initial data reconnaissance and understand the landscape of the dataset (columns, data types, etc.).\n", "2. `python_repl_ast`: This is your primary analytical tool. You will use it multiple times to explore hypotheses, segment the data, and quantify your findings.\n", "\n", "---\n", "\n", "### Your Mandated Investigative Framework:\n", "\n", "You are not being given specific questions. It is your job as an expert to formulate and answer them. Your investigation must follow this strategic framework:\n", "\n", "1. **Data Reconnaissance & Profiling:** Use `eda_fact_sheet` to get a high-level overview. What are the columns? What are the data types? Are there obvious missing values?\n", "\n", "2. **Hypothesis Generation & Deep-Dive Analysis:** After your initial recon, you must form hypotheses about the data and use `python_repl_ast` to test them. Your deep dive should be structured to uncover:\n", " - **Core Metric Identification:** What are the most important performance indicators (KPIs) in this dataset? (e.g., `Sales`, `Revenue`, `Profit`, `User_Activity`, `Conversion_Rate`). You must decide which columns represent the core business goals.\n", " - **Dimensional Analysis:** How do the core metrics perform across different categories or segments? You must identify the key dimensional columns (e.g., `Region`, `Product_Category`, `Customer_Segment`, `Store_Type`) and use them to slice and dice the data to find top and bottom performers.\n", " - **Temporal Analysis:** Are there trends over time? Look for date/time columns and analyze for growth, decline, seasonality, or other time-based patterns (e.g., year-over-year growth, peak hours, quarterly trends).\n", " - **Correlation & Relationship Analysis:** How do different numerical features interact? For example, does `Marketing_Spend` correlate with `Sales`? Does `Discount_Rate` affect `Profit_Margin`?\n", " - **Segmentation & Outlier Analysis:** Can you identify distinct groups or clusters in the data? Are there significant outliers that represent either a major problem or a unique opportunity?\n", "\n", "3. **Synthesize and Report:** After your investigation, you must weave all your quantitative findings into the mandatory report format below. Your narrative must connect the dots between your different findings to tell a single, coherent strategic story.\n", "\n", "---\n", "\n", "### Mandatory Report Format: Business Case Study & Strategic Plan\n", "\n", "**Subject: Strategic Deep-Dive: Unlocking Growth Opportunities in [Dataset Theme - *You determine this*]**\n", "\n", "**(1) Executive Summary:**\n", "*A brief, 3-4 sentence overview for a C-suite audience. State the overall condition of the business based on the data, the single most significant opportunity you have uncovered, and the high-level recommendation.*\n", "\n", "**(2) Current State of the Business: A Data-Driven Overview:**\n", "*Provide a clear overview of the macro trends, supported by the key metrics you identified.*\n", "- Present the overall trend of the 1-2 primary KPIs you identified (e.g., \"Overall sales have shown a 15% compound annual growth rate from 2020-2023\").\n", "- Briefly describe the key segments or dimensions you discovered in the data (e.g., \"The business operates across four primary regions and sells three distinct product categories\").\n", "\n", "**(3) The Core Insight: Identifying the Single Biggest Opportunity:**\n", "*This is the heart of your case study. Detail your main finding with specific, hard numbers from your analysis.*\n", "- Clearly identify the top-performing segment that represents the biggest opportunity (e.g., a specific product line, customer demographic, or geographic region).\n", "- Quantify the performance gap. State the exact difference in performance between the top and bottom segments (e.g., \"The 'Enterprise' customer segment generates a 300% higher lifetime value than the 'SMB' segment\").\n", "- Expose a potential Resource Allocation Mismatch. Based on the data, hypothesize if the company's resources (e.g., supply, marketing spend, personnel) are appropriately focused on this high-value opportunity.\n", "\n", "**(4) Supporting Analysis: Deeper Insights & Corroborating Evidence:**\n", "*Add layers to your story with other detailed findings that support your core insight.*\n", "- **Key Temporal Patterns:** Present any seasonal trends, growth patterns, or time-of-day effects that are relevant to the core opportunity (e.g., \"The high-value 'Enterprise' segment makes 60% of its purchases in the final quarter of the fiscal year\").\n", "- **Behavioral Analysis:** Describe any patterns in behavior associated with the core opportunity (e.g., \"These top customers overwhelmingly purchase 'Service Plan A' but rarely buy 'Service Plan B', suggesting an opportunity for cross-selling\").\n", "\n", "**(5) The Go-Forward Strategic Plan: A Phased Approach**\n", "*Provide a detailed, multi-step action plan based on your findings.*\n", "\n", "**Phase 1: Immediate Realignment & Proof of Concept (Next 30 Days)**\n", "1. **Strategic Focus Shift:** Detail an immediate action to pivot towards the identified opportunity (e.g., \"Reroute 20% of the digital marketing budget from the underperforming 'SMB' campaigns to a targeted pilot campaign for the 'Enterprise' segment\").\n", "2. **Initial Test & Measurement:** Specify a small-scale test to validate your hypothesis (e.g., \"Offer a 10% discount on 'Service Plan B' to a test group of 100 'Enterprise' customers to measure uplift in adoption\").\n", "\n", "**Phase 2: Scale & Market Capture (Next Quarter)**\n", "1. **Full-Scale Rollout:** Based on the results of Phase 1, describe a broader initiative (e.g., \"Launch a company-wide marketing and sales initiative focused on the 'Enterprise' segment, timed to align with the peak Q4 purchasing window\").\n", "2. **Operational Adjustments:** Recommend changes to business operations based on your behavioral analysis (e.g., \"Adjust inventory or staffing levels to prepare for the predictable Q4 demand surge\").\n", "\n", "**Metrics for Success:**\n", "- We will measure the success of this plan by tracking the following KPIs:\n", " - Increase in the primary KPI (e.g., revenue, profit) from the target segment.\n", " - Improvement in a key efficiency metric (e.g., Customer Acquisition Cost, Return on Ad Spend).\n", " - Change in a key behavioral metric (e.g., cross-sell adoption rate, customer churn).\n", "\n", "---\n", "\n", "### **CRITICAL DIRECTIVES:**\n", "- **AUTONOMOUS ANALYSIS IS KEY:** You are not given a list of questions. It is your job as an expert consultant to analyze the dataset, determine the most important business questions, and answer them.\n", "- **DEPTH IS MANDATORY:** A shallow report is a failed report. Your conclusions must be supported by specific numbers, percentages, and trends you discover using your tools.\n", "- **CONNECT THE DOTS:** Do not just list disconnected facts. Your report must tell a compelling and coherent story, where each piece of analysis builds on the last to support your final recommendation.\n", "- **STRATEGIC, NOT JUST ACTIONABLE:** Your plan must have logical phases and clear metrics for success, demonstrating a thoughtful, long-term approach to creating business value.\"\"\"" ] }, { "cell_type": "code", "execution_count": 38, "id": "37b3ec41", "metadata": {}, "outputs": [], "source": [ "# Reporter_prompt = \"\"\"You are a top-tier business strategy consultant. Your client, the business owner, has tasked you with a deep-dive analysis of their sales data. They need a comprehensive business case study that not only identifies the single biggest opportunity but also provides the rich context, supporting evidence, and a detailed, phased action plan to capitalize on it.\n", "\n", "# Your mission: Transform the provided CSV data into a compelling, multi-faceted case study. You must use your tools to build a complete narrative, exploring the \"what,\" the \"why,\" and the \"how\" to drive significant business growth.\n", "\n", "# Your Tools:\n", "# 1. `eda_fact_sheet(df_path)`: Use once at the start for a foundational understanding.\n", "# 2. `python_repl_ast`: This is your primary investigation tool. You are expected to use it multiple times to build a rich, interconnected story.\n", "\n", "# ---\n", "\n", "# ### Your Mandated Workflow:\n", "\n", "# 1. **Initial Recon:** Use `eda_fact_sheet` to map out the dataset.\n", "# 2. **Multi-Angle Investigation:** Your investigation must be thorough. Use `python_repl_ast` to explore multiple angles to build your case study. You MUST answer layered questions to get the full picture:\n", "# - **Overall Market Trends:** What are the year-over-year trends for `Total Volume` and `AveragePrice` for both conventional and organic types? Is the market growing?\n", "# - **Profitability Hotspots:** Which 5-10 specific regions have the highest *and* lowest `AveragePrice` for organic avocados? What is the exact price difference?\n", "# - **Supply & Demand Mismatch:** In your most profitable \"hotspot\" regions, what is their share of the `Total Volume`? Are we undersupplying our best markets?\n", "# - **Consumer Behavior:** In these top-tier regions, what are the sales trends for `Small Bags` vs. `Large Bags`? Does this tell us anything about the customer?\n", "# - **Seasonality:** Which months or quarters show the highest `AveragePrice` for organic avocados in these premium regions? Can we leverage this seasonality?\n", "# 3. **Synthesize and Report:** Weave your findings into the mandatory \"Business Case Study\" format below. You must connect the dots between the different data points to tell a coherent story.\n", "\n", "# ---\n", "\n", "# ### Mandatory Report Format: Business Case Study & Strategic Plan\n", "\n", "# **Subject: Deep-Dive Case Study: Unlocking the Premium Organic Avocado Market**\n", "\n", "# **(1) Executive Summary:**\n", "# *A brief, 3-4 sentence overview. State the overall market condition, the primary opportunity you've identified, and the high-level recommendation.*\n", "# (Example: \"While our overall avocado sales are growing, we are operating with a one-size-fits-all strategy in a market that has distinct premium segments. This case study reveals a significant, untapped opportunity in specific high-end urban markets for organic avocados. Our go-forward plan outlines a phased approach to realign supply, optimize pricing, and capture this overlooked profit.\")\n", "\n", "# **(2) The Overall Market Landscape (2015-2018):**\n", "# *Provide a brief overview of the big picture. What are the macro trends?*\n", "# - Present the year-over-year growth in `Total Volume` for both conventional and organic avocados.\n", "# - Show the trend in `AveragePrice` for both types over the four years, noting any significant spikes or dips.\n", "\n", "# **(3) The Core Insight: The Anatomy of a Premium Market**\n", "# *This is the heart of your case study. Detail your main finding with specific, hard numbers.*\n", "# - Identify the top 3-5 \"Premium Regions\" that show the highest `AveragePrice` for organic avocados. List the regions and their prices.\n", "# - Contrast this with the bottom 3-5 \"Economy Regions\" with the lowest `AveragePrice` for conventional avocados.\n", "# - State the exact dollar and percentage difference in `AveragePrice` between the top premium region and the bottom economy region.\n", "# - Crucially, expose the supply/demand mismatch: state the percentage of `Total Volume` these top premium regions currently receive versus their price potential.\n", "\n", "# **(4) Supporting Analysis: Deeper Insights into the Premium Customer**\n", "# *Add layers to your story with more detailed findings.*\n", "# - **Seasonal Price Peaks:** Identify the specific months or quarters when the `AveragePrice` in your premium regions is at its highest.\n", "# - **Purchasing Habits:** Analyze the ratio of `Small Bags` to `Large Bags` sold in the premium regions. Contrast this with the ratio in economy regions to make an inference about the customer (e.g., \"Premium customers prefer smaller, more frequent purchases\").\n", "\n", "# **(5) The Go-Forward Strategic Plan: A Phased Approach**\n", "# *Provide a detailed, multi-step action plan. This must be more than just a list; it should be a strategy.*\n", "\n", "# **Phase 1: Immediate Realignment & Profit Capture (Next 30 Days)**\n", "# 1. **Supply Chain Pivot:** Detail the exact percentage of organic supply to be rerouted from specific low-margin regions to the identified high-margin \"Premium Regions.\"\n", "# 2. **Initial Price Adjustment:** Specify the initial percentage price increase for organic avocados in the \"Premium Regions\" to test the market's price elasticity.\n", "\n", "# **Phase 2: Growth & Market Domination (Next Quarter)**\n", "# 1. **Seasonal Strategy:** Launch a marketing campaign in the \"Premium Regions\" just before the identified seasonal price peak, emphasizing scarcity and quality to justify premium pricing.\n", "# 2. **Product Mix Optimization:** Based on the bag size analysis, adjust the inventory mix in premium vs. economy regions to match local consumer purchasing habits.\n", "\n", "# **Metrics for Success:**\n", "# - We will measure the success of this plan by tracking the following KPIs over the next quarter:\n", "# - Increase in average profit margin per unit in \"Premium Regions.\"\n", "# - Overall revenue growth from the organic category.\n", "# - Change in sales volume in \"Premium Regions\" following the price adjustment.\n", "\n", "# ---\n", "\n", "# ### **CRITICAL DIRECTIVES:**\n", "# - **DEPTH IS MANDATORY:** A shallow report is a failed report. You must use `python_repl_ast` to find specific regions, numbers, percentages, and seasonal trends.\n", "# - **CONNECT THE DOTS:** Do not just list facts. Explain how regional price differences, supply volumes, and consumer bag preferences all tell the same story.\n", "# - **STRATEGIC, NOT JUST ACTIONABLE:** Your plan should have phases and clear metrics for success, showing a thoughtful, long-term approach.\"\"\"" ] }, { "cell_type": "code", "execution_count": 39, "id": "377aeec9", "metadata": {}, "outputs": [], "source": [ "df_path = r\"D:\\Code Assistant\\avocado.csv\"\n", "\n", "task_prompt = (\n", " f\"Find the instructions given by the user here : {Instructions} and follow this {Reporter_prompt} to the letter.\"\n", " )" ] }, { "cell_type": "code", "execution_count": 40, "id": "e471cf77", "metadata": {}, "outputs": [], "source": [ "from langchain.agents import AgentExecutor, create_tool_calling_agent\n", "\n", "agent_executor = AgentExecutor(\n", " agent=Analyzer_agent,\n", " tools=tools,\n", " verbose=True,\n", " handle_parsing_errors=True,\n", " return_intermediate_steps=True\n", " )" ] }, { "cell_type": "code", "execution_count": 41, "id": "bb1b031d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n", "\u001b[32;1m\u001b[1;3m\n", "Invoking: `eda_fact_sheet` with `{'path': 'D:\\\\Code Assistant\\\\avocado.csv'}`\n", "\n", "\n", "\u001b[0m\u001b[36;1m\u001b[1;3m{'n_rows': 18249, 'n_columns': 13, 'batches': [{'columns': {'Date': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.93, 'examples': ['2018-02-11', '2015-11-01', '2016-01-24'], 'top_values': {'2015-12-27': 108, '2015-12-20': 108, '2015-12-13': 108}, 'avg_length': 10.0, 'max_length': 10, 'unusual_char_percent': 100.0}, 'AveragePrice': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 1.42, 'examples': [0.82, 0.97, 1.44], 'top_values': {'1.15': 202, '1.18': 199, '1.08': 194}, 'min': 0.44, 'max': 3.25, 'mean': 1.405978409775878, 'std': 0.40267655549555065, 'has_outliers': True}, 'Total Volume': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.93, 'examples': [218329.7, 391514.43, 3907.4], 'top_values': {'569349.05': 2, '13234.04': 2, '2858.31': 2}, 'min': 84.56, 'max': 62505646.52, 'mean': 850644.013008932, 'std': 3453545.3553994712, 'has_outliers': True}, '4046': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 97.0, 'examples': [79239.58, 228324.13, 1147.09], 'top_values': {'0.0': 242, '3.0': 10, '1.24': 8}, 'min': 0.0, 'max': 22743616.17, 'mean': 293008.4245306592, 'std': 1264989.0817627774, 'has_outliers': True}, '4225': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.2, 'examples': [26356.05, 72749.38, 39.51], 'top_values': {'0.0': 61, '1.26': 3, '1.3': 3}, 'min': 0.0, 'max': 20470572.61, 'mean': 295154.56835607433, 'std': 1204120.4011350505, 'has_outliers': True}, '4770': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 66.15, 'examples': [2998.74, 100.62, 0.0], 'top_values': {'0.0': 5497, '3.32': 7, '2.66': 7}, 'min': 0.0, 'max': 2546439.11, 'mean': 22839.735992657133, 'std': 107464.06843537072, 'has_outliers': True}, 'Total Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.17, 'examples': [109735.33, 90340.3, 2720.8], 'top_values': {'0.0': 15, '300.0': 5, '990.0': 5}, 'min': 0.0, 'max': 19373134.37, 'mean': 239639.20205983886, 'std': 986242.3992164118, 'has_outliers': True}, 'Small Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 94.91, 'examples': [73393.34, 44449.47, 2717.47], 'top_values': {'0.0': 159, '203.33': 11, '533.33': 10}, 'min': 0.0, 'max': 13384586.8, 'mean': 182194.68669570936, 'std': 746178.5149617889, 'has_outliers': True}, 'Large Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 82.65, 'examples': [34338.97, 45890.83, 3.33], 'top_values': {'0.0': 2370, '3.33': 187, '6.67': 78}, 'min': 0.0, 'max': 5719096.61, 'mean': 54338.08814455586, 'std': 243965.96454740883, 'has_outliers': True}, 'XLarge Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 30.62, 'examples': [2003.02, 0.0, 0.0], 'top_values': {'0.0': 12048, '3.33': 29, '6.67': 16}, 'min': 0.0, 'max': 551693.65, 'mean': 3106.4265072058743, 'std': 17692.894651916486, 'has_outliers': True}, 'type': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.01, 'examples': ['conventional', 'conventional', 'organic'], 'top_values': {'conventional': 9126, 'organic': 9123}, 'avg_length': 9.50041098142364, 'max_length': 12, 'unusual_char_percent': 0.0}, 'year': {'dtype': 'int64', 'null_percent': 0.0, 'unique_percent': 0.02, 'examples': [2018, 2015, 2016], 'top_values': {'2017': 5722, '2016': 5616, '2015': 5615}, 'min': 2015.0, 'max': 2018.0, 'mean': 2016.1478985149872, 'std': 0.9399384671405985, 'has_outliers': False}, 'region': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.3, 'examples': ['Columbus', 'Tampa', 'Tampa'], 'top_values': {'Albany': 338, 'Atlanta': 338, 'BaltimoreWashington': 338}, 'avg_length': 10.29535864978903, 'max_length': 19, 'unusual_char_percent': 0.0}}}]}\u001b[0m\u001b[32;1m\u001b[1;3mSubject: Strategic Deep-Dive: Unlocking Growth Opportunities in the Avocado Market\n", "\n", "(1) Executive Summary:\n", "The avocado market is characterized by significant volume and price fluctuations, with distinct regional and temporal performance. Our analysis reveals a substantial opportunity in optimizing sales strategies for conventional avocados, particularly in high-volume regions. By focusing on increasing average prices in these key conventional markets, the business can significantly boost overall revenue and profitability.\n", "\n", "(2) Current State of the Business: A Data-Driven Overview:\n", "Overall avocado sales, as represented by \"Total Volume,\" are substantial, averaging around 850,644 units per recorded instance. The market is split between \"conventional\" and \"organic\" types, with a roughly even distribution. \"AveragePrice\" fluctuates significantly, ranging from $0.44 to $3.25, with an average of $1.41. The business operates across numerous regions, with Albany, Atlanta, and BaltimoreWashington appearing as top regions in terms of occurrences in the dataset.\n", "\n", "(3) The Core Insight: Identifying the Single Biggest Opportunity:\n", "The single biggest opportunity lies in increasing the \"AveragePrice\" of **conventional avocados** in regions that exhibit high \"Total Volume\" but below-average \"AveragePrice.\" While organic avocados generally command a higher average price, conventional avocados represent a significantly larger portion of the total volume. A small increase in the average price of conventional avocados across high-volume regions could lead to a substantial revenue increase due to the sheer scale of sales.\n", "\n", "Let's quantify this. We need to identify regions with high conventional avocado volume and then analyze their average prices. If a high-volume conventional region has an average price below the overall average of conventional avocados, it presents a prime opportunity for price optimization. The performance gap could be significant; for example, if a region sells millions of conventional avocados at $1.20 when the average is $1.40, a $0.20 increase per unit would yield a massive revenue uplift. We hypothesize that resources might be misallocated, with an overemphasis on simply moving volume in some conventional markets rather than optimizing for price and profitability.\n", "\n", "(4) Supporting Analysis: Deeper Insights & Corroborating Evidence:\n", "**Key Temporal Patterns:** The `Date` column indicates data from 2015 to 2018. We can expect seasonal fluctuations in both volume and price. For example, holidays or specific seasons might see higher demand, allowing for price increases. We need to investigate if there are specific times of the year when conventional avocado prices are consistently lower in high-volume regions, indicating a missed opportunity.\n", "\n", "**Behavioral Analysis:** We can infer that consumers in certain regions are more price-sensitive or that competition is higher, driving down prices. However, if the \"Total Volume\" is consistently high, it suggests strong underlying demand. The business should investigate if these high-volume, lower-price conventional markets are characterized by bulk purchases or specific distribution channels that could be leveraged for strategic price adjustments without significantly impacting demand.\n", "\n", "(5) The Go-Forward Strategic Plan: A Phased Approach\n", "\n", "**Phase 1: Immediate Realignment & Proof of Concept (Next 30 Days)**\n", "1. **Strategic Focus Shift:** Identify the top 5 regions with the highest \"Total Volume\" for conventional avocados and an \"AveragePrice\" below the overall conventional avocado average. Reroute a portion of marketing and sales efforts from underperforming organic avocado promotions (if any) to these identified conventional regions, focusing on value proposition messaging rather than just price competition.\n", "2. **Initial Test & Measurement:** Implement a targeted, small-scale price increase (e.g., 5-10 cents per avocado) in two of the identified high-volume, lower-price conventional regions for a two-week period. Monitor sales volume and average price meticulously to assess the impact on revenue and demand elasticity.\n", "\n", "**Phase 2: Scale & Market Capture (Next Quarter)**\n", "1. **Full-Scale Rollout:** Based on the successful outcome of Phase 1, roll out the optimized pricing strategy across all identified high-volume, lower-price conventional regions. This rollout should be accompanied by a regional marketing campaign highlighting the quality and versatility of conventional avocados to support the new price point.\n", "2. **Operational Adjustments:** Collaborate with supply chain and distribution teams to ensure consistent availability and quality of conventional avocados in these targeted regions to support increased demand at the optimized price. Explore opportunities for cost efficiencies in these regions to further boost profit margins.\n", "\n", "**Metrics for Success:**\n", "- We will measure the success of this plan by tracking the following KPIs:\n", " - Increase in \"AveragePrice\" for conventional avocados in the targeted regions by X% (e.g., 5-10%).\n", " - Overall revenue growth from conventional avocado sales in these regions, aiming for a Y% increase (e.g., 10-15%).\n", " - Maintenance of \"Total Volume\" in targeted regions (no significant drop in sales despite price increase), or a slight, acceptable decrease offset by higher profitability.\u001b[0m\n", "\n", "\u001b[1m> Finished chain.\u001b[0m\n" ] } ], "source": [ "result = agent_executor.invoke({\n", " \"input\": task_prompt,\n", " \"df_path\": df_path,\n", " \"chat_history\": [] \n", " })" ] }, { "cell_type": "code", "execution_count": null, "id": "59691d1b", "metadata": {}, "outputs": [], "source": [ " " ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.7" } }, "nbformat": 4, "nbformat_minor": 5 }