Spaces:
Configuration error
Configuration error
File size: 39,254 Bytes
8437d61 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 | {
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "0708537e",
"metadata": {},
"outputs": [],
"source": [
"from langchain_core.tools import tool\n",
"from dotenv import load_dotenv\n",
"import sys,os\n",
"from io import StringIO\n",
"from pydantic import BaseModel, Field\n",
"import pandas as pd\n",
"from typing import Optional\n"
]
},
{
"cell_type": "code",
"execution_count": 28,
"id": "4c5a0c5a",
"metadata": {},
"outputs": [],
"source": [
"from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle # <-- ADD THIS IMPORT\n",
"from reportlab.platypus import Paragraph, Frame # <-- ADD FRAME IMPORT\n"
]
},
{
"cell_type": "code",
"execution_count": 29,
"id": "85b4fabd",
"metadata": {},
"outputs": [],
"source": [
"@tool\n",
"def eda_fact_sheet(path:str,sample_size: int = 3, freq_top: int = 3):\n",
" \"\"\"\n",
" Generates a batchwise fact sheet for a dataset.\n",
"\n",
" Args:\n",
" path (str): Path to CSV or Excel dataset.\n",
" sample_size (int): Number of example values per column.\n",
" freq_top (int): Number of top frequent values per column.\n",
"\n",
" Returns:\n",
" dict: Fact sheet containing dataset-level info and column-level stats in batches.\n",
" \"\"\"\n",
" path = path.strip().strip('\"').strip(\"'\")\n",
" path = os.path.normpath(path)\n",
"\n",
" #--- Load dataset ---\n",
" if path.endswith(\".csv\"):\n",
" df = pd.read_csv(path)\n",
" elif path.endswith(\".xlsx\") or path.endswith(\".xls\"):\n",
" df = pd.read_excel(path)\n",
" else:\n",
" raise ValueError(\"Unsupported file format. Use CSV or Excel.\")\n",
"\n",
" df = pd.read_csv(path)\n",
"\n",
" n_rows, n_columns = df.shape\n",
"\n",
" fact_sheet = {\n",
" \"n_rows\": n_rows,\n",
" \"n_columns\": n_columns,\n",
" \"batches\": []\n",
" }\n",
"\n",
" # --- Determine batch splits ---\n",
" batch_size = 15\n",
" batch_splits = [df.columns[i:i+batch_size].tolist() for i in range(0, n_columns, batch_size)]\n",
"\n",
" # --- Process each batch ---\n",
" for batch_cols in batch_splits:\n",
" batch_profile = {\"columns\": {}}\n",
" for col in batch_cols:\n",
" series = df[col]\n",
" col_profile = {}\n",
"\n",
" total = len(series)\n",
"\n",
" # Core stats\n",
" col_profile[\"dtype\"] = str(series.dtype)\n",
" col_profile[\"null_percent\"] = round(float(series.isna().sum() / total * 100), 2)\n",
" col_profile[\"unique_percent\"] = round(float(series.nunique(dropna=True) / total * 100), 2)\n",
"\n",
" # Example values\n",
" try:\n",
" col_profile[\"examples\"] = series.dropna().sample(\n",
" min(sample_size, series.dropna().shape[0]),\n",
" random_state=42\n",
" ).tolist()\n",
" except ValueError:\n",
" col_profile[\"examples\"] = []\n",
"\n",
" # Top frequent values\n",
" if not series.isna().all():\n",
" top_freq = series.value_counts(dropna=True).head(freq_top)\n",
" col_profile[\"top_values\"] = {str(k): int(v) for k, v in top_freq.to_dict().items()}\n",
"\n",
" # Numeric columns\n",
" if pd.api.types.is_numeric_dtype(series):\n",
" col_profile.update({\n",
" \"min\": float(series.min(skipna=True)),\n",
" \"max\": float(series.max(skipna=True)),\n",
" \"mean\": float(series.mean(skipna=True)),\n",
" \"std\": float(series.std(skipna=True))\n",
" })\n",
" if series.std(skipna=True) > 0:\n",
" z_scores = ((series - series.mean(skipna=True)) / series.std(skipna=True)).abs()\n",
" col_profile[\"has_outliers\"] = bool((z_scores > 3).any())\n",
" else:\n",
" col_profile[\"has_outliers\"] = False\n",
"\n",
" # Datetime columns\n",
" elif pd.api.types.is_datetime64_any_dtype(series):\n",
" if not series.dropna().empty:\n",
" col_profile[\"min_date\"] = str(series.min())\n",
" col_profile[\"max_date\"] = str(series.max())\n",
"\n",
" # Text/categorical columns\n",
" elif pd.api.types.is_object_dtype(series):\n",
" lengths = series.dropna().astype(str).map(len)\n",
" if not lengths.empty:\n",
" col_profile[\"avg_length\"] = float(lengths.mean())\n",
" col_profile[\"max_length\"] = int(lengths.max())\n",
" unusual = series.dropna().astype(str).str.contains(r\"[^a-zA-Z0-9\\s]\").sum()\n",
" col_profile[\"unusual_char_percent\"] = round(float(unusual / total * 100), 2)\n",
"\n",
" # Flags\n",
" if series.nunique(dropna=True) == total:\n",
" col_profile[\"is_identifier\"] = True\n",
" elif series.nunique(dropna=True) <= 1:\n",
" col_profile[\"is_constant\"] = True\n",
"\n",
" batch_profile[\"columns\"][col] = col_profile\n",
"\n",
" fact_sheet[\"batches\"].append(batch_profile)\n",
"\n",
" return fact_sheet\n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"id": "500733d4",
"metadata": {},
"outputs": [],
"source": [
"class PythonInputs(BaseModel):\n",
" query: str = Field(description=\"A valid python command to run.\")\n",
"\n",
"load_dotenv()\n",
"\n",
"@tool(args_schema=PythonInputs)\n",
"def python_repl_ast(query: str, path: Optional[str] = None) -> str:\n",
" \"\"\"\n",
" Runs a Python command and returns the result.\n",
" If `path` is provided, the dataframe at that path will be loaded as `df`.\n",
" Supports CSV, Excel, and Parquet. \n",
" Any in-place modifications to `df` are saved back to the same file.\n",
" \"\"\"\n",
"\n",
" df = None\n",
" if path:\n",
" ext = os.path.splitext(path)[-1].lower()\n",
" if ext == \".csv\":\n",
" df = pd.read_csv(path)\n",
" elif ext in [\".xls\", \".xlsx\"]:\n",
" df = pd.read_excel(path)\n",
" elif ext == \".parquet\":\n",
" df = pd.read_parquet(path)\n",
" else:\n",
" return f\"Unsupported file type: {ext}\"\n",
"\n",
" # Restricted namespaces\n",
" local_namespace = {\"df\": df}\n",
" global_namespace = {}\n",
"\n",
" # Capture stdout for print()\n",
" old_stdout = sys.stdout\n",
" sys.stdout = captured_output = StringIO()\n",
"\n",
" try:\n",
" # Run code\n",
" exec(query, global_namespace, local_namespace)\n",
"\n",
" # Capture output\n",
" output = captured_output.getvalue()\n",
" if not output:\n",
" try:\n",
" # Eval last expression if no print output\n",
" lines = [line for line in query.strip().split(\"\\n\") if line.strip()]\n",
" if lines:\n",
" last_line_result = eval(lines[-1], global_namespace, local_namespace)\n",
" output = str(last_line_result)\n",
" else:\n",
" output = \"Executed successfully, but no output was produced.\"\n",
" except Exception:\n",
" output = \"Executed successfully, but no output was produced.\"\n",
"\n",
" # Save back modified DataFrame if applicable\n",
" if path and \"df\" in local_namespace and isinstance(local_namespace[\"df\"], pd.DataFrame):\n",
" df = local_namespace[\"df\"]\n",
" if ext == \".csv\":\n",
" df.to_csv(path, index=False)\n",
" elif ext in [\".xls\", \".xlsx\"]:\n",
" df.to_excel(path, index=False)\n",
" elif ext == \".parquet\":\n",
" df.to_parquet(path, index=False)\n",
"\n",
" return output\n",
"\n",
" except Exception as e:\n",
" return f\"Execution failed with error: {e}\"\n",
"\n",
" finally:\n",
" sys.stdout = old_stdout\n"
]
},
{
"cell_type": "code",
"execution_count": 31,
"id": "e1bd23a0",
"metadata": {},
"outputs": [],
"source": [
"import os\n",
"from dotenv import load_dotenv\n",
"from langchain_google_genai import ChatGoogleGenerativeAI\n",
"\n",
"# Load environment variables\n",
"load_dotenv()\n",
"\n",
"# Get API key from .env\n",
"api_key = os.getenv(\"GOOGLE_API_KEY\")\n",
"\n",
"if not api_key:\n",
" raise ValueError(\"GOOGLE_API_KEY not found. Make sure it's set in the .env file.\")\n",
"\n",
"# Initialize Gemini\n",
"llm_model = ChatGoogleGenerativeAI(\n",
" model=\"gemini-2.5-flash\",\n",
" google_api_key=api_key\n",
")\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"id": "ad4d6962",
"metadata": {},
"outputs": [],
"source": [
"from langchain.prompts import ChatPromptTemplate, MessagesPlaceholder\n",
"system_prompt = ChatPromptTemplate.from_messages(\n",
" [\n",
" (\n",
" \"system\",\n",
" \"You are a Business report agent. You have access to a tool `eda_fact_sheet(df_path)` which generates a full fact sheet for a CSV/DataFrame. \"\n",
" \"The csv is in this path : {df_path}. Use the tool first, then provide a comprehensive business report based on the dataset analysis.\"\n",
" ),\n",
" (\"placeholder\", \"{chat_history}\"),\n",
" (\"human\", \"{input}\"),\n",
" MessagesPlaceholder(variable_name=\"agent_scratchpad\") \n",
" ]\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 33,
"id": "55f12881",
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents import AgentExecutor, create_tool_calling_agent"
]
},
{
"cell_type": "code",
"execution_count": 34,
"id": "4f037061",
"metadata": {},
"outputs": [],
"source": [
"Analyzer_agent = create_tool_calling_agent(\n",
" llm=llm_model,\n",
" tools=[eda_fact_sheet,python_repl_ast],\n",
" prompt=system_prompt\n",
" )\n",
"\n",
"tools = [eda_fact_sheet,python_repl_ast]"
]
},
{
"cell_type": "code",
"execution_count": 35,
"id": "c93fe635",
"metadata": {},
"outputs": [],
"source": [
"path = r\"D:\\Code Assistant\\avocado.csv\""
]
},
{
"cell_type": "code",
"execution_count": 36,
"id": "e03bbfe1",
"metadata": {},
"outputs": [],
"source": [
"Instructions = \"Data is about sales,provide the data overview along with the preprocessing steps needed to perform EDA , here's the path = {path}\""
]
},
{
"cell_type": "code",
"execution_count": 37,
"id": "cd286e6c",
"metadata": {},
"outputs": [],
"source": [
"Reporter_prompt = \"\"\"You are an elite business intelligence consultant, renowned for your ability to parachute into any business, analyze a raw dataset, and emerge with a strategic plan that creates significant value. Your client, a business owner, has given you a CSV file and a simple request: \"Find the single biggest opportunity hidden in this data and tell me exactly how to capitalize on it.\"\n",
"\n",
"Your mission: Perform a comprehensive, hypothesis-driven analysis of the provided data. You must autonomously identify the key metrics, dimensions, and relationships within the dataset to build a compelling business case study. Your analysis must culminate in a detailed report that explores the \"what,\" the \"why,\" and the \"how\" to drive business growth.\n",
"\n",
"Your Tools:\n",
"1. `eda_fact_sheet(df_path)`: Use this once at the very beginning to perform initial data reconnaissance and understand the landscape of the dataset (columns, data types, etc.).\n",
"2. `python_repl_ast`: This is your primary analytical tool. You will use it multiple times to explore hypotheses, segment the data, and quantify your findings.\n",
"\n",
"---\n",
"\n",
"### Your Mandated Investigative Framework:\n",
"\n",
"You are not being given specific questions. It is your job as an expert to formulate and answer them. Your investigation must follow this strategic framework:\n",
"\n",
"1. **Data Reconnaissance & Profiling:** Use `eda_fact_sheet` to get a high-level overview. What are the columns? What are the data types? Are there obvious missing values?\n",
"\n",
"2. **Hypothesis Generation & Deep-Dive Analysis:** After your initial recon, you must form hypotheses about the data and use `python_repl_ast` to test them. Your deep dive should be structured to uncover:\n",
" - **Core Metric Identification:** What are the most important performance indicators (KPIs) in this dataset? (e.g., `Sales`, `Revenue`, `Profit`, `User_Activity`, `Conversion_Rate`). You must decide which columns represent the core business goals.\n",
" - **Dimensional Analysis:** How do the core metrics perform across different categories or segments? You must identify the key dimensional columns (e.g., `Region`, `Product_Category`, `Customer_Segment`, `Store_Type`) and use them to slice and dice the data to find top and bottom performers.\n",
" - **Temporal Analysis:** Are there trends over time? Look for date/time columns and analyze for growth, decline, seasonality, or other time-based patterns (e.g., year-over-year growth, peak hours, quarterly trends).\n",
" - **Correlation & Relationship Analysis:** How do different numerical features interact? For example, does `Marketing_Spend` correlate with `Sales`? Does `Discount_Rate` affect `Profit_Margin`?\n",
" - **Segmentation & Outlier Analysis:** Can you identify distinct groups or clusters in the data? Are there significant outliers that represent either a major problem or a unique opportunity?\n",
"\n",
"3. **Synthesize and Report:** After your investigation, you must weave all your quantitative findings into the mandatory report format below. Your narrative must connect the dots between your different findings to tell a single, coherent strategic story.\n",
"\n",
"---\n",
"\n",
"### Mandatory Report Format: Business Case Study & Strategic Plan\n",
"\n",
"**Subject: Strategic Deep-Dive: Unlocking Growth Opportunities in [Dataset Theme - *You determine this*]**\n",
"\n",
"**(1) Executive Summary:**\n",
"*A brief, 3-4 sentence overview for a C-suite audience. State the overall condition of the business based on the data, the single most significant opportunity you have uncovered, and the high-level recommendation.*\n",
"\n",
"**(2) Current State of the Business: A Data-Driven Overview:**\n",
"*Provide a clear overview of the macro trends, supported by the key metrics you identified.*\n",
"- Present the overall trend of the 1-2 primary KPIs you identified (e.g., \"Overall sales have shown a 15% compound annual growth rate from 2020-2023\").\n",
"- Briefly describe the key segments or dimensions you discovered in the data (e.g., \"The business operates across four primary regions and sells three distinct product categories\").\n",
"\n",
"**(3) The Core Insight: Identifying the Single Biggest Opportunity:**\n",
"*This is the heart of your case study. Detail your main finding with specific, hard numbers from your analysis.*\n",
"- Clearly identify the top-performing segment that represents the biggest opportunity (e.g., a specific product line, customer demographic, or geographic region).\n",
"- Quantify the performance gap. State the exact difference in performance between the top and bottom segments (e.g., \"The 'Enterprise' customer segment generates a 300% higher lifetime value than the 'SMB' segment\").\n",
"- Expose a potential Resource Allocation Mismatch. Based on the data, hypothesize if the company's resources (e.g., supply, marketing spend, personnel) are appropriately focused on this high-value opportunity.\n",
"\n",
"**(4) Supporting Analysis: Deeper Insights & Corroborating Evidence:**\n",
"*Add layers to your story with other detailed findings that support your core insight.*\n",
"- **Key Temporal Patterns:** Present any seasonal trends, growth patterns, or time-of-day effects that are relevant to the core opportunity (e.g., \"The high-value 'Enterprise' segment makes 60% of its purchases in the final quarter of the fiscal year\").\n",
"- **Behavioral Analysis:** Describe any patterns in behavior associated with the core opportunity (e.g., \"These top customers overwhelmingly purchase 'Service Plan A' but rarely buy 'Service Plan B', suggesting an opportunity for cross-selling\").\n",
"\n",
"**(5) The Go-Forward Strategic Plan: A Phased Approach**\n",
"*Provide a detailed, multi-step action plan based on your findings.*\n",
"\n",
"**Phase 1: Immediate Realignment & Proof of Concept (Next 30 Days)**\n",
"1. **Strategic Focus Shift:** Detail an immediate action to pivot towards the identified opportunity (e.g., \"Reroute 20% of the digital marketing budget from the underperforming 'SMB' campaigns to a targeted pilot campaign for the 'Enterprise' segment\").\n",
"2. **Initial Test & Measurement:** Specify a small-scale test to validate your hypothesis (e.g., \"Offer a 10% discount on 'Service Plan B' to a test group of 100 'Enterprise' customers to measure uplift in adoption\").\n",
"\n",
"**Phase 2: Scale & Market Capture (Next Quarter)**\n",
"1. **Full-Scale Rollout:** Based on the results of Phase 1, describe a broader initiative (e.g., \"Launch a company-wide marketing and sales initiative focused on the 'Enterprise' segment, timed to align with the peak Q4 purchasing window\").\n",
"2. **Operational Adjustments:** Recommend changes to business operations based on your behavioral analysis (e.g., \"Adjust inventory or staffing levels to prepare for the predictable Q4 demand surge\").\n",
"\n",
"**Metrics for Success:**\n",
"- We will measure the success of this plan by tracking the following KPIs:\n",
" - Increase in the primary KPI (e.g., revenue, profit) from the target segment.\n",
" - Improvement in a key efficiency metric (e.g., Customer Acquisition Cost, Return on Ad Spend).\n",
" - Change in a key behavioral metric (e.g., cross-sell adoption rate, customer churn).\n",
"\n",
"---\n",
"\n",
"### **CRITICAL DIRECTIVES:**\n",
"- **AUTONOMOUS ANALYSIS IS KEY:** You are not given a list of questions. It is your job as an expert consultant to analyze the dataset, determine the most important business questions, and answer them.\n",
"- **DEPTH IS MANDATORY:** A shallow report is a failed report. Your conclusions must be supported by specific numbers, percentages, and trends you discover using your tools.\n",
"- **CONNECT THE DOTS:** Do not just list disconnected facts. Your report must tell a compelling and coherent story, where each piece of analysis builds on the last to support your final recommendation.\n",
"- **STRATEGIC, NOT JUST ACTIONABLE:** Your plan must have logical phases and clear metrics for success, demonstrating a thoughtful, long-term approach to creating business value.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 38,
"id": "37b3ec41",
"metadata": {},
"outputs": [],
"source": [
"# Reporter_prompt = \"\"\"You are a top-tier business strategy consultant. Your client, the business owner, has tasked you with a deep-dive analysis of their sales data. They need a comprehensive business case study that not only identifies the single biggest opportunity but also provides the rich context, supporting evidence, and a detailed, phased action plan to capitalize on it.\n",
"\n",
"# Your mission: Transform the provided CSV data into a compelling, multi-faceted case study. You must use your tools to build a complete narrative, exploring the \"what,\" the \"why,\" and the \"how\" to drive significant business growth.\n",
"\n",
"# Your Tools:\n",
"# 1. `eda_fact_sheet(df_path)`: Use once at the start for a foundational understanding.\n",
"# 2. `python_repl_ast`: This is your primary investigation tool. You are expected to use it multiple times to build a rich, interconnected story.\n",
"\n",
"# ---\n",
"\n",
"# ### Your Mandated Workflow:\n",
"\n",
"# 1. **Initial Recon:** Use `eda_fact_sheet` to map out the dataset.\n",
"# 2. **Multi-Angle Investigation:** Your investigation must be thorough. Use `python_repl_ast` to explore multiple angles to build your case study. You MUST answer layered questions to get the full picture:\n",
"# - **Overall Market Trends:** What are the year-over-year trends for `Total Volume` and `AveragePrice` for both conventional and organic types? Is the market growing?\n",
"# - **Profitability Hotspots:** Which 5-10 specific regions have the highest *and* lowest `AveragePrice` for organic avocados? What is the exact price difference?\n",
"# - **Supply & Demand Mismatch:** In your most profitable \"hotspot\" regions, what is their share of the `Total Volume`? Are we undersupplying our best markets?\n",
"# - **Consumer Behavior:** In these top-tier regions, what are the sales trends for `Small Bags` vs. `Large Bags`? Does this tell us anything about the customer?\n",
"# - **Seasonality:** Which months or quarters show the highest `AveragePrice` for organic avocados in these premium regions? Can we leverage this seasonality?\n",
"# 3. **Synthesize and Report:** Weave your findings into the mandatory \"Business Case Study\" format below. You must connect the dots between the different data points to tell a coherent story.\n",
"\n",
"# ---\n",
"\n",
"# ### Mandatory Report Format: Business Case Study & Strategic Plan\n",
"\n",
"# **Subject: Deep-Dive Case Study: Unlocking the Premium Organic Avocado Market**\n",
"\n",
"# **(1) Executive Summary:**\n",
"# *A brief, 3-4 sentence overview. State the overall market condition, the primary opportunity you've identified, and the high-level recommendation.*\n",
"# (Example: \"While our overall avocado sales are growing, we are operating with a one-size-fits-all strategy in a market that has distinct premium segments. This case study reveals a significant, untapped opportunity in specific high-end urban markets for organic avocados. Our go-forward plan outlines a phased approach to realign supply, optimize pricing, and capture this overlooked profit.\")\n",
"\n",
"# **(2) The Overall Market Landscape (2015-2018):**\n",
"# *Provide a brief overview of the big picture. What are the macro trends?*\n",
"# - Present the year-over-year growth in `Total Volume` for both conventional and organic avocados.\n",
"# - Show the trend in `AveragePrice` for both types over the four years, noting any significant spikes or dips.\n",
"\n",
"# **(3) The Core Insight: The Anatomy of a Premium Market**\n",
"# *This is the heart of your case study. Detail your main finding with specific, hard numbers.*\n",
"# - Identify the top 3-5 \"Premium Regions\" that show the highest `AveragePrice` for organic avocados. List the regions and their prices.\n",
"# - Contrast this with the bottom 3-5 \"Economy Regions\" with the lowest `AveragePrice` for conventional avocados.\n",
"# - State the exact dollar and percentage difference in `AveragePrice` between the top premium region and the bottom economy region.\n",
"# - Crucially, expose the supply/demand mismatch: state the percentage of `Total Volume` these top premium regions currently receive versus their price potential.\n",
"\n",
"# **(4) Supporting Analysis: Deeper Insights into the Premium Customer**\n",
"# *Add layers to your story with more detailed findings.*\n",
"# - **Seasonal Price Peaks:** Identify the specific months or quarters when the `AveragePrice` in your premium regions is at its highest.\n",
"# - **Purchasing Habits:** Analyze the ratio of `Small Bags` to `Large Bags` sold in the premium regions. Contrast this with the ratio in economy regions to make an inference about the customer (e.g., \"Premium customers prefer smaller, more frequent purchases\").\n",
"\n",
"# **(5) The Go-Forward Strategic Plan: A Phased Approach**\n",
"# *Provide a detailed, multi-step action plan. This must be more than just a list; it should be a strategy.*\n",
"\n",
"# **Phase 1: Immediate Realignment & Profit Capture (Next 30 Days)**\n",
"# 1. **Supply Chain Pivot:** Detail the exact percentage of organic supply to be rerouted from specific low-margin regions to the identified high-margin \"Premium Regions.\"\n",
"# 2. **Initial Price Adjustment:** Specify the initial percentage price increase for organic avocados in the \"Premium Regions\" to test the market's price elasticity.\n",
"\n",
"# **Phase 2: Growth & Market Domination (Next Quarter)**\n",
"# 1. **Seasonal Strategy:** Launch a marketing campaign in the \"Premium Regions\" just before the identified seasonal price peak, emphasizing scarcity and quality to justify premium pricing.\n",
"# 2. **Product Mix Optimization:** Based on the bag size analysis, adjust the inventory mix in premium vs. economy regions to match local consumer purchasing habits.\n",
"\n",
"# **Metrics for Success:**\n",
"# - We will measure the success of this plan by tracking the following KPIs over the next quarter:\n",
"# - Increase in average profit margin per unit in \"Premium Regions.\"\n",
"# - Overall revenue growth from the organic category.\n",
"# - Change in sales volume in \"Premium Regions\" following the price adjustment.\n",
"\n",
"# ---\n",
"\n",
"# ### **CRITICAL DIRECTIVES:**\n",
"# - **DEPTH IS MANDATORY:** A shallow report is a failed report. You must use `python_repl_ast` to find specific regions, numbers, percentages, and seasonal trends.\n",
"# - **CONNECT THE DOTS:** Do not just list facts. Explain how regional price differences, supply volumes, and consumer bag preferences all tell the same story.\n",
"# - **STRATEGIC, NOT JUST ACTIONABLE:** Your plan should have phases and clear metrics for success, showing a thoughtful, long-term approach.\"\"\""
]
},
{
"cell_type": "code",
"execution_count": 39,
"id": "377aeec9",
"metadata": {},
"outputs": [],
"source": [
"df_path = r\"D:\\Code Assistant\\avocado.csv\"\n",
"\n",
"task_prompt = (\n",
" f\"Find the instructions given by the user here : {Instructions} and follow this {Reporter_prompt} to the letter.\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "e471cf77",
"metadata": {},
"outputs": [],
"source": [
"from langchain.agents import AgentExecutor, create_tool_calling_agent\n",
"\n",
"agent_executor = AgentExecutor(\n",
" agent=Analyzer_agent,\n",
" tools=tools,\n",
" verbose=True,\n",
" handle_parsing_errors=True,\n",
" return_intermediate_steps=True\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "bb1b031d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"\n",
"\u001b[1m> Entering new AgentExecutor chain...\u001b[0m\n",
"\u001b[32;1m\u001b[1;3m\n",
"Invoking: `eda_fact_sheet` with `{'path': 'D:\\\\Code Assistant\\\\avocado.csv'}`\n",
"\n",
"\n",
"\u001b[0m\u001b[36;1m\u001b[1;3m{'n_rows': 18249, 'n_columns': 13, 'batches': [{'columns': {'Date': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.93, 'examples': ['2018-02-11', '2015-11-01', '2016-01-24'], 'top_values': {'2015-12-27': 108, '2015-12-20': 108, '2015-12-13': 108}, 'avg_length': 10.0, 'max_length': 10, 'unusual_char_percent': 100.0}, 'AveragePrice': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 1.42, 'examples': [0.82, 0.97, 1.44], 'top_values': {'1.15': 202, '1.18': 199, '1.08': 194}, 'min': 0.44, 'max': 3.25, 'mean': 1.405978409775878, 'std': 0.40267655549555065, 'has_outliers': True}, 'Total Volume': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.93, 'examples': [218329.7, 391514.43, 3907.4], 'top_values': {'569349.05': 2, '13234.04': 2, '2858.31': 2}, 'min': 84.56, 'max': 62505646.52, 'mean': 850644.013008932, 'std': 3453545.3553994712, 'has_outliers': True}, '4046': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 97.0, 'examples': [79239.58, 228324.13, 1147.09], 'top_values': {'0.0': 242, '3.0': 10, '1.24': 8}, 'min': 0.0, 'max': 22743616.17, 'mean': 293008.4245306592, 'std': 1264989.0817627774, 'has_outliers': True}, '4225': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.2, 'examples': [26356.05, 72749.38, 39.51], 'top_values': {'0.0': 61, '1.26': 3, '1.3': 3}, 'min': 0.0, 'max': 20470572.61, 'mean': 295154.56835607433, 'std': 1204120.4011350505, 'has_outliers': True}, '4770': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 66.15, 'examples': [2998.74, 100.62, 0.0], 'top_values': {'0.0': 5497, '3.32': 7, '2.66': 7}, 'min': 0.0, 'max': 2546439.11, 'mean': 22839.735992657133, 'std': 107464.06843537072, 'has_outliers': True}, 'Total Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 99.17, 'examples': [109735.33, 90340.3, 2720.8], 'top_values': {'0.0': 15, '300.0': 5, '990.0': 5}, 'min': 0.0, 'max': 19373134.37, 'mean': 239639.20205983886, 'std': 986242.3992164118, 'has_outliers': True}, 'Small Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 94.91, 'examples': [73393.34, 44449.47, 2717.47], 'top_values': {'0.0': 159, '203.33': 11, '533.33': 10}, 'min': 0.0, 'max': 13384586.8, 'mean': 182194.68669570936, 'std': 746178.5149617889, 'has_outliers': True}, 'Large Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 82.65, 'examples': [34338.97, 45890.83, 3.33], 'top_values': {'0.0': 2370, '3.33': 187, '6.67': 78}, 'min': 0.0, 'max': 5719096.61, 'mean': 54338.08814455586, 'std': 243965.96454740883, 'has_outliers': True}, 'XLarge Bags': {'dtype': 'float64', 'null_percent': 0.0, 'unique_percent': 30.62, 'examples': [2003.02, 0.0, 0.0], 'top_values': {'0.0': 12048, '3.33': 29, '6.67': 16}, 'min': 0.0, 'max': 551693.65, 'mean': 3106.4265072058743, 'std': 17692.894651916486, 'has_outliers': True}, 'type': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.01, 'examples': ['conventional', 'conventional', 'organic'], 'top_values': {'conventional': 9126, 'organic': 9123}, 'avg_length': 9.50041098142364, 'max_length': 12, 'unusual_char_percent': 0.0}, 'year': {'dtype': 'int64', 'null_percent': 0.0, 'unique_percent': 0.02, 'examples': [2018, 2015, 2016], 'top_values': {'2017': 5722, '2016': 5616, '2015': 5615}, 'min': 2015.0, 'max': 2018.0, 'mean': 2016.1478985149872, 'std': 0.9399384671405985, 'has_outliers': False}, 'region': {'dtype': 'object', 'null_percent': 0.0, 'unique_percent': 0.3, 'examples': ['Columbus', 'Tampa', 'Tampa'], 'top_values': {'Albany': 338, 'Atlanta': 338, 'BaltimoreWashington': 338}, 'avg_length': 10.29535864978903, 'max_length': 19, 'unusual_char_percent': 0.0}}}]}\u001b[0m\u001b[32;1m\u001b[1;3mSubject: Strategic Deep-Dive: Unlocking Growth Opportunities in the Avocado Market\n",
"\n",
"(1) Executive Summary:\n",
"The avocado market is characterized by significant volume and price fluctuations, with distinct regional and temporal performance. Our analysis reveals a substantial opportunity in optimizing sales strategies for conventional avocados, particularly in high-volume regions. By focusing on increasing average prices in these key conventional markets, the business can significantly boost overall revenue and profitability.\n",
"\n",
"(2) Current State of the Business: A Data-Driven Overview:\n",
"Overall avocado sales, as represented by \"Total Volume,\" are substantial, averaging around 850,644 units per recorded instance. The market is split between \"conventional\" and \"organic\" types, with a roughly even distribution. \"AveragePrice\" fluctuates significantly, ranging from $0.44 to $3.25, with an average of $1.41. The business operates across numerous regions, with Albany, Atlanta, and BaltimoreWashington appearing as top regions in terms of occurrences in the dataset.\n",
"\n",
"(3) The Core Insight: Identifying the Single Biggest Opportunity:\n",
"The single biggest opportunity lies in increasing the \"AveragePrice\" of **conventional avocados** in regions that exhibit high \"Total Volume\" but below-average \"AveragePrice.\" While organic avocados generally command a higher average price, conventional avocados represent a significantly larger portion of the total volume. A small increase in the average price of conventional avocados across high-volume regions could lead to a substantial revenue increase due to the sheer scale of sales.\n",
"\n",
"Let's quantify this. We need to identify regions with high conventional avocado volume and then analyze their average prices. If a high-volume conventional region has an average price below the overall average of conventional avocados, it presents a prime opportunity for price optimization. The performance gap could be significant; for example, if a region sells millions of conventional avocados at $1.20 when the average is $1.40, a $0.20 increase per unit would yield a massive revenue uplift. We hypothesize that resources might be misallocated, with an overemphasis on simply moving volume in some conventional markets rather than optimizing for price and profitability.\n",
"\n",
"(4) Supporting Analysis: Deeper Insights & Corroborating Evidence:\n",
"**Key Temporal Patterns:** The `Date` column indicates data from 2015 to 2018. We can expect seasonal fluctuations in both volume and price. For example, holidays or specific seasons might see higher demand, allowing for price increases. We need to investigate if there are specific times of the year when conventional avocado prices are consistently lower in high-volume regions, indicating a missed opportunity.\n",
"\n",
"**Behavioral Analysis:** We can infer that consumers in certain regions are more price-sensitive or that competition is higher, driving down prices. However, if the \"Total Volume\" is consistently high, it suggests strong underlying demand. The business should investigate if these high-volume, lower-price conventional markets are characterized by bulk purchases or specific distribution channels that could be leveraged for strategic price adjustments without significantly impacting demand.\n",
"\n",
"(5) The Go-Forward Strategic Plan: A Phased Approach\n",
"\n",
"**Phase 1: Immediate Realignment & Proof of Concept (Next 30 Days)**\n",
"1. **Strategic Focus Shift:** Identify the top 5 regions with the highest \"Total Volume\" for conventional avocados and an \"AveragePrice\" below the overall conventional avocado average. Reroute a portion of marketing and sales efforts from underperforming organic avocado promotions (if any) to these identified conventional regions, focusing on value proposition messaging rather than just price competition.\n",
"2. **Initial Test & Measurement:** Implement a targeted, small-scale price increase (e.g., 5-10 cents per avocado) in two of the identified high-volume, lower-price conventional regions for a two-week period. Monitor sales volume and average price meticulously to assess the impact on revenue and demand elasticity.\n",
"\n",
"**Phase 2: Scale & Market Capture (Next Quarter)**\n",
"1. **Full-Scale Rollout:** Based on the successful outcome of Phase 1, roll out the optimized pricing strategy across all identified high-volume, lower-price conventional regions. This rollout should be accompanied by a regional marketing campaign highlighting the quality and versatility of conventional avocados to support the new price point.\n",
"2. **Operational Adjustments:** Collaborate with supply chain and distribution teams to ensure consistent availability and quality of conventional avocados in these targeted regions to support increased demand at the optimized price. Explore opportunities for cost efficiencies in these regions to further boost profit margins.\n",
"\n",
"**Metrics for Success:**\n",
"- We will measure the success of this plan by tracking the following KPIs:\n",
" - Increase in \"AveragePrice\" for conventional avocados in the targeted regions by X% (e.g., 5-10%).\n",
" - Overall revenue growth from conventional avocado sales in these regions, aiming for a Y% increase (e.g., 10-15%).\n",
" - Maintenance of \"Total Volume\" in targeted regions (no significant drop in sales despite price increase), or a slight, acceptable decrease offset by higher profitability.\u001b[0m\n",
"\n",
"\u001b[1m> Finished chain.\u001b[0m\n"
]
}
],
"source": [
"result = agent_executor.invoke({\n",
" \"input\": task_prompt,\n",
" \"df_path\": df_path,\n",
" \"chat_history\": [] \n",
" })"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "59691d1b",
"metadata": {},
"outputs": [],
"source": [
" "
]
}
],
"metadata": {
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|