Spaces:
Runtime error
Runtime error
| [ | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a venn diagram for people with diabetes and those less than 50 years of age, from meps", | |
| "example_code": "# Using Server: MUZAYAN_GRAM\\SQLEXPRESS, and database: MEPS.\n#conn = Driver={SQL Server};Server=MUZAYAN_GRAM\\SQLEXPRESS;Database=MEPS;Trusted_Connection=yes;.\n\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data\ndiabetes_query = \"\"\"\n SELECT DUPERSID, DIABDX_M18, AGELAST\n FROM h216_narrow\n\"\"\"\ndiabetes_df = pd.read_sql(diabetes_query, conn)\n\n# Filter the data\ndiabetes_df = diabetes_df[(diabetes_df['DIABDX_M18'] == '1') | (diabetes_df['DIABDX_M18'] == '2')]\ndiabetes_df = diabetes_df[diabetes_df['AGELAST'].notna()]\ndiabetes_df['DIABDX_M18'] = diabetes_df['DIABDX_M18'].apply(lambda x: True if x == '1' else False)\ndiabetes_df['UNDER_50'] = diabetes_df['AGELAST'].apply(lambda x: True if int(x) < 50 else False)\n\n# Create a venn diagram\ndiabetes_count = diabetes_df['DIABDX_M18'].sum()\nunder_50_count = diabetes_df['UNDER_50'].sum()\nboth_count = ((diabetes_df['DIABDX_M18']) & (diabetes_df['UNDER_50'])).sum()\n\n# Plot the venn diagram\nplt.figure(figsize=(8, 8))\nv = venn2(subsets=(diabetes_count - both_count, under_50_count - both_count, both_count), set_labels=('Diabetes', 'Under 50'))\n\n# Show the plot\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me the average pharmacy claim cost for people with and without diabetes", | |
| "example_code": "\nimport pandas as pd\n\n# Query the database to get the pharmacy claim cost for people with and without diabetes\nquery = \"\"\"\n SELECT \n CASE \n WHEN DIABAGED > 0 THEN 'With Diabetes'\n ELSE 'Without Diabetes'\n END AS diabetes_status,\n AVG(RXEXP19) AS average_pharmacy_claim_cost\n FROM \n h216_narrow\n GROUP BY \n CASE \n WHEN DIABAGED > 0 THEN 'With Diabetes'\n ELSE 'Without Diabetes'\n END\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Print the average pharmacy claim cost for people with and without diabetes\nfor index, row in df.iterrows():\n print(f\"Average pharmacy claim cost for people {row['diabetes_status'].lower()}: ${round(row['average_pharmacy_claim_cost'], 2)}\")\n\ndf" | |
| }, | |
| { | |
| "database": "Chinook", | |
| "user_messages": "how many artists are there?", | |
| "example_code": "\nimport pandas as pd\nimport pyodbc\n\n# SQL query to count the number of artists\nquery = \"SELECT COUNT(*) AS NumberOfArtists FROM Artist\"\n\n# Execute the query and read the result into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Print the number of artists\nprint(\"Number of artists:\", df['NumberOfArtists'][0])" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a venn diagram for people with diabetes and those with asthma", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for diabetes and asthma\nquery = \"\"\"\n SELECT DUPERSID, DIABDX_M18, ASTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter the data to include only diagnosed cases\ndf['DIABDX_M18'] = df['DIABDX_M18'].apply(lambda x: True if x == '1' else False)\ndf['ASTHDX'] = df['ASTHDX'].apply(lambda x: True if x == '1' else False)\n\n# Calculate counts for the Venn diagram\ndiabetes_count = df['DIABDX_M18'].sum()\nasthma_count = df['ASTHDX'].sum()\nboth_count = (df['DIABDX_M18'] & df['ASTHDX']).sum()\n\n# Plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(diabetes_count - both_count, asthma_count - both_count, both_count),\n set_labels=('Diabetes', 'Asthma'))\n\n# Show the plot\nplt.title(\"Venn Diagram of People with Diabetes and Asthma\")\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "visualize a venn diagram for people that have diabetes and arthritis, using meps", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for diabetes and arthritis\nquery = \"\"\"\n SELECT DUPERSID, DIABDX_M18, ARTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter and process the data\ndf = df[(df['DIABDX_M18'].isin(['1', '2'])) & (df['ARTHDX'].isin(['1', '2']))] # Filter valid responses\ndf['Diabetes'] = df['DIABDX_M18'].apply(lambda x: True if x == '1' else False) # Convert to boolean\ndf['Arthritis'] = df['ARTHDX'].apply(lambda x: True if x == '1' else False) # Convert to boolean\n\n# Calculate counts for the Venn diagram\ndiabetes_count = df['Diabetes'].sum()\narthritis_count = df['Arthritis'].sum()\nboth_count = (df['Diabetes'] & df['Arthritis']).sum()\n\n# Create and plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(diabetes_count - both_count, arthritis_count - both_count, both_count),\n set_labels=('Diabetes', 'Arthritis'))\nplt.title('Venn Diagram of People with Diabetes and Arthritis')\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "visualize a venn diagram for people that have diabetes and asthma, from meps", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for diabetes and asthma\nquery = \"\"\"\n SELECT DUPERSID, DIABDX_M18, ASTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter the data to include only diagnosed cases\ndf['DIABDX_M18'] = df['DIABDX_M18'].apply(lambda x: True if x == '1' else False)\ndf['ASTHDX'] = df['ASTHDX'].apply(lambda x: True if x == '1' else False)\n\n# Calculate counts for the Venn diagram\ndiabetes_count = df['DIABDX_M18'].sum()\nasthma_count = df['ASTHDX'].sum()\nboth_count = (df['DIABDX_M18'] & df['ASTHDX']).sum()\n\n# Plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(diabetes_count - both_count, asthma_count - both_count, both_count),\n set_labels=('Diabetes', 'Asthma'))\n\n# Show the plot\nplt.title(\"Venn Diagram of People with Diabetes and Asthma\")\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "\"give me the average pharmacy claim cost for people with and without diabetes\n\"\n", | |
| "example_code": "\nimport pandas as pd\n\n# Query to calculate average pharmacy claim cost by diabetes status\nquery = \"\"\"\nSELECT\n CASE\n WHEN CAST(DIABDX_M18 AS INT) = 1 THEN 'With Diabetes'\n ELSE 'Without Diabetes'\n END AS diabetes_status,\n AVG(RXEXP19) AS avg_pharmacy_claim_cost\nFROM \n h216_narrow\nGROUP BY \n CASE\n WHEN CAST(DIABDX_M18 AS INT) = 1 THEN 'With Diabetes'\n ELSE 'Without Diabetes'\n END\n\"\"\"\n\ndf = pd.read_sql(query, conn)\n\n# Format and print results\nprint(\"Average Pharmacy Claim Costs (2019)\")\nprint(\"-----------------------------------\")\nfor _, row in df.iterrows():\n status = row['diabetes_status'].lower()\n cost = round(row['avg_pharmacy_claim_cost'], 2)\n print(f\"{status}: ${cost:,.2f}\")" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "what is the relationship between income and diabetes", | |
| "example_code": "\nimport pandas as pd\n\n# Query to calculate correlation between income and diabetes\nquery = \"\"\"\nSELECT \n FAMINC19 AS income,\n CASE\n WHEN DIABDX_M18 = '1' THEN 1\n ELSE 0\n END AS diabetes\nFROM \n h216_narrow\nWHERE \n FAMINC19 IS NOT NULL AND DIABDX_M18 IS NOT NULL\n\"\"\"\n\ndf = pd.read_sql(query, conn)\n\n# Calculate correlation between income and diabetes\ncorrelation = df['income'].corr(df['diabetes'])\nprint(f\"Correlation between income and diabetes: {correlation:.4f}\")\n\n# Calculate average income for people with and without diabetes\navg_income_with_diabetes = df[df['diabetes'] == 1]['income'].mean()\navg_income_without_diabetes = df[df['diabetes'] == 0]['income'].mean()\n\n# Print average income for people with and without diabetes\nprint(f\"Average income for people with diabetes: ${avg_income_with_diabetes:,.2f}\")\nprint(f\"Average income for people without diabetes: ${avg_income_without_diabetes:,.2f}\")\n\n# Calculate percentage of people with diabetes at each income level\nincome_bins = [0, 25000, 50000, 75000, 100000, float('inf')]\ndf['income_bin'] = pd.cut(df['income'], bins=income_bins)\ndiabetes_rates = df.groupby('income_bin')['diabetes'].mean()\n\n# Print percentage of people with diabetes at each income level\nprint(\"\\nPercentage of people with diabetes at each income level:\")\nprint(diabetes_rates)" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "what is the relationship between income and diabetes", | |
| "example_code": "\nimport pandas as pd\n\n# Query to calculate correlation between income and diabetes\nquery = \"\"\"\nSELECT \n FAMINC19 AS income,\n CASE\n WHEN DIABDX_M18 = '1' THEN 1\n ELSE 0\n END AS diabetes\nFROM \n h216_narrow\nWHERE \n FAMINC19 IS NOT NULL AND DIABDX_M18 IS NOT NULL\n\"\"\"\n\ndf = pd.read_sql(query, conn)\n\n# Calculate correlation between income and diabetes\ncorrelation = df['income'].corr(df['diabetes'])\nprint(f\"Correlation between income and diabetes: {correlation:.4f}\")\n\n# Calculate average income for people with and without diabetes\navg_income_with_diabetes = df[df['diabetes'] == 1]['income'].mean()\navg_income_without_diabetes = df[df['diabetes'] == 0]['income'].mean()\n\n# Print average income for people with and without diabetes\nprint(f\"Average income for people with diabetes: ${avg_income_with_diabetes:,.2f}\")\nprint(f\"Average income for people without diabetes: ${avg_income_without_diabetes:,.2f}\")\n\n# Create bins for income levels\nincome_bins = [0, 25000, 50000, 75000, 100000, float('inf')]\nincome_labels = ['<25k', '25k-50k', '50k-75k', '75k-100k', '100k+']\ndf['income_bin'] = pd.cut(df['income'], bins=income_bins, labels=income_labels)\n\n# Calculate percentage of people with diabetes at each income level\ndiabetes_rates = df.groupby('income_bin')['diabetes'].mean()\n\n# Print percentage of people with diabetes at each income level\nprint(\"\\nPercentage of people with diabetes at each income level:\")\nprint(diabetes_rates * 100) # Convert to percentage" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "how many diabetics in meps?", | |
| "example_code": "\nimport pandas as pd\n\n# Query the data\ndiabetes_query = \"\"\"\n SELECT DIABDX_M18\n FROM h216_narrow\n\"\"\"\ndiabetes_df = pd.read_sql(diabetes_query, conn)\n\n# Filter the data\ndiabetes_df = diabetes_df[diabetes_df['DIABDX_M18'].notna()]\ndiabetes_df['DIABDX_M18'] = diabetes_df['DIABDX_M18'].apply(lambda x: True if x == '1' else False)\n\n# Get the count of diabetics\ndiabetic_count = diabetes_df['DIABDX_M18'].sum()\n\n# Calculate the percentage of diabetics\ntotal_count = len(diabetes_df)\ndiabetic_percentage = (diabetic_count / total_count) * 100\n\nprint(f\"The total count of diabetics in MEPS is: {diabetic_count}\")\nprint(f\"The percentage of diabetics in MEPS is: {diabetic_percentage:.2f}%\")" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a venn diagram for people with arthritis and those with diabetes, from meps", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for diabetes and arthritis\nquery = \"\"\"\n SELECT DUPERSID, DIABDX_M18, ARTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter and process the data\ndf = df[(df['DIABDX_M18'].isin(['1', '2'])) & (df['ARTHDX'].isin(['1', '2']))] # Filter valid responses\ndf['Diabetes'] = df['DIABDX_M18'].apply(lambda x: True if x == '1' else False) # Convert to boolean\ndf['Arthritis'] = df['ARTHDX'].apply(lambda x: True if x == '1' else False) # Convert to boolean\n\n# Calculate counts for the Venn diagram\ndiabetes_count = df['Diabetes'].sum()\narthritis_count = df['Arthritis'].sum()\nboth_count = (df['Diabetes'] & df['Arthritis']).sum()\n\n# Create and plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(diabetes_count - both_count, arthritis_count - both_count, both_count),\n set_labels=('Diabetes', 'Arthritis'))\nplt.title('Venn Diagram of People with Diabetes and Arthritis')\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a venn diagram for people with asthma and those with arthritis, from meps", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for asthma and arthritis\nquery = \"\"\"\n SELECT DUPERSID, ASTHDX, ARTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter and process the data\ndf = df[(df['ASTHDX'].isin(['1', '2'])) & (df['ARTHDX'].isin(['1', '2']))] # Filter valid responses\ndf['Asthma'] = df['ASTHDX'].apply(lambda x: True if x == '1' else False) # Convert to boolean\ndf['Arthritis'] = df['ARTHDX'].apply(lambda x: True if x == '1' else False) # Convert to boolean\n\n# Calculate counts for the Venn diagram\nasthma_count = df['Asthma'].sum()\narthritis_count = df['Arthritis'].sum()\nboth_count = (df['Asthma'] & df['Arthritis']).sum()\n\n# Create and plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(asthma_count - both_count, arthritis_count - both_count, both_count),\n set_labels=('Asthma', 'Arthritis'))\nplt.title('Venn Diagram of People with Asthma and Arthritis')\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me the top 10 drugs, in terms of amount paid by self, for individuals with diabetes. ", | |
| "example_code": "\nimport pandas as pd\n\n# Query the database to get the top 10 drugs paid by self for individuals with diabetes\nquery = \"\"\"\n SELECT TOP 10 \n RXDRGNAM,\n SUM(RXSF19X) AS total_self_paid\n FROM \n h213a\n INNER JOIN \n h216_narrow ON h213a.DUPERSID = h216_narrow.DUPERSID\n WHERE \n h216_narrow.DIABAGED > 0\n GROUP BY \n RXDRGNAM\n ORDER BY \n total_self_paid DESC\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Print the top 10 drugs paid by self for individuals with diabetes\ndf" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "from meps, give me the average total healthcare cost for people with diabetes and asthma, with just diabetes, and with neither\n\n", | |
| "example_code": "\nimport pandas as pd\n\n# Query the database to get the average total healthcare cost for people with diabetes and asthma, with just diabetes, and with neither\nquery = \"\"\"\n SELECT \n CASE \n WHEN DIABAGED > 0 AND ASTHAGED > 0 THEN 'Diabetes and Asthma'\n WHEN DIABAGED > 0 THEN 'Diabetes Only'\n ELSE 'Neither'\n END AS condition_status,\n AVG(TOTEXP19) AS average_healthcare_cost\n FROM \n h216_narrow\n WHERE \n DIABAGED IS NOT NULL AND ASTHAGED IS NOT NULL\n GROUP BY \n CASE \n WHEN DIABAGED > 0 AND ASTHAGED > 0 THEN 'Diabetes and Asthma'\n WHEN DIABAGED > 0 THEN 'Diabetes Only'\n ELSE 'Neither'\n END\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Print the average healthcare cost for people with diabetes and asthma, with just diabetes, and with neither\nfor index, row in df.iterrows():\n print(f\"Average total healthcare cost for people with {row['condition_status'].lower()}: ${round(row['average_healthcare_cost'], 2)}\")" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "from meps, explore the relationship of income with total healthcare cost", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom sklearn.linear_model import LinearRegression\nimport numpy as np\n\n# Query the database to get the income and total healthcare cost\nquery = \"\"\"\n SELECT \n FAMINC19,\n TOTEXP19\n FROM \n h216_narrow\n WHERE \n FAMINC19 IS NOT NULL AND TOTEXP19 IS NOT NULL\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Calculate the correlation coefficient between income and total healthcare cost\ncorrelation_coefficient = df['FAMINC19'].corr(df['TOTEXP19'])\nprint(f\"Correlation coefficient between income and total healthcare cost: {round(correlation_coefficient, 2)}\")\n\n# Create a scatter plot to visualize the relationship\nplt.figure(figsize=(10, 6))\nplt.scatter(df['FAMINC19'], df['TOTEXP19'])\nplt.xlabel('Family Income')\nplt.ylabel('Total Healthcare Cost')\nplt.title('Relationship between Income and Total Healthcare Cost')\nplt.show()\n\n# Perform linear regression to model the relationship\nX = df['FAMINC19'].values.reshape(-1, 1)\ny = df['TOTEXP19'].values\nmodel = LinearRegression()\nmodel.fit(X, y)\nprint(f\"Coefficient of determination (R^2): {round(model.score(X, y), 2)}\")\n\n# Predict total healthcare cost for different income levels\nincome_levels = np.array([20000, 50000, 100000]).reshape(-1, 1)\npredicted_costs = model.predict(income_levels)\nfor i, income in enumerate(income_levels):\n print(f\"Predicted total healthcare cost for income ${income[0]}: ${round(predicted_costs[i], 2)}\")\n\n# Calculate the average total healthcare cost for different income quartiles\ndf['income_quartile'] = pd.qcut(df['FAMINC19'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])\naverage_costs = df.groupby('income_quartile')['TOTEXP19'].mean()\nprint(\"Average total healthcare cost for different income quartiles:\")\nfor index, row in average_costs.items():\n print(f\"{index}: ${round(row, 2)}\")" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a venn diagram for those with diabetes, asthma, and arthritis. Use DUPERSID to get counts of those that have both diabetes and asthma, etc.", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn3\n\n# Query the data for diabetes, asthma, and arthritis\nquery = \"\"\"\n SELECT DUPERSID, DIABDX_M18, ASTHDX, ARTHDX\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Filter the data to include only diagnosed cases\ndf['DIABDX_M18'] = df['DIABDX_M18'].apply(lambda x: True if x == '1' else False)\ndf['ASTHDX'] = df['ASTHDX'].apply(lambda x: True if x == '1' else False)\ndf['ARTHDX'] = df['ARTHDX'].apply(lambda x: True if x == '1' else False)\n\n# Calculate counts for the Venn diagram\ndiabetes_count = df['DIABDX_M18'].sum()\nasthma_count = df['ASTHDX'].sum()\narthritis_count = df['ARTHDX'].sum()\n\nboth_diabetes_asthma = (df['DIABDX_M18'] & df['ASTHDX']).sum()\nboth_diabetes_arthritis = (df['DIABDX_M18'] & df['ARTHDX']).sum()\nboth_asthma_arthritis = (df['ASTHDX'] & df['ARTHDX']).sum()\n\nall_three = (df['DIABDX_M18'] & df['ASTHDX'] & df['ARTHDX']).sum()\n\n# Calculate the subsets\ndiabetes_only = diabetes_count - both_diabetes_asthma - both_diabetes_arthritis + all_three\nasthma_only = asthma_count - both_diabetes_asthma - both_asthma_arthritis + all_three\narthritis_only = arthritis_count - both_diabetes_arthritis - both_asthma_arthritis + all_three\n\n# Plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn3(subsets=(diabetes_only, asthma_only, both_diabetes_asthma - all_three, \n arthritis_only, both_diabetes_arthritis - all_three, both_asthma_arthritis - all_three, all_three),\n set_labels=('Diabetes', 'Asthma', 'Arthritis'))\n\n# Show the plot\nplt.title(\"Venn Diagram of People with Diabetes, Asthma, and Arthritis\")\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "can i get the portion of cost paid by self by insurance coverage type (private, public, uninsured)", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport numpy as np\n\n# Query the database to get the required data\nquery = \"\"\"\n SELECT \n CASE\n WHEN INSCOV19 = 1 THEN 'Private'\n WHEN INSCOV19 = 2 THEN 'Public'\n WHEN INSCOV19 = 3 THEN 'Uninsured'\n ELSE 'Other'\n END AS InsuranceCoverage,\n CASE\n WHEN TOTEXP19 = 0 THEN NULL -- avoid divide by zero\n ELSE TOTSLF19 / TOTEXP19\n END AS PortionOfCostPaidBySelf\n FROM \n h216_narrow\n WHERE \n TOTEXP19 IS NOT NULL AND TOTSLF19 IS NOT NULL\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\ndf = df.dropna() # drop any rows with missing values (i.e., TOTEXP19 = 0)\n\n# Calculate the average portion of cost paid by self by insurance coverage type\naverage_portion_paid = df.groupby('InsuranceCoverage')['PortionOfCostPaidBySelf'].mean()\nprint(\"Average portion of cost paid by self by insurance coverage type:\")\nfor index, row in average_portion_paid.items():\n print(f\"{index}: {round(row, 2)}\")\n\n# Create a bar chart to visualize the average portion of cost paid by self by insurance coverage type\nplt.figure(figsize=(10, 6))\nplt.bar(average_portion_paid.index, average_portion_paid.values)\nplt.xlabel('Insurance Coverage')\nplt.ylabel('Average Portion of Cost Paid by Self')\nplt.title('Average Portion of Cost Paid by Self by Insurance Coverage Type')\nplt.show()\n\n# Calculate the median portion of cost paid by self by insurance coverage type\nmedian_portion_paid = df.groupby('InsuranceCoverage')['PortionOfCostPaidBySelf'].median()\nprint(\"Median portion of cost paid by self by insurance coverage type:\")\nfor index, row in median_portion_paid.items():\n print(f\"{index}: {round(row, 2)}\")\n\n# Calculate the standard deviation of portion of cost paid by self by insurance coverage type\nstd_dev_portion_paid = df.groupby('InsuranceCoverage')['PortionOfCostPaidBySelf'].std()\nprint(\"Standard deviation of portion of cost paid by self by insurance coverage type:\")\nfor index, row in std_dev_portion_paid.items():\n print(f\"{index}: {round(row, 2)}\")" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "how many artists are there?", | |
| "example_code": "\nimport pandas as pd\n\n# Query to count the number of artists\nquery = \"SELECT COUNT(ArtistId) FROM Artist\"\n\n# Use pd.read_sql to execute the query and get the result\nresult = pd.read_sql(query, conn)\n\n# Print the result\nprint(\"Number of artists:\", result.iloc[0, 0])" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "how many artists are there?", | |
| "example_code": "\nimport pandas as pd\n\n# Query to count the number of artists\nquery = \"SELECT COUNT(ArtistId) AS ArtistCount FROM Artist\"\n\n# Use pd.read_sql to execute the query and get the result\nresult = pd.read_sql(query, conn)\n\n# Print the result\nprint(\"Number of artists:\", result.iloc[0, 0])" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "Rank the top selling albums by that famous irish band.", | |
| "example_code": "\n# Import necessary libraries\nimport pandas as pd\n\n# Query to find the ArtistId of U2\nquery_artist_id = \"SELECT ArtistId FROM Artist WHERE Name = 'U2'\"\n\n# Use pd.read_sql to execute the query and get the ArtistId of U2\nartist_id = pd.read_sql(query_artist_id, conn).iloc[0, 0]\n\n# Query to find the top selling albums of U2\nquery_top_albums = \"\"\"\n SELECT TOP 10 a.Title, SUM(il.Quantity) AS TotalSales\n FROM InvoiceLine il\n JOIN Track t ON il.TrackId = t.TrackId\n JOIN Album a ON t.AlbumId = a.AlbumId\n WHERE a.ArtistId = %s\n GROUP BY a.Title\n ORDER BY SUM(il.Quantity) DESC\n\"\"\" % artist_id\n\n# Use pd.read_sql to execute the query and get the top selling albums\ntop_albums = pd.read_sql(query_top_albums, conn)\n\n# Print the result\nprint(\"Top selling albums by U2:\")\ntop_albums" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "Give me a bar plot of sales, by album, of U2; use plotly", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\n\n# Query to find the ArtistId of U2\nquery_artist_id = \"SELECT ArtistId FROM Artist WHERE Name = 'U2'\"\nartist_id = pd.read_sql(query_artist_id, conn).iloc[0, 0]\n\n# Query to find the sales by album for U2\nquery_album_sales = \"\"\"\n SELECT a.Title, SUM(il.Quantity) AS TotalSales\n FROM Album a\n JOIN Artist ar ON a.ArtistId = ar.ArtistId\n JOIN Track t ON a.AlbumId = t.AlbumId\n JOIN InvoiceLine il ON t.TrackId = il.TrackId\n WHERE ar.ArtistId = %s\n GROUP BY a.Title\n ORDER BY SUM(il.Quantity) DESC\n\"\"\" % artist_id\n\n# Use pd.read_sql to execute the query and get the sales by album for U2\nalbum_sales = pd.read_sql(query_album_sales, conn)\n\n# Create a bar plot of sales by album using Plotly\nfig = px.bar(album_sales, x='Title', y='TotalSales', title=\"Album Sales by U2\")\nfig.update_layout(xaxis_title=\"Album Title\", yaxis_title=\"Total Sales\")\nfig.show()" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "from the chinook db, can you plot the number of customers by country on a map, and color code it by number of customers (white for few, and green for more)?", | |
| "example_code": "\nimport pandas as pd\nimport plotly.graph_objects as go\n\n# Query to find the number of customers by country\nquery_customer_country = \"\"\"\n SELECT Country, COUNT(CustomerId) AS NumberOfCustomers\n FROM Customer\n GROUP BY Country\n\"\"\"\n\n# Use pd.read_sql to execute the query and get the number of customers by country\ncustomer_country = pd.read_sql(query_customer_country, conn)\n\n# Create a world map plot of customers by country using Plotly\nfig = go.Figure(data=go.Choropleth(\n locations=customer_country['Country'], # Spatial coordinates\n z = customer_country['NumberOfCustomers'].astype(float), # Data to color by\n locationmode = 'country names', # Set of locations match entries in `locations`\n autocolorscale=False,\n colorscale='Greens',\n colorbar_title = \"Number of Customers\"\n))\n\nfig.update_layout(\n title_text='Number of Customers by Country',\n geo_scope='world',\n)\n\nfig.show()" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "from the chinook db, can you plot the number of customers by country on a map, and color code it by number of customers (white for few, and green for more)?", | |
| "example_code": "\n# Import necessary libraries\nimport pandas as pd\nimport plotly.graph_objects as go\n\n# Query to find the number of customers by country\nquery_customer_country = \"\"\"\n SELECT Country, COUNT(CustomerId) AS NumberOfCustomers\n FROM Customer\n GROUP BY Country\n\"\"\"\n\n# Use pd.read_sql to execute the query and get the number of customers by country\ncustomer_country = pd.read_sql(query_customer_country, conn)\n\n# Create a world map plot of customers by country using Plotly\nfig = go.Figure(data=go.Choropleth(\n locations=customer_country['Country'], # Spatial coordinates\n z = customer_country['NumberOfCustomers'].astype(float), # Data to color by\n locationmode = 'country names', # Set of locations match entries in `locations`\n autocolorscale=False,\n colorscale='Greens', # Set colorscale to 'Greens' to color code by number of customers\n reversescale=False, # Set reversescale to False to map low values to white and high values to green\n marker_line_color='darkgray', # Set marker line color to darkgray for better visibility\n marker_line_width=0.5, # Set marker line width to 0.5 for better visibility\n colorbar_title = \"Number of Customers\"\n))\n\nfig.update_layout(\n title_text='Number of Customers by Country',\n geo_scope='world',\n)\n\n# Show the plot\nfig.show()" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "Give me a word cloud for genre\n", | |
| "example_code": "\nimport pandas as pd\nfrom wordcloud import WordCloud\nimport matplotlib.pyplot as plt\n\n# Query to get all genres\nquery = \"SELECT Name FROM Genre\"\n\n# Use pd.read_sql to execute the query and get the result\ngenres = pd.read_sql(query, conn)\n\n# Join all genres into a single string\ngenre_string = ' '.join(genres['Name'].tolist())\n\n# Create a word cloud\nwordcloud = WordCloud(width=800, height=400).generate(genre_string)\n\n# Display the word cloud\nplt.figure(figsize=(10, 5))\nplt.imshow(wordcloud, interpolation='bilinear')\nplt.axis('off')\nplt.show()" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "what are the top 3 selling genre in chinook data?\n", | |
| "example_code": "\n# Import necessary libraries\nimport pandas as pd\n\n# Query to find the top 3 selling genres\nquery_top_genres = \"\"\"\n SELECT g.Name, SUM(il.Quantity) AS TotalSales\n FROM Genre g\n JOIN Track t ON g.GenreId = t.GenreId\n JOIN InvoiceLine il ON t.TrackId = il.TrackId\n GROUP BY g.Name\n ORDER BY SUM(il.Quantity) DESC\n LIMIT 3\n\"\"\"\n\n# Use pd.read_sql to execute the query and get the top selling genres\ntop_genres = pd.read_sql(query_top_genres, conn)\n\n# Print the result\nprint(\"Top 3 selling genres:\")\nprint(top_genres)" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "Give me the top 10 selling artists.", | |
| "example_code": "\nimport pandas as pd\n\n# Query to get the top 10 selling artists\nquery = \"\"\"\n SELECT a.Name, SUM(il.Quantity) as TotalSold\n FROM Artist a\n JOIN Album al ON a.ArtistId = al.ArtistId\n JOIN Track t ON al.AlbumId = t.AlbumId\n JOIN InvoiceLine il ON t.TrackId = il.TrackId\n GROUP BY a.Name\n ORDER BY TotalSold DESC\n LIMIT 10\n\"\"\"\n\n# Use pd.read_sql to execute the query and get the result\nresult = pd.read_sql(query, conn)\n\n# Print the result\nprint(result)" | |
| }, | |
| { | |
| "database": "capgpbsh_vg_chinook", | |
| "user_messages": "rank the employees making the most sales", | |
| "example_code": "\n# Import necessary libraries\nimport pandas as pd\n\n# Query to find the top selling employees\nquery_top_employees = \"\"\"\n SELECT e.FirstName, e.LastName, SUM(i.Total) AS TotalSales\n FROM Invoice i\n JOIN Customer c ON i.CustomerId = c.CustomerId\n JOIN Employee e ON c.SupportRepId = e.EmployeeId\n GROUP BY e.FirstName, e.LastName\n ORDER BY SUM(i.Total) DESC\n\"\"\"\n\n# Use pd.read_sql to execute the query and get the top selling employees\ntop_employees = pd.read_sql(query_top_employees, conn)\n\n# Print the result\nprint(\"Top selling employees:\")\nprint(top_employees)" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a visual to understand the total healthcare spend, and the medical and pharmacy as proportions of total\n", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\n# Query the database to get the total healthcare spend and pharmacy spend\nquery = \"\"\"\n SELECT \n TOTEXP19,\n RXEXP19\n FROM \n h216_narrow\n WHERE \n TOTEXP19 IS NOT NULL AND RXEXP19 IS NOT NULL\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Calculate the total healthcare spend and pharmacy spend\ntotal_healthcare_spend = df['TOTEXP19'].sum()\npharmacy_spend = df['RXEXP19'].sum()\n\n# Calculate the proportions of total healthcare spend\npharmacy_proportion = (pharmacy_spend / total_healthcare_spend) * 100\n\n# Calculate the medical spend as the difference between total healthcare spend and pharmacy spend\nmedical_spend = total_healthcare_spend - pharmacy_spend\nmedical_proportion = (medical_spend / total_healthcare_spend) * 100\n\n# Create a pie chart to visualize the proportions\nlabels = ['Medical', 'Pharmacy']\nsizes = [medical_proportion, pharmacy_proportion]\nplt.pie(sizes, labels=labels, autopct='%1.1f%%')\nplt.title('Proportion of Total Healthcare Spend')\nplt.show()\n\nprint(f\"Total healthcare spend: ${total_healthcare_spend:,.2f}\")\nprint(f\"Pharmacy spend as proportion of total healthcare spend: {pharmacy_proportion:.2f}%\")\nprint(f\"Medical spend as proportion of total healthcare spend: {medical_proportion:.2f}%\")" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "give me a visual to understand the total healthcare spend, and the medical and pharmacy as proportions of total\n", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\n\nquery = \"\"\"\n SELECT \n SUM(TOTEXP19) AS total_healthcare_spend,\n SUM(RXEXP19) AS pharmacy_spend\n FROM \n h216_narrow\n WHERE \n TOTEXP19 IS NOT NULL AND RXEXP19 IS NOT NULL\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Since the DataFrame has only one row, we can access the values directly\ntotal_healthcare_spend = df['total_healthcare_spend'].iloc[0]\npharmacy_spend = df['pharmacy_spend'].iloc[0]\n\n# Calculate the proportions of total healthcare spend\npharmacy_proportion = (pharmacy_spend / total_healthcare_spend) * 100\n\n# Calculate the medical spend as the difference between total healthcare spend and pharmacy spend\nmedical_spend = total_healthcare_spend - pharmacy_spend\nmedical_proportion = (medical_spend / total_healthcare_spend) * 100\n\n# Create a pie chart to visualize the proportions\nlabels = ['Medical', 'Pharmacy']\nsizes = [medical_proportion, pharmacy_proportion]\nplt.pie(sizes, labels=labels, autopct='%1.1f%%')\nplt.title('Proportion of Total Healthcare Spend')\nplt.show()\n\nprint(f\"Total healthcare spend: ${total_healthcare_spend:,.2f}\")\nprint(f\"Pharmacy spend as proportion of total healthcare spend: {pharmacy_proportion:.2f}%\")\nprint(f\"Medical spend as proportion of total healthcare spend: {medical_proportion:.2f}%\")" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "how many artists in data?", | |
| "example_code": "\nimport pandas as pd\n\n# Query the Artist table to get the number of artists\nartist_count_df = pd.read_sql_query(\"SELECT COUNT(*) FROM Artist\", conn)\n\n# Get the count from the DataFrame\nartist_count = artist_count_df.iloc[0, 0]\n\nprint(\"Number of artists in the data:\", artist_count)" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "what are the top 3 selling genre in chinook data?\n", | |
| "example_code": "\nimport pandas as pd\n\n# Define the query to get the top 3 selling genres\nquery = \"\"\"\n SELECT T.GenreId, G.Name, SUM(IL.Quantity) AS TotalSales\n FROM Track T\n JOIN Genre G ON T.GenreId = G.GenreId\n JOIN InvoiceLine IL ON T.TrackId = IL.TrackId\n GROUP BY T.GenreId, G.Name\n ORDER BY TotalSales DESC\n LIMIT 3\n\"\"\"\n\n# Use pd.read_sql_query to execute the query\ntop_selling_genres_df = pd.read_sql_query(query, conn)\n\n# Print the top 3 selling genres\nprint(top_selling_genres_df)" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "Give me the top 10 selling artists.\n", | |
| "example_code": "\nimport pandas as pd\n\n# Query the tables to get the top 10 selling artists\ntop_artists_df = pd.read_sql_query(\"\"\"\n SELECT a.Name, SUM(il.UnitPrice * il.Quantity) as TotalSales\n FROM InvoiceLine il\n JOIN Track t ON il.TrackId = t.TrackId\n JOIN Album alb ON t.AlbumId = alb.AlbumId\n JOIN Artist a ON alb.ArtistId = a.ArtistId\n GROUP BY a.Name\n ORDER BY TotalSales DESC\n LIMIT 10\n\"\"\", conn)\n\n# Print the top 10 selling artists\nprint(top_artists_df)" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "Give me a bar plot of sales, by album, of U2\n", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\n\n# Query the tables to get the sales by album for U2\nu2_album_sales_df = pd.read_sql_query(\"\"\"\n SELECT alb.Title, SUM(il.UnitPrice * il.Quantity) as TotalSales\n FROM InvoiceLine il\n JOIN Track t ON il.TrackId = t.TrackId\n JOIN Album alb ON t.AlbumId = alb.AlbumId\n JOIN Artist a ON alb.ArtistId = a.ArtistId\n WHERE a.Name = 'U2'\n GROUP BY alb.Title\n ORDER BY TotalSales DESC\n\"\"\", conn)\n\n# Create a bar plot of sales by album\nfig = px.bar(u2_album_sales_df, x='Title', y='TotalSales', title='Sales by Album for U2')\nfig.show()" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "from the chinook db, can you plot the number of customers by country on a map, and color code it by number of customers (white for few, and green for more)?\n", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\n\n# Define the query to get the number of customers by country\nquery = \"\"\"\n SELECT Country, COUNT(*) AS NumberOfCustomers\n FROM Customer\n GROUP BY Country\n\"\"\"\n\n# Use pd.read_sql_query to execute the query\ncustomers_by_country_df = pd.read_sql_query(query, conn)\n\n# Plot the number of customers by country using Plotly Express\nfig = px.choropleth(customers_by_country_df, locations=\"Country\", \n color=\"NumberOfCustomers\", \n color_continuous_scale='Greens', \n locationmode='country names')\n\nfig.update_layout(title='Number of Customers by Country')\nfig.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "can you give me a pmpm cost for people with and without diabetes, from meps; give me a bar chart too; normalize for age differences; maybe normalize instead by showing the bar chart with 10-year age bands", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\nimport numpy as np\n\n# Query the database to get the PMPM (per-member-per-month) healthcare cost for people with and without diabetes\nquery = \"\"\"\n SELECT \n DUPERSID,\n CASE \n WHEN DIABDX_M18 = 1 THEN 'Diabetes'\n ELSE 'No Diabetes'\n END AS diabetes_status,\n TOTEXP19 / 12 AS average_pmpm_healthcare_cost,\n AGELAST AS age\n FROM \n h216_narrow\n WHERE \n DIABDX_M18 IN (1, 2)\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Calculate the age group for normalization\nbins = [0, 20, 30, 40, 50, 60, 70, np.inf]\nlabels = ['0-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70+']\ndf['age_group'] = pd.cut(df['age'], bins=bins, labels=labels, right=False)\n\n# Calculate the average PMPM healthcare cost by age group and diabetes status\ndf_avg = df.groupby(['age_group', 'diabetes_status'])['average_pmpm_healthcare_cost'].mean().reset_index()\ndf_avg = df_avg.sort_values('age_group')\n\n# Generate a line chart to visualize the results\nfig = px.bar(df_avg, x='age_group', y='average_pmpm_healthcare_cost', barmode='group', color='diabetes_status', title='Average PMPM Healthcare Cost by Age Group and Diabetes Status')\nfig.show()\n\n# Calculate summary statistics\nprint(df_avg.groupby('diabetes_status')['average_pmpm_healthcare_cost'].describe())\n\n# Calculate the total number of observations\nprint(\"Total number of observations: \", len(df))\n\n# Calculate the total number of observations by age group\nprint(df['age_group'].value_counts())" | |
| }, | |
| { | |
| "database": "h216_narrow.csv", | |
| "user_messages": "can you give me a pmpm cost for people with and without diabetes", | |
| "example_code": "import pandas as pd\nimport numpy as np\n\n# Calculate monthly cost per patient with diabetes\ndf_diabetes = df[df['DIABDX_M18'] == 1]\ncost_diabetes = np.sum(df_diabetes['TOTEXP19']) / len(df_diabetes)\npmpm_diabetes = cost_diabetes / 12\n\n# Calculate monthly cost per patient without diabetes\ndf_no_diabetes = df[df['DIABDX_M18'] == 2]\ncost_no_diabetes = np.sum(df_no_diabetes['TOTEXP19']) / len(df_no_diabetes)\npmpm_no_diabetes = cost_no_diabetes / 12\n\nprint(f\"PMPM cost for people with diabetes: ${pmpm_diabetes:.2f}\")\nprint(f\"PMPM cost for people without diabetes: ${pmpm_no_diabetes:.2f}\")" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "give me a histogram for rings", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\n\n# Query the d_Abalone table to get the Rings column\nrings_df = pd.read_sql_query(\"SELECT Rings FROM d_Abalone\", conn)\n\n# Create a histogram using Plotly Express\nfig = px.histogram(rings_df, x=\"Rings\", \n title=\"Histogram of Rings in d_Abalone\",\n color_discrete_sequence=[\"#6495ED\"],\n template='plotly_white')\n\n# Add a black edge to the bars\nfig.update_traces(marker_line_color='black', marker_line_width=1)\n\n# Show the plot\nfig.show()" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "give me a pie chart for the distribution of iris flower", | |
| "example_code": "import pandas as pd\nimport plotly.express as px\n\n# Query the iris table to get the target column\ntarget_df = pd.read_sql_query(\"SELECT target FROM iris\", conn)\n\n# Map 'target' values to iris species\ntarget_df['species'] = target_df['target'].map({0: 'setosa', 1: 'versicolor', 2: 'virginica'})\n\n# Create a pie chart using Plotly Express\nfig = px.pie(target_df, names='species', \n title=\"Distribution of Iris Flowers\",\n color_discrete_sequence=[\"#6495ED\", \"#66B3FF\", \"#66CCCC\"],\n template='plotly_white')\n\n# Add a black edge to the pie slices\nfig.update_traces(marker_line_color='black', marker_line_width=1)\n\n# Show the plot\nfig.show()" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "Rank the top 5 variables that explain the variation in Rings\n", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nimport seaborn as sns\n\n# Query the d_Abalone table to get all columns\ndf = pd.read_sql_query(\"SELECT * FROM d_Abalone\", conn)\n\n# Convert Sex to a numerical value using one-hot encoding\ndf = pd.get_dummies(df, columns=['Sex'])\n\n# Calculate the correlation matrix\ncorr_matrix = df.corr()\n\n# Get the correlation coefficients for Rings\nrings_corr = corr_matrix['Rings'].sort_values(ascending=False).head(6)[1:]\n\n# Print the top 5 variables that explain the variation in Rings\nprint(rings_corr)\n\n# Create a bar chart to visualize the top 5 variables\nplt.figure(figsize=(10, 6))\nsns.barplot(x=rings_corr.index, y=rings_corr.values)\nplt.title(\"Top 5 Variables that Explain Variation in Rings\")\nplt.xlabel(\"Variables\")\nplt.ylabel(\"Correlation Coefficient\")\nplt.show()" | |
| }, | |
| { | |
| "database": "freedb_vg_demo", | |
| "user_messages": "give me a decision tree model to predict the iris flower. Show a decision tree.", | |
| "example_code": "\nimport pandas as pd\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn import metrics\nfrom sklearn import tree\nimport matplotlib.pyplot as plt\nfrom sklearn.preprocessing import LabelEncoder\n\n# Query the iris table to get all columns\niris_df = pd.read_sql_query(\"SELECT * FROM iris\", conn)\n\n# Map 'target' values to iris species\nle = LabelEncoder()\niris_df['target'] = le.fit_transform(iris_df['target'])\n\n# Separate features and target\nX = iris_df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']]\ny = iris_df['target']\n\n# Split the dataset into training set and test set\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)\n\n# Create a Decision Tree Classifier\nclf = DecisionTreeClassifier()\n\n# Train the model using the training sets\nclf = clf.fit(X_train,y_train)\n\n#Predict the response for test dataset\ny_pred = clf.predict(X_test)\n\n# Model Accuracy: how often is the classifier correct?\nprint(\"Accuracy:\", metrics.accuracy_score(y_test, y_pred))\n\n# Create a decision tree plot\nplt.figure(figsize=(10,8))\ntree.plot_tree(clf, filled = True, feature_names=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)'], class_names=['setosa', 'versicolor', 'virginica'])\nplt.title(\"Decision Tree for Iris Classification\")\nplt.show()" | |
| }, | |
| { | |
| "database": "vg_meps01", | |
| "user_messages": "how many diabetics in the data?", | |
| "example_code": "\nimport pandas as pd\n\n# Define the SQL query\nquery = \"\"\"\n SELECT DISTINCT DUPERSID\n FROM h216_narrow\n WHERE DIABDX_M18 = 1\n\"\"\"\n\n# Read the query result into a pandas DataFrame\ndiabetics = pd.read_sql(query, conn)\n\n# Calculate the number of diabetics\nnum_diabetics = len(diabetics)\n\nprint(f\"Number of diabetics in the data: {num_diabetics}\")" | |
| }, | |
| { | |
| "database": "vg_meps01", | |
| "user_messages": "visualize a venn diagram for people that have diabetes and asthma, from meps\n", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Define the SQL queries\nquery_diabetes = \"\"\"\n SELECT DISTINCT DUPERSID\n FROM h216_narrow\n WHERE DIABDX_M18 = 1\n\"\"\"\n\nquery_asthma = \"\"\"\n SELECT DISTINCT DUPERSID\n FROM h216_narrow\n WHERE ASTHDX = 1\n\"\"\"\n\n# Read the query results into pandas DataFrames\ndiabetics = pd.read_sql(query_diabetes, conn)\nasthmatics = pd.read_sql(query_asthma, conn)\n\n# Calculate the number of people with both diabetes and asthma\nboth = len(pd.merge(diabetics, asthmatics, on='DUPERSID'))\n\n# Calculate the number of people with only diabetes\nonly_diabetes = len(diabetics) - both\n\n# Calculate the number of people with only asthma\nonly_asthma = len(asthmatics) - both\n\n# Create a venn diagram\nplt.figure(figsize=(8,8))\nvenn2(subsets=(only_diabetes, only_asthma, both), set_labels=('Diabetes', 'Asthma'))\nplt.title('Venn Diagram of Diabetes and Asthma')\nplt.show()" | |
| }, | |
| { | |
| "database": "vg_meps01", | |
| "user_messages": "can you give me a pmpm cost for people with and without diabetes, from meps\n; since the dataset is large, can you do the query more efficiently (i.e., instead of loading all rows, aggregate them?)", | |
| "example_code": "\nimport pandas as pd\n\n# Define the SQL queries\nquery_with_diabetes = \"\"\"\n SELECT AVG(TOTEXP19 / 12) AS pmpm\n FROM h216_narrow\n WHERE DIABDX_M18 = 1\n\"\"\"\n\nquery_without_diabetes = \"\"\"\n SELECT AVG(TOTEXP19 / 12) AS pmpm\n FROM h216_narrow\n WHERE DIABDX_M18 = 2\n\"\"\"\n\n# Read the query results into pandas DataFrames\npmpm_with_diabetes = pd.read_sql(query_with_diabetes, conn)['pmpm'].iloc[0]\npmpm_without_diabetes = pd.read_sql(query_without_diabetes, conn)['pmpm'].iloc[0]\n\n# Print the results\nprint(f\"Mean pmpm cost for people with diabetes: ${pmpm_with_diabetes:.2f}\")\nprint(f\"Mean pmpm cost for people without diabetes: ${pmpm_without_diabetes:.2f}\")" | |
| }, | |
| { | |
| "database": "Db_Test_01", | |
| "user_messages": "give me a histogram for rings", | |
| "example_code": "\nimport pandas as pd\nimport plotly.express as px\n\n# Read the 'Rings' column from the 'd_Abalone' table\ndf = pd.read_sql(\"SELECT Rings FROM d_Abalone\", conn)\n\n# Create a histogram\nfig = px.histogram(df, x='Rings', histnorm='probability density')\n\n# Add black edges to the bars\nfig.update_layout(barmode='overlay')\nfor bar in fig.data:\n bar.marker.line.width = 1\n bar.marker.line.color = 'black'\n\n# Show the histogram\nfig.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "visualize a venn diagram for people that have diabetes and asthma, from meps\n", | |
| "example_code": "\nimport pandas as pd\nimport matplotlib.pyplot as plt\nfrom matplotlib_venn import venn2\n\n# Query the data for diabetes and asthma, summing the values to reduce data transfer\nquery = \"\"\"\n SELECT \n SUM(CASE WHEN DIABDX_M18 = 1 THEN 1 ELSE 0 END) AS diabetes_count,\n SUM(CASE WHEN ASTHDX = 1 THEN 1 ELSE 0 END) AS asthma_count,\n SUM(CASE WHEN DIABDX_M18 = 1 AND ASTHDX = 1 THEN 1 ELSE 0 END) AS both_count\n FROM h216_narrow\n\"\"\"\ndf = pd.read_sql(query, conn)\n\n# Extract the aggregate values\ndiabetes_count = df['diabetes_count'][0]\nasthma_count = df['asthma_count'][0]\nboth_count = df['both_count'][0]\n\n# Plot the Venn diagram\nplt.figure(figsize=(8, 8))\nvenn2(subsets=(diabetes_count - both_count, asthma_count - both_count, both_count),\n set_labels=('Diabetes', 'Asthma'))\n\n# Show the plot\nplt.title(\"Venn Diagram of People with Diabetes and Asthma\")\nplt.show()" | |
| }, | |
| { | |
| "database": "MEPS", | |
| "user_messages": "how do total healthcare costs vary with frequency of smoking?", | |
| "example_code": "\nimport pandas as pd\n\n# Query the database to get the average total healthcare cost for different frequencies of smoking\nquery = \"\"\"\n SELECT \n CASE \n WHEN OFTSMK53 = 1 THEN 'Daily'\n WHEN OFTSMK53 = 2 THEN 'Some Days'\n WHEN OFTSMK53 = 3 THEN 'Not At All'\n ELSE 'Unknown'\n END AS smoking_frequency,\n AVG(TOTEXP19) AS average_healthcare_cost\n FROM \n h216_narrow\n WHERE \n OFTSMK53 IN (1, 2, 3) -- Include only valid frequencies\n GROUP BY \n CASE \n WHEN OFTSMK53 = 1 THEN 'Daily'\n WHEN OFTSMK53 = 2 THEN 'Some Days'\n WHEN OFTSMK53 = 3 THEN 'Not At All'\n ELSE 'Unknown'\n END\n\"\"\"\n\n# Read the query results into a pandas DataFrame\ndf = pd.read_sql(query, conn)\n\n# Print the average healthcare cost for different frequencies of smoking\nfor index, row in df.iterrows():\n print(f\"Average total healthcare cost when smoking {row['smoking_frequency']}: ${round(row['average_healthcare_cost'], 2)}\")" | |
| }, | |
| { | |
| "database": "BenchmarkingDb_Smol", | |
| "user_messages": "Give me the counts of the different types of iris flower.", | |
| "example_code": "\nimport pandas as pd\n\n# Fetch data from the 'iris' table\nquery = \"SELECT target, COUNT(*) as count FROM iris GROUP BY target\"\niris_df = pd.read_sql(query, conn)\n\n# Map target values to flower types\nflower_types = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}\niris_df['target'] = iris_df['target'].map(flower_types)\n\n# Rename the columns for better understanding\niris_df.columns = ['flower_type', 'count']\n\n# Show the result\niris_df" | |
| }, | |
| { | |
| "database": "BenchmarkingDb_Smol", | |
| "user_messages": "Give me the counts of the different types of iris flower.", | |
| "example_code": "\nimport pandas as pd\n\n# Fetch data from the 'iris' table\nquery = \"SELECT target, COUNT(*) as count FROM iris GROUP BY target\"\niris_df = pd.read_sql(query, conn)\n\n# Map target values to flower types\nflower_types = {0: 'Setosa', 1: 'Versicolor', 2: 'Virginica'}\niris_df['target'] = iris_df['target'].map(flower_types)\n\n# Rename the columns for better understanding\niris_df.columns = ['flower_type', 'count']\n\n# Show the result\nprint(iris_df)" | |
| } | |
| ] |