{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "b20a65b1",
   "metadata": {},
   "source": [
    "# Initial Data Exploration\n",
    "\n",
    "This notebook explores the SkillScope dataset to understand:\n",
    "- Database structure and schema\n",
    "- Available tables and columns\n",
    "- Data types and distributions\n",
    "- Label characteristics and statistics\n",
    "- Text field content and quality"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "adad083b",
   "metadata": {},
   "outputs": [],
   "source": [
    "import sqlite3\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "from pathlib import Path\n",
    "\n",
    "# Set display options\n",
    "pd.set_option('display.max_columns', None)\n",
    "pd.set_option('display.max_rows', 100)\n",
    "pd.set_option('display.width', None)\n",
    "\n",
    "# Set visualization style\n",
    "sns.set_style('whitegrid')\n",
    "plt.rcParams['figure.figsize'] = (12, 6)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f23f7b74",
   "metadata": {},
   "source": [
    "## 1. Database Connection and Schema Exploration"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "72dc4acc",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Connect to database\n",
    "db_path = Path('../data/raw/skillscope_data.db')\n",
    "\n",
    "if not db_path.exists():\n",
    "    print(f\"Database not found at {db_path}\")\n",
    "    print(\"Please ensure skillscope_data.db is placed in data/raw/\")\n",
    "else:\n",
    "    print(f\"Database found at {db_path}\")\n",
    "\n",
    "conn = sqlite3.connect(db_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d543bebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# List all tables and views\n",
    "tables_query = \"\"\"\n",
    "SELECT name, type \n",
    "FROM sqlite_master \n",
    "WHERE type IN ('table', 'view')\n",
    "ORDER BY type, name\n",
    "\"\"\"\n",
    "\n",
    "tables_df = pd.read_sql_query(tables_query, conn)\n",
    "print(\"Available tables and views:\")\n",
    "print(tables_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "d7a34b6a",
   "metadata": {},
   "source": [
    "## 2. Main Table: nlbse_tool_competition_data_by_issue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "48b84322",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get schema for main table\n",
    "schema_query = \"PRAGMA table_info(nlbse_tool_competition_data_by_issue)\"\n",
    "schema_df = pd.read_sql_query(schema_query, conn)\n",
    "print(f\"Table schema ({len(schema_df)} columns):\")\n",
    "print(schema_df)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "13ce8a66",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load main table\n",
    "df = pd.read_sql_query(\"SELECT * FROM nlbse_tool_competition_data_by_issue\", conn)\n",
    "print(f\"Dataset shape: {df.shape}\")\n",
    "print(f\"Number of rows: {len(df):,}\")\n",
    "print(f\"Number of columns: {len(df.columns)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05d60108",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Display first few rows\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14ad1eb7",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Column data types\n",
    "print(\"Column data types:\")\n",
    "print(df.dtypes.value_counts())\n",
    "print(\"\\nDetailed types:\")\n",
    "print(df.dtypes)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b85d7d79",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Check for missing values\n",
    "missing_data = df.isnull().sum()\n",
    "missing_pct = (missing_data / len(df) * 100).round(2)\n",
    "missing_df = pd.DataFrame({\n",
    "    'missing_count': missing_data,\n",
    "    'missing_percentage': missing_pct\n",
    "})\n",
    "missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_count', ascending=False)\n",
    "print(f\"\\nColumns with missing values ({len(missing_df)} total):\")\n",
    "print(missing_df)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "02932a5b",
   "metadata": {},
   "source": [
    "## 3. Identify Text and Label Columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "39fac3e3",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify text columns by examining column names and content\n",
    "potential_text_cols = [col for col in df.columns if any(\n",
    "    keyword in col.lower() for keyword in \n",
    "    ['title', 'body', 'description', 'text', 'message', 'comment', 'summary']\n",
    ")]\n",
    "\n",
    "print(f\"Potential text columns ({len(potential_text_cols)}):\")\n",
    "for col in potential_text_cols:\n",
    "    print(f\"  - {col}: {df[col].dtype}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "14d69ebb",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify numeric columns (potential labels)\n",
    "numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n",
    "print(f\"\\nNumeric columns ({len(numeric_cols)}):\")\n",
    "print(numeric_cols[:20])  # Show first 20\n",
    "if len(numeric_cols) > 20:\n",
    "    print(f\"... and {len(numeric_cols) - 20} more\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c621e9db",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Identify likely metadata vs label columns\n",
    "metadata_keywords = ['id', 'url', 'date', 'time', 'created', 'updated', 'repo', 'author', 'number']\n",
    "metadata_cols = [col for col in df.columns if any(\n",
    "    keyword in col.lower() for keyword in metadata_keywords\n",
    ")]\n",
    "\n",
    "# Label columns are numeric but not metadata\n",
    "label_cols = [col for col in numeric_cols if col not in metadata_cols]\n",
    "\n",
    "print(f\"\\nIdentified metadata columns ({len(metadata_cols)}):\")\n",
    "print(metadata_cols)\n",
    "\n",
    "print(f\"\\nIdentified label columns ({len(label_cols)}):\")\n",
    "print(label_cols[:20])  # Show first 20\n",
    "if len(label_cols) > 20:\n",
    "    print(f\"... and {len(label_cols) - 20} more\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "8f197009",
   "metadata": {},
   "source": [
    "## 4. Text Data Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "633d86dd",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze text columns\n",
    "for col in potential_text_cols[:5]:  # Analyze first 5 text columns\n",
    "    if col in df.columns:\n",
    "        print(f\"\\n{'='*60}\")\n",
    "        print(f\"Column: {col}\")\n",
    "        print(f\"{'='*60}\")\n",
    "        print(f\"Non-null count: {df[col].notna().sum():,}\")\n",
    "        print(f\"Null count: {df[col].isna().sum():,}\")\n",
    "        \n",
    "        if df[col].notna().sum() > 0:\n",
    "            text_lengths = df[col].dropna().astype(str).str.len()\n",
    "            print(f\"\\nText length statistics:\")\n",
    "            print(text_lengths.describe())\n",
    "            \n",
    "            print(f\"\\nSample values:\")\n",
    "            samples = df[col].dropna().head(3)\n",
    "            for idx, sample in enumerate(samples, 1):\n",
    "                print(f\"\\n  Sample {idx}:\")\n",
    "                print(f\"  {str(sample)[:200]}...\" if len(str(sample)) > 200 else f\"  {sample}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f5b7b815",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize text length distributions\n",
    "fig, axes = plt.subplots(len(potential_text_cols[:3]), 1, figsize=(12, 4*len(potential_text_cols[:3])))\n",
    "\n",
    "if len(potential_text_cols[:3]) == 1:\n",
    "    axes = [axes]\n",
    "\n",
    "for idx, col in enumerate(potential_text_cols[:3]):\n",
    "    if col in df.columns and df[col].notna().sum() > 0:\n",
    "        lengths = df[col].dropna().astype(str).str.len()\n",
    "        axes[idx].hist(lengths, bins=50, edgecolor='black')\n",
    "        axes[idx].set_xlabel('Text Length (characters)')\n",
    "        axes[idx].set_ylabel('Frequency')\n",
    "        axes[idx].set_title(f'Distribution of Text Length: {col}')\n",
    "        axes[idx].axvline(lengths.median(), color='red', linestyle='--', label=f'Median: {lengths.median():.0f}')\n",
    "        axes[idx].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "4ad344fc",
   "metadata": {},
   "source": [
    "## 5. Label Analysis"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c81edc27",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Analyze label distribution\n",
    "if len(label_cols) > 0:\n",
    "    label_df = df[label_cols]\n",
    "    \n",
    "    print(f\"Label statistics:\")\n",
    "    print(f\"Total label columns: {len(label_cols)}\")\n",
    "    print(f\"\\nValue range:\")\n",
    "    print(f\"Min: {label_df.min().min()}\")\n",
    "    print(f\"Max: {label_df.max().max()}\")\n",
    "    print(f\"\\nBasic statistics:\")\n",
    "    print(label_df.describe())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "6a82a0d8",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Convert to binary labels (present/absent)\n",
    "if len(label_cols) > 0:\n",
    "    binary_labels = (label_df > 0).astype(int)\n",
    "    \n",
    "    # Count labels per issue\n",
    "    labels_per_issue = binary_labels.sum(axis=1)\n",
    "    \n",
    "    print(f\"\\nLabels per issue statistics:\")\n",
    "    print(labels_per_issue.describe())\n",
    "    \n",
    "    # Count issues per label\n",
    "    issues_per_label = binary_labels.sum(axis=0)\n",
    "    \n",
    "    print(f\"\\nIssues per label statistics:\")\n",
    "    print(issues_per_label.describe())\n",
    "    \n",
    "    print(f\"\\nTop 10 most common labels:\")\n",
    "    print(issues_per_label.sort_values(ascending=False).head(10))\n",
    "    \n",
    "    print(f\"\\nTop 10 rarest labels:\")\n",
    "    print(issues_per_label[issues_per_label > 0].sort_values().head(10))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f4ec72bf",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Visualize label distribution\n",
    "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n",
    "\n",
    "# Labels per issue\n",
    "axes[0].hist(labels_per_issue, bins=min(50, labels_per_issue.max()), edgecolor='black')\n",
    "axes[0].set_xlabel('Number of Labels per Issue')\n",
    "axes[0].set_ylabel('Frequency')\n",
    "axes[0].set_title('Distribution of Labels per Issue')\n",
    "axes[0].axvline(labels_per_issue.mean(), color='red', linestyle='--', label=f'Mean: {labels_per_issue.mean():.2f}')\n",
    "axes[0].axvline(labels_per_issue.median(), color='green', linestyle='--', label=f'Median: {labels_per_issue.median():.0f}')\n",
    "axes[0].legend()\n",
    "\n",
    "# Issues per label (log scale)\n",
    "axes[1].hist(issues_per_label, bins=50, edgecolor='black')\n",
    "axes[1].set_xlabel('Number of Issues per Label')\n",
    "axes[1].set_ylabel('Frequency')\n",
    "axes[1].set_title('Distribution of Issues per Label')\n",
    "axes[1].set_yscale('log')\n",
    "axes[1].axvline(issues_per_label.mean(), color='red', linestyle='--', label=f'Mean: {issues_per_label.mean():.2f}')\n",
    "axes[1].axvline(issues_per_label.median(), color='green', linestyle='--', label=f'Median: {issues_per_label.median():.0f}')\n",
    "axes[1].legend()\n",
    "\n",
    "plt.tight_layout()\n",
    "plt.show()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b0b78747",
   "metadata": {},
   "source": [
    "## 6. View: vw_nlbse_tool_competition_data_by_file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "b5605fd5",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Load and explore the file-level view\n",
    "file_view_query = \"SELECT * FROM vw_nlbse_tool_competition_data_by_file LIMIT 1000\"\n",
    "file_df = pd.read_sql_query(file_view_query, conn)\n",
    "\n",
    "print(f\"File view shape (first 1000 rows): {file_df.shape}\")\n",
    "print(f\"\\nColumns:\")\n",
    "print(file_df.columns.tolist())\n",
    "print(f\"\\nFirst few rows:\")\n",
    "file_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d34e06c9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Get total count from file view\n",
    "count_query = \"SELECT COUNT(*) as total FROM vw_nlbse_tool_competition_data_by_file\"\n",
    "total_files = pd.read_sql_query(count_query, conn)\n",
    "print(f\"Total records in file view: {total_files['total'].values[0]:,}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9c582768",
   "metadata": {},
   "source": [
    "## 7. Summary and Conclusions"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7884ce2d",
   "metadata": {},
   "outputs": [],
   "source": [
    "print(\"=\" * 60)\n",
    "print(\"DATA EXPLORATION SUMMARY\")\n",
    "print(\"=\" * 60)\n",
    "print(f\"\\nDataset Overview:\")\n",
    "print(f\"  - Total issues: {len(df):,}\")\n",
    "print(f\"  - Total columns: {len(df.columns)}\")\n",
    "print(f\"  - Text columns identified: {len(potential_text_cols)}\")\n",
    "print(f\"  - Label columns identified: {len(label_cols)}\")\n",
    "print(f\"  - Metadata columns: {len(metadata_cols)}\")\n",
    "\n",
    "if len(label_cols) > 0:\n",
    "    print(f\"\\nLabel Statistics:\")\n",
    "    print(f\"  - Avg labels per issue: {labels_per_issue.mean():.2f}\")\n",
    "    print(f\"  - Median labels per issue: {labels_per_issue.median():.0f}\")\n",
    "    print(f\"  - Max labels per issue: {labels_per_issue.max()}\")\n",
    "    print(f\"  - Avg issues per label: {issues_per_label.mean():.2f}\")\n",
    "    print(f\"  - Median issues per label: {issues_per_label.median():.0f}\")\n",
    "    print(f\"  - Labels with no issues: {(issues_per_label == 0).sum()}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a1440cc9",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save column information for reference\n",
    "column_info = pd.DataFrame({\n",
    "    'column_name': df.columns,\n",
    "    'dtype': df.dtypes.values,\n",
    "    'null_count': df.isnull().sum().values,\n",
    "    'null_pct': (df.isnull().sum() / len(df) * 100).values\n",
    "})\n",
    "\n",
    "output_path = Path('../reports/column_info.csv')\n",
    "output_path.parent.mkdir(parents=True, exist_ok=True)\n",
    "column_info.to_csv(output_path, index=False)\n",
    "print(f\"\\nColumn information saved to {output_path}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "1dd24d0f",
   "metadata": {},
   "outputs": [],
   "source": [
    "# Close database connection\n",
    "conn.close()\n",
    "print(\"\\nDatabase connection closed.\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}