{ "cells": [ { "cell_type": "markdown", "id": "b20a65b1", "metadata": {}, "source": [ "# Initial Data Exploration\n", "\n", "This notebook explores the SkillScope dataset to understand:\n", "- Database structure and schema\n", "- Available tables and columns\n", "- Data types and distributions\n", "- Label characteristics and statistics\n", "- Text field content and quality" ] }, { "cell_type": "code", "execution_count": null, "id": "adad083b", "metadata": {}, "outputs": [], "source": [ "import sqlite3\n", "import pandas as pd\n", "import numpy as np\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "from pathlib import Path\n", "\n", "# Set display options\n", "pd.set_option('display.max_columns', None)\n", "pd.set_option('display.max_rows', 100)\n", "pd.set_option('display.width', None)\n", "\n", "# Set visualization style\n", "sns.set_style('whitegrid')\n", "plt.rcParams['figure.figsize'] = (12, 6)" ] }, { "cell_type": "markdown", "id": "f23f7b74", "metadata": {}, "source": [ "## 1. Database Connection and Schema Exploration" ] }, { "cell_type": "code", "execution_count": null, "id": "72dc4acc", "metadata": {}, "outputs": [], "source": [ "# Connect to database\n", "db_path = Path('../data/raw/skillscope_data.db')\n", "\n", "if not db_path.exists():\n", " print(f\"Database not found at {db_path}\")\n", " print(\"Please ensure skillscope_data.db is placed in data/raw/\")\n", "else:\n", " print(f\"Database found at {db_path}\")\n", "\n", "conn = sqlite3.connect(db_path)" ] }, { "cell_type": "code", "execution_count": null, "id": "d543bebb", "metadata": {}, "outputs": [], "source": [ "# List all tables and views\n", "tables_query = \"\"\"\n", "SELECT name, type \n", "FROM sqlite_master \n", "WHERE type IN ('table', 'view')\n", "ORDER BY type, name\n", "\"\"\"\n", "\n", "tables_df = pd.read_sql_query(tables_query, conn)\n", "print(\"Available tables and views:\")\n", "print(tables_df)" ] }, { "cell_type": "markdown", "id": "d7a34b6a", "metadata": {}, "source": [ "## 2. Main Table: nlbse_tool_competition_data_by_issue" ] }, { "cell_type": "code", "execution_count": null, "id": "48b84322", "metadata": {}, "outputs": [], "source": [ "# Get schema for main table\n", "schema_query = \"PRAGMA table_info(nlbse_tool_competition_data_by_issue)\"\n", "schema_df = pd.read_sql_query(schema_query, conn)\n", "print(f\"Table schema ({len(schema_df)} columns):\")\n", "print(schema_df)" ] }, { "cell_type": "code", "execution_count": null, "id": "13ce8a66", "metadata": {}, "outputs": [], "source": [ "# Load main table\n", "df = pd.read_sql_query(\"SELECT * FROM nlbse_tool_competition_data_by_issue\", conn)\n", "print(f\"Dataset shape: {df.shape}\")\n", "print(f\"Number of rows: {len(df):,}\")\n", "print(f\"Number of columns: {len(df.columns)}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "05d60108", "metadata": {}, "outputs": [], "source": [ "# Display first few rows\n", "df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "14ad1eb7", "metadata": {}, "outputs": [], "source": [ "# Column data types\n", "print(\"Column data types:\")\n", "print(df.dtypes.value_counts())\n", "print(\"\\nDetailed types:\")\n", "print(df.dtypes)" ] }, { "cell_type": "code", "execution_count": null, "id": "b85d7d79", "metadata": {}, "outputs": [], "source": [ "# Check for missing values\n", "missing_data = df.isnull().sum()\n", "missing_pct = (missing_data / len(df) * 100).round(2)\n", "missing_df = pd.DataFrame({\n", " 'missing_count': missing_data,\n", " 'missing_percentage': missing_pct\n", "})\n", "missing_df = missing_df[missing_df['missing_count'] > 0].sort_values('missing_count', ascending=False)\n", "print(f\"\\nColumns with missing values ({len(missing_df)} total):\")\n", "print(missing_df)" ] }, { "cell_type": "markdown", "id": "02932a5b", "metadata": {}, "source": [ "## 3. Identify Text and Label Columns" ] }, { "cell_type": "code", "execution_count": null, "id": "39fac3e3", "metadata": {}, "outputs": [], "source": [ "# Identify text columns by examining column names and content\n", "potential_text_cols = [col for col in df.columns if any(\n", " keyword in col.lower() for keyword in \n", " ['title', 'body', 'description', 'text', 'message', 'comment', 'summary']\n", ")]\n", "\n", "print(f\"Potential text columns ({len(potential_text_cols)}):\")\n", "for col in potential_text_cols:\n", " print(f\" - {col}: {df[col].dtype}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "14d69ebb", "metadata": {}, "outputs": [], "source": [ "# Identify numeric columns (potential labels)\n", "numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()\n", "print(f\"\\nNumeric columns ({len(numeric_cols)}):\")\n", "print(numeric_cols[:20]) # Show first 20\n", "if len(numeric_cols) > 20:\n", " print(f\"... and {len(numeric_cols) - 20} more\")" ] }, { "cell_type": "code", "execution_count": null, "id": "c621e9db", "metadata": {}, "outputs": [], "source": [ "# Identify likely metadata vs label columns\n", "metadata_keywords = ['id', 'url', 'date', 'time', 'created', 'updated', 'repo', 'author', 'number']\n", "metadata_cols = [col for col in df.columns if any(\n", " keyword in col.lower() for keyword in metadata_keywords\n", ")]\n", "\n", "# Label columns are numeric but not metadata\n", "label_cols = [col for col in numeric_cols if col not in metadata_cols]\n", "\n", "print(f\"\\nIdentified metadata columns ({len(metadata_cols)}):\")\n", "print(metadata_cols)\n", "\n", "print(f\"\\nIdentified label columns ({len(label_cols)}):\")\n", "print(label_cols[:20]) # Show first 20\n", "if len(label_cols) > 20:\n", " print(f\"... and {len(label_cols) - 20} more\")" ] }, { "cell_type": "markdown", "id": "8f197009", "metadata": {}, "source": [ "## 4. Text Data Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "633d86dd", "metadata": {}, "outputs": [], "source": [ "# Analyze text columns\n", "for col in potential_text_cols[:5]: # Analyze first 5 text columns\n", " if col in df.columns:\n", " print(f\"\\n{'='*60}\")\n", " print(f\"Column: {col}\")\n", " print(f\"{'='*60}\")\n", " print(f\"Non-null count: {df[col].notna().sum():,}\")\n", " print(f\"Null count: {df[col].isna().sum():,}\")\n", " \n", " if df[col].notna().sum() > 0:\n", " text_lengths = df[col].dropna().astype(str).str.len()\n", " print(f\"\\nText length statistics:\")\n", " print(text_lengths.describe())\n", " \n", " print(f\"\\nSample values:\")\n", " samples = df[col].dropna().head(3)\n", " for idx, sample in enumerate(samples, 1):\n", " print(f\"\\n Sample {idx}:\")\n", " print(f\" {str(sample)[:200]}...\" if len(str(sample)) > 200 else f\" {sample}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "f5b7b815", "metadata": {}, "outputs": [], "source": [ "# Visualize text length distributions\n", "fig, axes = plt.subplots(len(potential_text_cols[:3]), 1, figsize=(12, 4*len(potential_text_cols[:3])))\n", "\n", "if len(potential_text_cols[:3]) == 1:\n", " axes = [axes]\n", "\n", "for idx, col in enumerate(potential_text_cols[:3]):\n", " if col in df.columns and df[col].notna().sum() > 0:\n", " lengths = df[col].dropna().astype(str).str.len()\n", " axes[idx].hist(lengths, bins=50, edgecolor='black')\n", " axes[idx].set_xlabel('Text Length (characters)')\n", " axes[idx].set_ylabel('Frequency')\n", " axes[idx].set_title(f'Distribution of Text Length: {col}')\n", " axes[idx].axvline(lengths.median(), color='red', linestyle='--', label=f'Median: {lengths.median():.0f}')\n", " axes[idx].legend()\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "4ad344fc", "metadata": {}, "source": [ "## 5. Label Analysis" ] }, { "cell_type": "code", "execution_count": null, "id": "c81edc27", "metadata": {}, "outputs": [], "source": [ "# Analyze label distribution\n", "if len(label_cols) > 0:\n", " label_df = df[label_cols]\n", " \n", " print(f\"Label statistics:\")\n", " print(f\"Total label columns: {len(label_cols)}\")\n", " print(f\"\\nValue range:\")\n", " print(f\"Min: {label_df.min().min()}\")\n", " print(f\"Max: {label_df.max().max()}\")\n", " print(f\"\\nBasic statistics:\")\n", " print(label_df.describe())" ] }, { "cell_type": "code", "execution_count": null, "id": "6a82a0d8", "metadata": {}, "outputs": [], "source": [ "# Convert to binary labels (present/absent)\n", "if len(label_cols) > 0:\n", " binary_labels = (label_df > 0).astype(int)\n", " \n", " # Count labels per issue\n", " labels_per_issue = binary_labels.sum(axis=1)\n", " \n", " print(f\"\\nLabels per issue statistics:\")\n", " print(labels_per_issue.describe())\n", " \n", " # Count issues per label\n", " issues_per_label = binary_labels.sum(axis=0)\n", " \n", " print(f\"\\nIssues per label statistics:\")\n", " print(issues_per_label.describe())\n", " \n", " print(f\"\\nTop 10 most common labels:\")\n", " print(issues_per_label.sort_values(ascending=False).head(10))\n", " \n", " print(f\"\\nTop 10 rarest labels:\")\n", " print(issues_per_label[issues_per_label > 0].sort_values().head(10))" ] }, { "cell_type": "code", "execution_count": null, "id": "f4ec72bf", "metadata": {}, "outputs": [], "source": [ "# Visualize label distribution\n", "fig, axes = plt.subplots(1, 2, figsize=(16, 6))\n", "\n", "# Labels per issue\n", "axes[0].hist(labels_per_issue, bins=min(50, labels_per_issue.max()), edgecolor='black')\n", "axes[0].set_xlabel('Number of Labels per Issue')\n", "axes[0].set_ylabel('Frequency')\n", "axes[0].set_title('Distribution of Labels per Issue')\n", "axes[0].axvline(labels_per_issue.mean(), color='red', linestyle='--', label=f'Mean: {labels_per_issue.mean():.2f}')\n", "axes[0].axvline(labels_per_issue.median(), color='green', linestyle='--', label=f'Median: {labels_per_issue.median():.0f}')\n", "axes[0].legend()\n", "\n", "# Issues per label (log scale)\n", "axes[1].hist(issues_per_label, bins=50, edgecolor='black')\n", "axes[1].set_xlabel('Number of Issues per Label')\n", "axes[1].set_ylabel('Frequency')\n", "axes[1].set_title('Distribution of Issues per Label')\n", "axes[1].set_yscale('log')\n", "axes[1].axvline(issues_per_label.mean(), color='red', linestyle='--', label=f'Mean: {issues_per_label.mean():.2f}')\n", "axes[1].axvline(issues_per_label.median(), color='green', linestyle='--', label=f'Median: {issues_per_label.median():.0f}')\n", "axes[1].legend()\n", "\n", "plt.tight_layout()\n", "plt.show()" ] }, { "cell_type": "markdown", "id": "b0b78747", "metadata": {}, "source": [ "## 6. View: vw_nlbse_tool_competition_data_by_file" ] }, { "cell_type": "code", "execution_count": null, "id": "b5605fd5", "metadata": {}, "outputs": [], "source": [ "# Load and explore the file-level view\n", "file_view_query = \"SELECT * FROM vw_nlbse_tool_competition_data_by_file LIMIT 1000\"\n", "file_df = pd.read_sql_query(file_view_query, conn)\n", "\n", "print(f\"File view shape (first 1000 rows): {file_df.shape}\")\n", "print(f\"\\nColumns:\")\n", "print(file_df.columns.tolist())\n", "print(f\"\\nFirst few rows:\")\n", "file_df.head()" ] }, { "cell_type": "code", "execution_count": null, "id": "d34e06c9", "metadata": {}, "outputs": [], "source": [ "# Get total count from file view\n", "count_query = \"SELECT COUNT(*) as total FROM vw_nlbse_tool_competition_data_by_file\"\n", "total_files = pd.read_sql_query(count_query, conn)\n", "print(f\"Total records in file view: {total_files['total'].values[0]:,}\")" ] }, { "cell_type": "markdown", "id": "9c582768", "metadata": {}, "source": [ "## 7. Summary and Conclusions" ] }, { "cell_type": "code", "execution_count": null, "id": "7884ce2d", "metadata": {}, "outputs": [], "source": [ "print(\"=\" * 60)\n", "print(\"DATA EXPLORATION SUMMARY\")\n", "print(\"=\" * 60)\n", "print(f\"\\nDataset Overview:\")\n", "print(f\" - Total issues: {len(df):,}\")\n", "print(f\" - Total columns: {len(df.columns)}\")\n", "print(f\" - Text columns identified: {len(potential_text_cols)}\")\n", "print(f\" - Label columns identified: {len(label_cols)}\")\n", "print(f\" - Metadata columns: {len(metadata_cols)}\")\n", "\n", "if len(label_cols) > 0:\n", " print(f\"\\nLabel Statistics:\")\n", " print(f\" - Avg labels per issue: {labels_per_issue.mean():.2f}\")\n", " print(f\" - Median labels per issue: {labels_per_issue.median():.0f}\")\n", " print(f\" - Max labels per issue: {labels_per_issue.max()}\")\n", " print(f\" - Avg issues per label: {issues_per_label.mean():.2f}\")\n", " print(f\" - Median issues per label: {issues_per_label.median():.0f}\")\n", " print(f\" - Labels with no issues: {(issues_per_label == 0).sum()}\")\n" ] }, { "cell_type": "code", "execution_count": null, "id": "a1440cc9", "metadata": {}, "outputs": [], "source": [ "# Save column information for reference\n", "column_info = pd.DataFrame({\n", " 'column_name': df.columns,\n", " 'dtype': df.dtypes.values,\n", " 'null_count': df.isnull().sum().values,\n", " 'null_pct': (df.isnull().sum() / len(df) * 100).values\n", "})\n", "\n", "output_path = Path('../reports/column_info.csv')\n", "output_path.parent.mkdir(parents=True, exist_ok=True)\n", "column_info.to_csv(output_path, index=False)\n", "print(f\"\\nColumn information saved to {output_path}\")" ] }, { "cell_type": "code", "execution_count": null, "id": "1dd24d0f", "metadata": {}, "outputs": [], "source": [ "# Close database connection\n", "conn.close()\n", "print(\"\\nDatabase connection closed.\")" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }