{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\u001b[2mUsing Python 3.12.11 environment at: /home/zeus/miniconda3/envs/cloudspace\u001b[0m\n",
      "\u001b[2mAudited \u001b[1m1 package\u001b[0m \u001b[2min 75ms\u001b[0m\u001b[0m\n"
     ]
    }
   ],
   "source": [
    "!uv pip install datasets"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Warning: You are sending unauthenticated requests to the HF Hub. Please set a HF_TOKEN to enable higher rate limits and faster downloads.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading dataset from Hugging Face...\n",
      "Loaded 2641 problems from Hugging Face.\n",
      "Sample question_ids: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n",
      "Loaded 3888 explanations from local file.\n",
      "After filtering English only: 2773 problems remain.\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from datasets import load_dataset\n",
    "\n",
    "# ========================\n",
    "# 1. Load from Hugging Face\n",
    "# ========================\n",
    "\n",
    "print(\"Loading dataset from Hugging Face...\")\n",
    "dataset = load_dataset(\"newfacade/LeetCodeDataset\", split=\"train\")   # or \"test\" if you want test split\n",
    "\n",
    "# Extract all question_id into a list\n",
    "question_ids = list(dataset[\"question_id\"])\n",
    "\n",
    "print(f\"Loaded {len(question_ids)} problems from Hugging Face.\")\n",
    "print(f\"Sample question_ids: {question_ids[:10]}\")\n",
    "\n",
    "# ========================\n",
    "# 2. Load your local explanations.json\n",
    "# ========================\n",
    "\n",
    "with open(\"explanations.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    explanations_data = json.load(f)\n",
    "\n",
    "print(f\"Loaded {len(explanations_data)} explanations from local file.\")\n",
    "\n",
    "# ========================\n",
    "# 3. Filter only English explanations\n",
    "# ========================\n",
    "\n",
    "filtered_explanations = [\n",
    "    item for item in explanations_data \n",
    "    if item.get(\"is_english\") is True\n",
    "]\n",
    "\n",
    "print(f\"After filtering English only: {len(filtered_explanations)} problems remain.\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loading dataset from Hugging Face...\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded 2641 problems from Hugging Face.\n",
      "Sample HF question_ids: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]\n",
      "\n",
      "Loaded 3888 explanations from local file.\n",
      "After filtering English only: 2773 problems remain.\n",
      "\n",
      "📊 Summary:\n",
      "   Total in HF Dataset     : 2641\n",
      "   English Explanations    : 2773\n",
      "   Common Problems         : 2044\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "from datasets import load_dataset\n",
    "\n",
    "# ========================\n",
    "# 1. Load from Hugging Face\n",
    "# ========================\n",
    "\n",
    "print(\"Loading dataset from Hugging Face...\")\n",
    "dataset = load_dataset(\"newfacade/LeetCodeDataset\", split=\"train\")\n",
    "\n",
    "# Store all question_ids in a list (as array)\n",
    "hf_question_ids = list(dataset[\"question_id\"])\n",
    "\n",
    "print(f\"Loaded {len(hf_question_ids)} problems from Hugging Face.\")\n",
    "print(f\"Sample HF question_ids: {hf_question_ids[:10]}\")\n",
    "\n",
    "# Save HF question ids\n",
    "with open(\"hf_question_ids.json\", \"w\") as f:\n",
    "    json.dump(hf_question_ids, f, indent=2)\n",
    "\n",
    "# ========================\n",
    "# 2. Load your local explanations.json\n",
    "# ========================\n",
    "\n",
    "with open(\"explanations.json\", \"r\", encoding=\"utf-8\") as f:\n",
    "    explanations_data = json.load(f)\n",
    "\n",
    "print(f\"\\nLoaded {len(explanations_data)} explanations from local file.\")\n",
    "\n",
    "# ========================\n",
    "# 3. Filter only English explanations\n",
    "# ========================\n",
    "\n",
    "filtered_explanations = [\n",
    "    item for item in explanations_data \n",
    "    if item.get(\"is_english\") is True\n",
    "]\n",
    "\n",
    "# Extract English problem_ids from your explanations\n",
    "english_problem_ids = [item[\"problem_id\"] for item in filtered_explanations]\n",
    "\n",
    "print(f\"After filtering English only: {len(filtered_explanations)} problems remain.\")\n",
    "\n",
    "# ========================\n",
    "# 4. Find common question_ids (Intersection)\n",
    "# ========================\n",
    "\n",
    "# Convert both to sets for fast intersection\n",
    "hf_set = set(hf_question_ids)\n",
    "english_set = set(english_problem_ids)\n",
    "\n",
    "common_ids = sorted(hf_set.intersection(english_set))\n",
    "\n",
    "print(f\"\\n📊 Summary:\")\n",
    "print(f\"   Total in HF Dataset     : {len(hf_question_ids)}\")\n",
    "print(f\"   English Explanations    : {len(english_problem_ids)}\")\n",
    "print(f\"   Common Problems         : {len(common_ids)}\")\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Merged 2044 records\n",
      "Saved to dataset.jsonl\n",
      "\n",
      "Sample record:\n",
      "{\n",
      "  \"problem_id\": 1,\n",
      "  \"task_id\": \"two-sum\",\n",
      "  \"difficulty\": \"Easy\",\n",
      "  \"tags\": [\n",
      "    \"Array\",\n",
      "    \"Hash Table\"\n",
      "  ],\n",
      "  \"problem_description\": \"Given an array of integers nums and an integer target, return indices of the two numbers such that they add up to target.\\nYou may assume that each input would have exactly one solution, and you may not use the same element twice.\\nYou can return the answer in any order.\\n \\nExample 1:\\n\\nInput: nums = [2,7,11,15], target = 9\\nOutput: [0,1]\\nExplanation: B...\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "# ========================\n",
    "# 5. Build merged dataset\n",
    "# ========================\n",
    "\n",
    "# Create lookup dict for HF data\n",
    "hf_lookup = {}\n",
    "for item in dataset:\n",
    "    hf_lookup[item['question_id']] = item\n",
    "\n",
    "# Create lookup dict for local explanations (English only)\n",
    "expl_lookup = {}\n",
    "for item in filtered_explanations:\n",
    "    expl_lookup[item['problem_id']] = item\n",
    "\n",
    "# Merge for common IDs\n",
    "merged_records = []\n",
    "for pid in common_ids:\n",
    "    hf_item = hf_lookup.get(pid)\n",
    "    expl_item = expl_lookup.get(pid)\n",
    "    \n",
    "    if hf_item and expl_item:\n",
    "        # Get explanation for solution 1, or empty string if missing\n",
    "        explanation = \"\"\n",
    "        if '1' in expl_item.get('explanations', {}):\n",
    "            explanation = expl_item['explanations']['1'] or \"\"\n",
    "        \n",
    "        # Get time_complexity, or empty string if null\n",
    "        time_complexity = expl_item.get('time_complexity', \"\") or \"\"\n",
    "        \n",
    "        record = {\n",
    "            'problem_id': pid,\n",
    "            'task_id': hf_item['task_id'],\n",
    "            'difficulty': hf_item['difficulty'],\n",
    "            'tags': hf_item['tags'],\n",
    "            'problem_description': hf_item['problem_description'],\n",
    "            'time_complexity': time_complexity,\n",
    "            'explanation': explanation\n",
    "        }\n",
    "        merged_records.append(record)\n",
    "\n",
    "print(f\"Merged {len(merged_records)} records\")\n",
    "\n",
    "# ========================\n",
    "# 6. Save to JSONL\n",
    "# ========================\n",
    "\n",
    "output_path = \"dataset.jsonl\"\n",
    "with open(output_path, 'w', encoding='utf-8') as f:\n",
    "    for record in merged_records:\n",
    "        f.write(json.dumps(record, ensure_ascii=False) + '\\n')\n",
    "\n",
    "print(f\"Saved to {output_path}\")\n",
    "\n",
    "# Preview\n",
    "print(\"\\nSample record:\")\n",
    "print(json.dumps(merged_records[0], indent=2, ensure_ascii=False)[:500] + \"...\")"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}