"""Generate a small synthetic dataset in CodeSearchNet's schema. This lets you test the full pipeline (clean -> EDA -> later phases) without downloading the real ~2M-row dataset. The schema matches the HuggingFace `code_search_net` columns we actually use downstream. """ from __future__ import annotations import random import pandas as pd # A handful of realistic (docstring, code) templates plus some deliberately # "dirty" rows so the cleaning step has something to remove. _GOOD = [ ( "Return the factorial of a non-negative integer n.", "def factorial(n):\n if n < 0:\n raise ValueError('n must be >= 0')\n result = 1\n for i in range(2, n + 1):\n result *= i\n return result", ), ( "Compute the nth Fibonacci number using iteration.", "def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a", ), ( "Check whether a given string is a palindrome, ignoring case.", "def is_palindrome(s):\n s = s.lower()\n return s == s[::-1]", ), ( "Read a JSON file from disk and return the parsed dictionary.", "import json\n\ndef read_json(path):\n with open(path) as f:\n return json.load(f)", ), ( "Return a list of unique elements preserving original order.", "def dedupe(items):\n seen = set()\n out = []\n for x in items:\n if x not in seen:\n seen.add(x)\n out.append(x)\n return out", ), ( "Flatten a nested list of arbitrary depth into a single list.", "def flatten(lst):\n out = []\n for x in lst:\n if isinstance(x, list):\n out.extend(flatten(x))\n else:\n out.append(x)\n return out", ), ] # Rows that should be filtered out by the cleaning rules. _DIRTY = [ ("", "def noop():\n pass"), # empty docstring ("ok", "def f():\n return 1"), # too-short docstring ("TODO: write this later", "def g():\n pass"), # blocklisted ("auto-generated do not edit", "def h():\n pass"), # blocklisted ("Returns x.", "x"), # too-short code ("说明:返回输入值的两倍。", "def dbl(x):\n return x * 2"), # non-ascii doc ] def make_sample(n: int = 200, seed: int = 42) -> pd.DataFrame: """Build a DataFrame with the CodeSearchNet columns we rely on.""" rng = random.Random(seed) rows = [] for i in range(n): # ~15% dirty rows so cleaning has work to do. if rng.random() < 0.15: doc, code = rng.choice(_DIRTY) else: doc, code = rng.choice(_GOOD) rows.append( { "repository_name": f"acme/repo{i % 10}", "func_path_in_repository": f"src/module_{i}.py", "func_name": (code.split("(")[0].replace("def ", "").strip() if code.startswith("def ") else f"sym_{i}"), "language": "python", "func_code_string": code, "func_documentation_string": doc, "func_code_url": f"https://github.com/acme/repo{i % 10}/blob/main/src/module_{i}.py", } ) return pd.DataFrame(rows) if __name__ == "__main__": df = make_sample() print(df.shape) print(df.head(3).to_string())