Spaces:
Sleeping
Sleeping
| """Generate a small synthetic dataset in CodeSearchNet's schema. | |
| This lets you test the full pipeline (clean -> EDA -> later phases) without | |
| downloading the real ~2M-row dataset. The schema matches the HuggingFace | |
| `code_search_net` columns we actually use downstream. | |
| """ | |
| from __future__ import annotations | |
| import random | |
| import pandas as pd | |
| # A handful of realistic (docstring, code) templates plus some deliberately | |
| # "dirty" rows so the cleaning step has something to remove. | |
| _GOOD = [ | |
| ( | |
| "Return the factorial of a non-negative integer n.", | |
| "def factorial(n):\n if n < 0:\n raise ValueError('n must be >= 0')\n result = 1\n for i in range(2, n + 1):\n result *= i\n return result", | |
| ), | |
| ( | |
| "Compute the nth Fibonacci number using iteration.", | |
| "def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a", | |
| ), | |
| ( | |
| "Check whether a given string is a palindrome, ignoring case.", | |
| "def is_palindrome(s):\n s = s.lower()\n return s == s[::-1]", | |
| ), | |
| ( | |
| "Read a JSON file from disk and return the parsed dictionary.", | |
| "import json\n\ndef read_json(path):\n with open(path) as f:\n return json.load(f)", | |
| ), | |
| ( | |
| "Return a list of unique elements preserving original order.", | |
| "def dedupe(items):\n seen = set()\n out = []\n for x in items:\n if x not in seen:\n seen.add(x)\n out.append(x)\n return out", | |
| ), | |
| ( | |
| "Flatten a nested list of arbitrary depth into a single list.", | |
| "def flatten(lst):\n out = []\n for x in lst:\n if isinstance(x, list):\n out.extend(flatten(x))\n else:\n out.append(x)\n return out", | |
| ), | |
| ] | |
| # Rows that should be filtered out by the cleaning rules. | |
| _DIRTY = [ | |
| ("", "def noop():\n pass"), # empty docstring | |
| ("ok", "def f():\n return 1"), # too-short docstring | |
| ("TODO: write this later", "def g():\n pass"), # blocklisted | |
| ("auto-generated do not edit", "def h():\n pass"), # blocklisted | |
| ("Returns x.", "x"), # too-short code | |
| ("说明:返回输入值的两倍。", "def dbl(x):\n return x * 2"), # non-ascii doc | |
| ] | |
| def make_sample(n: int = 200, seed: int = 42) -> pd.DataFrame: | |
| """Build a DataFrame with the CodeSearchNet columns we rely on.""" | |
| rng = random.Random(seed) | |
| rows = [] | |
| for i in range(n): | |
| # ~15% dirty rows so cleaning has work to do. | |
| if rng.random() < 0.15: | |
| doc, code = rng.choice(_DIRTY) | |
| else: | |
| doc, code = rng.choice(_GOOD) | |
| rows.append( | |
| { | |
| "repository_name": f"acme/repo{i % 10}", | |
| "func_path_in_repository": f"src/module_{i}.py", | |
| "func_name": (code.split("(")[0].replace("def ", "").strip() | |
| if code.startswith("def ") else f"sym_{i}"), | |
| "language": "python", | |
| "func_code_string": code, | |
| "func_documentation_string": doc, | |
| "func_code_url": f"https://github.com/acme/repo{i % 10}/blob/main/src/module_{i}.py", | |
| } | |
| ) | |
| return pd.DataFrame(rows) | |
| if __name__ == "__main__": | |
| df = make_sample() | |
| print(df.shape) | |
| print(df.head(3).to_string()) | |