Spaces:
Sleeping
Sleeping
File size: 3,452 Bytes
b89e6d6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 | """Generate a small synthetic dataset in CodeSearchNet's schema.
This lets you test the full pipeline (clean -> EDA -> later phases) without
downloading the real ~2M-row dataset. The schema matches the HuggingFace
`code_search_net` columns we actually use downstream.
"""
from __future__ import annotations
import random
import pandas as pd
# A handful of realistic (docstring, code) templates plus some deliberately
# "dirty" rows so the cleaning step has something to remove.
_GOOD = [
(
"Return the factorial of a non-negative integer n.",
"def factorial(n):\n if n < 0:\n raise ValueError('n must be >= 0')\n result = 1\n for i in range(2, n + 1):\n result *= i\n return result",
),
(
"Compute the nth Fibonacci number using iteration.",
"def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a",
),
(
"Check whether a given string is a palindrome, ignoring case.",
"def is_palindrome(s):\n s = s.lower()\n return s == s[::-1]",
),
(
"Read a JSON file from disk and return the parsed dictionary.",
"import json\n\ndef read_json(path):\n with open(path) as f:\n return json.load(f)",
),
(
"Return a list of unique elements preserving original order.",
"def dedupe(items):\n seen = set()\n out = []\n for x in items:\n if x not in seen:\n seen.add(x)\n out.append(x)\n return out",
),
(
"Flatten a nested list of arbitrary depth into a single list.",
"def flatten(lst):\n out = []\n for x in lst:\n if isinstance(x, list):\n out.extend(flatten(x))\n else:\n out.append(x)\n return out",
),
]
# Rows that should be filtered out by the cleaning rules.
_DIRTY = [
("", "def noop():\n pass"), # empty docstring
("ok", "def f():\n return 1"), # too-short docstring
("TODO: write this later", "def g():\n pass"), # blocklisted
("auto-generated do not edit", "def h():\n pass"), # blocklisted
("Returns x.", "x"), # too-short code
("说明:返回输入值的两倍。", "def dbl(x):\n return x * 2"), # non-ascii doc
]
def make_sample(n: int = 200, seed: int = 42) -> pd.DataFrame:
"""Build a DataFrame with the CodeSearchNet columns we rely on."""
rng = random.Random(seed)
rows = []
for i in range(n):
# ~15% dirty rows so cleaning has work to do.
if rng.random() < 0.15:
doc, code = rng.choice(_DIRTY)
else:
doc, code = rng.choice(_GOOD)
rows.append(
{
"repository_name": f"acme/repo{i % 10}",
"func_path_in_repository": f"src/module_{i}.py",
"func_name": (code.split("(")[0].replace("def ", "").strip()
if code.startswith("def ") else f"sym_{i}"),
"language": "python",
"func_code_string": code,
"func_documentation_string": doc,
"func_code_url": f"https://github.com/acme/repo{i % 10}/blob/main/src/module_{i}.py",
}
)
return pd.DataFrame(rows)
if __name__ == "__main__":
df = make_sample()
print(df.shape)
print(df.head(3).to_string())
|