File size: 3,452 Bytes
b89e6d6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
"""Generate a small synthetic dataset in CodeSearchNet's schema.

This lets you test the full pipeline (clean -> EDA -> later phases) without
downloading the real ~2M-row dataset. The schema matches the HuggingFace
`code_search_net` columns we actually use downstream.
"""
from __future__ import annotations

import random

import pandas as pd

# A handful of realistic (docstring, code) templates plus some deliberately
# "dirty" rows so the cleaning step has something to remove.
_GOOD = [
    (
        "Return the factorial of a non-negative integer n.",
        "def factorial(n):\n    if n < 0:\n        raise ValueError('n must be >= 0')\n    result = 1\n    for i in range(2, n + 1):\n        result *= i\n    return result",
    ),
    (
        "Compute the nth Fibonacci number using iteration.",
        "def fib(n):\n    a, b = 0, 1\n    for _ in range(n):\n        a, b = b, a + b\n    return a",
    ),
    (
        "Check whether a given string is a palindrome, ignoring case.",
        "def is_palindrome(s):\n    s = s.lower()\n    return s == s[::-1]",
    ),
    (
        "Read a JSON file from disk and return the parsed dictionary.",
        "import json\n\ndef read_json(path):\n    with open(path) as f:\n        return json.load(f)",
    ),
    (
        "Return a list of unique elements preserving original order.",
        "def dedupe(items):\n    seen = set()\n    out = []\n    for x in items:\n        if x not in seen:\n            seen.add(x)\n            out.append(x)\n    return out",
    ),
    (
        "Flatten a nested list of arbitrary depth into a single list.",
        "def flatten(lst):\n    out = []\n    for x in lst:\n        if isinstance(x, list):\n            out.extend(flatten(x))\n        else:\n            out.append(x)\n    return out",
    ),
]

# Rows that should be filtered out by the cleaning rules.
_DIRTY = [
    ("", "def noop():\n    pass"),                          # empty docstring
    ("ok", "def f():\n    return 1"),                        # too-short docstring
    ("TODO: write this later", "def g():\n    pass"),        # blocklisted
    ("auto-generated do not edit", "def h():\n    pass"),    # blocklisted
    ("Returns x.", "x"),                                     # too-short code
    ("说明:返回输入值的两倍。", "def dbl(x):\n    return x * 2"),  # non-ascii doc
]


def make_sample(n: int = 200, seed: int = 42) -> pd.DataFrame:
    """Build a DataFrame with the CodeSearchNet columns we rely on."""
    rng = random.Random(seed)
    rows = []
    for i in range(n):
        # ~15% dirty rows so cleaning has work to do.
        if rng.random() < 0.15:
            doc, code = rng.choice(_DIRTY)
        else:
            doc, code = rng.choice(_GOOD)
        rows.append(
            {
                "repository_name": f"acme/repo{i % 10}",
                "func_path_in_repository": f"src/module_{i}.py",
                "func_name": (code.split("(")[0].replace("def ", "").strip()
                              if code.startswith("def ") else f"sym_{i}"),
                "language": "python",
                "func_code_string": code,
                "func_documentation_string": doc,
                "func_code_url": f"https://github.com/acme/repo{i % 10}/blob/main/src/module_{i}.py",
            }
        )
    return pd.DataFrame(rows)


if __name__ == "__main__":
    df = make_sample()
    print(df.shape)
    print(df.head(3).to_string())