Spaces:

Rushabh147
/

code-gen-assistant

Sleeping

App Files Files Community

code-gen-assistant / src /data /make_sample.py

Rushabh147

Initial deploy to HF Spaces (clean history, LFS for all binaries)

b89e6d6 4 days ago

Raw

History Blame Contribute Delete

3.45 kB

	"""Generate a small synthetic dataset in CodeSearchNet's schema.

	This lets you test the full pipeline (clean -> EDA -> later phases) without
	downloading the real ~2M-row dataset. The schema matches the HuggingFace
	`code_search_net` columns we actually use downstream.
	"""
	from __future__ import annotations

	import random

	import pandas as pd

	# A handful of realistic (docstring, code) templates plus some deliberately
	# "dirty" rows so the cleaning step has something to remove.
	_GOOD = [
	(
	"Return the factorial of a non-negative integer n.",
	"def factorial(n):\n if n < 0:\n raise ValueError('n must be >= 0')\n result = 1\n for i in range(2, n + 1):\n result *= i\n return result",
	),
	(
	"Compute the nth Fibonacci number using iteration.",
	"def fib(n):\n a, b = 0, 1\n for _ in range(n):\n a, b = b, a + b\n return a",
	),
	(
	"Check whether a given string is a palindrome, ignoring case.",
	"def is_palindrome(s):\n s = s.lower()\n return s == s[::-1]",
	),
	(
	"Read a JSON file from disk and return the parsed dictionary.",
	"import json\n\ndef read_json(path):\n with open(path) as f:\n return json.load(f)",
	),
	(
	"Return a list of unique elements preserving original order.",
	"def dedupe(items):\n seen = set()\n out = []\n for x in items:\n if x not in seen:\n seen.add(x)\n out.append(x)\n return out",
	),
	(
	"Flatten a nested list of arbitrary depth into a single list.",
	"def flatten(lst):\n out = []\n for x in lst:\n if isinstance(x, list):\n out.extend(flatten(x))\n else:\n out.append(x)\n return out",
	),
	]

	# Rows that should be filtered out by the cleaning rules.
	_DIRTY = [
	("", "def noop():\n pass"), # empty docstring
	("ok", "def f():\n return 1"), # too-short docstring
	("TODO: write this later", "def g():\n pass"), # blocklisted
	("auto-generated do not edit", "def h():\n pass"), # blocklisted
	("Returns x.", "x"), # too-short code
	("说明：返回输入值的两倍。", "def dbl(x):\n return x * 2"), # non-ascii doc
	]


	def make_sample(n: int = 200, seed: int = 42) -> pd.DataFrame:
	"""Build a DataFrame with the CodeSearchNet columns we rely on."""
	rng = random.Random(seed)
	rows = []
	for i in range(n):
	# ~15% dirty rows so cleaning has work to do.
	if rng.random() < 0.15:
	doc, code = rng.choice(_DIRTY)
	else:
	doc, code = rng.choice(_GOOD)
	rows.append(
	{
	"repository_name": f"acme/repo{i % 10}",
	"func_path_in_repository": f"src/module_{i}.py",
	"func_name": (code.split("(")[0].replace("def ", "").strip()
	if code.startswith("def ") else f"sym_{i}"),
	"language": "python",
	"func_code_string": code,
	"func_documentation_string": doc,
	"func_code_url": f"https://github.com/acme/repo{i % 10}/blob/main/src/module_{i}.py",
	}
	)
	return pd.DataFrame(rows)


	if __name__ == "__main__":
	df = make_sample()
	print(df.shape)
	print(df.head(3).to_string())