Spaces:
Runtime error
Runtime error
| import os | |
| import json | |
| import ast | |
| import tempfile | |
| from git import Repo, exc | |
| # --- Configuration --- | |
| REPO_CONFIG = { | |
| "fastapi": "https://github.com/tiangolo/fastapi.git", | |
| "requests": "https://github.com/psf/requests.git", | |
| "scikit-learn": "https://github.com/scikit-learn/scikit-learn.git" | |
| } | |
| OUTPUT_FILE = "diff_dataset.jsonl" | |
| MAX_COMMITS_PER_REPO = 5000 | |
| class FuncParser(ast.NodeVisitor): | |
| def __init__(self): | |
| self.functions = {} | |
| def visit_FunctionDef(self, node): | |
| docstring = ast.get_docstring(node) or "" | |
| self.functions[node.name] = docstring | |
| self.generic_visit(node) | |
| def get_functions_from_source(source_code): | |
| try: | |
| tree = ast.parse(source_code) | |
| parser = FuncParser() | |
| parser.visit(tree) | |
| return parser.functions | |
| except SyntaxError: | |
| return {} | |
| def format_for_model(diff_text, old_doc, new_doc): | |
| return { | |
| "text": f"""### INSTRUCTION: | |
| A Python function's code was changed. Based on the `git diff` provided, update the function's documentation. | |
| ### GIT DIFF: | |
| ```diff | |
| {diff_text} | |
| OLD DOCUMENTATION: | |
| {old_doc.strip()} | |
| UPDATED DOCUMENTATION: | |
| {new_doc.strip()} | |
| """ | |
| } | |
| def main(): | |
| dataset = [] | |
| base_repo_dir = tempfile.mkdtemp() | |
| print(f"Using temporary directory for clones: {base_repo_dir}") | |
| for name, url in REPO_CONFIG.items(): | |
| repo_dir = os.path.join(base_repo_dir, name) | |
| try: | |
| print(f"Cloning {name} from {url}...") | |
| repo = Repo.clone_from(url, repo_dir) | |
| except exc.GitCommandError as e: | |
| print(f"Error cloning {name}: {e}") | |
| continue | |
| print(f"Mining commit history for {name}...") | |
| commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO)) | |
| for commit in commits: | |
| if not commit.parents: | |
| continue | |
| parent = commit.parents[0] | |
| diffs = commit.diff(parent, create_patch=True, unified=0) | |
| for diff in diffs: | |
| if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')): | |
| continue | |
| if diff.a_blob is None or diff.b_blob is None: | |
| continue | |
| try: | |
| old_source = diff.a_blob.data_stream.read().decode('utf-8') | |
| new_source = diff.b_blob.data_stream.read().decode('utf-8') | |
| except UnicodeDecodeError: | |
| continue | |
| old_funcs = get_functions_from_source(old_source) | |
| new_funcs = get_functions_from_source(new_source) | |
| for func_name, old_doc in old_funcs.items(): | |
| if func_name in new_funcs: | |
| new_doc = new_funcs[func_name] | |
| if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20: | |
| diff_text = diff.diff.decode('utf-8', errors='ignore') | |
| formatted_example = format_for_model(diff_text, old_doc, new_doc) | |
| dataset.append(formatted_example) | |
| print(f"\nFound {len(dataset)} high-quality examples.") | |
| try: | |
| with open(OUTPUT_FILE, 'w') as f: | |
| for item in dataset: | |
| f.write(json.dumps(item) + "\n") | |
| print(f"Dataset successfully saved to '{OUTPUT_FILE}'.") | |
| except Exception as e: | |
| print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}") | |
| if __name__ == "main": | |
| main() |