SharathReddy commited on
Commit
8394c6e
·
verified ·
1 Parent(s): b6ab7c0

Create mine_diffs.py

Browse files
Files changed (1) hide show
  1. mine_diffs.py +122 -0
mine_diffs.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import ast
4
+ from git import Repo, exc
5
+
6
+ # --- Configuration ---
7
+ # We'll use a few high-quality repos for a more diverse dataset
8
+ REPO_CONFIG = {
9
+ "fastapi": "https://github.com/tiangolo/fastapi.git",
10
+ "requests": "https://github.com/psf/requests.git",
11
+ "scikit-learn": "https://github.com/scikit-learn/scikit-learn.git"
12
+ }
13
+ OUTPUT_FILE = "diff_dataset.jsonl"
14
+ MAX_COMMITS_PER_REPO = 5000 # Scan more commits for a better dataset
15
+
16
+ class FuncParser(ast.NodeVisitor):
17
+ """AST visitor to find all function definitions and their docstrings."""
18
+ def __init__(self):
19
+ self.functions = {}
20
+
21
+ def visit_FunctionDef(self, node):
22
+ docstring = ast.get_docstring(node) or ""
23
+ # Store the function name and its docstring
24
+ self.functions[node.name] = docstring
25
+ self.generic_visit(node)
26
+
27
+ def get_functions_from_source(source_code):
28
+ """Parses source code and returns a dictionary of {func_name: docstring}."""
29
+ try:
30
+ tree = ast.parse(source_code)
31
+ parser = FuncParser()
32
+ parser.visit(tree)
33
+ return parser.functions
34
+ except SyntaxError:
35
+ return {} # Ignore files with syntax errors
36
+
37
+ def format_for_model(diff_text, old_doc, new_doc):
38
+ """Creates a structured prompt for the model to learn from."""
39
+ return {
40
+ "text": f"""### INSTRUCTION:
41
+ A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
42
+
43
+ ### GIT DIFF:
44
+ ```diff
45
+ {diff_text}
46
+
47
+ OLD DOCUMENTATION:
48
+
49
+ {old_doc.strip()}
50
+ UPDATED DOCUMENTATION:
51
+
52
+ {new_doc.strip()}
53
+ """
54
+ }
55
+
56
+ def main():
57
+ dataset = []
58
+ for name, url in REPO_CONFIG.items():
59
+ repo_dir = f"repos/{name}"
60
+ # 1. Clone or pull the repository
61
+ try:
62
+ if os.path.exists(repo_dir):
63
+ print(f"Pulling latest changes for {name}...")
64
+ repo = Repo(repo_dir)
65
+ repo.remotes.origin.pull()
66
+ else:
67
+ print(f"Cloning {name} from {url}...")
68
+ repo = Repo.clone_from(url, repo_dir)
69
+ except exc.GitCommandError as e:
70
+ print(f"Error cloning/pulling {name}: {e}")
71
+ continue
72
+
73
+ print(f"Mining commit history for {name}...")
74
+ # 2. Iterate through commits
75
+ commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
76
+ for commit in commits:
77
+ if not commit.parents:
78
+ continue # Skip initial commit
79
+
80
+ parent = commit.parents[0]
81
+ diffs = commit.diff(parent, create_patch=True, unified=0)
82
+
83
+ # 3. Look for changes in Python files
84
+ for diff in diffs:
85
+ if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
86
+ continue
87
+
88
+ # Avoid processing deleted or new files for simplicity
89
+ if diff.a_blob is None or diff.b_blob is None:
90
+ continue
91
+
92
+ # 4. Use AST to find changes, not regex
93
+ try:
94
+ old_source = diff.a_blob.data_stream.read().decode('utf-8')
95
+ new_source = diff.b_blob.data_stream.read().decode('utf-8')
96
+ except UnicodeDecodeError:
97
+ continue # Skip files with decoding errors
98
+
99
+ old_funcs = get_functions_from_source(old_source)
100
+ new_funcs = get_functions_from_source(new_source)
101
+
102
+ # 5. Compare the docstrings of functions that exist in both versions
103
+ for func_name, old_doc in old_funcs.items():
104
+ if func_name in new_funcs:
105
+ new_doc = new_funcs[func_name]
106
+ # We found a change if the docstrings are different
107
+ if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
108
+ diff_text = diff.diff.decode('utf-8', errors='ignore')
109
+ formatted_example = format_for_model(diff_text, old_doc, new_doc)
110
+ dataset.append(formatted_example)
111
+
112
+ print(f"\nFound {len(dataset)} high-quality examples.")
113
+ # 6. Save the final dataset
114
+ with open(OUTPUT_FILE, 'w') as f:
115
+ for item in dataset:
116
+ f.write(json.dumps(item) + "\n")
117
+
118
+ print(f"Dataset saved to '{OUTPUT_FILE}'.")
119
+
120
+ if __name__ == "__main__":
121
+ main()
122
+ ```