SharathReddy commited on
Commit
3ef6520
·
verified ·
1 Parent(s): 284a8c6

Update mine_diffs.py

Browse files
Files changed (1) hide show
  1. mine_diffs.py +66 -66
mine_diffs.py CHANGED
@@ -33,75 +33,75 @@ def get_functions_from_source(source_code):
33
  def format_for_model(diff_text, old_doc, new_doc):
34
  return {
35
  "text": f"""### INSTRUCTION:
36
- A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
37
-
38
- ### GIT DIFF:
39
- ```diff
40
- {diff_text}
41
- OLD DOCUMENTATION:
42
-
43
- {old_doc.strip()}
44
- UPDATED DOCUMENTATION:
45
-
46
- {new_doc.strip()}
47
- """
48
- }
49
 
50
  def main():
51
- # All code inside this function must be indented.
52
- dataset = []
53
- base_repo_dir = tempfile.mkdtemp()
54
- print(f"Using temporary directory for clones: {base_repo_dir}")
55
-
56
- for name, url in REPO_CONFIG.items():
57
- repo_dir = os.path.join(base_repo_dir, name)
58
- try:
59
- print(f"Cloning {name} from {url}...")
60
- repo = Repo.clone_from(url, repo_dir)
61
- except exc.GitCommandError as e:
62
- print(f"Error cloning {name}: {e}")
63
- continue
64
-
65
- print(f"Mining commit history for {name}...")
66
- commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
67
- for commit in commits:
68
- if not commit.parents:
69
  continue
70
-
71
- parent = commit.parents[0]
72
- diffs = commit.diff(parent, create_patch=True, unified=0)
73
-
74
- for diff in diffs:
75
- if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
76
- continue
77
- if diff.a_blob is None or diff.b_blob is None:
78
- continue
79
-
80
- try:
81
- old_source = diff.a_blob.data_stream.read().decode('utf-8')
82
- new_source = diff.b_blob.data_stream.read().decode('utf-8')
83
- except UnicodeDecodeError:
84
  continue
85
-
86
- old_funcs = get_functions_from_source(old_source)
87
- new_funcs = get_functions_from_source(new_source)
88
-
89
- for func_name, old_doc in old_funcs.items():
90
- if func_name in new_funcs:
91
- new_doc = new_funcs[func_name]
92
- if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
93
- diff_text = diff.diff.decode('utf-8', errors='ignore')
94
- formatted_example = format_for_model(diff_text, old_doc, new_doc)
95
- dataset.append(formatted_example)
96
-
97
- print(f"\nFound {len(dataset)} high-quality examples.")
98
-
99
- try:
100
- with open(OUTPUT_FILE, 'w') as f:
101
- for item in dataset:
102
- f.write(json.dumps(item) + "\n")
103
- print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
104
- except Exception as e:
105
- print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
  if name == "main":
107
  main()
 
33
  def format_for_model(diff_text, old_doc, new_doc):
34
  return {
35
  "text": f"""### INSTRUCTION:
36
+ A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
37
+
38
+ ### GIT DIFF:
39
+ ```diff
40
+ {diff_text}
41
+ OLD DOCUMENTATION:
42
+
43
+ {old_doc.strip()}
44
+ UPDATED DOCUMENTATION:
45
+
46
+ {new_doc.strip()}
47
+ """
48
+ }
49
 
50
  def main():
51
+ # All code inside this function must be indented.
52
+ dataset = []
53
+ base_repo_dir = tempfile.mkdtemp()
54
+ print(f"Using temporary directory for clones: {base_repo_dir}")
55
+
56
+ for name, url in REPO_CONFIG.items():
57
+ repo_dir = os.path.join(base_repo_dir, name)
58
+ try:
59
+ print(f"Cloning {name} from {url}...")
60
+ repo = Repo.clone_from(url, repo_dir)
61
+ except exc.GitCommandError as e:
62
+ print(f"Error cloning {name}: {e}")
 
 
 
 
 
 
63
  continue
64
+
65
+ print(f"Mining commit history for {name}...")
66
+ commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
67
+ for commit in commits:
68
+ if not commit.parents:
 
 
 
 
 
 
 
 
 
69
  continue
70
+
71
+ parent = commit.parents[0]
72
+ diffs = commit.diff(parent, create_patch=True, unified=0)
73
+
74
+ for diff in diffs:
75
+ if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
76
+ continue
77
+ if diff.a_blob is None or diff.b_blob is None:
78
+ continue
79
+
80
+ try:
81
+ old_source = diff.a_blob.data_stream.read().decode('utf-8')
82
+ new_source = diff.b_blob.data_stream.read().decode('utf-8')
83
+ except UnicodeDecodeError:
84
+ continue
85
+
86
+ old_funcs = get_functions_from_source(old_source)
87
+ new_funcs = get_functions_from_source(new_source)
88
+
89
+ for func_name, old_doc in old_funcs.items():
90
+ if func_name in new_funcs:
91
+ new_doc = new_funcs[func_name]
92
+ if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
93
+ diff_text = diff.diff.decode('utf-8', errors='ignore')
94
+ formatted_example = format_for_model(diff_text, old_doc, new_doc)
95
+ dataset.append(formatted_example)
96
+
97
+ print(f"\nFound {len(dataset)} high-quality examples.")
98
+
99
+ try:
100
+ with open(OUTPUT_FILE, 'w') as f:
101
+ for item in dataset:
102
+ f.write(json.dumps(item) + "\n")
103
+ print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
104
+ except Exception as e:
105
+ print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")
106
  if name == "main":
107
  main()