SharathReddy commited on
Commit
a13b6ec
·
verified ·
1 Parent(s): 5d52c73

Update mine_diffs.py

Browse files
Files changed (1) hide show
  1. mine_diffs.py +57 -70
mine_diffs.py CHANGED
@@ -1,41 +1,37 @@
1
  import os
2
  import json
3
  import ast
 
4
  from git import Repo, exc
5
 
6
  # --- Configuration ---
7
- # We'll use a few high-quality repos for a more diverse dataset
8
  REPO_CONFIG = {
9
  "fastapi": "https://github.com/tiangolo/fastapi.git",
10
  "requests": "https://github.com/psf/requests.git",
11
  "scikit-learn": "https://github.com/scikit-learn/scikit-learn.git"
12
  }
 
13
  OUTPUT_FILE = "diff_dataset.jsonl"
14
- MAX_COMMITS_PER_REPO = 5000 # Scan more commits for a better dataset
15
 
16
  class FuncParser(ast.NodeVisitor):
17
- """AST visitor to find all function definitions and their docstrings."""
18
  def __init__(self):
19
  self.functions = {}
20
-
21
  def visit_FunctionDef(self, node):
22
  docstring = ast.get_docstring(node) or ""
23
- # Store the function name and its docstring
24
  self.functions[node.name] = docstring
25
  self.generic_visit(node)
26
 
27
  def get_functions_from_source(source_code):
28
- """Parses source code and returns a dictionary of {func_name: docstring}."""
29
  try:
30
  tree = ast.parse(source_code)
31
  parser = FuncParser()
32
  parser.visit(tree)
33
  return parser.functions
34
  except SyntaxError:
35
- return {} # Ignore files with syntax errors
36
-
37
  def format_for_model(diff_text, old_doc, new_doc):
38
- """Creates a structured prompt for the model to learn from."""
39
  return {
40
  "text": f"""### INSTRUCTION:
41
  A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
@@ -43,7 +39,6 @@ A Python function's code was changed. Based on the `git diff` provided, update t
43
  ### GIT DIFF:
44
  ```diff
45
  {diff_text}
46
-
47
  OLD DOCUMENTATION:
48
 
49
  {old_doc.strip()}
@@ -54,68 +49,60 @@ UPDATED DOCUMENTATION:
54
  }
55
 
56
  def main():
57
- dataset = []
58
- for name, url in REPO_CONFIG.items():
59
- repo_dir = f"repos/{name}"
60
- # 1. Clone or pull the repository
61
- try:
62
- if os.path.exists(repo_dir):
63
- print(f"Pulling latest changes for {name}...")
64
- repo = Repo(repo_dir)
65
- repo.remotes.origin.pull()
66
- else:
67
- print(f"Cloning {name} from {url}...")
68
- repo = Repo.clone_from(url, repo_dir)
69
- except exc.GitCommandError as e:
70
- print(f"Error cloning/pulling {name}: {e}")
 
 
 
 
71
  continue
72
 
73
- print(f"Mining commit history for {name}...")
74
- # 2. Iterate through commits
75
- commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
76
- for commit in commits:
77
- if not commit.parents:
78
- continue # Skip initial commit
79
-
80
- parent = commit.parents[0]
81
- diffs = commit.diff(parent, create_patch=True, unified=0)
82
-
83
- # 3. Look for changes in Python files
84
- for diff in diffs:
85
- if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
86
- continue
87
-
88
- # Avoid processing deleted or new files for simplicity
89
- if diff.a_blob is None or diff.b_blob is None:
90
- continue
91
-
92
- # 4. Use AST to find changes, not regex
93
- try:
94
- old_source = diff.a_blob.data_stream.read().decode('utf-8')
95
- new_source = diff.b_blob.data_stream.read().decode('utf-8')
96
- except UnicodeDecodeError:
97
- continue # Skip files with decoding errors
98
-
99
- old_funcs = get_functions_from_source(old_source)
100
- new_funcs = get_functions_from_source(new_source)
101
-
102
- # 5. Compare the docstrings of functions that exist in both versions
103
- for func_name, old_doc in old_funcs.items():
104
- if func_name in new_funcs:
105
- new_doc = new_funcs[func_name]
106
- # We found a change if the docstrings are different
107
- if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
108
- diff_text = diff.diff.decode('utf-8', errors='ignore')
109
- formatted_example = format_for_model(diff_text, old_doc, new_doc)
110
- dataset.append(formatted_example)
111
-
112
- print(f"\nFound {len(dataset)} high-quality examples.")
113
- # 6. Save the final dataset
114
  with open(OUTPUT_FILE, 'w') as f:
115
  for item in dataset:
116
  f.write(json.dumps(item) + "\n")
117
-
118
- print(f"Dataset saved to '{OUTPUT_FILE}'.")
119
-
120
- if __name__ == "__main__":
121
- main()
 
1
  import os
2
  import json
3
  import ast
4
+ import tempfile
5
  from git import Repo, exc
6
 
7
  # --- Configuration ---
 
8
  REPO_CONFIG = {
9
  "fastapi": "https://github.com/tiangolo/fastapi.git",
10
  "requests": "https://github.com/psf/requests.git",
11
  "scikit-learn": "https://github.com/scikit-learn/scikit-learn.git"
12
  }
13
+ # The final output file will be created in the main directory
14
  OUTPUT_FILE = "diff_dataset.jsonl"
15
+ MAX_COMMITS_PER_REPO = 5000
16
 
17
  class FuncParser(ast.NodeVisitor):
 
18
  def __init__(self):
19
  self.functions = {}
 
20
  def visit_FunctionDef(self, node):
21
  docstring = ast.get_docstring(node) or ""
 
22
  self.functions[node.name] = docstring
23
  self.generic_visit(node)
24
 
25
  def get_functions_from_source(source_code):
 
26
  try:
27
  tree = ast.parse(source_code)
28
  parser = FuncParser()
29
  parser.visit(tree)
30
  return parser.functions
31
  except SyntaxError:
32
+ return {}
33
+
34
  def format_for_model(diff_text, old_doc, new_doc):
 
35
  return {
36
  "text": f"""### INSTRUCTION:
37
  A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
 
39
  ### GIT DIFF:
40
  ```diff
41
  {diff_text}
 
42
  OLD DOCUMENTATION:
43
 
44
  {old_doc.strip()}
 
49
  }
50
 
51
  def main():
52
+ dataset = []
53
+ # Use the guaranteed-writeable /tmp directory for cloning
54
+ base_repo_dir = tempfile.mkdtemp()
55
+ print(f"Using temporary directory for clones: {base_repo_dir}")
56
+
57
+ for name, url in REPO_CONFIG.items():
58
+ repo_dir = os.path.join(base_repo_dir, name)
59
+ try:
60
+ print(f"Cloning {name} from {url}...")
61
+ repo = Repo.clone_from(url, repo_dir)
62
+ except exc.GitCommandError as e:
63
+ print(f"Error cloning {name}: {e}")
64
+ continue
65
+
66
+ print(f"Mining commit history for {name}...")
67
+ commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
68
+ for commit in commits:
69
+ if not commit.parents:
70
  continue
71
 
72
+ parent = commit.parents[0]
73
+ diffs = commit.diff(parent, create_patch=True, unified=0)
74
+
75
+ for diff in diffs:
76
+ if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
77
+ continue
78
+ if diff.a_blob is None or diff.b_blob is None:
79
+ continue
80
+
81
+ try:
82
+ old_source = diff.a_blob.data_stream.read().decode('utf-8')
83
+ new_source = diff.b_blob.data_stream.read().decode('utf-8')
84
+ except UnicodeDecodeError:
85
+ continue
86
+
87
+ old_funcs = get_functions_from_source(old_source)
88
+ new_funcs = get_functions_from_source(new_source)
89
+
90
+ for func_name, old_doc in old_funcs.items():
91
+ if func_name in new_funcs:
92
+ new_doc = new_funcs[func_name]
93
+ if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
94
+ diff_text = diff.diff.decode('utf-8', errors='ignore')
95
+ formatted_example = format_for_model(diff_text, old_doc, new_doc)
96
+ dataset.append(formatted_example)
97
+
98
+ print(f"\nFound {len(dataset)} high-quality examples.")
99
+
100
+ try:
 
 
 
 
 
 
 
 
 
 
 
 
101
  with open(OUTPUT_FILE, 'w') as f:
102
  for item in dataset:
103
  f.write(json.dumps(item) + "\n")
104
+ print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
105
+ except Exception as e:
106
+ print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")
107
+ if name == "main":
108
+ main()