Spaces:

SharathReddy
/

doc-mining

Runtime error

App Files Files Community

SharathReddy commited on Jul 2, 2025

Commit

8394c6e

verified ·

1 Parent(s): b6ab7c0

Create mine_diffs.py

Browse files

Files changed (1) hide show

mine_diffs.py +122 -0

mine_diffs.py ADDED Viewed

	@@ -0,0 +1,122 @@

+import os
+import json
+import ast
+from git import Repo, exc
+# --- Configuration ---
+# We'll use a few high-quality repos for a more diverse dataset
+REPO_CONFIG = {
+    "fastapi": "https://github.com/tiangolo/fastapi.git",
+    "requests": "https://github.com/psf/requests.git",
+    "scikit-learn": "https://github.com/scikit-learn/scikit-learn.git"
+}
+OUTPUT_FILE = "diff_dataset.jsonl"
+MAX_COMMITS_PER_REPO = 5000  # Scan more commits for a better dataset
+class FuncParser(ast.NodeVisitor):
+    """AST visitor to find all function definitions and their docstrings."""
+    def __init__(self):
+        self.functions = {}
+    def visit_FunctionDef(self, node):
+        docstring = ast.get_docstring(node) or ""
+        # Store the function name and its docstring
+        self.functions[node.name] = docstring
+        self.generic_visit(node)
+def get_functions_from_source(source_code):
+    """Parses source code and returns a dictionary of {func_name: docstring}."""
+    try:
+        tree = ast.parse(source_code)
+        parser = FuncParser()
+        parser.visit(tree)
+        return parser.functions
+    except SyntaxError:
+        return {} # Ignore files with syntax errors
+def format_for_model(diff_text, old_doc, new_doc):
+    """Creates a structured prompt for the model to learn from."""
+    return {
+        "text": f"""### INSTRUCTION:
+A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
+### GIT DIFF:
+```diff
+{diff_text}
+OLD DOCUMENTATION:
+{old_doc.strip()}
+UPDATED DOCUMENTATION:
+{new_doc.strip()}
+"""
+}
+def main():
+    dataset = []
+    for name, url in REPO_CONFIG.items():
+        repo_dir = f"repos/{name}"
+        # 1. Clone or pull the repository
+        try:
+            if os.path.exists(repo_dir):
+                print(f"Pulling latest changes for {name}...")
+                repo = Repo(repo_dir)
+                repo.remotes.origin.pull()
+            else:
+                print(f"Cloning {name} from {url}...")
+                repo = Repo.clone_from(url, repo_dir)
+        except exc.GitCommandError as e:
+            print(f"Error cloning/pulling {name}: {e}")
+            continue
+        print(f"Mining commit history for {name}...")
+        # 2. Iterate through commits
+        commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
+        for commit in commits:
+            if not commit.parents:
+                continue # Skip initial commit
+            parent = commit.parents[0]
+            diffs = commit.diff(parent, create_patch=True, unified=0)
+            # 3. Look for changes in Python files
+            for diff in diffs:
+                if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
+                    continue
+                # Avoid processing deleted or new files for simplicity
+                if diff.a_blob is None or diff.b_blob is None:
+                    continue
+                # 4. Use AST to find changes, not regex
+                try:
+                    old_source = diff.a_blob.data_stream.read().decode('utf-8')
+                    new_source = diff.b_blob.data_stream.read().decode('utf-8')
+                except UnicodeDecodeError:
+                    continue # Skip files with decoding errors
+                old_funcs = get_functions_from_source(old_source)
+                new_funcs = get_functions_from_source(new_source)
+                # 5. Compare the docstrings of functions that exist in both versions
+                for func_name, old_doc in old_funcs.items():
+                    if func_name in new_funcs:
+                        new_doc = new_funcs[func_name]
+                        # We found a change if the docstrings are different
+                        if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
+                            diff_text = diff.diff.decode('utf-8', errors='ignore')
+                            formatted_example = format_for_model(diff_text, old_doc, new_doc)
+                            dataset.append(formatted_example)
+    print(f"\nFound {len(dataset)} high-quality examples.")
+    # 6. Save the final dataset
+    with open(OUTPUT_FILE, 'w') as f:
+        for item in dataset:
+            f.write(json.dumps(item) + "\n")
+    print(f"Dataset saved to '{OUTPUT_FILE}'.")
+if __name__ == "__main__":
+    main()
+```