Spaces:

SharathReddy
/

doc-mining

Runtime error

App Files Files Community

SharathReddy commited on Jul 3, 2025

Commit

3ef6520

verified ·

1 Parent(s): 284a8c6

Update mine_diffs.py

Browse files

Files changed (1) hide show

mine_diffs.py +66 -66

mine_diffs.py CHANGED Viewed

@@ -33,75 +33,75 @@ def get_functions_from_source(source_code):
 def format_for_model(diff_text, old_doc, new_doc):
     return {
         "text": f"""### INSTRUCTION:
-A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
-### GIT DIFF:
-```diff
-{diff_text}
-OLD DOCUMENTATION:
-{old_doc.strip()}
-UPDATED DOCUMENTATION:
-{new_doc.strip()}
-"""
-}
 def main():
-# All code inside this function must be indented.
-dataset = []
-base_repo_dir = tempfile.mkdtemp()
-print(f"Using temporary directory for clones: {base_repo_dir}")
-for name, url in REPO_CONFIG.items():
-    repo_dir = os.path.join(base_repo_dir, name)
-    try:
-        print(f"Cloning {name} from {url}...")
-        repo = Repo.clone_from(url, repo_dir)
-    except exc.GitCommandError as e:
-        print(f"Error cloning {name}: {e}")
-        continue
-    print(f"Mining commit history for {name}...")
-    commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
-    for commit in commits:
-        if not commit.parents:
             continue
-        parent = commit.parents[0]
-        diffs = commit.diff(parent, create_patch=True, unified=0)
-        for diff in diffs:
-            if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
-                continue
-            if diff.a_blob is None or diff.b_blob is None:
-                continue
-            try:
-                old_source = diff.a_blob.data_stream.read().decode('utf-8')
-                new_source = diff.b_blob.data_stream.read().decode('utf-8')
-            except UnicodeDecodeError:
                 continue
-            old_funcs = get_functions_from_source(old_source)
-            new_funcs = get_functions_from_source(new_source)
-            for func_name, old_doc in old_funcs.items():
-                if func_name in new_funcs:
-                    new_doc = new_funcs[func_name]
-                    if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
-                        diff_text = diff.diff.decode('utf-8', errors='ignore')
-                        formatted_example = format_for_model(diff_text, old_doc, new_doc)
-                        dataset.append(formatted_example)
-print(f"\nFound {len(dataset)} high-quality examples.")
-try:
-    with open(OUTPUT_FILE, 'w') as f:
-        for item in dataset:
-            f.write(json.dumps(item) + "\n")
-    print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
-except Exception as e:
-    print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")
 if name == "main":
 main()

 def format_for_model(diff_text, old_doc, new_doc):
     return {
         "text": f"""### INSTRUCTION:
+    A Python function's code was changed. Based on the `git diff` provided, update the function's documentation.
+    ### GIT DIFF:
+    ```diff
+    {diff_text}
+    OLD DOCUMENTATION:
+    {old_doc.strip()}
+    UPDATED DOCUMENTATION:
+    {new_doc.strip()}
+    """
+    }
 def main():
+    # All code inside this function must be indented.
+    dataset = []
+    base_repo_dir = tempfile.mkdtemp()
+    print(f"Using temporary directory for clones: {base_repo_dir}")
+    for name, url in REPO_CONFIG.items():
+        repo_dir = os.path.join(base_repo_dir, name)
+        try:
+            print(f"Cloning {name} from {url}...")
+            repo = Repo.clone_from(url, repo_dir)
+        except exc.GitCommandError as e:
+            print(f"Error cloning {name}: {e}")
             continue
+        print(f"Mining commit history for {name}...")
+        commits = list(repo.iter_commits(max_count=MAX_COMMITS_PER_REPO))
+        for commit in commits:
+            if not commit.parents:
                 continue
+            parent = commit.parents[0]
+            diffs = commit.diff(parent, create_patch=True, unified=0)
+            for diff in diffs:
+                if not (diff.a_path and diff.b_path and diff.a_path.endswith('.py') and diff.b_path.endswith('.py')):
+                    continue
+                if diff.a_blob is None or diff.b_blob is None:
+                    continue
+                try:
+                    old_source = diff.a_blob.data_stream.read().decode('utf-8')
+                    new_source = diff.b_blob.data_stream.read().decode('utf-8')
+                except UnicodeDecodeError:
+                    continue
+                old_funcs = get_functions_from_source(old_source)
+                new_funcs = get_functions_from_source(new_source)
+                for func_name, old_doc in old_funcs.items():
+                    if func_name in new_funcs:
+                        new_doc = new_funcs[func_name]
+                        if old_doc != new_doc and len(old_doc) > 20 and len(new_doc) > 20:
+                            diff_text = diff.diff.decode('utf-8', errors='ignore')
+                            formatted_example = format_for_model(diff_text, old_doc, new_doc)
+                            dataset.append(formatted_example)
+    print(f"\nFound {len(dataset)} high-quality examples.")
+    try:
+        with open(OUTPUT_FILE, 'w') as f:
+            for item in dataset:
+                f.write(json.dumps(item) + "\n")
+        print(f"Dataset successfully saved to '{OUTPUT_FILE}'.")
+    except Exception as e:
+        print(f"FATAL: Could not write final dataset file to {OUTPUT_FILE}. Error: {e}")
 if name == "main":
 main()