| import json |
| from git import Repo, GitCommandError, NULL_TREE |
| import os |
|
|
| BOT_AUTHORS = ['github-actions[bot]', 'dependabot[bot]', 'pre-commit-ci[bot]', 'github-actions'] |
| CODE_EXTENSIONS = ['.py'] |
| MAX_FILES_PER_COMMIT = 20 |
| MAX_COMMITS_TO_SCAN = 200 |
|
|
| def splitDiffs(raw_diff): |
| if not raw_diff: return [], [] |
| lines = raw_diff.split('\n') |
| added, removed = [], [] |
| for line in lines: |
| if line.startswith('+++') or line.startswith('---') or line.startswith('\\'): |
| continue |
| if line.startswith('+'): added.append(line[1:]) |
| elif line.startswith('-'): removed.append(line[1:]) |
| return added, removed |
|
|
| def isCodeCommit(files_info): |
| return any(f['file'].endswith(ext) for ext in CODE_EXTENSIONS for f in files_info) |
|
|
| def extractHistory(repo, destPath, outputFile="commits_history.jsonl"): |
| try: |
| commits = list(repo.iter_commits(max_count=MAX_COMMITS_TO_SCAN)) |
| except (GitCommandError, ValueError): |
| commits = [] |
|
|
| indexedCount = 0 |
| with open(outputFile, 'w', encoding='utf-8') as f: |
| for commit in commits: |
| if commit.author.name in BOT_AUTHORS: |
| continue |
|
|
| parent = None |
| if commit.parents: |
| try: |
| p = commit.parents[0] |
| repo.git.cat_file('-e', p.hexsha) |
| parent = p |
| except GitCommandError: |
| parent = None |
|
|
| if parent: |
| diffs = parent.diff(commit, create_patch=True) |
| else: |
| diffs = commit.diff(NULL_TREE, create_patch=True, reverse=True) |
|
|
| files_info = [] |
| for d in diffs: |
| filePath = d.b_path if d.b_path else d.a_path |
| rawPatch = d.diff.decode('utf-8', errors='replace') if d.diff else "" |
| addedLines, removedLines = splitDiffs(rawPatch) |
|
|
| status = d.change_type |
| if not status: |
| if addedLines and not removedLines: status = 'A' |
| elif removedLines and not addedLines: status = 'D' |
| else: status = 'M' |
|
|
| files_info.append({ |
| 'file': filePath, |
| 'status': status, |
| 'additions': addedLines[:20], |
| 'removals': removedLines[:20] |
| }) |
|
|
| if len(files_info) > MAX_FILES_PER_COMMIT: |
| continue |
| if not isCodeCommit(files_info): |
| continue |
|
|
| filesTouched = ", ".join(fi['file'] for fi in files_info) |
| embedString = (f"{commit.summary} — files: {filesTouched} — " |
| f"author: {commit.author.name} — " |
| f"date: {commit.authored_datetime.date()}") |
|
|
| commit_info = { |
| 'sha': commit.hexsha[:7], |
| 'author': commit.author.name, |
| 'summary': commit.summary, |
| 'date': commit.authored_datetime.isoformat(), |
| 'changes': files_info, |
| 'embedText': embedString |
| } |
|
|
| f.write(json.dumps(commit_info) + '\n') |
| indexedCount += 1 |
|
|
| print(f"Extracted {indexedCount} commits into {outputFile}") |
| return outputFile |