GitSpec / backend /historyExtractor.py
Lumiin0us
GitSpec Initial Deployment
34b79cd
import json
from git import Repo, GitCommandError, NULL_TREE
import os
BOT_AUTHORS = ['github-actions[bot]', 'dependabot[bot]', 'pre-commit-ci[bot]', 'github-actions']
CODE_EXTENSIONS = ['.py']
MAX_FILES_PER_COMMIT = 20
MAX_COMMITS_TO_SCAN = 200
def splitDiffs(raw_diff):
if not raw_diff: return [], []
lines = raw_diff.split('\n')
added, removed = [], []
for line in lines:
if line.startswith('+++') or line.startswith('---') or line.startswith('\\'):
continue
if line.startswith('+'): added.append(line[1:])
elif line.startswith('-'): removed.append(line[1:])
return added, removed
def isCodeCommit(files_info):
return any(f['file'].endswith(ext) for ext in CODE_EXTENSIONS for f in files_info)
def extractHistory(repo, destPath, outputFile="commits_history.jsonl"):
try:
commits = list(repo.iter_commits(max_count=MAX_COMMITS_TO_SCAN))
except (GitCommandError, ValueError):
commits = []
indexedCount = 0
with open(outputFile, 'w', encoding='utf-8') as f:
for commit in commits:
if commit.author.name in BOT_AUTHORS:
continue
parent = None
if commit.parents:
try:
p = commit.parents[0]
repo.git.cat_file('-e', p.hexsha)
parent = p
except GitCommandError:
parent = None
if parent:
diffs = parent.diff(commit, create_patch=True)
else:
diffs = commit.diff(NULL_TREE, create_patch=True, reverse=True)
files_info = []
for d in diffs:
filePath = d.b_path if d.b_path else d.a_path
rawPatch = d.diff.decode('utf-8', errors='replace') if d.diff else ""
addedLines, removedLines = splitDiffs(rawPatch)
status = d.change_type
if not status:
if addedLines and not removedLines: status = 'A'
elif removedLines and not addedLines: status = 'D'
else: status = 'M'
files_info.append({
'file': filePath,
'status': status,
'additions': addedLines[:20],
'removals': removedLines[:20]
})
if len(files_info) > MAX_FILES_PER_COMMIT:
continue
if not isCodeCommit(files_info):
continue
filesTouched = ", ".join(fi['file'] for fi in files_info)
embedString = (f"{commit.summary} — files: {filesTouched} — "
f"author: {commit.author.name} — "
f"date: {commit.authored_datetime.date()}")
commit_info = {
'sha': commit.hexsha[:7],
'author': commit.author.name,
'summary': commit.summary,
'date': commit.authored_datetime.isoformat(),
'changes': files_info,
'embedText': embedString
}
f.write(json.dumps(commit_info) + '\n')
indexedCount += 1
print(f"Extracted {indexedCount} commits into {outputFile}")
return outputFile