| | from pydriller import Repository |
| | import os |
| | import json |
| | from tqdm import tqdm |
| | import re |
| | from multiprocessing import Pool |
| |
|
| | REPO_PATH = '../linux' |
| | OUTPUT_FILE = './output/linux_bugfix_dataset.jsonl' |
| |
|
| | TEST_MODE = False |
| | MAX_COMMITS_TEST = 50 |
| | NUM_WORKERS = 16 |
| |
|
| | BUGFIX_KEYWORDS = [ |
| | 'fix', 'bug', 'leak', 'null', 'overflow', 'error', 'failure', |
| | 'crash', 'panic', 'memory', 'race', 'deadlock', 'corruption', |
| | 'security', 'vulnerability', 'exploit', 'buffer', 'stack' |
| | ] |
| |
|
| | def is_bugfix_commit(msg): |
| | msg_lower = msg.lower() |
| | return any(keyword in msg_lower for keyword in BUGFIX_KEYWORDS) |
| |
|
| | def extract_instruction_from_commit_msg(msg): |
| | lines = msg.strip().splitlines() |
| | for line in lines: |
| | line = line.strip() |
| | if len(line) < 5 or not any(c.isalpha() for c in line): |
| | continue |
| | if line.lower().startswith(( |
| | '[patch]', 'signed-off-by', 'reviewed-by', 'tested-by', 'ack', |
| | 'reported-by', 'cc:', 'co-authored-by', 'patchwork-id', |
| | 'suggested-by', 'fixes:', 'link:', 'cherry picked from commit' |
| | )): |
| | continue |
| | return line |
| | return msg.strip().splitlines()[0] if msg.strip() else "fix" |
| |
|
| | def extract_code_context(code, line_number, context_lines=10): |
| | if not code: |
| | return "" |
| | lines = code.split('\n') |
| | start = max(0, line_number - context_lines) |
| | end = min(len(lines), line_number + context_lines) |
| | return '\n'.join(lines[start:end]) |
| |
|
| | def extract_diff_context(diff_text, context_lines=5): |
| | if not diff_text: |
| | return "" |
| | lines = diff_text.split('\n') |
| | change_lines = [i for i, line in enumerate(lines) if line.startswith('+') or line.startswith('-')] |
| | if not change_lines: |
| | return diff_text |
| | start = max(0, change_lines[0] - context_lines) |
| | end = min(len(lines), change_lines[-1] + context_lines + 1) |
| | return '\n'.join(lines[start:end]) |
| |
|
| | def create_dataset_entry(original_code, commit_msg, diff_code): |
| | return { |
| | "input": { |
| | "original code": original_code.strip(), |
| | "instruction": extract_instruction_from_commit_msg(commit_msg) |
| | }, |
| | "output": { |
| | "diff codes": diff_code.strip() |
| | } |
| | } |
| |
|
| | def process_commit(commit): |
| | entries = [] |
| | if not is_bugfix_commit(commit.msg): |
| | return entries |
| |
|
| | for mod in commit.modified_files: |
| | if not mod.new_path or not mod.new_path.endswith(('.c', '.h')): |
| | continue |
| | if mod.change_type.name != "MODIFY": |
| | continue |
| | if not mod.diff or not mod.source_code_before: |
| | continue |
| |
|
| | focused_diff = extract_diff_context(mod.diff) |
| |
|
| | diff_lines = mod.diff.split('\n') |
| | line_numbers = [] |
| | for line in diff_lines: |
| | if line.startswith('@@'): |
| | match = re.search(r'@@ -(\d+),?\d* \+\d+,?\d* @@', line) |
| | if match: |
| | line_numbers.append(int(match.group(1))) |
| |
|
| | if line_numbers: |
| | focused_code = extract_code_context(mod.source_code_before, line_numbers[0]) |
| | else: |
| | focused_code = '\n'.join(mod.source_code_before.split('\n')[:50]) |
| |
|
| | entry = create_dataset_entry( |
| | original_code=focused_code, |
| | commit_msg=commit.msg, |
| | diff_code=focused_diff |
| | ) |
| | entries.append(entry) |
| |
|
| | return entries |
| |
|
| | def collect_entries_from_hash(commit_hash): |
| | try: |
| | commit = next(Repository(REPO_PATH, only_commits=[commit_hash]).traverse_commits()) |
| | return process_commit(commit) |
| | except Exception: |
| | return [] |
| |
|
| | def main(): |
| | if not os.path.exists(REPO_PATH): |
| | print("[ERROR] Repository not found at:", REPO_PATH) |
| | return |
| |
|
| | os.makedirs('./output', exist_ok=True) |
| |
|
| | print("[INFO] Building Linux kernel bug-fix dataset...") |
| | print("[INFO] Repository:", REPO_PATH) |
| | print("[INFO] Output file:", OUTPUT_FILE) |
| |
|
| | output_file = OUTPUT_FILE.replace('.jsonl', '_test.jsonl') if TEST_MODE else OUTPUT_FILE |
| |
|
| | all_hashes = [c.hash for c in Repository(REPO_PATH).traverse_commits()] |
| | if TEST_MODE and MAX_COMMITS_TEST: |
| | all_hashes = all_hashes[:MAX_COMMITS_TEST] |
| |
|
| | dataset_entries = [] |
| | with Pool(NUM_WORKERS) as pool: |
| | results = list(tqdm(pool.imap_unordered(collect_entries_from_hash, all_hashes), total=len(all_hashes))) |
| |
|
| | for entries in results: |
| | dataset_entries.extend(entries) |
| |
|
| | with open(output_file, 'w', encoding='utf-8') as f: |
| | for entry in dataset_entries: |
| | f.write(json.dumps(entry, ensure_ascii=False) + '\n') |
| |
|
| | print("[DONE] Dataset creation completed!") |
| | print("[INFO] Total commits processed:", len(all_hashes)) |
| | print("[INFO] Total dataset entries:", len(dataset_entries)) |
| | print("[INFO] Saved to:", output_file) |
| |
|
| | if dataset_entries: |
| | print("[INFO] Sample dataset entry:") |
| | sample = dataset_entries[0] |
| | print(json.dumps(sample, indent=2, ensure_ascii=False)[:800] + "...") |
| | print("[INFO] Dataset structure:") |
| | print(" - Input: original code + instruction") |
| | print(" - Output: diff codes") |
| | print(" - Format: JSONL (one JSON object per line)") |
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|