broadfield-dev commited on
Commit
2b890a2
·
verified ·
1 Parent(s): 5b20ea5

Create dataset_gen.py

Browse files
Files changed (1) hide show
  1. dataset_gen.py +37 -0
dataset_gen.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ from parser import parse_source_to_graph
4
+ from datetime import datetime
5
+
6
+ def create_hf_dataset(code_samples, output_file="software_structure_dataset.jsonl"):
7
+ """
8
+ Takes a list of code strings, parses them, and saves them
9
+ in a format ready for Hugging Face 'datasets'.
10
+ """
11
+ data_entries = []
12
+
13
+ for idx, code in enumerate(code_samples):
14
+ graph_data = parse_source_to_graph(code)
15
+
16
+ if "error" in graph_data:
17
+ continue
18
+
19
+ # Flatten vectors for ML input
20
+ vectors = [n['vector'] for n in graph_data['nodes']]
21
+
22
+ entry = {
23
+ "id": f"sample_{idx}_{int(datetime.now().timestamp())}",
24
+ "source_code": code,
25
+ "graph_nodes": json.dumps(graph_data['nodes']),
26
+ "graph_edges": json.dumps(graph_data['connections']),
27
+ "structural_vectors": vectors, # The core feature for training
28
+ "node_count": len(graph_data['nodes'])
29
+ }
30
+ data_entries.append(entry)
31
+
32
+ # Write to JSONL
33
+ with open(output_file, 'w') as f:
34
+ for entry in data_entries:
35
+ f.write(json.dumps(entry) + '\n')
36
+
37
+ return output_file