girish00 commited on
Commit
971cb4b
·
verified ·
1 Parent(s): 1f05683

update endpoint helper files

Browse files
Files changed (1) hide show
  1. generate_dataset.py +77 -0
generate_dataset.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import random
4
+
5
+
6
+ TEMPLATES = [
7
+ {
8
+ "instruction": "Fix the Python code",
9
+ "input": "def add(a,b) return a+b",
10
+ "output": "def add(a, b): return a + b",
11
+ "explanation": "Added missing colon and corrected syntax.",
12
+ },
13
+ {
14
+ "instruction": "Fix loop syntax",
15
+ "input": "for i in range(5 print(i)",
16
+ "output": "for i in range(5): print(i)",
17
+ "explanation": "Added missing parenthesis and colon.",
18
+ },
19
+ {
20
+ "instruction": "Fix condition",
21
+ "input": "if x = 10: print(x)",
22
+ "output": "if x == 10: print(x)",
23
+ "explanation": "Corrected assignment to comparison operator.",
24
+ },
25
+ {
26
+ "instruction": "Explain code",
27
+ "input": "for i in range(3): print(i)",
28
+ "output": "Prints numbers from 0 to 2.",
29
+ "explanation": "Loop iterates from 0 to 2 and prints values.",
30
+ },
31
+ ]
32
+
33
+
34
+ def format_training_text(template):
35
+ target = {
36
+ "code": template["output"],
37
+ "explanation": template["explanation"],
38
+ }
39
+ return (
40
+ f"Instruction: {template['instruction']}\n"
41
+ f"Input: {template['input']}\n"
42
+ "Return only valid JSON with keys code and explanation.\n"
43
+ f"JSON: {json.dumps(target, ensure_ascii=False)}\n"
44
+ )
45
+
46
+
47
+ def generate_sample():
48
+ template = random.choice(TEMPLATES)
49
+ text = format_training_text(template)
50
+ return {
51
+ "instruction": template["instruction"],
52
+ "input": template["input"],
53
+ "output": template["output"],
54
+ "explanation": template["explanation"],
55
+ "text": text,
56
+ "confidence": round(random.uniform(0.9, 0.99), 2),
57
+ "relevancy": round(random.uniform(0.85, 0.99), 2),
58
+ }
59
+
60
+
61
+ def main():
62
+ parser = argparse.ArgumentParser()
63
+ parser.add_argument("--size", type=int, default=8000)
64
+ parser.add_argument("--out", type=str, default="train.json")
65
+ args = parser.parse_args()
66
+
67
+ if not (5000 <= args.size <= 10000):
68
+ raise ValueError("size must be between 5000 and 10000")
69
+
70
+ dataset = [generate_sample() for _ in range(args.size)]
71
+ with open(args.out, "w", encoding="utf-8") as f:
72
+ json.dump(dataset, f, indent=2)
73
+ print(f"Dataset created: {len(dataset)} -> {args.out}")
74
+
75
+
76
+ if __name__ == "__main__":
77
+ main()