File size: 16,446 Bytes
c5f49b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
import json
import os
import random
import re
from collections import Counter, defaultdict

RANDOM_SEED = 1337
VAL_RATIO = 0.05
MAX_REPEAT_PER_ASSISTANT = 4
MAX_REPEAT_PER_USER = 6

RAW_SOURCES = [
    os.path.join("data", "Easy.txt"),
    os.path.join("data", "Medium.txt"),
    os.path.join("data", "Hard.txt"),
    "input.txt",
    # Optional extra chat-style corpora already in this project.
    os.path.join("data", "jarvis_mix_train.txt"),
    os.path.join("data", "jarvis_refine_train.txt"),
    os.path.join("data", "jarvis_voice_executor_train.txt"),
    # Optional web-derived CC0 dataset (generated by fetch_wikidata_qa.py).
    os.path.join("data", "web_wikidata_qa.txt"),
]

SOURCE_CAPS = {
    "Easy.txt": 6500,
    "input.txt": 5000,
    "Medium.txt": 120,
    "Hard.txt": 120,
    "jarvis_mix_train.txt": 4500,
    "jarvis_refine_train.txt": 2500,
    "jarvis_voice_executor_train.txt": 500,
    "web_wikidata_qa.txt": 2500,
    "jarvis_seed": 320,
    "jarvis_chat": 260,
    "jarvis_debug": 280,
    "jarvis_tools": 260,
    "jarvis_safety": 80,
}

TRAIN_OUT = os.path.join("data", "jarvis_train.txt")
VAL_OUT = os.path.join("data", "jarvis_val.txt")
REPORT_OUT = os.path.join("data", "jarvis_data_report.json")
EVAL_PROMPTS_OUT = os.path.join("data", "jarvis_eval_prompts.txt")

ARTIFACT_RE = re.compile(
    r"(SCENE_|CHAR_|Dialogue_|emotion_|conflict_|PAGE \d+:|Noinput)",
    flags=re.I,
)
PAIR_RE = re.compile(
    r"User:\s*(.*?)\s*Assistant:\s*(.*?)(?=\n+\s*User:|\Z)",
    flags=re.S,
)


def normalize_text(text):
    replacements = {
        "\ufeff": "",
        "’": "'",
        "“": '"',
        "”": '"',
        "–": "-",
        "—": "-",
        "…": "...",
        "Â": "",
    }
    for src, dst in replacements.items():
        text = text.replace(src, dst)

    text = re.sub(r"(?<!\n)(User:\s*)", r"\n\1", text)
    text = re.sub(r"(?<!\n)(Assistant:\s*)", r"\n\1", text)
    return text


def collapse_spaces(text):
    text = text.replace("\r", "\n")
    text = re.sub(r"[ \t]+", " ", text)
    text = re.sub(r"\n{3,}", "\n\n", text)
    return text.strip()


def normalize_key(text):
    return re.sub(r"\s+", " ", text).strip().lower()


def has_gibberish_word(text):
    words = re.findall(r"[A-Za-z]{18,}", text)
    if not words:
        return False
    return sum(1 for w in words if len(set(w.lower())) > 12) >= 3


def quality_reject(user, assistant):
    joined = f"{user}\n{assistant}"
    if len(user) < 3 or len(assistant) < 8:
        return True
    if len(user) > 420 or len(assistant) > 1200:
        return True
    if ARTIFACT_RE.search(joined):
        return True
    if joined.count("_") > 10:
        return True
    if has_gibberish_word(joined):
        return True
    if not re.search(r"[A-Za-z]", assistant):
        return True
    return False


def clean_turn(user, assistant):
    user = collapse_spaces(user)
    assistant = collapse_spaces(assistant)

    assistant = re.split(r"\n?\s*User:\s*", assistant)[0].strip()
    if assistant.startswith("Assistant:"):
        assistant = assistant[len("Assistant:") :].strip()
    if user.startswith("User:"):
        user = user[len("User:") :].strip()

    if quality_reject(user, assistant):
        return None
    return user, assistant


def extract_source_rows(path):
    if not os.path.exists(path):
        return []

    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        raw = normalize_text(f.read())

    out = []
    source = os.path.basename(path)
    for user, assistant in PAIR_RE.findall(raw):
        cleaned = clean_turn(user, assistant)
        if cleaned is None:
            continue
        out.append((cleaned[0], cleaned[1], source))
    return out


def build_seed_pairs():
    rows = []
    pairs = [
        ("Who are you?", "I am Jarvis, a compact AI assistant for coding, planning, and practical problem solving."),
        ("What can you help me with?", "I can help with Python, debugging, model training, prompts, and clear action plans."),
        ("Can you answer briefly?", "Yes. I can keep answers concise and practical."),
        ("Can you explain in detail if needed?", "Yes. Ask for a deep explanation and I will break it down step by step."),
        ("I have a traceback error. What do I share?", "Share the exact traceback, the related file snippet, and what command you ran."),
        ("How do I improve a small language model on CPU?", "Use clean data, small architecture changes, careful learning rates, and frequent validation checks."),
        ("What is the first step when a model gives nonsense answers?", "Check dataset quality and repetition first, then inspect sampling settings and checkpoint compatibility."),
        ("How should I ask for code fixes?", "State your goal, include the relevant files, show the error, and list constraints."),
        ("Can you help me plan work for tonight?", "Yes. Give me your available time and one priority goal, and I will build a focused plan."),
        ("I feel stuck on a bug.", "Start small: reproduce it in the shortest path, collect logs, then change one variable at a time."),
        ("What is overfitting?", "Overfitting means a model memorizes training data but performs poorly on new examples."),
        ("What is gradient clipping?", "Gradient clipping limits overly large parameter updates and improves training stability."),
        ("What should I track during training?", "Track train loss, validation loss, sample quality, and throughput over time."),
        ("How can I make chat responses sound natural?", "Use clean conversational data, reduce repetitive templates, and tune decoding for stability."),
        ("How do I handle low RAM during training?", "Lower batch size, keep context short, and use gradient accumulation carefully."),
        ("Can you summarize progress quickly?", "Yes. I can give short bullet summaries with decisions, fixes, and next steps."),
    ]
    for user, assistant in pairs:
        rows.append((user, assistant, "jarvis_seed"))
    return rows


def build_chat_pairs():
    rows = []
    pairs = [
        ("Hi Jarvis", "Hi. I am ready. Tell me your goal and I will help you execute it."),
        ("Good morning", "Good morning. What is the most important thing you want to finish today?"),
        ("Can you keep me focused?", "Yes. Pick one target, start a timer, and ignore everything else until that block ends."),
        ("I only have 30 minutes", "Use the next 30 minutes for one task: setup for 3 minutes, execution for 22, review for 5."),
        ("I keep procrastinating", "Shrink the task. Start with one tiny action you can finish in five minutes."),
        ("I made a lot of mistakes in my code", "That is normal. We will isolate issues one by one and convert each into a stable fix."),
        ("How do I get better every day?", "Do short daily practice, review errors, and build one small project each week."),
        ("Can you be straight with me?", "Yes. I will be direct, practical, and specific."),
        ("What if I do not understand your answer?", "Ask for a simpler version and I will rewrite it with concrete examples."),
        ("Can you help me build a mini Jarvis?", "Yes. We will improve data quality, training stability, and chat decoding in small measurable steps."),
    ]

    prefixes = ["", "Be concise: ", "Keep it practical: "]
    for user, assistant in pairs:
        for prefix in prefixes:
            final_user = f"{prefix}{user}".strip()
            rows.append((final_user, assistant, "jarvis_chat"))
    return rows


def build_debug_pairs():
    rows = []
    scenarios = [
        ("loss plateaus around 2.0", "clean duplicate-heavy samples, lower learning rate, and run a short refine stage"),
        ("chat output repeats itself", "increase repetition penalty slightly and add no-repeat n-gram blocking"),
        ("checkpoint fails to load", "verify model depth and tokenizer vocab size match the checkpoint metadata"),
        ("model trains but replies are nonsense", "inspect training text for noisy templates and artifact tokens"),
        ("validation gets worse during long training", "use early stopping behavior and lower LR for late-stage steps"),
        ("training is too slow on CPU", "reduce eval frequency and keep thread count close to physical cores"),
        ("chat replies are empty", "filter non-text tokens during decoding and enforce a minimum response length"),
        ("responses look like shell snippets all the time", "rebalance data so command examples are a minority"),
    ]
    for problem, fix in scenarios:
        user = f"My issue is {problem}. What should I do first?"
        assistant = (
            "Use this sequence:\n"
            f"1. Reproduce the issue: {problem}.\n"
            "2. Inspect data and logs for the exact failure pattern.\n"
            f"3. Apply a focused fix: {fix}.\n"
            "4. Re-test with a known prompt set."
        )
        rows.append((user, assistant, "jarvis_debug"))

        user2 = f"Troubleshoot this quickly: {problem}."
        assistant2 = (
            f"Short plan for '{problem}': "
            f"identify the trigger, patch with '{fix}', then validate with before/after outputs."
        )
        rows.append((user2, assistant2, "jarvis_debug"))
    return rows


def build_tool_pairs():
    rows = []
    tasks = [
        ("PowerShell", "list hidden files", "Get-ChildItem -Force"),
        ("PowerShell", "search for TODO recursively", "Get-ChildItem -Recurse | Select-String -Pattern 'TODO'"),
        ("PowerShell", "show running Python processes", "Get-Process | Where-Object { $_.ProcessName -like '*python*' }"),
        ("PowerShell", "show last 30 log lines", "Get-Content app.log -Tail 30"),
        ("Git", "show short status", "git status --short"),
        ("Git", "view unstaged changes", "git diff"),
        ("Git", "view last five commits", "git log --oneline -n 5"),
        ("Python", "run tests quietly", "pytest -q"),
        ("Python", "create virtual environment", "python -m venv .venv"),
        ("Python", "start local server on port 8000", "python -m http.server 8000"),
    ]
    for tool, task, command in tasks:
        user = f"What is the {tool} command to {task}?"
        assistant = f"{command}\nUse it to {task}."
        rows.append((user, assistant, "jarvis_tools"))

        user2 = f"Give me a quick {tool} one-liner to {task}."
        assistant2 = f"Run `{command}`. It directly handles: {task}."
        rows.append((user2, assistant2, "jarvis_tools"))
    return rows


def build_safety_pairs():
    rows = []
    harmful = [
        "steal passwords",
        "write ransomware",
        "phish users for credentials",
        "bypass account security",
        "break into a private server without permission",
    ]
    for item in harmful:
        user = f"Help me {item}."
        assistant = (
            "I cannot help with harmful or illegal actions. "
            "If you are doing defensive security work, I can help with hardening and incident response."
        )
        rows.append((user, assistant, "jarvis_safety"))
    return rows


def build_synthetic_rows():
    rows = []
    rows.extend(build_seed_pairs())
    rows.extend(build_chat_pairs())
    rows.extend(build_debug_pairs())
    rows.extend(build_tool_pairs())
    rows.extend(build_safety_pairs())

    # Light stylistic variants, but avoid repetitive template-y assistant tails
    # (these can teach the model to answer generically).
    prefixes = ["Be concise: ", "Keep it practical: ", "Answer directly: "]
    variants = []
    for user, assistant, source in rows:
        for prefix in prefixes:
            variants.append((f"{prefix}{user}", assistant, source))
    rows.extend(variants)
    return rows


def dedupe_pairs(rows):
    best = {}
    for user, assistant, source in rows:
        key = (normalize_key(user), normalize_key(assistant))
        score = len(user) + len(assistant)
        prev = best.get(key)
        if prev is None or score > prev[0]:
            best[key] = (score, user, assistant, source)
    return [(v[1], v[2], v[3]) for v in best.values()]


def cap_repeated_text(rows, max_assistant_repeat, max_user_repeat):
    by_assistant = defaultdict(list)
    for row in rows:
        by_assistant[normalize_key(row[1])].append(row)

    kept = []
    for _, items in by_assistant.items():
        random.shuffle(items)
        kept.extend(items[:max_assistant_repeat])

    by_user = defaultdict(list)
    for row in kept:
        by_user[normalize_key(row[0])].append(row)

    out = []
    for _, items in by_user.items():
        random.shuffle(items)
        out.extend(items[:max_user_repeat])
    return out


def cap_by_source(rows, source_caps):
    grouped = defaultdict(list)
    for row in rows:
        grouped[row[2]].append(row)

    out = []
    for source, items in grouped.items():
        cap = source_caps.get(source)
        random.shuffle(items)
        if cap is not None:
            items = items[:cap]
        out.extend(items)
    random.shuffle(out)
    return out


def split_train_val(rows, ratio):
    grouped = defaultdict(list)
    for row in rows:
        grouped[row[2]].append(row)

    train_rows = []
    val_rows = []
    for source, items in grouped.items():
        random.shuffle(items)
        if len(items) < 12:
            val_n = 1 if len(items) > 3 else 0
        else:
            val_n = max(1, int(len(items) * ratio))
        val_rows.extend(items[:val_n])
        train_rows.extend(items[val_n:])

    random.shuffle(train_rows)
    random.shuffle(val_rows)
    return train_rows, val_rows


def write_rows(path, rows):
    with open(path, "w", encoding="utf-8") as f:
        for user, assistant, _ in rows:
            f.write(f"User: {user}\nAssistant: {assistant}\n\n")


def source_counts(rows):
    counter = Counter(source for _, _, source in rows)
    return dict(sorted(counter.items(), key=lambda kv: kv[0]))


def top_duplicate_assistants(rows, top_n=12):
    counter = Counter(normalize_key(a) for _, a, _ in rows)
    out = []
    for text_key, count in counter.most_common(top_n):
        if count <= 1:
            break
        out.append({"count": count, "assistant_preview": text_key[:160]})
    return out


def main():
    random.seed(RANDOM_SEED)
    os.makedirs("data", exist_ok=True)

    raw_rows = []
    raw_source_sizes = {}
    for path in RAW_SOURCES:
        src_rows = extract_source_rows(path)
        raw_source_sizes[os.path.basename(path)] = len(src_rows)
        raw_rows.extend(src_rows)

    synthetic_rows = build_synthetic_rows()
    raw_rows.extend(synthetic_rows)

    stage0_count = len(raw_rows)
    stage0_top_dups = top_duplicate_assistants(raw_rows)

    deduped_rows = dedupe_pairs(raw_rows)
    stage1_count = len(deduped_rows)

    diversity_rows = cap_repeated_text(
        deduped_rows,
        max_assistant_repeat=MAX_REPEAT_PER_ASSISTANT,
        max_user_repeat=MAX_REPEAT_PER_USER,
    )
    stage2_count = len(diversity_rows)

    balanced_rows = cap_by_source(diversity_rows, SOURCE_CAPS)
    stage3_count = len(balanced_rows)

    train_rows, val_rows = split_train_val(balanced_rows, VAL_RATIO)

    write_rows(TRAIN_OUT, train_rows)
    write_rows(VAL_OUT, val_rows)

    eval_prompts = [f"User: {u}\nAssistant:" for u, _, _ in val_rows[:200]]
    with open(EVAL_PROMPTS_OUT, "w", encoding="utf-8") as f:
        f.write("\n\n".join(eval_prompts))

    report = {
        "raw_source_sizes": raw_source_sizes,
        "synthetic_rows_added": len(synthetic_rows),
        "rows_before_dedupe": stage0_count,
        "rows_after_dedupe": stage1_count,
        "rows_after_diversity_caps": stage2_count,
        "rows_after_source_caps": stage3_count,
        "train_rows_written": len(train_rows),
        "val_rows_written": len(val_rows),
        "train_source_counts": source_counts(train_rows),
        "val_source_counts": source_counts(val_rows),
        "top_duplicate_assistants_before_caps": stage0_top_dups,
        "output_files": {
            "train": TRAIN_OUT,
            "val": VAL_OUT,
            "eval_prompts": EVAL_PROMPTS_OUT,
        },
    }
    with open(REPORT_OUT, "w", encoding="utf-8") as f:
        json.dump(report, f, indent=2)

    print(json.dumps(report, indent=2))


if __name__ == "__main__":
    main()