| import json |
| import os |
| import random |
| import re |
| from collections import Counter, defaultdict |
|
|
| RANDOM_SEED = 1337 |
| VAL_RATIO = 0.05 |
| MAX_REPEAT_PER_ASSISTANT = 4 |
| MAX_REPEAT_PER_USER = 6 |
|
|
| RAW_SOURCES = [ |
| os.path.join("data", "Easy.txt"), |
| os.path.join("data", "Medium.txt"), |
| os.path.join("data", "Hard.txt"), |
| "input.txt", |
| |
| os.path.join("data", "jarvis_mix_train.txt"), |
| os.path.join("data", "jarvis_refine_train.txt"), |
| os.path.join("data", "jarvis_voice_executor_train.txt"), |
| |
| os.path.join("data", "web_wikidata_qa.txt"), |
| ] |
|
|
| SOURCE_CAPS = { |
| "Easy.txt": 6500, |
| "input.txt": 5000, |
| "Medium.txt": 120, |
| "Hard.txt": 120, |
| "jarvis_mix_train.txt": 4500, |
| "jarvis_refine_train.txt": 2500, |
| "jarvis_voice_executor_train.txt": 500, |
| "web_wikidata_qa.txt": 2500, |
| "jarvis_seed": 320, |
| "jarvis_chat": 260, |
| "jarvis_debug": 280, |
| "jarvis_tools": 260, |
| "jarvis_safety": 80, |
| } |
|
|
| TRAIN_OUT = os.path.join("data", "jarvis_train.txt") |
| VAL_OUT = os.path.join("data", "jarvis_val.txt") |
| REPORT_OUT = os.path.join("data", "jarvis_data_report.json") |
| EVAL_PROMPTS_OUT = os.path.join("data", "jarvis_eval_prompts.txt") |
|
|
| ARTIFACT_RE = re.compile( |
| r"(SCENE_|CHAR_|Dialogue_|emotion_|conflict_|PAGE \d+:|Noinput)", |
| flags=re.I, |
| ) |
| PAIR_RE = re.compile( |
| r"User:\s*(.*?)\s*Assistant:\s*(.*?)(?=\n+\s*User:|\Z)", |
| flags=re.S, |
| ) |
|
|
|
|
| def normalize_text(text): |
| replacements = { |
| "\ufeff": "", |
| "’": "'", |
| "“": '"', |
| "â€Â": '"', |
| "–": "-", |
| "â€â€": "-", |
| "…": "...", |
| "Â": "", |
| } |
| for src, dst in replacements.items(): |
| text = text.replace(src, dst) |
|
|
| text = re.sub(r"(?<!\n)(User:\s*)", r"\n\1", text) |
| text = re.sub(r"(?<!\n)(Assistant:\s*)", r"\n\1", text) |
| return text |
|
|
|
|
| def collapse_spaces(text): |
| text = text.replace("\r", "\n") |
| text = re.sub(r"[ \t]+", " ", text) |
| text = re.sub(r"\n{3,}", "\n\n", text) |
| return text.strip() |
|
|
|
|
| def normalize_key(text): |
| return re.sub(r"\s+", " ", text).strip().lower() |
|
|
|
|
| def has_gibberish_word(text): |
| words = re.findall(r"[A-Za-z]{18,}", text) |
| if not words: |
| return False |
| return sum(1 for w in words if len(set(w.lower())) > 12) >= 3 |
|
|
|
|
| def quality_reject(user, assistant): |
| joined = f"{user}\n{assistant}" |
| if len(user) < 3 or len(assistant) < 8: |
| return True |
| if len(user) > 420 or len(assistant) > 1200: |
| return True |
| if ARTIFACT_RE.search(joined): |
| return True |
| if joined.count("_") > 10: |
| return True |
| if has_gibberish_word(joined): |
| return True |
| if not re.search(r"[A-Za-z]", assistant): |
| return True |
| return False |
|
|
|
|
| def clean_turn(user, assistant): |
| user = collapse_spaces(user) |
| assistant = collapse_spaces(assistant) |
|
|
| assistant = re.split(r"\n?\s*User:\s*", assistant)[0].strip() |
| if assistant.startswith("Assistant:"): |
| assistant = assistant[len("Assistant:") :].strip() |
| if user.startswith("User:"): |
| user = user[len("User:") :].strip() |
|
|
| if quality_reject(user, assistant): |
| return None |
| return user, assistant |
|
|
|
|
| def extract_source_rows(path): |
| if not os.path.exists(path): |
| return [] |
|
|
| with open(path, "r", encoding="utf-8", errors="ignore") as f: |
| raw = normalize_text(f.read()) |
|
|
| out = [] |
| source = os.path.basename(path) |
| for user, assistant in PAIR_RE.findall(raw): |
| cleaned = clean_turn(user, assistant) |
| if cleaned is None: |
| continue |
| out.append((cleaned[0], cleaned[1], source)) |
| return out |
|
|
|
|
| def build_seed_pairs(): |
| rows = [] |
| pairs = [ |
| ("Who are you?", "I am Jarvis, a compact AI assistant for coding, planning, and practical problem solving."), |
| ("What can you help me with?", "I can help with Python, debugging, model training, prompts, and clear action plans."), |
| ("Can you answer briefly?", "Yes. I can keep answers concise and practical."), |
| ("Can you explain in detail if needed?", "Yes. Ask for a deep explanation and I will break it down step by step."), |
| ("I have a traceback error. What do I share?", "Share the exact traceback, the related file snippet, and what command you ran."), |
| ("How do I improve a small language model on CPU?", "Use clean data, small architecture changes, careful learning rates, and frequent validation checks."), |
| ("What is the first step when a model gives nonsense answers?", "Check dataset quality and repetition first, then inspect sampling settings and checkpoint compatibility."), |
| ("How should I ask for code fixes?", "State your goal, include the relevant files, show the error, and list constraints."), |
| ("Can you help me plan work for tonight?", "Yes. Give me your available time and one priority goal, and I will build a focused plan."), |
| ("I feel stuck on a bug.", "Start small: reproduce it in the shortest path, collect logs, then change one variable at a time."), |
| ("What is overfitting?", "Overfitting means a model memorizes training data but performs poorly on new examples."), |
| ("What is gradient clipping?", "Gradient clipping limits overly large parameter updates and improves training stability."), |
| ("What should I track during training?", "Track train loss, validation loss, sample quality, and throughput over time."), |
| ("How can I make chat responses sound natural?", "Use clean conversational data, reduce repetitive templates, and tune decoding for stability."), |
| ("How do I handle low RAM during training?", "Lower batch size, keep context short, and use gradient accumulation carefully."), |
| ("Can you summarize progress quickly?", "Yes. I can give short bullet summaries with decisions, fixes, and next steps."), |
| ] |
| for user, assistant in pairs: |
| rows.append((user, assistant, "jarvis_seed")) |
| return rows |
|
|
|
|
| def build_chat_pairs(): |
| rows = [] |
| pairs = [ |
| ("Hi Jarvis", "Hi. I am ready. Tell me your goal and I will help you execute it."), |
| ("Good morning", "Good morning. What is the most important thing you want to finish today?"), |
| ("Can you keep me focused?", "Yes. Pick one target, start a timer, and ignore everything else until that block ends."), |
| ("I only have 30 minutes", "Use the next 30 minutes for one task: setup for 3 minutes, execution for 22, review for 5."), |
| ("I keep procrastinating", "Shrink the task. Start with one tiny action you can finish in five minutes."), |
| ("I made a lot of mistakes in my code", "That is normal. We will isolate issues one by one and convert each into a stable fix."), |
| ("How do I get better every day?", "Do short daily practice, review errors, and build one small project each week."), |
| ("Can you be straight with me?", "Yes. I will be direct, practical, and specific."), |
| ("What if I do not understand your answer?", "Ask for a simpler version and I will rewrite it with concrete examples."), |
| ("Can you help me build a mini Jarvis?", "Yes. We will improve data quality, training stability, and chat decoding in small measurable steps."), |
| ] |
|
|
| prefixes = ["", "Be concise: ", "Keep it practical: "] |
| for user, assistant in pairs: |
| for prefix in prefixes: |
| final_user = f"{prefix}{user}".strip() |
| rows.append((final_user, assistant, "jarvis_chat")) |
| return rows |
|
|
|
|
| def build_debug_pairs(): |
| rows = [] |
| scenarios = [ |
| ("loss plateaus around 2.0", "clean duplicate-heavy samples, lower learning rate, and run a short refine stage"), |
| ("chat output repeats itself", "increase repetition penalty slightly and add no-repeat n-gram blocking"), |
| ("checkpoint fails to load", "verify model depth and tokenizer vocab size match the checkpoint metadata"), |
| ("model trains but replies are nonsense", "inspect training text for noisy templates and artifact tokens"), |
| ("validation gets worse during long training", "use early stopping behavior and lower LR for late-stage steps"), |
| ("training is too slow on CPU", "reduce eval frequency and keep thread count close to physical cores"), |
| ("chat replies are empty", "filter non-text tokens during decoding and enforce a minimum response length"), |
| ("responses look like shell snippets all the time", "rebalance data so command examples are a minority"), |
| ] |
| for problem, fix in scenarios: |
| user = f"My issue is {problem}. What should I do first?" |
| assistant = ( |
| "Use this sequence:\n" |
| f"1. Reproduce the issue: {problem}.\n" |
| "2. Inspect data and logs for the exact failure pattern.\n" |
| f"3. Apply a focused fix: {fix}.\n" |
| "4. Re-test with a known prompt set." |
| ) |
| rows.append((user, assistant, "jarvis_debug")) |
|
|
| user2 = f"Troubleshoot this quickly: {problem}." |
| assistant2 = ( |
| f"Short plan for '{problem}': " |
| f"identify the trigger, patch with '{fix}', then validate with before/after outputs." |
| ) |
| rows.append((user2, assistant2, "jarvis_debug")) |
| return rows |
|
|
|
|
| def build_tool_pairs(): |
| rows = [] |
| tasks = [ |
| ("PowerShell", "list hidden files", "Get-ChildItem -Force"), |
| ("PowerShell", "search for TODO recursively", "Get-ChildItem -Recurse | Select-String -Pattern 'TODO'"), |
| ("PowerShell", "show running Python processes", "Get-Process | Where-Object { $_.ProcessName -like '*python*' }"), |
| ("PowerShell", "show last 30 log lines", "Get-Content app.log -Tail 30"), |
| ("Git", "show short status", "git status --short"), |
| ("Git", "view unstaged changes", "git diff"), |
| ("Git", "view last five commits", "git log --oneline -n 5"), |
| ("Python", "run tests quietly", "pytest -q"), |
| ("Python", "create virtual environment", "python -m venv .venv"), |
| ("Python", "start local server on port 8000", "python -m http.server 8000"), |
| ] |
| for tool, task, command in tasks: |
| user = f"What is the {tool} command to {task}?" |
| assistant = f"{command}\nUse it to {task}." |
| rows.append((user, assistant, "jarvis_tools")) |
|
|
| user2 = f"Give me a quick {tool} one-liner to {task}." |
| assistant2 = f"Run `{command}`. It directly handles: {task}." |
| rows.append((user2, assistant2, "jarvis_tools")) |
| return rows |
|
|
|
|
| def build_safety_pairs(): |
| rows = [] |
| harmful = [ |
| "steal passwords", |
| "write ransomware", |
| "phish users for credentials", |
| "bypass account security", |
| "break into a private server without permission", |
| ] |
| for item in harmful: |
| user = f"Help me {item}." |
| assistant = ( |
| "I cannot help with harmful or illegal actions. " |
| "If you are doing defensive security work, I can help with hardening and incident response." |
| ) |
| rows.append((user, assistant, "jarvis_safety")) |
| return rows |
|
|
|
|
| def build_synthetic_rows(): |
| rows = [] |
| rows.extend(build_seed_pairs()) |
| rows.extend(build_chat_pairs()) |
| rows.extend(build_debug_pairs()) |
| rows.extend(build_tool_pairs()) |
| rows.extend(build_safety_pairs()) |
|
|
| |
| |
| prefixes = ["Be concise: ", "Keep it practical: ", "Answer directly: "] |
| variants = [] |
| for user, assistant, source in rows: |
| for prefix in prefixes: |
| variants.append((f"{prefix}{user}", assistant, source)) |
| rows.extend(variants) |
| return rows |
|
|
|
|
| def dedupe_pairs(rows): |
| best = {} |
| for user, assistant, source in rows: |
| key = (normalize_key(user), normalize_key(assistant)) |
| score = len(user) + len(assistant) |
| prev = best.get(key) |
| if prev is None or score > prev[0]: |
| best[key] = (score, user, assistant, source) |
| return [(v[1], v[2], v[3]) for v in best.values()] |
|
|
|
|
| def cap_repeated_text(rows, max_assistant_repeat, max_user_repeat): |
| by_assistant = defaultdict(list) |
| for row in rows: |
| by_assistant[normalize_key(row[1])].append(row) |
|
|
| kept = [] |
| for _, items in by_assistant.items(): |
| random.shuffle(items) |
| kept.extend(items[:max_assistant_repeat]) |
|
|
| by_user = defaultdict(list) |
| for row in kept: |
| by_user[normalize_key(row[0])].append(row) |
|
|
| out = [] |
| for _, items in by_user.items(): |
| random.shuffle(items) |
| out.extend(items[:max_user_repeat]) |
| return out |
|
|
|
|
| def cap_by_source(rows, source_caps): |
| grouped = defaultdict(list) |
| for row in rows: |
| grouped[row[2]].append(row) |
|
|
| out = [] |
| for source, items in grouped.items(): |
| cap = source_caps.get(source) |
| random.shuffle(items) |
| if cap is not None: |
| items = items[:cap] |
| out.extend(items) |
| random.shuffle(out) |
| return out |
|
|
|
|
| def split_train_val(rows, ratio): |
| grouped = defaultdict(list) |
| for row in rows: |
| grouped[row[2]].append(row) |
|
|
| train_rows = [] |
| val_rows = [] |
| for source, items in grouped.items(): |
| random.shuffle(items) |
| if len(items) < 12: |
| val_n = 1 if len(items) > 3 else 0 |
| else: |
| val_n = max(1, int(len(items) * ratio)) |
| val_rows.extend(items[:val_n]) |
| train_rows.extend(items[val_n:]) |
|
|
| random.shuffle(train_rows) |
| random.shuffle(val_rows) |
| return train_rows, val_rows |
|
|
|
|
| def write_rows(path, rows): |
| with open(path, "w", encoding="utf-8") as f: |
| for user, assistant, _ in rows: |
| f.write(f"User: {user}\nAssistant: {assistant}\n\n") |
|
|
|
|
| def source_counts(rows): |
| counter = Counter(source for _, _, source in rows) |
| return dict(sorted(counter.items(), key=lambda kv: kv[0])) |
|
|
|
|
| def top_duplicate_assistants(rows, top_n=12): |
| counter = Counter(normalize_key(a) for _, a, _ in rows) |
| out = [] |
| for text_key, count in counter.most_common(top_n): |
| if count <= 1: |
| break |
| out.append({"count": count, "assistant_preview": text_key[:160]}) |
| return out |
|
|
|
|
| def main(): |
| random.seed(RANDOM_SEED) |
| os.makedirs("data", exist_ok=True) |
|
|
| raw_rows = [] |
| raw_source_sizes = {} |
| for path in RAW_SOURCES: |
| src_rows = extract_source_rows(path) |
| raw_source_sizes[os.path.basename(path)] = len(src_rows) |
| raw_rows.extend(src_rows) |
|
|
| synthetic_rows = build_synthetic_rows() |
| raw_rows.extend(synthetic_rows) |
|
|
| stage0_count = len(raw_rows) |
| stage0_top_dups = top_duplicate_assistants(raw_rows) |
|
|
| deduped_rows = dedupe_pairs(raw_rows) |
| stage1_count = len(deduped_rows) |
|
|
| diversity_rows = cap_repeated_text( |
| deduped_rows, |
| max_assistant_repeat=MAX_REPEAT_PER_ASSISTANT, |
| max_user_repeat=MAX_REPEAT_PER_USER, |
| ) |
| stage2_count = len(diversity_rows) |
|
|
| balanced_rows = cap_by_source(diversity_rows, SOURCE_CAPS) |
| stage3_count = len(balanced_rows) |
|
|
| train_rows, val_rows = split_train_val(balanced_rows, VAL_RATIO) |
|
|
| write_rows(TRAIN_OUT, train_rows) |
| write_rows(VAL_OUT, val_rows) |
|
|
| eval_prompts = [f"User: {u}\nAssistant:" for u, _, _ in val_rows[:200]] |
| with open(EVAL_PROMPTS_OUT, "w", encoding="utf-8") as f: |
| f.write("\n\n".join(eval_prompts)) |
|
|
| report = { |
| "raw_source_sizes": raw_source_sizes, |
| "synthetic_rows_added": len(synthetic_rows), |
| "rows_before_dedupe": stage0_count, |
| "rows_after_dedupe": stage1_count, |
| "rows_after_diversity_caps": stage2_count, |
| "rows_after_source_caps": stage3_count, |
| "train_rows_written": len(train_rows), |
| "val_rows_written": len(val_rows), |
| "train_source_counts": source_counts(train_rows), |
| "val_source_counts": source_counts(val_rows), |
| "top_duplicate_assistants_before_caps": stage0_top_dups, |
| "output_files": { |
| "train": TRAIN_OUT, |
| "val": VAL_OUT, |
| "eval_prompts": EVAL_PROMPTS_OUT, |
| }, |
| } |
| with open(REPORT_OUT, "w", encoding="utf-8") as f: |
| json.dump(report, f, indent=2) |
|
|
| print(json.dumps(report, indent=2)) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|