Mindigenous commited on 7 days ago

Commit

5ae3e12

1 Parent(s): 7a24ed3

Sync latest workspace state: data/scripts updates and archive cleanup

Files changed (49) hide show

AMD Cloud Logs.txt +8 -1
apps_converter.py +127 -0
backup_step8500.tar.gz +0 -3
backup_step8750.tar.gz +0 -3
backup_step9000.tar.gz +0 -3
backup_step9250.tar.gz +0 -3
backup_step9500.tar.gz +0 -3
backup_step9750.tar.gz +0 -3
backups/backup_step4250.tar.gz +0 -3
backups/backup_step4500.tar.gz +0 -3
backups/backup_step4750.tar.gz +0 -3
backups/backup_step5000.tar.gz +0 -3
backups/backup_step5250.tar.gz +0 -3
backups/backup_step5500.tar.gz +0 -3
backups/backup_step5750.tar.gz +0 -3
backups/backup_step6000.tar.gz +0 -3
backups/backup_step6250.tar.gz +0 -3
backups/backup_step6500.tar.gz +0 -3
backups/backup_step6750.tar.gz +0 -3
backups/backup_step7000.tar.gz +0 -3
backups/backup_step7250.tar.gz +0 -3
backups/backup_step7500.tar.gz +0 -3
backups/backup_step7750.tar.gz +0 -3
checkpoints/component5_420m/latest.pt +0 -3
checkpoints/component5_420m/step_3000.pt +0 -3
checkpoints/component5_420m/step_3200.pt +0 -3
codeforces_ingest.py +288 -0
backup_step1000.tar.gz → data/final/_rebalance_tmp/instruction.jsonl +2 -2
backup_step2000.tar.gz → data/final/_rebalance_tmp/problem.jsonl +2 -2
backup_step8000.tar.gz → data/final/_rebalance_tmp/rebalance_seen.sqlite +2 -2
backup_step3000.tar.gz → data/final/_rebalance_tmp/structured.jsonl +2 -2
backup_step8250.tar.gz → data/final/dedupe_hashes.sqlite +2 -2
data/final/train.jsonl +3 -0
data/raw/custom_finetune_pairs.jsonl +0 -3
data_fetch.py +1025 -170
dataset_cleaner.py +342 -0
dataset_formatter.py +102 -0
final_model/config.json +29 -0
final_model/configuration_mindi.py +38 -0
final_model/generation_config.json +10 -0
backup_step4000.tar.gz → final_model/model.safetensors +2 -2
final_model/modeling_mindi.py +219 -0
final_model/tokenization_mindi.py +33 -0
final_model/tokenizer.json +799 -0
final_model/tokenizer_config.json +191 -0
logs/data_fetch.log +2 -2
merge.py +29 -0
requirements.txt +1 -0
test.py +35 -0

AMD Cloud Logs.txt CHANGED Viewed

@@ -408,4 +408,11 @@ trainable params: 7,630,848 || all params: 431,565,696 || trainable%: 1.7682
 {'loss': 6.2728, 'grad_norm': 4.490257740020752, 'learning_rate': 7.265401482791907e-06, 'epoch': 1.85}
 {'loss': 6.4827, 'grad_norm': 4.102600574493408, 'learning_rate': 7.250086658147697e-06, 'epoch': 1.85}
 {'loss': 6.2786, 'grad_norm': 4.251227378845215, 'learning_rate': 7.23474531713807e-06, 'epoch': 1.86}
- 37%|█████████████████████████████████████████████████████████████████▉                                                                                                               | 7028/18870 [44:42<2:03:45,  1.59it/s]

 {'loss': 6.2728, 'grad_norm': 4.490257740020752, 'learning_rate': 7.265401482791907e-06, 'epoch': 1.85}
 {'loss': 6.4827, 'grad_norm': 4.102600574493408, 'learning_rate': 7.250086658147697e-06, 'epoch': 1.85}
 {'loss': 6.2786, 'grad_norm': 4.251227378845215, 'learning_rate': 7.23474531713807e-06, 'epoch': 1.86}
+ 37%|█████████████████████████████████████████████████████████████████▉                                                                                                               | 7028/18870 [44:42<2:03:45,  1.59it/s]
+apt update && apt install -y git-lfs
+git clone https://huggingface.co/Mindigenous/mindi-backup
+cd mindi-backup
+tar -xzvf backup_step12000.tar.gz

apps_converter.py ADDED Viewed

	@@ -0,0 +1,127 @@

+import json
+import re
+from pathlib import Path
+from typing import Iterable, List, Tuple
+from tqdm import tqdm
+PROJECT_ROOT = Path(__file__).resolve().parent
+INPUT_FILES = [
+    PROJECT_ROOT / "apps" / "train.jsonl",
+    PROJECT_ROOT / "apps" / "test.jsonl",
+]
+OUTPUT_FILE = PROJECT_ROOT / "data" / "raw" / "apps.jsonl"
+MAX_SOLUTIONS_PER_PROBLEM = 2
+MIN_RESPONSE_CHARS = 20
+MAX_RESPONSE_TOKENS = 3000
+CODE_HINT_RE = re.compile(
+    r"(\bdef\s+\w+\s*\(|\bclass\s+\w+|\bfor\s+\w+\s+in\b|\bwhile\b|[{;}]|\breturn\b|\bimport\b)",
+    re.IGNORECASE,
+)
+def _normalize_text(value: str) -> str:
+    return value.strip()
+def _parse_solutions(raw_solutions) -> List[str]:
+    if raw_solutions is None:
+        return []
+    if isinstance(raw_solutions, list):
+        return [str(x) for x in raw_solutions if x is not None]
+    if isinstance(raw_solutions, str):
+        raw_solutions = raw_solutions.strip()
+        if not raw_solutions:
+            return []
+        try:
+            parsed = json.loads(raw_solutions)
+            if isinstance(parsed, list):
+                return [str(x) for x in parsed if x is not None]
+            if isinstance(parsed, str):
+                return [parsed]
+            return []
+        except json.JSONDecodeError:
+            return [raw_solutions]
+    return []
+def _is_code_like(text: str) -> bool:
+    return bool(CODE_HINT_RE.search(text))
+def _iter_jsonl(path: Path) -> Iterable[dict]:
+    with path.open("r", encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+            except json.JSONDecodeError:
+                continue
+            if isinstance(obj, dict):
+                yield obj
+def convert_apps_dataset(input_files: List[Path], output_file: Path) -> Tuple[int, int, int]:
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    total_input_samples = 0
+    valid_output_samples = 0
+    skipped_samples = 0
+    with output_file.open("w", encoding="utf-8") as out_f:
+        for input_path in input_files:
+            if not input_path.exists():
+                continue
+            for item in tqdm(_iter_jsonl(input_path), desc=f"apps:{input_path.name}", unit="rows"):
+                total_input_samples += 1
+                question = _normalize_text(str(item.get("question", "")))
+                if not question:
+                    skipped_samples += 1
+                    continue
+                all_solutions = _parse_solutions(item.get("solutions"))
+                if not all_solutions:
+                    skipped_samples += 1
+                    continue
+                usable = 0
+                for raw_solution in all_solutions:
+                    solution = _normalize_text(raw_solution)
+                    if not solution:
+                        continue
+                    if len(solution) < MIN_RESPONSE_CHARS:
+                        continue
+                    if len(solution.split()) > MAX_RESPONSE_TOKENS:
+                        continue
+                    if not _is_code_like(solution):
+                        continue
+                    row = {
+                        "instruction": f"Solve the following problem:\n{question}",
+                        "response": solution,
+                    }
+                    out_f.write(json.dumps(row, ensure_ascii=False) + "\n")
+                    valid_output_samples += 1
+                    usable += 1
+                    if usable >= MAX_SOLUTIONS_PER_PROBLEM:
+                        break
+                if usable == 0:
+                    skipped_samples += 1
+    return total_input_samples, valid_output_samples, skipped_samples
+if __name__ == "__main__":
+    total_input, valid_output, skipped = convert_apps_dataset(INPUT_FILES, OUTPUT_FILE)
+    print(f"Output: {OUTPUT_FILE}")
+    print(f"Total input samples: {total_input}")
+    print(f"Valid output samples: {valid_output}")
+    print(f"Skipped samples: {skipped}")
+    print("APPS dataset ready for training pipeline")

backup_step8500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d35f24763676911a2f605ff63e56a62f521bde805757d51b2e356a004d479e2e
-size 84695943

backup_step8750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:007068138f8a165ff5a3fea9ed096a94bdf620d0007b013d8834d69bfc650628
-size 84696682

backup_step9000.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a69b305a69b77ea66f9feeaaaa3bbd7c4a08f7111bbd6cdd3b90e2e59a5b2e7b
-size 84704097

backup_step9250.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f8724eceedfd4f8c4f87a14f1fa8c2019bcbfe9af6165e57aac020bb04c65fd5
-size 84699876

backup_step9500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a728fcf9931e37ae37a3db4044170a254473aa08f9a10e958ce88987f2575d8c
-size 84705286

backup_step9750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fe8bbef08bb3ee21de186753bce613d4b050b4011d85378737d464e190db65a7
-size 84703357

backups/backup_step4250.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f95bd63ebd8df351b262311c6400b60263f5e975f3022810c432db7207c3d92c
-size 84560529

backups/backup_step4500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b2494fb51792518a027ae7fa403139cb96ada8218c3a1dc85b131850cbf98ed0
-size 84567347

backups/backup_step4750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:1971eeabb6531dbbe399c5173e830262e6c0c8708e020ba52f5090e1370a01e8
-size 84587608

backups/backup_step5000.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2100ab540c449907be5a9403489969e63c6e25faf7d6d62d81de95977091634a
-size 84605420

backups/backup_step5250.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e556e3898a018791ec7afeda40574ea0b192d463c9a306e4d2d30384234f1d5d
-size 84614578

backups/backup_step5500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07e6f0821e8a1ef8568d7733beefc43aa01a88c49f60344c0d0d508ea60c8776
-size 84617900

backups/backup_step5750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6fd42608ddcc28d31c9144fb60c949dcc16cfc1fd90b99bd4a9dc2d059354318
-size 84628951

backups/backup_step6000.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8643c889c56e07ff862cf9300c33b14fd9eb0d8d130dd3c837ce99bef0c5accb
-size 84638746

backups/backup_step6250.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:6745b7c92df7b4a915e3a62064c74a537e85f4cac701c98eb798b9e4039db4e8
-size 84646702

backups/backup_step6500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:329dbcfd21ab74c713da3b8a6e4dc35bde1d863200b99cdcd183a6c499a18d3c
-size 84646328

backups/backup_step6750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d3e9ac1b4dd644738bdcc09bdd559bafe9e8fb2910dac242a78516b38919af34
-size 84660003

backups/backup_step7000.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ad80a05d74d8ca176a7620613a10eca11332d176d5cd8be89b284a521b5409f9
-size 84664111

backups/backup_step7250.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:282b3f77f6a892bd8813fd280f0bfd5ebb80545aca2f5614b0f3aa85358fb46a
-size 84667605

backups/backup_step7500.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3e523cd3c7479b88b40f84b7f46c75b749f712d61e1223d068f25ea330c428ed
-size 84668063

backups/backup_step7750.tar.gz DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:775b3f4094fbf31da00a81b1f76f84efed709a17c5da27c5b2d9c8b8cb11389b
-size 84688475

checkpoints/component5_420m/latest.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:32d26a7dd9e6e294c6657f6fb3a4d947cf52eb8e1c0b11032722fa50d15c4a21
-size 5087449970

checkpoints/component5_420m/step_3000.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e11bded40789574ef316636c02c2fd1e8cd54c13441d8cd6a28980f2209ffaa9
-size 5087455158

checkpoints/component5_420m/step_3200.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:71d2ea9401f3b08b2528dbb8f993949794d0adb57642d0f4752d74da0e445238
-size 5087455158

codeforces_ingest.py ADDED Viewed

	@@ -0,0 +1,288 @@

+import argparse
+import csv
+import html
+import json
+import logging
+import re
+import sqlite3
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Tuple
+from tqdm import tqdm
+HTML_TAG_RE = re.compile(r"<[^>]+>")
+WS_RE = re.compile(r"\s+")
+CODE_HINT_RE = re.compile(
+    r"(\bdef\b|\bclass\b|#include|public\s+class|function\s+\w+|\breturn\b|\bimport\b)",
+    re.IGNORECASE,
+)
+def setup_logger(log_path: Path) -> logging.Logger:
+    log_path.parent.mkdir(parents=True, exist_ok=True)
+    logger = logging.getLogger("codeforces_ingest")
+    logger.setLevel(logging.INFO)
+    if logger.handlers:
+        return logger
+    formatter = logging.Formatter("%(asctime)s | %(levelname)s | %(message)s")
+    fh = logging.FileHandler(log_path, encoding="utf-8")
+    fh.setFormatter(formatter)
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    logger.addHandler(fh)
+    logger.addHandler(sh)
+    return logger
+def clean_text(text: str) -> str:
+    if not text:
+        return ""
+    text = html.unescape(str(text))
+    text = HTML_TAG_RE.sub(" ", text)
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    text = "\n".join(line.strip() for line in text.split("\n"))
+    text = WS_RE.sub(" ", text)
+    return text.strip()
+def _safe_get(record: Dict[str, object], keys: Iterable[str]) -> str:
+    for key in keys:
+        val = record.get(key)
+        if val:
+            return str(val)
+    return ""
+def _extract_pair(record: Dict[str, object]) -> Tuple[str, str]:
+    problem = _safe_get(
+        record,
+        [
+            "problem_statement",
+            "statement",
+            "problem",
+            "question",
+            "content",
+            "description",
+            "prompt",
+            "instruction",
+        ],
+    )
+    solution = _safe_get(
+        record,
+        [
+            "solution",
+            "solution_code",
+            "answer",
+            "code",
+            "response",
+            "python",
+            "cpp",
+            "java",
+            "javascript",
+        ],
+    )
+    return clean_text(problem), clean_text(solution)
+def _iter_jsonl(path: Path) -> Iterator[Dict[str, object]]:
+    with path.open("r", encoding="utf-8", errors="ignore") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                obj = json.loads(line)
+                if isinstance(obj, dict):
+                    yield obj
+            except Exception:
+                continue
+def _iter_json_stream(path: Path, logger: logging.Logger) -> Iterator[Dict[str, object]]:
+    # Streaming JSON parser for array/object JSON files if ijson is available.
+    try:
+        import ijson  # type: ignore
+    except Exception:
+        logger.warning("Skipping JSON file without ijson installed (to preserve streaming): %s", path)
+        return iter(())
+    def gen():
+        with path.open("rb") as f:
+            try:
+                for obj in ijson.items(f, "item"):
+                    if isinstance(obj, dict):
+                        yield obj
+            except Exception:
+                # Some files may be top-level dicts with nested lists.
+                f.seek(0)
+                try:
+                    root = next(ijson.items(f, ""))
+                    if isinstance(root, dict):
+                        for v in root.values():
+                            if isinstance(v, list):
+                                for obj in v:
+                                    if isinstance(obj, dict):
+                                        yield obj
+                except Exception:
+                    return
+    return gen()
+def _iter_csv_like(path: Path, delimiter: str) -> Iterator[Dict[str, object]]:
+    with path.open("r", encoding="utf-8", errors="ignore", newline="") as f:
+        reader = csv.DictReader(f, delimiter=delimiter)
+        for row in reader:
+            if isinstance(row, dict):
+                yield row
+def _iter_txt_records(path: Path) -> Iterator[Dict[str, object]]:
+    # Heuristic fallback for text dumps:
+    # split on obvious separators and map to pseudo records.
+    with path.open("r", encoding="utf-8", errors="ignore") as f:
+        blob = f.read()
+    chunks = re.split(r"\n\s*[-=]{3,}\s*\n|\n\s*Problem\s+\d+\s*\n", blob, flags=re.IGNORECASE)
+    for chunk in chunks:
+        chunk = chunk.strip()
+        if len(chunk) < 120:
+            continue
+        parts = re.split(r"\n\s*(Solution|Answer)\s*:\s*\n", chunk, flags=re.IGNORECASE)
+        if len(parts) < 3:
+            continue
+        yield {"problem_statement": parts[0], "solution": parts[2]}
+def _iter_candidate_files(input_dir: Path) -> Iterator[Path]:
+    patterns = [
+        "**/*.jsonl",
+        "**/*.json",
+        "**/*.csv",
+        "**/*.tsv",
+        "**/*.txt",
+    ]
+    seen = set()
+    for pat in patterns:
+        for path in input_dir.glob(pat):
+            if ".git" in path.parts:
+                continue
+            lower = str(path).lower()
+            if "codeforces" not in lower:
+                continue
+            if path.name.lower() == "codeforces.jsonl":
+                continue
+            if path.is_file() and path not in seen:
+                seen.add(path)
+                yield path
+def ingest_codeforces(input_dir: Path, output_file: Path, logger: logging.Logger) -> Dict[str, int]:
+    output_file.parent.mkdir(parents=True, exist_ok=True)
+    extracted = 0
+    filtered = 0
+    bad = 0
+    usable = 0
+    deduped = 0
+    files = list(_iter_candidate_files(input_dir))
+    if not files:
+        logger.warning("No Codeforces raw files found under %s", input_dir.resolve())
+        output_file.write_text("", encoding="utf-8")
+        return {"extracted": 0, "filtered": 0, "bad": 0, "usable": 0}
+    dedupe_db = output_file.parent / "codeforces_ingest_dedupe.sqlite"
+    if dedupe_db.exists():
+        dedupe_db.unlink()
+    for suffix in ("-wal", "-shm"):
+        s = dedupe_db.with_name(dedupe_db.name + suffix)
+        if s.exists():
+            s.unlink()
+    conn = sqlite3.connect(str(dedupe_db))
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.execute("CREATE TABLE IF NOT EXISTS seen_hashes (h TEXT PRIMARY KEY)")
+    def is_dup(instruction: str, response: str) -> bool:
+        import hashlib
+        h = hashlib.sha256(f"{instruction}||{response}".encode("utf-8")).hexdigest()
+        try:
+            conn.execute("INSERT INTO seen_hashes(h) VALUES (?)", (h,))
+            return False
+        except sqlite3.IntegrityError:
+            return True
+    with output_file.open("w", encoding="utf-8") as out_f:
+        for file_path in tqdm(files, desc="codeforces_files", unit="file"):
+            suffix = file_path.suffix.lower()
+            if suffix == ".jsonl":
+                rec_iter = _iter_jsonl(file_path)
+            elif suffix == ".json":
+                rec_iter = _iter_json_stream(file_path, logger)
+            elif suffix == ".csv":
+                rec_iter = _iter_csv_like(file_path, ",")
+            elif suffix == ".tsv":
+                rec_iter = _iter_csv_like(file_path, "\t")
+            else:
+                rec_iter = _iter_txt_records(file_path)
+            for rec in tqdm(rec_iter, desc=f"ingest:{file_path.name}", unit="rows", leave=False):
+                try:
+                    extracted += 1
+                    problem, solution = _extract_pair(rec)
+                    if len(problem) <= 50 or len(solution) <= 20:
+                        filtered += 1
+                        continue
+                    # Keep response as solution code; reject obvious non-code text.
+                    if not CODE_HINT_RE.search(solution):
+                        filtered += 1
+                        continue
+                    instruction = f"Solve the following problem:\n{problem}"
+                    if is_dup(instruction, solution):
+                        deduped += 1
+                        continue
+                    row = {"instruction": instruction, "response": solution}
+                    out_f.write(json.dumps(row, ensure_ascii=False) + "\n")
+                    usable += 1
+                except Exception:
+                    bad += 1
+                    continue
+            conn.commit()
+    conn.close()
+    skipped = filtered + bad + deduped
+    logger.info("Codeforces ingest total_input=%d", extracted)
+    logger.info("Codeforces ingest valid_output=%d", usable)
+    logger.info("Codeforces ingest skipped=%d", skipped)
+    logger.info("Codeforces ingest filtered=%d", filtered)
+    logger.info("Codeforces ingest deduped=%d", deduped)
+    logger.info("Codeforces ingest bad=%d", bad)
+    return {
+        "total_input": extracted,
+        "valid_output": usable,
+        "skipped": skipped,
+        "filtered": filtered,
+        "deduped": deduped,
+        "bad": bad,
+    }
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(
+        description="Ingest Codeforces problem-solution data into JSONL for MINDI pipeline."
+    )
+    parser.add_argument("--input-dir", type=Path, default=Path("./data/raw"))
+    parser.add_argument("--output", type=Path, default=Path("./data/raw/codeforces.jsonl"))
+    parser.add_argument("--log-file", type=Path, default=Path("./logs/codeforces_ingest.log"))
+    return parser
+if __name__ == "__main__":
+    args = _build_parser().parse_args()
+    log = setup_logger(args.log_file)
+    stats = ingest_codeforces(args.input_dir, args.output, log)
+    print(f"Output: {args.output.resolve()}")
+    print(f"Total input: {stats['total_input']}")
+    print(f"Valid output: {stats['valid_output']}")
+    print(f"Skipped: {stats['skipped']}")

backup_step1000.tar.gz → data/final/_rebalance_tmp/instruction.jsonl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebe005c43dd59c9c49ad153d41af1bdaaad47c2a21ae231a4c5e90c8005560af
-size 337623475

 version https://git-lfs.github.com/spec/v1
+oid sha256:77fd726ec9b3b9135edc4b22c251c760ea444060507315d09e0156a9ad08cff2
+size 359523113

backup_step2000.tar.gz → data/final/_rebalance_tmp/problem.jsonl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:861329fb551b4c6406e92e06cfa1faae592f0fe0d0ce713189a57c62b33b0969
-size 337571785

 version https://git-lfs.github.com/spec/v1
+oid sha256:dea7bb001629f0e7b72363ea4ffc8da89b21cab0322c80faea3bd4352d2d28cd
+size 392283637

backup_step8000.tar.gz → data/final/_rebalance_tmp/rebalance_seen.sqlite RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:13c3e5c401a567493b92bf02f6d4040f5b6f578c4c413b33362a0009d7405237
-size 84689731

 version https://git-lfs.github.com/spec/v1
+oid sha256:27065d4515ff3d381d784058a61a997517d89621147f71316b4517997a7567a0
+size 85028864

backup_step3000.tar.gz → data/final/_rebalance_tmp/structured.jsonl RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:238c2859ebf4efc0195456a898d2fb8bce0397e39fdf59e9f940963232d628a8
-size 337762553

 version https://git-lfs.github.com/spec/v1
+oid sha256:83898474a15e20eba8f8fd4b4163c381d69b87374c4d82b3e289898da0b9f2fc
+size 283810066

backup_step8250.tar.gz → data/final/dedupe_hashes.sqlite RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5dff8ef0900eeed1141b8aac59e1c45697ff3c804e4e2792568f4fdf5754e021
-size 84688227

 version https://git-lfs.github.com/spec/v1
+oid sha256:89622c37e7ff270b6745693d3dbd63a25a7937bf7e5261a01d4676568207d7ea
+size 84996096

data/final/train.jsonl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0f78cdcc0171535ad8ac533beccaec3aab78ef94870055bdf7dd5c798d629aef
+size 1867627125

data/raw/custom_finetune_pairs.jsonl DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7ab1ceab4d5a85de0c15a54f6420c483e78de5db4b5654dc5d34aa1d02893921
-size 451

data_fetch.py CHANGED Viewed

@@ -1,222 +1,1077 @@
 import argparse
-import hashlib
 from pathlib import Path
-from typing import Dict, List, Optional
-from datasets import Dataset, DatasetDict, load_dataset, load_from_disk
-from config import DATA_CONFIG, PATHS
-from utils import ensure_dirs, setup_logger, write_jsonl
-def _normalize_text(text: Optional[str]) -> str:
-    if not text:
         return ""
-    return " ".join(str(text).strip().split())
-def _quality_ok(sample: Dict[str, str]) -> bool:
-    instruction = _normalize_text(sample.get("instruction"))
-    output = _normalize_text(sample.get("output"))
-    if not instruction or not output:
-        return False
-    if len(output) < DATA_CONFIG.min_output_chars:
-        return False
-    lowered = output.lower()
-    bad_tokens = ("todo", "fixme", "coming soon", "not implemented")
-    if any(tok in lowered for tok in bad_tokens):
-        return False
-    if output.strip() in {"pass", "...", "return ..."}:
-        return False
-    return True
-def _to_record(instruction: str, input_text: str, output_text: str) -> Dict[str, str]:
-    return {
-        "instruction": instruction.strip(),
-        "input": input_text.strip(),
-        "output": output_text.strip(),
-    }
-def _save_dataset_for_offline(ds_obj, save_path: Path) -> None:
-    if save_path.exists():
         return
-    save_path.parent.mkdir(parents=True, exist_ok=True)
-    ds_obj.save_to_disk(str(save_path))
-def _load_or_download(dataset_name: str, cache_path: Path, **kwargs):
-    if cache_path.exists():
-        return load_from_disk(str(cache_path))
-    dataset_obj = load_dataset(dataset_name, **kwargs)
-    _save_dataset_for_offline(dataset_obj, cache_path)
-    return dataset_obj
-def _load_or_download_codesearchnet(cache_path: Path, subset: str = "python"):
-    if cache_path.exists():
-        return load_from_disk(str(cache_path))
-    ds = load_dataset("code_search_net", subset)
-    _save_dataset_for_offline(ds, cache_path)
-    return ds
-def _extract_humaneval(ds_obj, max_samples: int) -> List[Dict[str, str]]:
-    rows: List[Dict[str, str]] = []
-    split = ds_obj["test"] if isinstance(ds_obj, DatasetDict) else ds_obj
-    for item in split:
-        prompt = item.get("prompt", "")
-        solution = item.get("canonical_solution", "")
-        if "def " not in prompt:
-            continue
-        rows.append(
-            _to_record(
-                instruction="Complete the Python function so it satisfies the specification.",
-                input_text=prompt,
-                output_text=solution,
-            )
         )
-        if len(rows) >= max_samples:
-            break
-    return rows
-def _extract_mbpp(ds_obj, max_samples: int) -> List[Dict[str, str]]:
-    rows: List[Dict[str, str]] = []
-    splits = []
-    if isinstance(ds_obj, DatasetDict):
-        splits = [ds_obj[k] for k in ds_obj.keys()]
-    else:
-        splits = [ds_obj]
-    for split in splits:
-        for item in split:
-            task = item.get("text", "")
-            code = item.get("code", "")
-            tests = item.get("test_list", [])
-            if not task or not code:
                 continue
-            test_blob = "\n".join(tests) if isinstance(tests, list) else str(tests)
-            input_text = f"Task:\n{task}\n\nTests:\n{test_blob}".strip()
-            rows.append(
-                _to_record(
-                    instruction="Write Python code that solves the problem and passes the tests.",
-                    input_text=input_text,
-                    output_text=code,
                 )
             )
-            if len(rows) >= max_samples:
-                return rows
-    return rows
-def _extract_codesearchnet(ds_obj, max_samples: int) -> List[Dict[str, str]]:
-    rows: List[Dict[str, str]] = []
-    splits = []
-    if isinstance(ds_obj, DatasetDict):
-        for split_name in ("train", "validation"):
-            if split_name in ds_obj:
-                splits.append(ds_obj[split_name])
-    else:
-        splits = [ds_obj]
-    for split in splits:
-        for item in split:
-            language = str(item.get("language", "")).lower()
-            if language and language != "python":
                 continue
-            docstring = item.get("docstring", "") or item.get("func_documentation_string", "")
-            code = item.get("whole_func_string", "") or item.get("code", "")
-            if not docstring or not code:
                 continue
-            if "def " not in code and "class " not in code:
                 continue
-            rows.append(
-                _to_record(
-                    instruction="Write Python code that matches the following docstring.",
-                    input_text=docstring,
-                    output_text=code,
                 )
             )
-            if len(rows) >= max_samples:
-                return rows
-    return rows
-def _dedupe_and_filter(rows: List[Dict[str, str]], max_total: int) -> List[Dict[str, str]]:
-    seen = set()
-    clean_rows: List[Dict[str, str]] = []
-    for row in rows:
-        if not _quality_ok(row):
-            continue
-        digest = hashlib.sha256(
-            f"{row['instruction']}||{row['input']}||{row['output']}".encode("utf-8")
-        ).hexdigest()
-        if digest in seen:
-            continue
-        seen.add(digest)
-        clean_rows.append(row)
-        if len(clean_rows) >= max_total:
-            break
-    return clean_rows
-def fetch_and_prepare_dataset(offline_only: bool = False) -> Path:
-    ensure_dirs([PATHS.data_dir, PATHS.dataset_cache_dir, PATHS.raw_dataset_dir, PATHS.logs_dir])
-    logger = setup_logger("data_fetch", PATHS.logs_dir / "data_fetch.log")
-    logger.info("Loading datasets (offline_only=%s).", offline_only)
-    humaneval_cache = PATHS.raw_dataset_dir / "openai_humaneval"
-    mbpp_cache = PATHS.raw_dataset_dir / "mbpp"
-    csn_cache = PATHS.raw_dataset_dir / "code_search_net_python"
-    if offline_only:
-        if not humaneval_cache.exists() or not mbpp_cache.exists() or not csn_cache.exists():
-            raise FileNotFoundError(
-                "Offline mode requested but one or more cached datasets are missing. "
-                "Run without --offline first."
             )
-        humaneval_ds = load_from_disk(str(humaneval_cache))
-        mbpp_ds = load_from_disk(str(mbpp_cache))
-        csn_ds = load_from_disk(str(csn_cache))
     else:
-        humaneval_ds = _load_or_download("openai_humaneval", humaneval_cache)
-        mbpp_ds = _load_or_download("mbpp", mbpp_cache)
-        csn_ds = _load_or_download_codesearchnet(csn_cache, subset="python")
-    rows = []
-    rows.extend(_extract_humaneval(humaneval_ds, DATA_CONFIG.max_humaneval_samples))
-    rows.extend(_extract_mbpp(mbpp_ds, DATA_CONFIG.max_mbpp_samples))
-    rows.extend(_extract_codesearchnet(csn_ds, DATA_CONFIG.max_codesearchnet_samples))
-    clean_rows = _dedupe_and_filter(rows, DATA_CONFIG.max_total_samples)
-    write_jsonl(PATHS.train_jsonl, clean_rows)
-    logger.info("Saved %d cleaned training rows to %s", len(clean_rows), PATHS.train_jsonl)
-    print(f"Saved dataset: {PATHS.train_jsonl.resolve()}")
-    return PATHS.train_jsonl
-def _build_arg_parser() -> argparse.ArgumentParser:
-    parser = argparse.ArgumentParser(description="Download and prepare Python fine-tuning data.")
     parser.add_argument(
-        "--offline",
-        action="store_true",
-        help="Use only previously saved local dataset cache.",
     )
     return parser
 if __name__ == "__main__":
-    args = _build_arg_parser().parse_args()
-    fetch_and_prepare_dataset(offline_only=args.offline)

 import argparse
+import json
+import os
+import re
+import sqlite3
+from collections import Counter
 from pathlib import Path
+from typing import Dict, Iterable, List, Optional
+from datasets import load_dataset, load_from_disk
+from tqdm import tqdm
+from dataset_cleaner import build_balanced_dataset, clean_record
+from dataset_formatter import build_instruction_sample
+from utils import ensure_dirs, setup_logger
+RAW_DIR = Path("./data/raw")
+FINAL_DIR = Path("./data/final")
+FINAL_TRAIN = FINAL_DIR / "train.jsonl"
+LOG_DIR = Path("./logs")
+def _safe_get(item: Dict[str, object], keys: Iterable[str]) -> str:
+    for key in keys:
+        value = item.get(key)
+        if value:
+            return str(value)
+    return ""
+def _write_jsonl(path: Path, rows: Iterable[Dict[str, str]]) -> int:
+    path.parent.mkdir(parents=True, exist_ok=True)
+    count = 0
+    with path.open("w", encoding="utf-8") as f:
+        for row in rows:
+            if not row.get("instruction") or not row.get("response"):
+                continue
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+            count += 1
+    return count
+def _iter_jsonl(path: Path) -> Iterable[Dict[str, object]]:
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                yield json.loads(line)
+            except json.JSONDecodeError:
+                continue
+def _source_to_category(source_name: str) -> str:
+    s = source_name.lower()
+    if any(k in s for k in ("codealpaca", "evol", "ultrachat", "openhermes", "orca")):
+        return "instruction"
+    if any(
+        k in s
+        for k in (
+            "leetcode",
+            "contest",
+            "problem",
+            "mbpp",
+            "humaneval",
+            "apps",
+            "codeforces",
+            "codesearchnet_problem",
+        )
+    ):
+        return "problem"
+    return "structured"
+def _decode_text(value) -> str:
+    if value is None:
         return ""
+    if isinstance(value, str):
+        return value
+    if isinstance(value, bytes):
+        return value.decode("utf-8", errors="ignore")
+    return str(value)
+def _extract_solution_from_code_contests(item: Dict[str, object]) -> str:
+    sols = item.get("solutions")
+    if isinstance(sols, dict):
+        # Typical schema: {"language": [...], "solution": [bytes...]}
+        cand = sols.get("solution")
+        if isinstance(cand, list):
+            # Prefer Python-looking snippets when possible.
+            for s in cand:
+                t = _decode_text(s)
+                if re.search(r"\bdef\b|\bimport\b|\bprint\(", t):
+                    return t
+            if cand:
+                return _decode_text(cand[0])
+    if isinstance(sols, list) and sols:
+        return _decode_text(sols[0])
+    return _safe_get(item, ["solution", "answer", "code"])
+def _extract_many_code_contests_solutions(item: Dict[str, object], max_per_problem: int = 6) -> List[str]:
+    out: List[str] = []
+    sols = item.get("solutions")
+    if isinstance(sols, dict):
+        cand = sols.get("solution")
+        if isinstance(cand, list):
+            for s in cand:
+                t = _decode_text(s).strip()
+                if not t:
+                    continue
+                if t not in out:
+                    out.append(t)
+                if len(out) >= max_per_problem:
+                    break
+    if not out:
+        one = _extract_solution_from_code_contests(item).strip()
+        if one:
+            out.append(one)
+    return out
+def _extract_many_apps_solutions(item: Dict[str, object], max_per_problem: int = 5) -> List[str]:
+    out: List[str] = []
+    for key in ("solutions", "solution", "answer", "code"):
+        val = item.get(key)
+        if isinstance(val, list):
+            for x in val:
+                t = _decode_text(x).strip()
+                if t and t not in out:
+                    out.append(t)
+                if len(out) >= max_per_problem:
+                    return out
+        elif isinstance(val, dict):
+            for x in val.values():
+                if isinstance(x, list):
+                    for y in x:
+                        t = _decode_text(y).strip()
+                        if t and t not in out:
+                            out.append(t)
+                        if len(out) >= max_per_problem:
+                            return out
+        else:
+            t = _decode_text(val).strip()
+            if t and t not in out:
+                out.append(t)
+            if len(out) >= max_per_problem:
+                return out
+    return out
+def _collect_code_candidates(value, out: List[str], max_per_problem: int) -> None:
+    if len(out) >= max_per_problem:
         return
+    if value is None:
+        return
+    if isinstance(value, str):
+        v = value.strip()
+        if v and v not in out:
+            out.append(v)
+        return
+    if isinstance(value, bytes):
+        v = _decode_text(value).strip()
+        if v and v not in out:
+            out.append(v)
+        return
+    if isinstance(value, list):
+        for x in value:
+            _collect_code_candidates(x, out, max_per_problem)
+            if len(out) >= max_per_problem:
+                return
+        return
+    if isinstance(value, dict):
+        for k in ("solution", "solutions", "code", "answer", "python", "cpp", "java", "javascript"):
+            if k in value:
+                _collect_code_candidates(value.get(k), out, max_per_problem)
+                if len(out) >= max_per_problem:
+                    return
+        for v in value.values():
+            _collect_code_candidates(v, out, max_per_problem)
+            if len(out) >= max_per_problem:
+                return
+def _extract_many_generic_solutions(
+    item: Dict[str, object],
+    *,
+    max_per_problem: int = 6,
+) -> List[str]:
+    out: List[str] = []
+    for key in ("solutions", "solution", "code", "answer", "python", "cpp", "java", "javascript"):
+        _collect_code_candidates(item.get(key), out, max_per_problem)
+        if len(out) >= max_per_problem:
+            break
+    return out
+def _compute_targets(target_size: int, min_problem_samples: int) -> Dict[str, int]:
+    instruction_target = int(target_size * 0.60)
+    structured_target = int(target_size * 0.30)
+    problem_target = target_size - instruction_target - structured_target
+    problem_target = max(problem_target, min_problem_samples)
+    remainder = target_size - problem_target
+    if remainder < 0:
+        raise RuntimeError(
+            f"Invalid target sizing: min_problem_samples={min_problem_samples} exceeds "
+            f"target_size={target_size}."
         )
+    instruction_target = int(remainder * (60.0 / 90.0))
+    structured_target = remainder - instruction_target
+    return {
+        "instruction": instruction_target,
+        "structured": structured_target,
+        "problem": problem_target,
+    }
+def rebalance_final_dataset(
+    *,
+    raw_paths: List[Path],
+    output_path: Path,
+    target_size: int,
+    min_tokens: int,
+    max_tokens: int,
+    min_problem_samples: int,
+    logger,
+) -> Dict[str, object]:
+    # Post-build rebalance using streaming + temp shards, then exact down/upsample.
+    tmp_dir = output_path.parent / "_rebalance_tmp"
+    ensure_dirs([tmp_dir])
+    shard_paths = {
+        "instruction": tmp_dir / "instruction.jsonl",
+        "structured": tmp_dir / "structured.jsonl",
+        "problem": tmp_dir / "problem.jsonl",
+    }
+    for p in shard_paths.values():
+        if p.exists():
+            p.unlink()
+    dedupe_db = tmp_dir / "rebalance_seen.sqlite"
+    if dedupe_db.exists():
+        dedupe_db.unlink()
+    for suffix in ("-wal", "-shm"):
+        side = dedupe_db.with_name(dedupe_db.name + suffix)
+        if side.exists():
+            side.unlink()
+    conn = sqlite3.connect(str(dedupe_db))
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.execute("CREATE TABLE IF NOT EXISTS seen_hashes (h TEXT PRIMARY KEY)")
+    def is_dup(instruction: str, response: str) -> bool:
+        import hashlib
+        h = hashlib.sha256(f"{instruction}||{response}".encode("utf-8")).hexdigest()
+        try:
+            conn.execute("INSERT INTO seen_hashes(h) VALUES (?)", (h,))
+            return False
+        except sqlite3.IntegrityError:
+            return True
+    shard_counts = Counter()
+    with (
+        shard_paths["instruction"].open("w", encoding="utf-8") as f_inst,
+        shard_paths["structured"].open("w", encoding="utf-8") as f_struct,
+        shard_paths["problem"].open("w", encoding="utf-8") as f_prob,
+    ):
+        writers = {
+            "instruction": f_inst,
+            "structured": f_struct,
+            "problem": f_prob,
+        }
+        for raw_path in raw_paths:
+            if not raw_path.exists():
                 continue
+            src_default = raw_path.stem
+            for rec in tqdm(_iter_jsonl(raw_path), desc=f"rebalance_scan:{raw_path.name}", unit="rows"):
+                if "_source" not in rec:
+                    rec["_source"] = src_default
+                if "_category" not in rec:
+                    rec["_category"] = _source_to_category(src_default)
+                cleaned = clean_record(rec, min_tokens=min_tokens, max_tokens=max_tokens)
+                if cleaned is None:
+                    continue
+                if is_dup(cleaned["instruction"], cleaned["response"]):
+                    continue
+                cat = cleaned["_category"]
+                if cat not in writers:
+                    cat = _source_to_category(cleaned.get("_source", ""))
+                line_obj = {
+                    "instruction": cleaned["instruction"],
+                    "response": cleaned["response"],
+                    "_source": cleaned["_source"],
+                    "_category": cat,
+                }
+                writers[cat].write(json.dumps(line_obj, ensure_ascii=False) + "\n")
+                shard_counts[cat] += 1
+            conn.commit()
+    conn.close()
+    targets = _compute_targets(target_size=target_size, min_problem_samples=min_problem_samples)
+    logger.info("Rebalance targets: %s (available=%s)", targets, dict(shard_counts))
+    source_breakdown = Counter()
+    category_breakdown = Counter()
+    total_tokens = 0
+    total_samples = 0
+    problem_real_count = 0
+    problem_synthetic_count = 0
+    max_synth_problem = int(targets["problem"] * 0.30)
+    def write_from_shard(cat: str, needed: int, out_f) -> int:
+        nonlocal total_samples, total_tokens, problem_real_count, problem_synthetic_count
+        written = 0
+        shard = shard_paths[cat]
+        if not shard.exists():
+            return 0
+        with shard.open("r", encoding="utf-8") as f:
+            for line in f:
+                if written >= needed:
+                    break
+                obj = json.loads(line)
+                src = obj.get("_source", "unknown")
+                is_problem_synth = cat == "problem" and "codesearchnet_problem_fallback" in src
+                if is_problem_synth and problem_synthetic_count >= max_synth_problem:
+                    continue
+                out_f.write(
+                    json.dumps(
+                        {"instruction": obj["instruction"], "response": obj["response"]},
+                        ensure_ascii=False,
+                    )
+                    + "\n"
                 )
+                written += 1
+                total_samples += 1
+                category_breakdown[cat] += 1
+                source_breakdown[src] += 1
+                if cat == "problem":
+                    if is_problem_synth:
+                        problem_synthetic_count += 1
+                    else:
+                        problem_real_count += 1
+                total_tokens += len((obj["instruction"] + " " + obj["response"]).split())
+        return written
+    def upsample_shard(cat: str, needed: int, out_f) -> int:
+        nonlocal total_samples, total_tokens, problem_real_count, problem_synthetic_count
+        shard = shard_paths[cat]
+        if not shard.exists() or needed <= 0:
+            return 0
+        written = 0
+        while written < needed:
+            made_progress = 0
+            with shard.open("r", encoding="utf-8") as f:
+                for line in f:
+                    if written >= needed:
+                        break
+                    obj = json.loads(line)
+                    src = obj.get("_source", "unknown")
+                    is_problem_synth = cat == "problem" and "codesearchnet_problem_fallback" in src
+                    if is_problem_synth and problem_synthetic_count >= max_synth_problem:
+                        continue
+                    out_f.write(
+                        json.dumps(
+                            {"instruction": obj["instruction"], "response": obj["response"]},
+                            ensure_ascii=False,
+                        )
+                        + "\n"
+                    )
+                    written += 1
+                    made_progress += 1
+                    total_samples += 1
+                    category_breakdown[cat] += 1
+                    source_breakdown[src] += 1
+                    if cat == "problem":
+                        if is_problem_synth:
+                            problem_synthetic_count += 1
+                        else:
+                            problem_real_count += 1
+                    total_tokens += len((obj["instruction"] + " " + obj["response"]).split())
+            if made_progress == 0:
+                break
+        return written
+    with output_path.open("w", encoding="utf-8") as out_f:
+        for cat in ("instruction", "structured", "problem"):
+            want = targets[cat]
+            got = write_from_shard(cat, want, out_f)
+            if got < want:
+                deficit = want - got
+                if cat == "problem":
+                    logger.warning(
+                        "Category %s shortfall: need=%d got=%d (no upsampling allowed for problem).",
+                        cat,
+                        want,
+                        got,
+                    )
+                else:
+                    upsampled = upsample_shard(cat, deficit, out_f)
+                    logger.warning(
+                        "Category %s shortfall: need=%d got=%d upsampled=%d",
+                        cat,
+                        want,
+                        got,
+                        upsampled,
+                    )
+    inst = category_breakdown["instruction"]
+    struct = category_breakdown["structured"]
+    problem = category_breakdown["problem"]
+    instruction_vs_raw = {
+        "instruction_pct": round(100.0 * inst / max(total_samples, 1), 2),
+        "raw_converted_pct": round(100.0 * (struct + problem) / max(total_samples, 1), 2),
+    }
+    avg_len = round(total_tokens / max(total_samples, 1), 2)
+    return {
+        "total_samples": total_samples,
+        "avg_length_tokens": avg_len,
+        "source_breakdown": dict(source_breakdown),
+        "category_breakdown": dict(category_breakdown),
+        "instruction_vs_raw_ratio": instruction_vs_raw,
+        "targets": targets,
+        "problem_real_count": problem_real_count,
+        "problem_synthetic_count": problem_synthetic_count,
+        "problem_synthetic_pct": round(
+            100.0 * problem_synthetic_count / max(problem_real_count + problem_synthetic_count, 1), 2
+        ),
+    }
+def _try_load_dataset(candidates: List[Dict[str, object]], logger):
+    last_exc: Optional[Exception] = None
+    for cand in candidates:
+        try:
+            ds = load_dataset(**cand)
+            logger.info("Loaded dataset: %s", cand)
+            return ds
+        except Exception as exc:
+            logger.warning("Dataset load failed for %s: %s", cand, exc)
+            last_exc = exc
+    if last_exc:
+        raise last_exc
+    raise RuntimeError("No dataset candidates provided.")
+def fetch_instruction_codealpaca(raw_path: Path, limit: int, logger) -> int:
+    ds = _try_load_dataset(
+        [
+            {"path": "sahil2801/CodeAlpaca-20k", "split": "train"},
+            {"path": "HuggingFaceH4/CodeAlpaca_20K", "split": "train"},
+        ],
+        logger,
+    )
+    def rows():
+        emitted = 0
+        for item in tqdm(ds, desc="codealpaca", unit="rows"):
+            if emitted >= limit:
+                break
+            instruction = _safe_get(item, ["instruction"])
+            inp = _safe_get(item, ["input"])
+            output = _safe_get(item, ["output", "response", "answer"])
+            if inp:
+                instruction = f"{instruction}\n\nInput:\n{inp}".strip()
+            emitted += 1
+            yield build_instruction_sample(
+                instruction=instruction,
+                response=output,
+                source="codealpaca",
+                category="instruction",
             )
+    return _write_jsonl(raw_path, rows())
+def fetch_instruction_evol(raw_path: Path, limit: int, logger) -> int:
+    ds = _try_load_dataset(
+        [
+            {"path": "nickrosh/Evol-Instruct-Code-80k-v1", "split": "train"},
+            {"path": "WizardLMTeam/WizardCoder-Evol-Instruct-V2-196k", "split": "train"},
+            {"path": "ise-uiuc/Magicoder-OSS-Instruct-75K", "split": "train"},
+        ],
+        logger,
+    )
+    def rows():
+        emitted = 0
+        for item in tqdm(ds, desc="evol_instruct_code", unit="rows"):
+            if emitted >= limit:
+                break
+            instruction = _safe_get(item, ["instruction", "prompt", "question"])
+            inp = _safe_get(item, ["input"])
+            output = _safe_get(item, ["output", "response", "answer"])
+            if inp:
+                instruction = f"{instruction}\n\nInput:\n{inp}".strip()
+            emitted += 1
+            yield build_instruction_sample(
+                instruction=instruction,
+                response=output,
+                source="evol_instruct_code",
+                category="instruction",
+            )
+    return _write_jsonl(raw_path, rows())
+def fetch_instruction_ultrachat_code(raw_path: Path, limit: int, logger) -> int:
+    ds = _try_load_dataset(
+        [
+            {"path": "HuggingFaceH4/ultrachat_200k", "split": "train_sft"},
+            {"path": "stingning/ultrachat", "split": "train"},
+        ],
+        logger,
+    )
+    code_terms = ("python", "javascript", "typescript", "java", "code", "api", "backend", "frontend")
+    def rows():
+        emitted = 0
+        for item in tqdm(ds, desc="ultrachat_code", unit="rows"):
+            if emitted >= limit:
+                break
+            msgs = item.get("messages") or item.get("conversation") or item.get("conversations")
+            if not isinstance(msgs, list) or len(msgs) < 2:
+                continue
+            user = ""
+            assistant = ""
+            for msg in msgs:
+                if not isinstance(msg, dict):
+                    continue
+                role = str(msg.get("role", "")).lower()
+                content = str(msg.get("content", "")).strip()
+                if role in {"user", "human"} and not user:
+                    user = content
+                if role in {"assistant", "gpt"} and user and not assistant:
+                    assistant = content
+                    break
+            if not user or not assistant:
+                continue
+            low = (user + " " + assistant).lower()
+            if not any(term in low for term in code_terms):
                 continue
+            emitted += 1
+            yield build_instruction_sample(
+                instruction=user,
+                response=assistant,
+                source="ultrachat_code",
+                category="instruction",
+            )
+    return _write_jsonl(raw_path, rows())
+def fetch_instruction_openhermes_code(raw_path: Path, limit: int, logger) -> int:
+    ds = _try_load_dataset(
+        [
+            {"path": "teknium/OpenHermes-2.5", "split": "train"},
+            {"path": "Open-Orca/OpenOrca", "split": "train"},
+        ],
+        logger,
+    )
+    code_terms = ("python", "javascript", "typescript", "java", "code", "function", "api", "fastapi")
+    def rows():
+        emitted = 0
+        for item in tqdm(ds, desc="openhermes_code", unit="rows"):
+            if emitted >= limit:
+                break
+            instruction = _safe_get(item, ["instruction", "question", "prompt"])
+            response = _safe_get(item, ["output", "response", "answer"])
+            if (not instruction or not response) and isinstance(item.get("conversations"), list):
+                user = ""
+                assistant = ""
+                for msg in item.get("conversations"):
+                    if not isinstance(msg, dict):
+                        continue
+                    from_role = str(msg.get("from", "")).lower()
+                    value = str(msg.get("value", "")).strip()
+                    if from_role in {"human", "user"} and not user:
+                        user = value
+                    if from_role in {"gpt", "assistant"} and user and not assistant:
+                        assistant = value
+                        break
+                instruction = instruction or user
+                response = response or assistant
+            if not instruction or not response:
                 continue
+            low = (instruction + " " + response).lower()
+            if not any(term in low for term in code_terms):
                 continue
+            emitted += 1
+            yield build_instruction_sample(
+                instruction=instruction,
+                response=response,
+                source="openhermes_code",
+                category="instruction",
+            )
+    return _write_jsonl(raw_path, rows())
+def fetch_structured_codesearchnet(raw_path: Path, limit: int, logger) -> int:
+    languages = ["python", "javascript", "java"]
+    per_lang = max(1, limit // max(1, len(languages)))
+    def rows():
+        emitted = 0
+        for lang in languages:
+            if emitted >= limit:
+                break
+            ds = None
+            cache_by_lang = Path(f"./data/cache/raw/code_search_net_{lang}")
+            if cache_by_lang.exists():
+                try:
+                    ds = load_from_disk(str(cache_by_lang))["train"]
+                    logger.info("Loaded cached CodeSearchNet language=%s from %s", lang, cache_by_lang)
+                except Exception as exc:
+                    logger.warning("Failed cached CodeSearchNet for %s: %s", lang, exc)
+            if ds is None:
+                try:
+                    ds = load_dataset("code_search_net", lang, split="train", streaming=True)
+                    logger.info("Loaded streamed CodeSearchNet language=%s", lang)
+                except Exception as exc:
+                    logger.warning("Skipping CodeSearchNet language=%s: %s", lang, exc)
+                    continue
+            lang_count = 0
+            for item in tqdm(ds, desc=f"codesearchnet_{lang}", unit="rows"):
+                if emitted >= limit or lang_count >= per_lang:
+                    break
+                code = _safe_get(item, ["whole_func_string", "code"])
+                path = _safe_get(item, ["path", "func_name"])
+                doc = _safe_get(item, ["docstring", "func_documentation_string"])
+                if not code:
+                    continue
+                emitted += 1
+                lang_count += 1
+                yield build_instruction_sample(
+                    code=code,
+                    instruction=doc,
+                    language=lang,
+                    path=path,
+                    source=f"codesearchnet_{lang}",
+                    category="structured",
                 )
+    return _write_jsonl(raw_path, rows())
+def fetch_structured_github_functions(raw_path: Path, limit: int, logger) -> int:
+    ds = None
+    cache_path = Path("./data/cache/raw/code_search_net_python")
+    if cache_path.exists():
+        ds = load_from_disk(str(cache_path))["train"]
+        logger.info("Using cached GitHub function corpus from %s", cache_path.resolve())
+    else:
+        ds = load_dataset("code_search_net", "python", split="train", streaming=True)
+        logger.info("Using streamed CodeSearchNet python as GitHub-curated function source.")
+    def rows():
+        emitted = 0
+        for item in tqdm(ds, desc="github_curated_functions", unit="rows"):
+            if emitted >= limit:
+                break
+            code = _safe_get(item, ["whole_func_string", "code", "content"])
+            path = _safe_get(item, ["path", "func_name"])
+            repo = _safe_get(item, ["repo", "repository_name"])
+            doc = _safe_get(item, ["docstring", "func_documentation_string"])
+            if not code:
+                continue
+            title = f"{repo}/{path}" if repo and path else path
+            emitted += 1
+            yield build_instruction_sample(
+                code=code,
+                instruction=doc,
+                language="python",
+                path=path,
+                title=title,
+                source="github_curated_functions",
+                category="structured",
             )
+    return _write_jsonl(raw_path, rows())
+def fetch_problem_leetcode(raw_path: Path, limit: int, logger) -> int:
+    def rows():
+        emitted = 0
+        synth_emitted = 0
+        candidates = [
+            ("greengerong/leetcode", {"path": "greengerong/leetcode", "split": "train"}),
+            ("deepmind/code_contests", {"path": "deepmind/code_contests", "split": "train"}),
+            ("codeparrot/apps", {"path": "codeparrot/apps", "split": "train"}),
+            ("google-research-datasets/mbpp", {"path": "google-research-datasets/mbpp", "split": "train"}),
+            ("openai_humaneval", {"path": "openai_humaneval", "split": "test"}),
+            # Streamed high-volume real problem source; avoid full git clone.
+            ("open-r1/codeforces", {"path": "open-r1/codeforces", "split": "train", "streaming": True}),
+        ]
+        # Optional local codeforces/problem-solution JSONL fallback.
+        local_problem_files = sorted(RAW_DIR.glob("codeforces*.jsonl")) + sorted(
+            RAW_DIR.glob("problem_solution*.jsonl")
+        )
+        if not local_problem_files:
+            logger.warning(
+                "Codeforces dataset missing – recommended for production quality."
+            )
+        for local_file in local_problem_files:
+            if emitted >= limit:
+                break
+            for item in tqdm(_iter_jsonl(local_file), desc=f"problem_local:{local_file.name}", unit="rows"):
+                if emitted >= limit:
+                    break
+                problem = _safe_get(item, ["problem", "instruction", "statement", "question"])
+                solution = _safe_get(item, ["solution", "response", "answer", "code"])
+                if not problem or not solution:
+                    continue
+                emitted += 1
+                yield build_instruction_sample(
+                    instruction=f"Solve the following problem:\n\n{problem}",
+                    response=solution,
+                    source="codeforces_local",
+                    category="problem",
+                )
+        for source_name, cand in candidates:
+            if emitted >= limit:
+                break
+            try:
+                ds = load_dataset(**cand)
+                logger.info("Loaded problem dataset: %s", cand)
+            except Exception as exc:
+                logger.warning("Problem dataset load failed for %s: %s", cand, exc)
+                if source_name == "codeparrot/apps":
+                    apps_local = sorted(RAW_DIR.glob("apps*.jsonl")) + sorted(RAW_DIR.glob("apps*.json"))
+                    if not apps_local:
+                        logger.warning(
+                            "APPS dataset unavailable via HF and local APPS JSON missing in ./data/raw."
+                        )
+                    for local_file in apps_local:
+                        if emitted >= limit:
+                            break
+                        for item in tqdm(
+                            _iter_jsonl(local_file),
+                            desc=f"problem_apps_local:{local_file.name}",
+                            unit="rows",
+                        ):
+                            if emitted >= limit:
+                                break
+                            problem = _safe_get(item, ["question", "prompt", "problem", "statement"])
+                            solution = _safe_get(item, ["solution", "answer", "code"])
+                            if not problem or not solution:
+                                continue
+                            emitted += 1
+                            yield build_instruction_sample(
+                                instruction=f"Solve the following problem:\n\n{problem}",
+                                response=solution,
+                                source="problem_apps_local",
+                                category="problem",
+                            )
+                continue
+            for item in tqdm(ds, desc=f"problem_{source_name}", unit="rows"):
+                if emitted >= limit:
+                    break
+                title = _safe_get(item, ["title", "name", "problem_id", "task_id"])
+                base_instruction = ""
+                solutions: List[str] = []
+                if source_name.endswith("mbpp"):
+                    problem = _safe_get(item, ["text"])
+                    tests = item.get("test_list") or []
+                    test_blob = "\n".join(tests) if isinstance(tests, list) else _decode_text(tests)
+                    if test_blob:
+                        problem = f"{problem}\n\nTests:\n{test_blob}"
+                    sol = _safe_get(item, ["code"])
+                    solutions = [sol] if sol else []
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                elif source_name.endswith("humaneval"):
+                    problem = _safe_get(item, ["prompt"])
+                    tests = _safe_get(item, ["test"])
+                    if tests:
+                        problem = f"{problem}\n\nTests:\n{tests}"
+                    sol = _safe_get(item, ["canonical_solution"])
+                    solutions = [sol] if sol else []
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                elif source_name.endswith("code_contests"):
+                    problem = _safe_get(item, ["description", "problem", "question", "prompt"])
+                    solutions = _extract_many_code_contests_solutions(item, max_per_problem=6)
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                elif source_name.endswith("apps"):
+                    problem = _safe_get(item, ["question", "problem", "prompt", "statement"])
+                    solutions = _extract_many_apps_solutions(item, max_per_problem=5)
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                elif source_name.endswith("open-r1/codeforces"):
+                    problem = _safe_get(
+                        item,
+                        ["problem", "statement", "question", "prompt", "description", "content"],
+                    )
+                    solutions = _extract_many_generic_solutions(item, max_per_problem=6)
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                else:
+                    problem = _safe_get(item, ["content", "description", "question", "prompt", "statement"])
+                    langs = [
+                        _safe_get(item, ["python"]),
+                        _safe_get(item, ["javascript"]),
+                        _safe_get(item, ["java"]),
+                        _safe_get(item, ["c++"]),
+                        _safe_get(item, ["answer"]),
+                        _safe_get(item, ["code"]),
+                    ]
+                    solutions = [s for s in langs if s]
+                    if isinstance(item.get("solutions"), list):
+                        for extra in item["solutions"]:
+                            t = _decode_text(extra).strip()
+                            if t and t not in solutions:
+                                solutions.append(t)
+                    base_instruction = f"Solve this coding problem: {title}\n\n{problem}"
+                if not problem or not solutions:
+                    continue
+                for sol in solutions:
+                    if emitted >= limit:
+                        break
+                    if not sol or len(sol.strip()) < 20:
+                        continue
+                    emitted += 1
+                    yield build_instruction_sample(
+                        instruction=base_instruction,
+                        response=sol,
+                        source=f"problem_{source_name.replace('/', '_')}",
+                        category="problem",
+                    )
+        # Final problem fallback from CodeSearchNet docstrings to boost high-quality problem pairs.
+        if emitted < limit:
+            synth_cap = int(limit * 0.30)
+            cache_path = Path("./data/cache/raw/code_search_net_python")
+            ds = None
+            if cache_path.exists():
+                try:
+                    ds = load_from_disk(str(cache_path))["train"]
+                    logger.info("Using cached CodeSearchNet Python for problem fallback.")
+                except Exception:
+                    ds = None
+            if ds is None:
+                try:
+                    ds = load_dataset("code_search_net", "python", split="train", streaming=True)
+                    logger.info("Using streamed CodeSearchNet Python for problem fallback.")
+                except Exception as exc:
+                    logger.warning("Problem fallback CodeSearchNet failed: %s", exc)
+                    ds = None
+            if ds is not None:
+                for item in tqdm(ds, desc="problem_codesearchnet_fallback", unit="rows"):
+                    if emitted >= limit or synth_emitted >= synth_cap:
+                        break
+                    doc = _safe_get(item, ["docstring", "func_documentation_string"])
+                    code = _safe_get(item, ["whole_func_string", "code"])
+                    if len(doc.strip()) < 30 or not code:
+                        continue
+                    emitted += 1
+                    synth_emitted += 1
+                    yield build_instruction_sample(
+                        instruction=f"Solve the following programming task:\n\n{doc}",
+                        response=code,
+                        source="codesearchnet_problem_fallback",
+                        category="problem",
+                    )
+    return _write_jsonl(raw_path, rows())
+def fetch_problem_codeforces(raw_path: Path, limit: int, logger) -> int:
+    source_file = RAW_DIR / "codeforces.jsonl"
+    if not source_file.exists():
+        logger.warning("Codeforces dataset file not found: %s", source_file.resolve())
+        return 0
+    def rows():
+        emitted = 0
+        for item in tqdm(_iter_jsonl(source_file), desc="problem_codeforces", unit="rows"):
+            if emitted >= limit:
+                break
+            instruction = _safe_get(item, ["instruction", "problem", "statement", "question"])
+            response = _safe_get(item, ["response", "solution", "answer", "code"])
+            if not instruction or not response:
+                continue
+            if not instruction.lower().startswith("solve the following problem"):
+                instruction = f"Solve the following problem:\n{instruction}"
+            emitted += 1
+            yield build_instruction_sample(
+                instruction=instruction,
+                response=response,
+                source="codeforces_dataset",
+                category="problem",
             )
+    count = _write_jsonl(raw_path, rows())
+    logger.info("Loaded Codeforces pre-ingested samples: %d", count)
+    return count
+def build_dataset(args) -> Path:
+    ensure_dirs([RAW_DIR, FINAL_DIR, LOG_DIR])
+    logger = setup_logger("data_fetch_build", LOG_DIR / "data_fetch.log")
+    logger.info("Starting production dataset build. target_size=%d", args.target_size)
+    logger.info("Raw dir: %s", RAW_DIR.resolve())
+    logger.info("Final dir: %s", FINAL_DIR.resolve())
+    fetch_plan = {
+        "codealpaca": (fetch_instruction_codealpaca, args.codealpaca_limit),
+        "evol_instruct_code": (fetch_instruction_evol, args.evol_limit),
+        "ultrachat_code": (fetch_instruction_ultrachat_code, args.ultrachat_limit),
+        "openhermes_code": (fetch_instruction_openhermes_code, min(args.openhermes_limit, 120_000)),
+        "codesearchnet_multilang": (fetch_structured_codesearchnet, args.codesearchnet_limit),
+        "github_curated_functions": (fetch_structured_github_functions, args.github_limit),
+        "codeforces_problem": (fetch_problem_codeforces, args.codeforces_limit),
+        "leetcode_competitive": (fetch_problem_leetcode, args.leetcode_limit),
+    }
+    raw_paths: List[Path] = []
+    if not args.skip_fetch:
+        for name, (fn, limit) in fetch_plan.items():
+            raw_path = RAW_DIR / f"{name}.jsonl"
+            raw_paths.append(raw_path)
+            try:
+                count = fn(raw_path, limit, logger)
+                logger.info("Fetched %d rows for source=%s", count, name)
+            except Exception as exc:
+                logger.warning("Skipping source=%s due to fetch error: %s", name, exc)
     else:
+        raw_paths = sorted(RAW_DIR.glob("*.jsonl"))
+        logger.info("Skip fetch enabled. Using existing raw files: %d", len(raw_paths))
+    # Phase 1: base balanced build (streaming + dedupe).
+    stats = build_balanced_dataset(
+        input_paths=raw_paths,
+        output_path=FINAL_TRAIN,
+        target_size=args.target_size,
+        min_tokens=args.min_tokens,
+        max_tokens=args.max_tokens,
+        num_workers=args.workers,
+        category_weights={"instruction": 0.60, "structured": 0.30, "problem": 0.10},
+        sqlite_path=FINAL_DIR / "dedupe_hashes.sqlite",
+    )
+    # Phase 2: post-build strict rebalance (downsample excess + upsample deficits).
+    rebalance_stats = rebalance_final_dataset(
+        raw_paths=raw_paths,
+        output_path=FINAL_TRAIN,
+        target_size=args.target_size,
+        min_tokens=args.min_tokens,
+        max_tokens=args.max_tokens,
+        min_problem_samples=args.min_problem_samples,
+        logger=logger,
+    )
+    actual_problem = int(rebalance_stats["category_breakdown"].get("problem", 0))
+    required_problem = int(args.min_problem_samples)
+    real_problem = int(rebalance_stats.get("problem_real_count", 0))
+    synthetic_problem = int(rebalance_stats.get("problem_synthetic_count", 0))
+    synthetic_ratio = synthetic_problem / max(actual_problem, 1)
+    if actual_problem < max(required_problem, args.min_total_problem_samples):
+        raise RuntimeError(
+            "Build aborted: insufficient problem-solving data after rebalance. "
+            f"Required >= {max(required_problem, args.min_total_problem_samples)}, actual = {actual_problem}. "
+            "Increase problem dataset sources (e.g., leetcode/code contests/problem-solution datasets) "
+            "or raise problem fetch limits, then rebuild."
+        )
+    if real_problem < args.min_real_problem_samples:
+        raise RuntimeError(
+            "Build aborted: insufficient REAL problem-solving data after rebalance. "
+            f"Required real >= {args.min_real_problem_samples}, actual real = {real_problem}. "
+            "Add more high-quality real problem datasets (APPS/CodeContests/Codeforces/LeetCode)."
+        )
+    if synthetic_ratio > args.max_synthetic_problem_ratio:
+        raise RuntimeError(
+            "Build aborted: synthetic problem share too high. "
+            f"Allowed <= {args.max_synthetic_problem_ratio:.0%}, actual = {synthetic_ratio:.2%}. "
+            "Increase real problem sources and reduce synthetic fallback usage."
+        )
+    logger.info("Build complete. Final dataset: %s", FINAL_TRAIN.resolve())
+    logger.info("Base stats: %s", stats)
+    logger.info("Rebalanced stats: %s", rebalance_stats)
+    print(f"Final dataset: {FINAL_TRAIN.resolve()}")
+    print(f"Total samples: {rebalance_stats['total_samples']}")
+    print(f"Avg length (tokens est.): {rebalance_stats['avg_length_tokens']}")
+    print("Per-source breakdown:")
+    for src, count in sorted(
+        rebalance_stats["source_breakdown"].items(), key=lambda x: x[1], reverse=True
+    ):
+        print(f"  - {src}: {count}")
+    print("Category breakdown:")
+    for cat, count in sorted(rebalance_stats["category_breakdown"].items(), key=lambda x: x[0]):
+        print(f"  - {cat}: {count} (target: {rebalance_stats['targets'].get(cat, 0)})")
+    ratio = rebalance_stats["instruction_vs_raw_ratio"]
+    print(
+        f"Instruction vs raw-converted ratio: {ratio['instruction_pct']}% / {ratio['raw_converted_pct']}%"
+    )
+    total = max(1, rebalance_stats["total_samples"])
+    print("Category percentages:")
+    for cat in ("instruction", "structured", "problem"):
+        pct = 100.0 * rebalance_stats["category_breakdown"].get(cat, 0) / total
+        print(f"  - {cat}: {pct:.2f}%")
+    print(f"Real problem count: {real_problem}")
+    print(f"Synthetic problem count: {synthetic_problem}")
+    print(f"Synthetic problem %: {synthetic_ratio * 100:.2f}%")
+    return FINAL_TRAIN
+def _build_parser() -> argparse.ArgumentParser:
+    parser = argparse.ArgumentParser(description="Production-grade coding dataset build pipeline.")
+    parser.add_argument("--build", action="store_true", help="Run the full build pipeline.")
+    parser.add_argument("--target-size", type=int, default=1_000_000)
+    parser.add_argument("--min-tokens", type=int, default=10)
+    parser.add_argument("--max-tokens", type=int, default=2048)
+    parser.add_argument("--skip-fetch", action="store_true", help="Use existing ./data/raw/*.jsonl only.")
+    parser.add_argument(
+        "--workers",
+        type=int,
+        default=max(1, (os.cpu_count() or 4) // 2),
+        help="Parallel worker processes for cleaning stage.",
+    )
+    parser.add_argument("--codealpaca-limit", type=int, default=20000)
+    parser.add_argument("--evol-limit", type=int, default=300000)
+    parser.add_argument("--ultrachat-limit", type=int, default=250000)
+    parser.add_argument("--openhermes-limit", type=int, default=250000)
+    parser.add_argument("--codesearchnet-limit", type=int, default=300000)
+    parser.add_argument("--github-limit", type=int, default=200000)
+    parser.add_argument("--codeforces-limit", type=int, default=200000)
+    parser.add_argument("--leetcode-limit", type=int, default=300000)
+    parser.add_argument(
+        "--stackoverflow-limit",
+        type=int,
+        default=0,
+        help="Deprecated. StackOverflow sources were removed due unreliability.",
+    )
     parser.add_argument(
+        "--min-problem-samples",
+        type=int,
+        default=50_000,
+        help="Ensure at least this many samples in problem category during post-rebalance.",
+    )
+    parser.add_argument(
+        "--min-real-problem-samples",
+        type=int,
+        default=50_000,
+        help="Minimum REAL problem samples required after rebalance.",
+    )
+    parser.add_argument(
+        "--min-total-problem-samples",
+        type=int,
+        default=80_000,
+        help="Minimum total problem samples required after rebalance.",
+    )
+    parser.add_argument(
+        "--max-synthetic-problem-ratio",
+        type=float,
+        default=0.30,
+        help="Maximum allowed synthetic (docstring fallback) share in problem category.",
     )
     return parser
 if __name__ == "__main__":
+    parser = _build_parser()
+    args = parser.parse_args()
+    if args.build:
+        build_dataset(args)
+    else:
+        parser.print_help()

dataset_cleaner.py ADDED Viewed

	@@ -0,0 +1,342 @@

+import hashlib
+import json
+import multiprocessing as mp
+import re
+import sqlite3
+from collections import Counter, defaultdict
+from pathlib import Path
+from typing import Dict, Iterable, Iterator, List, Optional
+from tqdm import tqdm
+TOKEN_PATTERN = re.compile(r"\w+|[^\w\s]", re.UNICODE)
+CODE_PATTERN = re.compile(
+    r"(\bdef\b|\bclass\b|\bimport\b|\breturn\b|=>|function\s+\w+|public\s+class|#include|```)",
+    re.IGNORECASE,
+)
+EXPLANATION_PATTERN = re.compile(
+    r"\b(explain|because|algorithm|steps|approach|complexity|solution)\b", re.IGNORECASE
+)
+PROBLEM_PROMPT_RE = re.compile(
+    r"\b(solve|given|find|compute|return|input|output|problem|algorithm|task|challenge)\b",
+    re.IGNORECASE,
+)
+def estimate_tokens(text: str) -> int:
+    if not text:
+        return 0
+    return len(TOKEN_PATTERN.findall(text))
+def normalize_text(text: str) -> str:
+    if text is None:
+        return ""
+    text = str(text).replace("\x00", "")
+    text = text.replace("\r\n", "\n").replace("\r", "\n")
+    text = "".join(ch for ch in text if ch == "\n" or ch == "\t" or ord(ch) >= 32)
+    lines = [line.rstrip() for line in text.split("\n")]
+    return "\n".join(lines).strip()
+def _ascii_ratio(text: str) -> float:
+    if not text:
+        return 1.0
+    ascii_count = sum(1 for c in text if ord(c) < 128)
+    return ascii_count / len(text)
+def _response_is_valid(response: str) -> bool:
+    if not response:
+        return False
+    if CODE_PATTERN.search(response):
+        return True
+    if EXPLANATION_PATTERN.search(response):
+        return True
+    return False
+def _response_has_code(response: str) -> bool:
+    return bool(
+        re.search(
+            r"(\bdef\b|\bclass\b|\breturn\b|\bimport\b|```|function\s+\w+|public\s+class|#include|SELECT\s+)",
+            response,
+            re.IGNORECASE,
+        )
+    )
+def clean_record(
+    record: Dict[str, str],
+    *,
+    min_tokens: int = 10,
+    max_tokens: int = 2048,
+) -> Optional[Dict[str, str]]:
+    instruction = normalize_text(record.get("instruction", ""))
+    response = normalize_text(record.get("response", ""))
+    source = normalize_text(record.get("_source", "unknown"))
+    category = normalize_text(record.get("_category", ""))
+    if not category:
+        src_low = source.lower()
+        if any(k in src_low for k in ("codealpaca", "evol", "ultrachat", "openhermes", "orca")):
+            category = "instruction"
+        elif any(
+            k in src_low
+            for k in (
+                "leetcode",
+                "contest",
+                "mbpp",
+                "humaneval",
+                "apps",
+                "codeforces",
+                "problem",
+                "codesearchnet_problem",
+            )
+        ):
+            category = "problem"
+        else:
+            category = "structured"
+    if not instruction or not response:
+        return None
+    if _ascii_ratio(instruction + response) < 0.85:
+        return None
+    if not _response_is_valid(response):
+        return None
+    if category == "problem":
+        if len(instruction) <= 50:
+            return None
+        if not PROBLEM_PROMPT_RE.search(instruction):
+            return None
+        if not _response_has_code(response):
+            return None
+        # Problem solutions must include code, not explanation-only text.
+        if EXPLANATION_PATTERN.search(response) and not CODE_PATTERN.search(response):
+            return None
+    total_tokens = estimate_tokens(instruction) + estimate_tokens(response)
+    if total_tokens < min_tokens or total_tokens > max_tokens:
+        return None
+    return {
+        "instruction": instruction,
+        "response": response,
+        "_source": source,
+        "_category": category,
+        "_tokens": total_tokens,
+    }
+def _iter_jsonl(path: Path) -> Iterable[Dict[str, str]]:
+    with path.open("r", encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            try:
+                yield json.loads(line)
+            except json.JSONDecodeError:
+                continue
+def _clean_record_worker(payload: Dict[str, object]) -> Optional[Dict[str, str]]:
+    record = payload["record"]
+    min_tokens = int(payload["min_tokens"])
+    max_tokens = int(payload["max_tokens"])
+    return clean_record(record, min_tokens=min_tokens, max_tokens=max_tokens)
+def iter_cleaned_records(
+    path: Path,
+    *,
+    min_tokens: int,
+    max_tokens: int,
+    num_workers: int = 1,
+    batch_size: int = 2000,
+) -> Iterator[Dict[str, str]]:
+    if num_workers <= 1:
+        for record in _iter_jsonl(path):
+            cleaned = clean_record(record, min_tokens=min_tokens, max_tokens=max_tokens)
+            if cleaned is not None:
+                yield cleaned
+        return
+    pool = mp.Pool(processes=num_workers)
+    try:
+        batch: List[Dict[str, str]] = []
+        for record in _iter_jsonl(path):
+            batch.append(record)
+            if len(batch) < batch_size:
+                continue
+            payloads = [
+                {"record": r, "min_tokens": min_tokens, "max_tokens": max_tokens} for r in batch
+            ]
+            for cleaned in pool.imap_unordered(_clean_record_worker, payloads, chunksize=64):
+                if cleaned is not None:
+                    yield cleaned
+            batch.clear()
+        if batch:
+            payloads = [{"record": r, "min_tokens": min_tokens, "max_tokens": max_tokens} for r in batch]
+            for cleaned in pool.imap_unordered(_clean_record_worker, payloads, chunksize=64):
+                if cleaned is not None:
+                    yield cleaned
+    finally:
+        pool.close()
+        pool.join()
+def _remove_sqlite_artifacts(sqlite_path: Path) -> None:
+    if sqlite_path.exists():
+        sqlite_path.unlink()
+    for suffix in ("-wal", "-shm"):
+        p = sqlite_path.with_name(sqlite_path.name + suffix)
+        if p.exists():
+            p.unlink()
+def _open_dedupe_db(sqlite_path: Path):
+    sqlite_path = sqlite_path.resolve()
+    sqlite_path.parent.mkdir(parents=True, exist_ok=True)
+    _remove_sqlite_artifacts(sqlite_path)
+    conn = sqlite3.connect(str(sqlite_path))
+    conn.execute("PRAGMA journal_mode=WAL;")
+    conn.execute("CREATE TABLE IF NOT EXISTS seen_hashes (h TEXT PRIMARY KEY)")
+    return conn
+def _is_duplicate(conn, instruction: str, response: str) -> bool:
+    digest = hashlib.sha256(f"{instruction}||{response}".encode("utf-8")).hexdigest()
+    try:
+        conn.execute("INSERT INTO seen_hashes(h) VALUES (?)", (digest,))
+        return False
+    except sqlite3.IntegrityError:
+        return True
+def build_balanced_dataset(
+    input_paths: List[Path],
+    output_path: Path,
+    *,
+    target_size: int = 1_000_000,
+    min_tokens: int = 10,
+    max_tokens: int = 2048,
+    category_weights: Optional[Dict[str, float]] = None,
+    sqlite_path: Optional[Path] = None,
+    num_workers: int = 1,
+) -> Dict[str, object]:
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    if sqlite_path is None:
+        sqlite_path = output_path.parent / "dedupe_hashes.sqlite"
+    conn = _open_dedupe_db(sqlite_path)
+    weights = category_weights or {"instruction": 0.60, "structured": 0.30, "problem": 0.10}
+    target_by_cat = {k: int(target_size * v) for k, v in weights.items()}
+    target_by_cat["problem"] = target_size - target_by_cat["instruction"] - target_by_cat["structured"]
+    grouped_paths: Dict[str, List[Path]] = defaultdict(list)
+    for path in input_paths:
+        if not path.exists():
+            continue
+        name = path.stem
+        if "codealpaca" in name or "evol" in name or "ultrachat" in name or "openhermes" in name:
+            grouped_paths["instruction"].append(path)
+        elif any(
+            k in name
+            for k in (
+                "leetcode",
+                "contest",
+                "problem",
+                "mbpp",
+                "humaneval",
+                "apps",
+                "codeforces",
+            )
+        ):
+            grouped_paths["problem"].append(path)
+        else:
+            grouped_paths["structured"].append(path)
+    source_counter = Counter()
+    category_counter = Counter()
+    total_tokens = 0
+    total_kept = 0
+    def try_write(cleaned: Dict[str, str], out_f, enforce_category_target: bool) -> bool:
+        nonlocal total_kept, total_tokens
+        category = cleaned["_category"]
+        if enforce_category_target and category_counter[category] >= target_by_cat.get(category, 0):
+            return False
+        if _is_duplicate(conn, cleaned["instruction"], cleaned["response"]):
+            return False
+        source = cleaned["_source"]
+        tokens = int(cleaned["_tokens"])
+        category_counter[category] += 1
+        source_counter[source] += 1
+        total_tokens += tokens
+        total_kept += 1
+        out_f.write(
+            json.dumps(
+                {"instruction": cleaned["instruction"], "response": cleaned["response"]},
+                ensure_ascii=False,
+            )
+            + "\n"
+        )
+        return True
+    with output_path.open("w", encoding="utf-8") as out_f:
+        # Phase 1: enforce 60/30/10 quotas.
+        for category in ("instruction", "structured", "problem"):
+            if category not in grouped_paths:
+                continue
+            for path in grouped_paths[category]:
+                cleaned_iter = iter_cleaned_records(
+                    path,
+                    min_tokens=min_tokens,
+                    max_tokens=max_tokens,
+                    num_workers=num_workers,
+                )
+                for cleaned in tqdm(cleaned_iter, desc=f"balance1:{path.name}", unit="rows"):
+                    if total_kept >= target_size or category_counter[category] >= target_by_cat[category]:
+                        break
+                    try_write(cleaned, out_f, enforce_category_target=True)
+                conn.commit()
+                if total_kept >= target_size or category_counter[category] >= target_by_cat[category]:
+                    continue
+        # Phase 2: fill remaining slots from all categories while preserving dedupe.
+        if total_kept < target_size:
+            for path in input_paths:
+                if not path.exists():
+                    continue
+                cleaned_iter = iter_cleaned_records(
+                    path,
+                    min_tokens=min_tokens,
+                    max_tokens=max_tokens,
+                    num_workers=num_workers,
+                )
+                for cleaned in tqdm(cleaned_iter, desc=f"balance2:{path.name}", unit="rows"):
+                    if total_kept >= target_size:
+                        break
+                    try_write(cleaned, out_f, enforce_category_target=False)
+                conn.commit()
+                if total_kept >= target_size:
+                    break
+    conn.close()
+    avg_len = round((total_tokens / total_kept), 2) if total_kept else 0.0
+    raw_converted = category_counter["structured"] + category_counter["problem"]
+    ratio = {
+        "instruction_pct": round(100.0 * category_counter["instruction"] / max(total_kept, 1), 2),
+        "raw_converted_pct": round(100.0 * raw_converted / max(total_kept, 1), 2),
+    }
+    return {
+        "total_samples": total_kept,
+        "avg_length_tokens": avg_len,
+        "source_breakdown": dict(source_counter),
+        "category_breakdown": dict(category_counter),
+        "instruction_vs_raw_ratio": ratio,
+        "targets": target_by_cat,
+    }

dataset_formatter.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import re
+from typing import Dict, Optional
+FUNC_RE = re.compile(r"\bdef\s+([a-zA-Z_]\w*)\s*\(|\bfunction\s+([a-zA-Z_]\w*)\s*\(")
+CLASS_RE = re.compile(r"\bclass\s+([a-zA-Z_]\w*)")
+DOCSTRING_RE = re.compile(r'"""(.*?)"""|\'\'\'(.*?)\'\'\'', re.DOTALL)
+COMMENT_RE = re.compile(r"^\s*(#|//)\s*(.+)$", re.MULTILINE)
+def normalize_spaces(text: str) -> str:
+    if not text:
+        return ""
+    return text.replace("\r\n", "\n").replace("\r", "\n").strip()
+def _first_non_empty(*vals: Optional[str]) -> str:
+    for v in vals:
+        if v and str(v).strip():
+            return str(v).strip()
+    return ""
+def infer_language(lang: str = "", path: str = "") -> str:
+    lang = (lang or "").lower()
+    path = (path or "").lower()
+    if lang:
+        return lang
+    if path.endswith(".py"):
+        return "python"
+    if path.endswith(".js"):
+        return "javascript"
+    if path.endswith(".ts"):
+        return "typescript"
+    if path.endswith(".java"):
+        return "java"
+    return "code"
+def extract_function_name(code: str) -> str:
+    if not code:
+        return ""
+    m = FUNC_RE.search(code)
+    if m:
+        return m.group(1) or m.group(2) or ""
+    c = CLASS_RE.search(code)
+    if c:
+        return c.group(1) or ""
+    return ""
+def extract_doc_or_comment(code: str) -> str:
+    if not code:
+        return ""
+    doc = DOCSTRING_RE.search(code)
+    if doc:
+        return _first_non_empty(doc.group(1), doc.group(2))
+    com = COMMENT_RE.search(code)
+    if com:
+        return com.group(2).strip()
+    return ""
+def code_to_instruction(code: str, *, language: str = "", path: str = "", title: str = "") -> str:
+    code = normalize_spaces(code)
+    lang = infer_language(language, path)
+    func = extract_function_name(code)
+    hint = _first_non_empty(title, extract_doc_or_comment(code))
+    if func and hint:
+        return f"Write a {lang} implementation of `{func}`. Requirements: {hint}"
+    if func:
+        return f"Write a {lang} function `{func}`."
+    if hint:
+        return f"Implement this {lang} code task: {hint}"
+    if path:
+        return f"Implement or refactor the {lang} code from `{path}`."
+    return f"Write a correct and production-ready {lang} code snippet."
+def build_instruction_sample(
+    *,
+    instruction: str = "",
+    response: str = "",
+    code: str = "",
+    language: str = "",
+    path: str = "",
+    title: str = "",
+    source: str,
+    category: str,
+) -> Dict[str, str]:
+    if not instruction:
+        instruction = code_to_instruction(code, language=language, path=path, title=title)
+    if not response:
+        response = code
+    return {
+        "instruction": normalize_spaces(instruction),
+        "response": normalize_spaces(response),
+        "_source": source,
+        "_category": category,
+    }

final_model/config.json ADDED Viewed

	@@ -0,0 +1,29 @@

+{
+  "architectures": [
+    "MindiForCausalLM"
+  ],
+  "auto_map": {
+    "AutoConfig": "configuration_mindi.MindiConfig",
+    "AutoModelForCausalLM": "modeling_mindi.MindiForCausalLM",
+    "AutoTokenizer": [
+      null,
+      "tokenization_mindi.MindiTokenizer"
+    ]
+  },
+  "bos_token_id": 2,
+  "d_ff": 4608,
+  "d_model": 1152,
+  "dropout": 0.1,
+  "dtype": "float16",
+  "eos_token_id": 3,
+  "init_std": 0.02,
+  "max_seq_len": 2048,
+  "model_type": "mindi",
+  "n_heads": 16,
+  "n_layers": 23,
+  "pad_token_id": 0,
+  "rms_norm_eps": 1e-05,
+  "tie_embeddings": true,
+  "transformers_version": "5.4.0",
+  "vocab_size": 50000
+}

final_model/configuration_mindi.py ADDED Viewed

	@@ -0,0 +1,38 @@

+"""
+Hugging Face config class for MINDI 1.0 420M.
+"""
+from transformers import PretrainedConfig
+class MindiConfig(PretrainedConfig):
+    model_type = "mindi"
+    def __init__(
+        self,
+        vocab_size=50000,
+        max_seq_len=2048,
+        d_model=1152,
+        n_layers=23,
+        n_heads=16,
+        d_ff=4608,
+        dropout=0.1,
+        tie_embeddings=True,
+        init_std=0.02,
+        rms_norm_eps=1e-5,
+        bos_token_id=2,
+        eos_token_id=3,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.max_seq_len = max_seq_len
+        self.d_model = d_model
+        self.n_layers = n_layers
+        self.n_heads = n_heads
+        self.d_ff = d_ff
+        self.dropout = dropout
+        self.tie_embeddings = tie_embeddings
+        self.init_std = init_std
+        self.rms_norm_eps = rms_norm_eps

final_model/generation_config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+  "bos_token_id": 2,
+  "do_sample": true,
+  "eos_token_id": 3,
+  "max_new_tokens": 220,
+  "pad_token_id": 0,
+  "temperature": 0.2,
+  "top_p": 0.9,
+  "transformers_version": "5.4.0"
+}

backup_step4000.tar.gz → final_model/model.safetensors RENAMED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:de71068cc6f34954697f26b4e4a68410cfbb228d9d7947120ce54878b65c158c
-size 84549165

 version https://git-lfs.github.com/spec/v1
+oid sha256:c63f0d3f5cf8fca2fca36c1339b2c07d1c21378ce0753b007e78048607a66764
+size 963088320

final_model/modeling_mindi.py ADDED Viewed

	@@ -0,0 +1,219 @@

+"""
+Hugging Face model class for MINDI 1.0 420M.
+"""
+from __future__ import annotations
+from dataclasses import dataclass
+from typing import Optional, Tuple
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from .configuration_mindi import MindiConfig
+@dataclass
+class _Cfg:
+    vocab_size: int
+    max_seq_len: int
+    d_model: int
+    n_layers: int
+    n_heads: int
+    d_ff: int
+    dropout: float
+    tie_embeddings: bool
+    init_std: float
+    rms_norm_eps: float
+    @property
+    def head_dim(self) -> int:
+        if self.d_model % self.n_heads != 0:
+            raise ValueError("d_model must be divisible by n_heads")
+        return self.d_model // self.n_heads
+class RMSNorm(nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-5) -> None:
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        norm = x.pow(2).mean(dim=-1, keepdim=True)
+        x = x * torch.rsqrt(norm + self.eps)
+        return self.weight * x
+class RotaryEmbedding(nn.Module):
+    def __init__(self, head_dim: int, max_seq_len: int) -> None:
+        super().__init__()
+        if head_dim % 2 != 0:
+            raise ValueError("head_dim must be even for rotary embeddings")
+        inv_freq = 1.0 / (10000 ** (torch.arange(0, head_dim, 2).float() / head_dim))
+        t = torch.arange(max_seq_len, dtype=torch.float32)
+        freqs = torch.outer(t, inv_freq)
+        self.register_buffer("cos_cached", torch.cos(freqs), persistent=False)
+        self.register_buffer("sin_cached", torch.sin(freqs), persistent=False)
+    def forward(self, q: torch.Tensor, k: torch.Tensor, seq_len: int) -> Tuple[torch.Tensor, torch.Tensor]:
+        cos = self.cos_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        sin = self.sin_cached[:seq_len].unsqueeze(0).unsqueeze(0)
+        return self._apply_rotary(q, cos, sin), self._apply_rotary(k, cos, sin)
+    @staticmethod
+    def _apply_rotary(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        xe = x1 * cos - x2 * sin
+        xo = x1 * sin + x2 * cos
+        return torch.stack((xe, xo), dim=-1).flatten(-2)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.n_heads = cfg.n_heads
+        self.head_dim = cfg.head_dim
+        self.scale = self.head_dim ** -0.5
+        self.q_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.k_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.v_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.o_proj = nn.Linear(cfg.d_model, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.rotary = RotaryEmbedding(self.head_dim, cfg.max_seq_len)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        bsz, seq_len, _ = x.shape
+        q = self.q_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        k = self.k_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        v = self.v_proj(x).view(bsz, seq_len, self.n_heads, self.head_dim).transpose(1, 2)
+        q, k = self.rotary(q, k, seq_len=seq_len)
+        out = F.scaled_dot_product_attention(
+            q,
+            k,
+            v,
+            attn_mask=None,
+            dropout_p=self.dropout.p if self.training else 0.0,
+            is_causal=True,
+            scale=self.scale,
+        )
+        out = out.transpose(1, 2).contiguous().view(bsz, seq_len, -1)
+        return self.o_proj(out)
+class FeedForward(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.fc1 = nn.Linear(cfg.d_model, cfg.d_ff, bias=False)
+        self.fc2 = nn.Linear(cfg.d_ff, cfg.d_model, bias=False)
+        self.dropout = nn.Dropout(cfg.dropout)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        x = self.dropout(x)
+        return x
+class TransformerBlock(nn.Module):
+    def __init__(self, cfg: _Cfg) -> None:
+        super().__init__()
+        self.norm1 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.attn = CausalSelfAttention(cfg)
+        self.norm2 = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.ffn = FeedForward(cfg)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.attn(self.norm1(x))
+        x = x + self.ffn(self.norm2(x))
+        return x
+class MindiForCausalLM(PreTrainedModel):
+    config_class = MindiConfig
+    base_model_prefix = "mindi"
+    supports_gradient_checkpointing = False
+    def __init__(self, config: MindiConfig):
+        super().__init__(config)
+        cfg = _Cfg(
+            vocab_size=config.vocab_size,
+            max_seq_len=config.max_seq_len,
+            d_model=config.d_model,
+            n_layers=config.n_layers,
+            n_heads=config.n_heads,
+            d_ff=config.d_ff,
+            dropout=config.dropout,
+            tie_embeddings=config.tie_embeddings,
+            init_std=config.init_std,
+            rms_norm_eps=config.rms_norm_eps,
+        )
+        self.embed_tokens = nn.Embedding(cfg.vocab_size, cfg.d_model)
+        self.dropout = nn.Dropout(cfg.dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(cfg) for _ in range(cfg.n_layers)])
+        self.norm_final = RMSNorm(cfg.d_model, cfg.rms_norm_eps)
+        self.lm_head = nn.Linear(cfg.d_model, cfg.vocab_size, bias=False)
+        if cfg.tie_embeddings:
+            self.lm_head.weight = self.embed_tokens.weight
+        self.post_init()
+    def _init_weights(self, module: nn.Module) -> None:
+        if isinstance(module, nn.Linear):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+        elif isinstance(module, nn.Embedding):
+            nn.init.normal_(module.weight, mean=0.0, std=self.config.init_std)
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embed_tokens
+    def set_input_embeddings(self, value: nn.Module) -> None:
+        self.embed_tokens = value
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+    def set_output_embeddings(self, new_embeddings: nn.Module) -> None:
+        self.lm_head = new_embeddings
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> CausalLMOutputWithPast:
+        del attention_mask, kwargs
+        x = self.embed_tokens(input_ids)
+        x = self.dropout(x)
+        for block in self.blocks:
+            x = block(x)
+        x = self.norm_final(x)
+        logits = self.lm_head(x)
+        loss = None
+        if labels is not None:
+            shift_logits = logits[:, :-1, :].contiguous()
+            shift_labels = labels[:, 1:].contiguous()
+            loss = F.cross_entropy(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1),
+                ignore_index=-100,
+            )
+        return CausalLMOutputWithPast(loss=loss, logits=logits)
+    @torch.no_grad()
+    def prepare_inputs_for_generation(self, input_ids: torch.Tensor, **kwargs):
+        del kwargs
+        return {"input_ids": input_ids}

final_model/tokenization_mindi.py ADDED Viewed

	@@ -0,0 +1,33 @@

+"""
+Hugging Face tokenizer class for MINDI 1.0 420M.
+"""
+from pathlib import Path
+from transformers import PreTrainedTokenizerFast
+class MindiTokenizer(PreTrainedTokenizerFast):
+    vocab_files_names = {"tokenizer_file": "tokenizer.json"}
+    model_input_names = ["input_ids", "attention_mask"]
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs):
+        if kwargs.get("tokenizer_file") is None:
+            local_candidate = Path(str(pretrained_model_name_or_path)) / "tokenizer.json"
+            if local_candidate.exists():
+                kwargs["tokenizer_file"] = str(local_candidate)
+        return super().from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+    def __init__(self, tokenizer_file=None, **kwargs):
+        name_or_path = kwargs.pop("name_or_path", None)
+        if tokenizer_file is None and name_or_path is not None:
+            candidate = Path(name_or_path) / "tokenizer.json"
+            if candidate.exists():
+                tokenizer_file = str(candidate)
+        if tokenizer_file is None:
+            tokenizer_file = str(Path(__file__).resolve().parent / "tokenizer.json")
+        kwargs.setdefault("bos_token", "<BOS>")
+        kwargs.setdefault("eos_token", "<EOS>")
+        kwargs.setdefault("unk_token", "<UNK>")
+        kwargs.setdefault("pad_token", "<PAD>")
+        super().__init__(tokenizer_file=tokenizer_file, **kwargs)

final_model/tokenizer.json ADDED Viewed

	@@ -0,0 +1,799 @@

+{
+  "version": "1.0",
+  "truncation": null,
+  "padding": null,
+  "added_tokens": [
+    {
+      "id": 0,
+      "content": "<PAD>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 1,
+      "content": "<UNK>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 2,
+      "content": "<BOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 3,
+      "content": "<EOS>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 4,
+      "content": "<NL>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 5,
+      "content": "<INDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 6,
+      "content": "<DEDENT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 7,
+      "content": "<PROMPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 8,
+      "content": "<CODE>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 9,
+      "content": "<PYTHON>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    },
+    {
+      "id": 10,
+      "content": "<JAVASCRIPT>",
+      "single_word": false,
+      "lstrip": false,
+      "rstrip": false,
+      "normalized": false,
+      "special": true
+    }
+  ],
+  "normalizer": {
+    "type": "Sequence",
+    "normalizers": [
+      {
+        "type": "NFKC"
+      }
+    ]
+  },
+  "pre_tokenizer": {
+    "type": "Sequence",
+    "pretokenizers": [
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "(==|!=|<=|>=|:=|->|=>|\\+\\+|--|\\+=|-=|\\*=|/=|//=|%=|\\*\\*|&&|\\|\\||<<|>>)"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Split",
+        "pattern": {
+          "Regex": "([()\\[\\]{}.,:;])"
+        },
+        "behavior": "Isolated",
+        "invert": false
+      },
+      {
+        "type": "Metaspace",
+        "replacement": "_",
+        "prepend_scheme": "always",
+        "split": true
+      }
+    ]
+  },
+  "post_processor": {
+    "type": "TemplateProcessing",
+    "single": [
+      {
+        "SpecialToken": {
+          "id": "<BOS>",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "SpecialToken": {
+          "id": "<EOS>",
+          "type_id": 0
+        }
+      }
+    ],
+    "pair": [
+      {
+        "Sequence": {
+          "id": "A",
+          "type_id": 0
+        }
+      },
+      {
+        "Sequence": {
+          "id": "B",
+          "type_id": 1
+        }
+      }
+    ],
+    "special_tokens": {
+      "<BOS>": {
+        "id": "<BOS>",
+        "ids": [
+          2
+        ],
+        "tokens": [
+          "<BOS>"
+        ]
+      },
+      "<EOS>": {
+        "id": "<EOS>",
+        "ids": [
+          3
+        ],
+        "tokens": [
+          "<EOS>"
+        ]
+      }
+    }
+  },
+  "decoder": {
+    "type": "BPEDecoder",
+    "suffix": "</w>"
+  },
+  "model": {
+    "type": "BPE",
+    "dropout": null,
+    "unk_token": "<UNK>",
+    "continuing_subword_prefix": null,
+    "end_of_word_suffix": null,
+    "fuse_unk": false,
+    "byte_fallback": false,
+    "ignore_merges": false,
+    "vocab": {
+      "<PAD>": 0,
+      "<UNK>": 1,
+      "<BOS>": 2,
+      "<EOS>": 3,
+      "<NL>": 4,
+      "<INDENT>": 5,
+      "<DEDENT>": 6,
+      "<PROMPT>": 7,
+      "<CODE>": 8,
+      "<PYTHON>": 9,
+      "<JAVASCRIPT>": 10,
+      "(": 11,
+      ")": 12,
+      "+": 13,
+      ",": 14,
+      ".": 15,
+      "0": 16,
+      "4": 17,
+      "5": 18,
+      ":": 19,
+      ";": 20,
+      "<": 21,
+      "=": 22,
+      ">": 23,
+      "A": 24,
+      "C": 25,
+      "D": 26,
+      "E": 27,
+      "F": 28,
+      "H": 29,
+      "I": 30,
+      "J": 31,
+      "L": 32,
+      "M": 33,
+      "N": 34,
+      "O": 35,
+      "P": 36,
+      "R": 37,
+      "S": 38,
+      "T": 39,
+      "V": 40,
+      "W": 41,
+      "Y": 42,
+      "_": 43,
+      "a": 44,
+      "b": 45,
+      "c": 46,
+      "d": 47,
+      "e": 48,
+      "f": 49,
+      "g": 50,
+      "h": 51,
+      "i": 52,
+      "l": 53,
+      "m": 54,
+      "n": 55,
+      "o": 56,
+      "p": 57,
+      "r": 58,
+      "s": 59,
+      "t": 60,
+      "u": 61,
+      "v": 62,
+      "w": 63,
+      "x": 64,
+      "y": 65,
+      "{": 66,
+      "}": 67,
+      "_<": 68,
+      "DE": 69,
+      "T>": 70,
+      "_a": 71,
+      "L>": 72,
+      "NL>": 73,
+      "_<NL>": 74,
+      "NT>": 75,
+      "_t": 76,
+      "DENT>": 77,
+      "_i": 78,
+      "PT>": 79,
+      "_(": 80,
+      "_)": 81,
+      "on": 82,
+      "_<P": 83,
+      "_f": 84,
+      "_l": 85,
+      "re": 86,
+      "ri": 87,
+      "CO": 88,
+      "IN": 89,
+      "MPT>": 90,
+      "OMPT>": 91,
+      "ROMPT>": 92,
+      "_;": 93,
+      "_b": 94,
+      "at": 95,
+      "_<DE": 96,
+      "_<CO": 97,
+      "_<IN": 98,
+      "DE>": 99,
+      "_to": 100,
+      "_<PROMPT>": 101,
+      "_lo": 102,
+      "_<DEDENT>": 103,
+      "_<CODE>": 104,
+      "_<INDENT>": 105,
+      "_+": 106,
+      "_0": 107,
+      "_re": 108,
+      "ct": 109,
+      "dd": 110,
+      "ion": 111,
+      "nct": 112,
+      "rn": 113,
+      "tu": 114,
+      "unct": 115,
+      "va": 116,
+      "_add": 117,
+      "_th": 118,
+      "_funct": 119,
+      "_retu": 120,
+      "_function": 121,
+      "_return": 122,
+      "AS": 123,
+      "AV": 124,
+      "CR": 125,
+      "Cre": 126,
+      "HO": 127,
+      "IPT>": 128,
+      "Ja": 129,
+      "JAV": 130,
+      "N>": 131,
+      "Py": 132,
+      "Sc": 133,
+      "THO": 134,
+      "YTHO": 135,
+      "_,": 136,
+      "_4": 137,
+      "_5": 138,
+      "_:": 139,
+      "_p": 140,
+      "_{": 141,
+      "_}": 142,
+      "_Cre": 143,
+      "_Ja": 144,
+      "_Py": 145,
+      "hon": 146,
+      "nt": 147,
+      "op": 148,
+      "or": 149,
+      "pt": 150,
+      "thon": 151,
+      "_<JAV": 152,
+      "_<PYTHO": 153,
+      "_for": 154,
+      "rint": 155,
+      "ript": 156,
+      "ate": 157,
+      "_log": 158,
+      "_loop": 159,
+      "vaSc": 160,
+      "_that": 161,
+      "ASCR": 162,
+      "_print": 163,
+      "_Create": 164,
+      "_JavaSc": 165,
+      "_Python": 166,
+      "_<JAVASCR": 167,
+      "_<PYTHON>": 168,
+      "_JavaScript": 169,
+      "_<JAVASCRIPT>": 170
+    },
+    "merges": [
+      [
+        "_",
+        "<"
+      ],
+      [
+        "D",
+        "E"
+      ],
+      [
+        "T",
+        ">"
+      ],
+      [
+        "_",
+        "a"
+      ],
+      [
+        "L",
+        ">"
+      ],
+      [
+        "N",
+        "L>"
+      ],
+      [
+        "_<",
+        "NL>"
+      ],
+      [
+        "N",
+        "T>"
+      ],
+      [
+        "_",
+        "t"
+      ],
+      [
+        "DE",
+        "NT>"
+      ],
+      [
+        "_",
+        "i"
+      ],
+      [
+        "P",
+        "T>"
+      ],
+      [
+        "_",
+        "("
+      ],
+      [
+        "_",
+        ")"
+      ],
+      [
+        "o",
+        "n"
+      ],
+      [
+        "_<",
+        "P"
+      ],
+      [
+        "_",
+        "f"
+      ],
+      [
+        "_",
+        "l"
+      ],
+      [
+        "r",
+        "e"
+      ],
+      [
+        "r",
+        "i"
+      ],
+      [
+        "C",
+        "O"
+      ],
+      [
+        "I",
+        "N"
+      ],
+      [
+        "M",
+        "PT>"
+      ],
+      [
+        "O",
+        "MPT>"
+      ],
+      [
+        "R",
+        "OMPT>"
+      ],
+      [
+        "_",
+        ";"
+      ],
+      [
+        "_",
+        "b"
+      ],
+      [
+        "a",
+        "t"
+      ],
+      [
+        "_<",
+        "DE"
+      ],
+      [
+        "_<",
+        "CO"
+      ],
+      [
+        "_<",
+        "IN"
+      ],
+      [
+        "DE",
+        ">"
+      ],
+      [
+        "_t",
+        "o"
+      ],
+      [
+        "_<P",
+        "ROMPT>"
+      ],
+      [
+        "_l",
+        "o"
+      ],
+      [
+        "_<DE",
+        "DENT>"
+      ],
+      [
+        "_<CO",
+        "DE>"
+      ],
+      [
+        "_<IN",
+        "DENT>"
+      ],
+      [
+        "_",
+        "+"
+      ],
+      [
+        "_",
+        "0"
+      ],
+      [
+        "_",
+        "re"
+      ],
+      [
+        "c",
+        "t"
+      ],
+      [
+        "d",
+        "d"
+      ],
+      [
+        "i",
+        "on"
+      ],
+      [
+        "n",
+        "ct"
+      ],
+      [
+        "r",
+        "n"
+      ],
+      [
+        "t",
+        "u"
+      ],
+      [
+        "u",
+        "nct"
+      ],
+      [
+        "v",
+        "a"
+      ],
+      [
+        "_a",
+        "dd"
+      ],
+      [
+        "_t",
+        "h"
+      ],
+      [
+        "_f",
+        "unct"
+      ],
+      [
+        "_re",
+        "tu"
+      ],
+      [
+        "_funct",
+        "ion"
+      ],
+      [
+        "_retu",
+        "rn"
+      ],
+      [
+        "A",
+        "S"
+      ],
+      [
+        "A",
+        "V"
+      ],
+      [
+        "C",
+        "R"
+      ],
+      [
+        "C",
+        "re"
+      ],
+      [
+        "H",
+        "O"
+      ],
+      [
+        "I",
+        "PT>"
+      ],
+      [
+        "J",
+        "a"
+      ],
+      [
+        "J",
+        "AV"
+      ],
+      [
+        "N",
+        ">"
+      ],
+      [
+        "P",
+        "y"
+      ],
+      [
+        "S",
+        "c"
+      ],
+      [
+        "T",
+        "HO"
+      ],
+      [
+        "Y",
+        "THO"
+      ],
+      [
+        "_",
+        ","
+      ],
+      [
+        "_",
+        "4"
+      ],
+      [
+        "_",
+        "5"
+      ],
+      [
+        "_",
+        ":"
+      ],
+      [
+        "_",
+        "p"
+      ],
+      [
+        "_",
+        "{"
+      ],
+      [
+        "_",
+        "}"
+      ],
+      [
+        "_",
+        "Cre"
+      ],
+      [
+        "_",
+        "Ja"
+      ],
+      [
+        "_",
+        "Py"
+      ],
+      [
+        "h",
+        "on"
+      ],
+      [
+        "n",
+        "t"
+      ],
+      [
+        "o",
+        "p"
+      ],
+      [
+        "o",
+        "r"
+      ],
+      [
+        "p",
+        "t"
+      ],
+      [
+        "t",
+        "hon"
+      ],
+      [
+        "_<",
+        "JAV"
+      ],
+      [
+        "_<P",
+        "YTHO"
+      ],
+      [
+        "_f",
+        "or"
+      ],
+      [
+        "ri",
+        "nt"
+      ],
+      [
+        "ri",
+        "pt"
+      ],
+      [
+        "at",
+        "e"
+      ],
+      [
+        "_lo",
+        "g"
+      ],
+      [
+        "_lo",
+        "op"
+      ],
+      [
+        "va",
+        "Sc"
+      ],
+      [
+        "_th",
+        "at"
+      ],
+      [
+        "AS",
+        "CR"
+      ],
+      [
+        "_p",
+        "rint"
+      ],
+      [
+        "_Cre",
+        "ate"
+      ],
+      [
+        "_Ja",
+        "vaSc"
+      ],
+      [
+        "_Py",
+        "thon"
+      ],
+      [
+        "_<JAV",
+        "ASCR"
+      ],
+      [
+        "_<PYTHO",
+        "N>"
+      ],
+      [
+        "_JavaSc",
+        "ript"
+      ],
+      [
+        "_<JAVASCR",
+        "IPT>"
+      ]
+    ]
+  }
+}

final_model/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,191 @@

+{
+  "auto_map": {
+    "AutoTokenizer": [
+      "tokenization_mindi.MindiTokenizer",
+      null
+    ]
+  },
+  "backend": "tokenizers",
+  "bos_token": "<BOS>",
+  "eos_token": "<EOS>",
+  "is_local": true,
+  "model_max_length": 2048,
+  "pad_token": "<PAD>",
+  "padding_side": "right",
+  "tokenizer_class": "MindiTokenizer",
+  "truncation_side": "right",
+  "unk_token": "<UNK>",
+  "vocab": {
+    "(": 11,
+    ")": 12,
+    "+": 13,
+    ",": 14,
+    ".": 15,
+    "0": 16,
+    "4": 17,
+    "5": 18,
+    ":": 19,
+    ";": 20,
+    "<": 21,
+    "<BOS>": 2,
+    "<CODE>": 8,
+    "<DEDENT>": 6,
+    "<EOS>": 3,
+    "<INDENT>": 5,
+    "<JAVASCRIPT>": 10,
+    "<NL>": 4,
+    "<PAD>": 0,
+    "<PROMPT>": 7,
+    "<PYTHON>": 9,
+    "<UNK>": 1,
+    "=": 22,
+    ">": 23,
+    "A": 24,
+    "AS": 123,
+    "ASCR": 162,
+    "AV": 124,
+    "C": 25,
+    "CO": 88,
+    "CR": 125,
+    "Cre": 126,
+    "D": 26,
+    "DE": 69,
+    "DE>": 99,
+    "DENT>": 77,
+    "E": 27,
+    "F": 28,
+    "H": 29,
+    "HO": 127,
+    "I": 30,
+    "IN": 89,
+    "IPT>": 128,
+    "J": 31,
+    "JAV": 130,
+    "Ja": 129,
+    "L": 32,
+    "L>": 72,
+    "M": 33,
+    "MPT>": 90,
+    "N": 34,
+    "N>": 131,
+    "NL>": 73,
+    "NT>": 75,
+    "O": 35,
+    "OMPT>": 91,
+    "P": 36,
+    "PT>": 79,
+    "Py": 132,
+    "R": 37,
+    "ROMPT>": 92,
+    "S": 38,
+    "Sc": 133,
+    "T": 39,
+    "T>": 70,
+    "THO": 134,
+    "V": 40,
+    "W": 41,
+    "Y": 42,
+    "YTHO": 135,
+    "_": 43,
+    "_(": 80,
+    "_)": 81,
+    "_+": 106,
+    "_,": 136,
+    "_0": 107,
+    "_4": 137,
+    "_5": 138,
+    "_:": 139,
+    "_;": 93,
+    "_<": 68,
+    "_<CO": 97,
+    "_<CODE>": 104,
+    "_<DE": 96,
+    "_<DEDENT>": 103,
+    "_<IN": 98,
+    "_<INDENT>": 105,
+    "_<JAV": 152,
+    "_<JAVASCR": 167,
+    "_<JAVASCRIPT>": 170,
+    "_<NL>": 74,
+    "_<P": 83,
+    "_<PROMPT>": 101,
+    "_<PYTHO": 153,
+    "_<PYTHON>": 168,
+    "_Cre": 143,
+    "_Create": 164,
+    "_Ja": 144,
+    "_JavaSc": 165,
+    "_JavaScript": 169,
+    "_Py": 145,
+    "_Python": 166,
+    "_a": 71,
+    "_add": 117,
+    "_b": 94,
+    "_f": 84,
+    "_for": 154,
+    "_funct": 119,
+    "_function": 121,
+    "_i": 78,
+    "_l": 85,
+    "_lo": 102,
+    "_log": 158,
+    "_loop": 159,
+    "_p": 140,
+    "_print": 163,
+    "_re": 108,
+    "_retu": 120,
+    "_return": 122,
+    "_t": 76,
+    "_th": 118,
+    "_that": 161,
+    "_to": 100,
+    "_{": 141,
+    "_}": 142,
+    "a": 44,
+    "at": 95,
+    "ate": 157,
+    "b": 45,
+    "c": 46,
+    "ct": 109,
+    "d": 47,
+    "dd": 110,
+    "e": 48,
+    "f": 49,
+    "g": 50,
+    "h": 51,
+    "hon": 146,
+    "i": 52,
+    "ion": 111,
+    "l": 53,
+    "m": 54,
+    "n": 55,
+    "nct": 112,
+    "nt": 147,
+    "o": 56,
+    "on": 82,
+    "op": 148,
+    "or": 149,
+    "p": 57,
+    "pt": 150,
+    "r": 58,
+    "re": 86,
+    "ri": 87,
+    "rint": 155,
+    "ript": 156,
+    "rn": 113,
+    "s": 59,
+    "t": 60,
+    "thon": 151,
+    "tu": 114,
+    "u": 61,
+    "unct": 115,
+    "v": 62,
+    "va": 116,
+    "vaSc": 160,
+    "w": 63,
+    "x": 64,
+    "y": 65,
+    "{": 66,
+    "}": 67
+  }
+}

logs/data_fetch.log CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1efcdb351ebae00b30ecf9d1e1b2f83188919e4a33052707d1627cfde6a3c731
-size 153

 version https://git-lfs.github.com/spec/v1
+oid sha256:250881bf6b7747176c7432a40e84fb3dc4eeca6f9a1a75378ee7e3ccdf662fbf
+size 44778

merge.py ADDED Viewed

	@@ -0,0 +1,29 @@

+from peft import PeftModel
+from transformers import AutoModelForCausalLM, AutoTokenizer
+base_model_path = "hf_release/MINDI-1.0-420M"
+lora_path = "output/checkpoints/checkpoint-12000"
+print("Loading base model...")
+model = AutoModelForCausalLM.from_pretrained(
+    base_model_path,
+    trust_remote_code=True
+)
+print("Loading LoRA...")
+model = PeftModel.from_pretrained(model, lora_path)
+print("Merging...")
+model = model.merge_and_unload()
+print("Saving final model...")
+model.save_pretrained("final_model", safe_serialization=False)
+print("Saving tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(
+    base_model_path,
+    trust_remote_code=True
+)
+tokenizer.save_pretrained("final_model")
+print("✅ DONE")

requirements.txt CHANGED Viewed

@@ -3,3 +3,4 @@ datasets
 peft
 accelerate
 torch

 peft
 accelerate
 torch
+tqdm

test.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from transformers import AutoModelForCausalLM, AutoTokenizer
+model_path = "final_model"
+print("Loading tokenizer...")
+tokenizer = AutoTokenizer.from_pretrained(
+    model_path,
+    trust_remote_code=True
+)
+print("Loading model...")
+model = AutoModelForCausalLM.from_pretrained(
+    model_path,
+    trust_remote_code=True
+)
+# 🔥 FIXES
+model = model.float()
+model.config.num_hidden_layers = getattr(model.config, "n_layer", 12)
+model.config.is_encoder_decoder = False
+prompt = "Write a Python function for binary search"
+inputs = tokenizer(prompt, return_tensors="pt")
+print("Generating...")
+output = model.generate(
+    **inputs,
+    max_new_tokens=200,
+    temperature=0.7,
+    do_sample=True
+)
+print("\n=== OUTPUT ===\n")
+print(tokenizer.decode(output[0], skip_special_tokens=True))