slenk commited on
Commit
eeef81e
·
verified ·
1 Parent(s): c33e1b1

Upload folder using huggingface_hub

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .claude/pipeline_state.json +48 -0
  2. .claude/scheduled_tasks.lock +1 -0
  3. .claude/settings.local.json +64 -0
  4. .gitattributes +1 -0
  5. .github/workflows/ci.yml +28 -0
  6. .pre-commit-config.yaml +29 -0
  7. .pytest_cache/.gitignore +2 -0
  8. .pytest_cache/CACHEDIR.TAG +4 -0
  9. .pytest_cache/README.md +8 -0
  10. .pytest_cache/v/cache/lastfailed +1 -0
  11. .pytest_cache/v/cache/nodeids +58 -0
  12. .ruff_cache/.gitignore +2 -0
  13. .ruff_cache/0.14.11/10353388469511876764 +0 -0
  14. .ruff_cache/0.14.11/13241530885439384270 +0 -0
  15. .ruff_cache/0.14.11/15588448742999807618 +0 -0
  16. .ruff_cache/0.14.11/16909963377101005444 +0 -0
  17. .ruff_cache/0.14.11/1739102192644247459 +0 -0
  18. .ruff_cache/0.14.11/2164407392135946080 +0 -0
  19. .ruff_cache/0.14.11/3894487969124666669 +0 -0
  20. .ruff_cache/0.14.11/4150897988697354825 +0 -0
  21. .ruff_cache/0.14.11/4355628838106937123 +0 -0
  22. .ruff_cache/0.14.11/5959195261246591303 +0 -0
  23. .ruff_cache/0.14.11/7470470446486951261 +0 -0
  24. .ruff_cache/0.14.11/7805097287912496176 +0 -0
  25. .ruff_cache/0.15.10/12264003023071563180 +0 -0
  26. .ruff_cache/0.15.10/12707412386835734272 +0 -0
  27. .ruff_cache/0.15.10/14255674844609017079 +0 -0
  28. .ruff_cache/0.15.10/14515759316653052378 +0 -0
  29. .ruff_cache/0.15.10/1490264962946858478 +0 -0
  30. .ruff_cache/0.15.10/15830732279563417379 +0 -0
  31. .ruff_cache/0.15.10/16585459526495690818 +0 -0
  32. .ruff_cache/0.15.10/17140885238503855112 +0 -0
  33. .ruff_cache/0.15.10/17258540645676314702 +0 -0
  34. .ruff_cache/0.15.10/3600153915928311247 +0 -0
  35. .ruff_cache/0.15.10/4777661366283320788 +0 -0
  36. .ruff_cache/0.15.10/8194725524321540937 +0 -0
  37. .ruff_cache/0.15.10/9372251181401831964 +0 -0
  38. .ruff_cache/CACHEDIR.TAG +1 -0
  39. CLAUDE.md +28 -0
  40. Final Project Rubric.odt +3 -0
  41. Final Project Rubric.odt:Zone.Identifier +0 -0
  42. Modelfile.teacher +4 -0
  43. README.md +175 -76
  44. data/eval_report.md +20 -0
  45. data/eval_report_3b_v2.md +20 -0
  46. data/eval_report_3b_v3.md +20 -0
  47. data/eval_report_3b_v4.md +20 -0
  48. data/eval_report_8b_v2.md +20 -0
  49. data/eval_report_8b_v4.md +20 -0
  50. data/eval_report_8b_v5.md +20 -0
.claude/pipeline_state.json ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "current_stage": "evaluation_complete",
3
+ "model_version": "v7",
4
+ "teacher_model": "Qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
5
+ "teacher_server": "vllm @ 192.168.13.21:8081",
6
+ "student_model": "unsloth/Llama-3.1-8B-Instruct",
7
+ "previous_version": {
8
+ "version": "v6",
9
+ "pairs": 171,
10
+ "train_split": 145,
11
+ "eval_split": 26,
12
+ "structural_score": 0.97,
13
+ "perfect_scores": "19/26",
14
+ "adapter_path": "models/codewraith-lora-8b-v6/",
15
+ "hf_repo": "slenk/codewraith-lora-8b",
16
+ "status": "complete"
17
+ },
18
+ "current_run": {
19
+ "output_file": "data/training_pairs_v7.jsonl",
20
+ "pairs_generated": 231,
21
+ "failures": 19,
22
+ "total_source_files": 250,
23
+ "train_split": 197,
24
+ "eval_split": 34,
25
+ "structural_score": 0.97,
26
+ "perfect_scores": "25/34",
27
+ "good_scores": "29/34",
28
+ "training_loss": 0.12,
29
+ "adapter_path": "models/codewraith-lora-8b-v7/",
30
+ "status": "evaluation_complete",
31
+ "notes": "v7 matches v6 structural score (0.97) with 35% more training data. 4 low scores (0.50) likely Python 2 source files. Ready for upload and deployment."
32
+ },
33
+ "running_pids": [],
34
+ "last_progress": "evaluation complete",
35
+ "timestamp": "2026-04-16T00:15:00-04:00",
36
+ "next_steps": [
37
+ "Upload v7 adapter to HF Hub (slenk/codewraith-lora-8b)",
38
+ "Update README with v7 results",
39
+ "Redeploy HF Space with v7 adapter"
40
+ ],
41
+ "known_constraints": {
42
+ "vram": "32GB (RTX 5090)",
43
+ "32b_max_context": 4096,
44
+ "14b_max_context": 16384,
45
+ "generation_command": "uv run --extra ml python3 -c \"from codewraith.teacher.generator import generate_dataset; generate_dataset('data/source_files', 'data/training_pairs_v7.jsonl', model='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', backend='vllm')\"",
46
+ "vllm_command": "uv run --extra ml python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-Coder-14B-Instruct-AWQ --port 8081 --max-model-len 16384 --gpu-memory-utilization 0.90 --host 0.0.0.0"
47
+ }
48
+ }
.claude/scheduled_tasks.lock ADDED
@@ -0,0 +1 @@
 
 
1
+ {"sessionId":"65720353-dd32-4bab-971c-cca9aacd06fe","pid":74275,"acquiredAt":1776308286647}
.claude/settings.local.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "permissions": {
3
+ "allow": [
4
+ "Bash(bash:*)",
5
+ "Bash([ -f \"$HOME/.claude/CLAUDE.md\" ])",
6
+ "Bash(node:*)",
7
+ "Bash(chmod 755:*)",
8
+ "Bash(npm view:*)",
9
+ "Bash(npm install:*)",
10
+ "Bash(command -v omc)",
11
+ "Bash(omc --version)",
12
+ "Bash(claude mcp:*)",
13
+ "mcp__plugin_oh-my-claudecode_t__state_write",
14
+ "mcp__plugin_oh-my-claudecode_t__state_clear",
15
+ "Bash(command -v libreoffice)",
16
+ "Bash(libreoffice --headless --convert-to txt \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" --outdir /tmp)",
17
+ "Read(//tmp/**)",
18
+ "Bash(command -v pandoc)",
19
+ "Bash(pandoc \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" -t plain)",
20
+ "Bash(uv sync:*)",
21
+ "Bash(uv run:*)",
22
+ "Bash(git add:*)",
23
+ "Bash(curl -s http://127.0.0.1:11434/api/tags)",
24
+ "Bash(python3 -c \"import sys,json; data=json.load\\(sys.stdin\\); [print\\(m['name']\\) for m in data.get\\('models',[]\\)]\")",
25
+ "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"codewraith-teacher\",\"keep_alive\":0}')",
26
+ "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
27
+ "Bash(command -v chub)",
28
+ "WebSearch",
29
+ "mcp__context7__resolve-library-id",
30
+ "mcp__context7__query-docs",
31
+ "Bash(git commit:*)",
32
+ "Bash(nvidia-smi:*)",
33
+ "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":512}}')",
34
+ "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
35
+ "Bash(pip index:*)",
36
+ "Bash(curl -s https://pypi.org/pypi/gradio/json)",
37
+ "Bash(python3 -c \"import sys,json; print\\('Latest:', json.load\\(sys.stdin\\)['info']['version']\\)\")",
38
+ "Bash(pkill -f \"codewraith.teacher.generator\")",
39
+ "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/tags)",
40
+ "Bash(python3 -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\('Ollama OK:', len\\(d.get\\('models',[]\\)\\), 'models'\\)\")",
41
+ "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"gemma4:26b\",\"keep_alive\":0}')",
42
+ "Bash(du -sh /home/slenk/dev/CodeWraith/*/)",
43
+ "Bash(du -sh /home/slenk/dev/CodeWraith/data/*.json)",
44
+ "Bash(du -sh /home/slenk/dev/CodeWraith/.*)",
45
+ "Bash(git:*)",
46
+ "Bash(du -sh /home/slenk/dev/CodeWraith/data/chromadb/ /home/slenk/dev/CodeWraith/data/*.jsonl)",
47
+ "Bash(hf upload:*)",
48
+ "Bash(hf whoami:*)",
49
+ "Bash(hf auth:*)",
50
+ "Bash(hf spaces:*)",
51
+ "Bash(python:*)",
52
+ "Bash(python3:*)",
53
+ "WebFetch(domain:ollama.com)",
54
+ "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen2.5-coder:32b-instruct-q6_K\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":256}}')",
55
+ "Bash(HF_TOKEN=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 hf upload:*)",
56
+ "Bash(curl -s http://127.0.0.1:8081/health)",
57
+ "Bash(curl -s http://192.168.13.21:8081/v1/chat/completions -H 'Content-Type: application/json' -d '{:*)",
58
+ "Bash(curl:*)",
59
+ "Bash(chmod +x .venv/bin/pytest)",
60
+ "Bash(awk '{print $2}')",
61
+ "Bash(ls /home/slenk/dev/CodeWraith/data/source_files/*.py)"
62
+ ]
63
+ }
64
+ }
.gitattributes CHANGED
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
 
 
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
37
+ Final[[:space:]]Project[[:space:]]Rubric.odt filter=lfs diff=lfs merge=lfs -text
.github/workflows/ci.yml ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Test, Format, and Lint
2
+
3
+ on:
4
+ push:
5
+ branches: [ "**" ]
6
+
7
+ jobs:
8
+ build:
9
+ runs-on: ubuntu-latest
10
+
11
+ steps:
12
+ - uses: actions/checkout@v6
13
+
14
+ - name: Setup uv
15
+ uses: astral-sh/setup-uv@v7
16
+
17
+ - name: Install dependencies
18
+ run: |
19
+ uv sync --extra dev
20
+
21
+ - name: Run Format and Lint Checks
22
+ run: |
23
+ uv run ruff check src/ tests/
24
+ uv run ruff format --check src/ tests/
25
+
26
+ - name: Run tests
27
+ run: |
28
+ uv run pytest --cov=src --cov-fail-under=80
.pre-commit-config.yaml ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # See https://pre-commit.com for more information
2
+ # See https://pre-commit.com/hooks.html for more hooks
3
+ repos:
4
+ - repo: https://github.com/pre-commit/pre-commit-hooks
5
+ rev: v6.0.0
6
+ hooks:
7
+ - id: trailing-whitespace
8
+ - id: end-of-file-fixer
9
+ - id: check-yaml
10
+ - id: check-added-large-files
11
+ exclude: '^uv\.lock$'
12
+ - id: check-json
13
+ - id: check-xml
14
+
15
+ - repo: https://github.com/astral-sh/ruff-pre-commit
16
+ rev: v0.14.11
17
+ hooks:
18
+ - id: ruff
19
+ - id: ruff-format
20
+
21
+ - repo: local
22
+ hooks:
23
+ - id: pytest
24
+ name: pytest
25
+ entry: uv run pytest --cov=src --cov-fail-under=80 tests
26
+ language: system
27
+ types: [python]
28
+ pass_filenames: false
29
+ always_run: true
.pytest_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Created by pytest automatically.
2
+ *
.pytest_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
2
+ # This file is a cache directory tag created by pytest.
3
+ # For information about cache directory tags, see:
4
+ # https://bford.info/cachedir/spec.html
.pytest_cache/README.md ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ # pytest cache directory #
2
+
3
+ This directory contains data from the pytest's cache plugin,
4
+ which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
5
+
6
+ **Do not** commit this to version control.
7
+
8
+ See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
.pytest_cache/v/cache/lastfailed ADDED
@@ -0,0 +1 @@
 
 
1
+ {}
.pytest_cache/v/cache/nodeids ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ "tests/test_regressions.py::TestFlattenField::test_dict_joins_values",
3
+ "tests/test_regressions.py::TestFlattenField::test_dict_with_none_values_skipped",
4
+ "tests/test_regressions.py::TestFlattenField::test_empty_string_returns_empty",
5
+ "tests/test_regressions.py::TestFlattenField::test_list_of_dicts",
6
+ "tests/test_regressions.py::TestFlattenField::test_list_of_strings",
7
+ "tests/test_regressions.py::TestFlattenField::test_none_returns_empty",
8
+ "tests/test_regressions.py::TestFlattenField::test_string_passthrough",
9
+ "tests/test_regressions.py::TestFunctionSeparators::test_no_separator_before_first_function",
10
+ "tests/test_regressions.py::TestFunctionSeparators::test_separator_between_functions",
11
+ "tests/test_regressions.py::TestFunctionSeparators::test_single_function_no_separator",
12
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_dict",
13
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_list_of_dicts",
14
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_none",
15
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_dict",
16
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_list_of_dicts",
17
+ "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_none",
18
+ "tests/test_regressions.py::TestVLLMMaxTokensConfig::test_query_vllm_uses_4096_max_tokens",
19
+ "tests/test_student/test_evaluate.py::test_argument_coverage",
20
+ "tests/test_student/test_evaluate.py::test_empty_spec",
21
+ "tests/test_student/test_evaluate.py::test_has_structure_plain_text",
22
+ "tests/test_student/test_evaluate.py::test_has_structure_with_markdown",
23
+ "tests/test_student/test_evaluate.py::test_missing_class",
24
+ "tests/test_student/test_evaluate.py::test_missing_function",
25
+ "tests/test_student/test_evaluate.py::test_perfect_score",
26
+ "tests/test_student/test_evaluate.py::test_syntax_error_source",
27
+ "tests/test_student/test_trainer.py::test_lora_config_defaults",
28
+ "tests/test_student/test_trainer.py::test_models_dict",
29
+ "tests/test_student/test_trainer.py::test_push_to_hub_requires_model",
30
+ "tests/test_student/test_trainer.py::test_system_message",
31
+ "tests/test_teacher/test_clean_dataset.py::test_filters_null_output",
32
+ "tests/test_teacher/test_clean_dataset.py::test_filters_too_long",
33
+ "tests/test_teacher/test_clean_dataset.py::test_filters_too_short",
34
+ "tests/test_teacher/test_clean_dataset.py::test_keeps_valid_entries",
35
+ "tests/test_teacher/test_clean_dataset.py::test_rejected_file_has_reasons",
36
+ "tests/test_teacher/test_collect.py::test_count_lines",
37
+ "tests/test_teacher/test_collect.py::test_file_hash_deterministic",
38
+ "tests/test_teacher/test_collect.py::test_file_hash_different",
39
+ "tests/test_teacher/test_collect.py::test_has_functions_or_classes_indented",
40
+ "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_class",
41
+ "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_function",
42
+ "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_neither",
43
+ "tests/test_teacher/test_generator.py::test_load_completed_empty_file",
44
+ "tests/test_teacher/test_generator.py::test_load_completed_nonexistent",
45
+ "tests/test_teacher/test_generator.py::test_load_completed_skips_entries_without_source_file",
46
+ "tests/test_teacher/test_generator.py::test_load_completed_with_entries",
47
+ "tests/test_verifier/test_ast_checker.py::test_extract_class_info",
48
+ "tests/test_verifier/test_ast_checker.py::test_extract_function_signatures",
49
+ "tests/test_verifier/test_ast_checker.py::test_validate_signatures_match",
50
+ "tests/test_verifier/test_ast_checker.py::test_validate_signatures_mismatch",
51
+ "tests/test_verifier/test_judge.py::test_parse_direct_json",
52
+ "tests/test_verifier/test_judge.py::test_parse_empty_string",
53
+ "tests/test_verifier/test_judge.py::test_parse_invalid_json",
54
+ "tests/test_verifier/test_judge.py::test_parse_json_embedded_in_text",
55
+ "tests/test_verifier/test_judge.py::test_parse_json_in_code_fence",
56
+ "tests/test_verifier/test_judge.py::test_parse_json_in_plain_fence",
57
+ "tests/test_verifier/test_judge.py::test_parse_nested_json"
58
+ ]
.ruff_cache/.gitignore ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ # Automatically created by ruff.
2
+ *
.ruff_cache/0.14.11/10353388469511876764 ADDED
Binary file (55 Bytes). View file
 
.ruff_cache/0.14.11/13241530885439384270 ADDED
Binary file (67 Bytes). View file
 
.ruff_cache/0.14.11/15588448742999807618 ADDED
Binary file (55 Bytes). View file
 
.ruff_cache/0.14.11/16909963377101005444 ADDED
Binary file (67 Bytes). View file
 
.ruff_cache/0.14.11/1739102192644247459 ADDED
Binary file (62 Bytes). View file
 
.ruff_cache/0.14.11/2164407392135946080 ADDED
Binary file (67 Bytes). View file
 
.ruff_cache/0.14.11/3894487969124666669 ADDED
Binary file (74 Bytes). View file
 
.ruff_cache/0.14.11/4150897988697354825 ADDED
Binary file (228 Bytes). View file
 
.ruff_cache/0.14.11/4355628838106937123 ADDED
Binary file (336 Bytes). View file
 
.ruff_cache/0.14.11/5959195261246591303 ADDED
Binary file (545 Bytes). View file
 
.ruff_cache/0.14.11/7470470446486951261 ADDED
Binary file (346 Bytes). View file
 
.ruff_cache/0.14.11/7805097287912496176 ADDED
Binary file (499 Bytes). View file
 
.ruff_cache/0.15.10/12264003023071563180 ADDED
Binary file (119 Bytes). View file
 
.ruff_cache/0.15.10/12707412386835734272 ADDED
Binary file (639 Bytes). View file
 
.ruff_cache/0.15.10/14255674844609017079 ADDED
Binary file (5.45 kB). View file
 
.ruff_cache/0.15.10/14515759316653052378 ADDED
Binary file (703 Bytes). View file
 
.ruff_cache/0.15.10/1490264962946858478 ADDED
Binary file (346 Bytes). View file
 
.ruff_cache/0.15.10/15830732279563417379 ADDED
Binary file (639 Bytes). View file
 
.ruff_cache/0.15.10/16585459526495690818 ADDED
Binary file (3.65 kB). View file
 
.ruff_cache/0.15.10/17140885238503855112 ADDED
Binary file (62 Bytes). View file
 
.ruff_cache/0.15.10/17258540645676314702 ADDED
Binary file (703 Bytes). View file
 
.ruff_cache/0.15.10/3600153915928311247 ADDED
Binary file (74 Bytes). View file
 
.ruff_cache/0.15.10/4777661366283320788 ADDED
Binary file (67 Bytes). View file
 
.ruff_cache/0.15.10/8194725524321540937 ADDED
Binary file (55 Bytes). View file
 
.ruff_cache/0.15.10/9372251181401831964 ADDED
Binary file (590 Bytes). View file
 
.ruff_cache/CACHEDIR.TAG ADDED
@@ -0,0 +1 @@
 
 
1
+ Signature: 8a477f597d28d172789f06886806bc55
CLAUDE.md ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith
2
+
3
+ Module-to-Spec Transformer -- fine-tuned LLM that generates technical specifications from Python source code.
4
+
5
+ ## Pipeline State
6
+
7
+ Read `.claude/pipeline_state.json` at session start to know where the ML pipeline left off. Update it after completing any pipeline stage (generation, cleaning, training, evaluation, upload).
8
+
9
+ ## Process Monitoring
10
+
11
+ - When monitoring long-running processes (vLLM serving, dataset generation, model training, uploads), check status at **5-minute intervals minimum**. Do NOT poll more frequently unless explicitly asked.
12
+ - Before killing any long-running process, **always confirm with the user first**. Never assume a process is stuck without evidence of zero progress over multiple checks.
13
+ - For HuggingFace uploads of large models (>10GB), prefer `hf upload` CLI over Python `push_to_hub()`. The CLI handles resumption better.
14
+
15
+ ## Environment
16
+
17
+ - Python 3.12, managed with `uv`
18
+ - Use `uv sync` / `uv run`, never `uv pip install`
19
+ - Tests: `uv run pytest`
20
+ - Lint: `uv run ruff check`
21
+ - GPU: NVIDIA RTX 5090 (32GB VRAM)
22
+ - Teacher models served via vLLM at 192.168.13.21:8081
23
+
24
+ ## Commits
25
+
26
+ - Use Angular/Conventional Commits format
27
+ - No Co-Authored-By lines
28
+ - Commit at every meaningful milestone
Final Project Rubric.odt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a0adb49313b481e0771de5fca861649823b8c56220a59ded7146d0d1d5283d60
3
+ size 3241527
Final Project Rubric.odt:Zone.Identifier ADDED
Binary file (59 Bytes). View file
 
Modelfile.teacher ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ FROM llama3.1:70b-instruct-q4_K_M
2
+
3
+ PARAMETER num_ctx 4096
4
+ PARAMETER num_gpu 99
README.md CHANGED
@@ -22,8 +22,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
22
  ```
23
  ┌─────────────┐
24
  Python Source ──> │ Teacher │ ──> Training Pairs (code -> spec)
25
- Qwen3 30B │ │
26
- (Ollama) │ │
27
  └─────────────┘ │
28
 
29
  ┌─────────────┐ ┌─────────────┐
@@ -41,8 +41,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
41
 
42
 
43
  ┌─────────────┐
44
- │ Gradio App │
45
- │ HF Spaces │
46
  └─────────────┘
47
  ```
48
 
@@ -50,10 +50,10 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
50
 
51
  | Component | Directory | Purpose |
52
  |-----------|-----------|---------|
53
- | **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using Qwen3 30B via Ollama |
54
  | **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
55
  | **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
56
- | **App** | `src/codewraith/app/` | Gradio web interface deployed on HuggingFace Spaces |
57
 
58
  ## Verification Pipeline
59
 
@@ -61,6 +61,29 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
61
  2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
62
  3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  ## Quick Start
65
 
66
  ### Prerequisites
@@ -80,13 +103,10 @@ cd CodeWraith
80
  uv venv
81
  uv sync
82
 
83
- # Install ML dependencies (datasets, transformers, dspy)
84
  uv sync --extra ml
85
 
86
- # Install training dependencies (unsloth, peft, trl)
87
- uv sync --extra ml --extra training
88
-
89
- # Install app dependencies (gradio)
90
  uv sync --extra app
91
 
92
  # Install everything
@@ -119,23 +139,29 @@ into `data/source_files/`. Resumable if interrupted.
119
  ### Step 2: Optimize Prompt with DSPy
120
 
121
  Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
122
- Requires Ollama running with `qwen3:30b-a3b`.
123
 
124
  ```bash
125
- # Pull the teacher model
126
- ollama pull qwen3:30b-a3b
127
-
128
  # Run optimization
129
  uv run --extra ml python3 -m codewraith.teacher.optimize
130
  ```
131
 
132
  Saves the optimized generator to `data/optimized_generator.json`.
 
133
 
134
  ### Step 3: Generate Training Data
135
 
136
- Generate specs for all collected source files using the optimized prompt.
137
 
138
  ```bash
 
 
 
 
 
 
 
 
139
  uv run --extra ml python3 -c "
140
  from codewraith.teacher.generator import generate_dataset
141
  generate_dataset('data/source_files', 'data/training_pairs.jsonl')
@@ -158,10 +184,10 @@ Fine-tune with Unsloth + LoRA. Supports both 3B and 8B models.
158
 
159
  ```bash
160
  # Train Llama 3.2 3B (fast, ~3-4 minutes)
161
- uv run --extra ml --extra training python3 -m codewraith.student.trainer 3b
162
 
163
  # Train Llama 3.1 8B (better quality, ~8-10 minutes)
164
- uv run --extra ml --extra training python3 -m codewraith.student.trainer 8b
165
  ```
166
 
167
  Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
@@ -172,10 +198,10 @@ Run evaluation comparing structural accuracy across models.
172
 
173
  ```bash
174
  # Evaluate 3B
175
- uv run --extra ml --extra training python3 -m codewraith.student.evaluate 3b
176
 
177
  # Evaluate 8B
178
- uv run --extra ml --extra training python3 -m codewraith.student.evaluate 8b
179
  ```
180
 
181
  Generates `data/eval_report.md` with comparison metrics.
@@ -183,7 +209,7 @@ Generates `data/eval_report.md` with comparison metrics.
183
  ### Step 7: Run Gradio App
184
 
185
  ```bash
186
- uv run --extra ml --extra training --extra app python3 -m codewraith.app.main
187
  ```
188
 
189
  Auto-detects the best available adapter (prefers 8B over 3B).
@@ -193,73 +219,147 @@ Opens a web UI with code input, sampling parameter controls, and live spec gener
193
 
194
  ```bash
195
  # Push adapter to HuggingFace Hub
196
- uv run --extra ml --extra training python3 -c "
197
  from codewraith.student.trainer import load_base_model, push_to_hub
198
  from peft import PeftModel
199
- model, tokenizer = load_base_model('3b')
200
- model = PeftModel.from_pretrained(model, './models/codewraith-lora-3b')
201
- push_to_hub(model, tokenizer, 'your-username/codewraith-lora-3b')
202
  "
 
 
 
 
 
203
  ```
204
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
  ## Evaluation Results
206
 
207
- Models trained with 8192 context, LoRA r=32, 4 epochs, dropout=0.05.
208
- Training data generated by Gemma 4 26B teacher model with DSPy-optimized prompts.
209
- Evaluated on 28 held-out examples (proper train/eval split, no data leakage).
 
 
210
 
211
- ### Llama 3.1 8B (CodeWraith-8b) -- Deployed Model
212
 
213
  | Metric | Score |
214
  |--------|-------|
215
- | Avg Structural Score | 0.95 |
216
- | Function Coverage | 90% |
217
  | Class Coverage | 100% |
218
- | Argument Coverage | 94% |
219
- | Return Type Coverage | 67% |
220
- | Perfect Scores | 22/28 |
221
- | Good Scores (>=80%) | 25/28 |
222
- | Avg Inference Time | 28s |
223
- | Training Loss | 0.59 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
 
225
- ### Llama 3.2 3B (CodeWraith-3b)
 
 
 
 
 
 
226
 
227
  | Metric | Score |
228
  |--------|-------|
229
- | Avg Structural Score | 0.91 |
230
- | Function Coverage | 86% |
231
- | Class Coverage | 96% |
232
- | Argument Coverage | 93% |
233
- | Return Type Coverage | 67% |
234
- | Perfect Scores | 19/28 |
235
- | Good Scores (>=80%) | 24/28 |
236
- | Avg Inference Time | 26s |
237
- | Training Loss | 0.76 |
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
238
 
239
  ### Analysis
240
 
241
- The 8B model was selected for deployment because:
242
- - Higher overall structural score (0.95 vs 0.91)
243
- - Perfect class coverage (100% vs 96%)
244
- - More perfect scores (22/28 vs 19/28)
245
- - Higher quality training data from Gemma 4 26B teacher enabled the larger model to shine
246
 
247
- Training data was generated using Gemma 4 26B as the teacher model (replacing Qwen3 30B),
248
- producing higher quality specs with better structured Markdown and mermaid diagrams.
249
- DSPy BootstrapFewShot was used to optimize the generation prompt.
 
 
 
 
250
 
251
  ### HuggingFace Models
252
 
253
- - Deployed (8B): https://huggingface.co/slenk/codewraith-lora-8b
254
- - Alternative (3B): https://huggingface.co/slenk/codewraith-lora-3b
 
255
 
256
  ## Environment
257
 
258
- - **Teacher model**: Gemma 4 26B via Ollama at `127.0.0.1:11434`
259
  - **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
260
  - **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
261
- - **Deployment**: Gradio on HuggingFace Spaces
262
- - **Hardware**: NVIDIA RTX 5090 (32GB VRAM)
 
263
 
264
  ## Project Structure
265
 
@@ -272,8 +372,9 @@ CodeWraith/
272
  │ ├── teacher/
273
  │ │ ├── collect.py # HF dataset collection
274
  │ │ ├── optimize.py # DSPy prompt optimization
275
- │ │ ├── generator.py # Training data generation
276
  │ │ └── clean_dataset.py # Dataset filtering
 
277
  │ ├── verifier/
278
  │ │ ├── ast_checker.py # AST structural validation
279
  │ │ └── judge.py # LLM-as-Judge semantic audit
@@ -281,19 +382,17 @@ CodeWraith/
281
  │ │ ├── trainer.py # Unsloth + LoRA fine-tuning
282
  │ │ └── evaluate.py # Model evaluation pipeline
283
  │ └── app/
284
- ── main.py # Gradio inference UI
285
- ── data/ # Training data, eval sets, reports
286
- ├── models/ # Saved LoRA adapters
287
- ── tests/ # Test suite (96% coverage)
 
 
 
 
 
 
 
 
 
288
  ```
289
-
290
- ## Rubric Alignment
291
-
292
- | Rubric Section | Points | Implementation |
293
- |---------------|--------|----------------|
294
- | Model Functionality (training + LoRA + eval) | 20 | `student/trainer.py`, `student/evaluate.py`, 3B vs 8B comparison |
295
- | Innovation & Creativity | 20 | Teacher-student architecture, DSPy prompt optimization, AST verification pipeline |
296
- | Environment Setup (deployment) | 15 | `app/main.py`, Gradio on HF Spaces |
297
- | Inference Pipeline (sampling) | 15 | `app/main.py` with temperature/top_p/max_tokens controls |
298
- | Technical Documentation | 15 | This README, evaluation reports, docstrings |
299
- | Demo & Presentation | 15 | Live Gradio app as interactive demo |
 
22
  ```
23
  ┌─────────────┐
24
  Python Source ──> │ Teacher │ ──> Training Pairs (code -> spec)
25
+ LLM via │ │
26
+ vLLM/Ollama │ │
27
  └─────────────┘ │
28
 
29
  ┌─────────────┐ ┌─────────────┐
 
41
 
42
 
43
  ┌─────────────┐
44
+ │ Gradio App │ <── RAG Retriever
45
+ │ HF Spaces │ (ChromaDB)
46
  └─────────────┘
47
  ```
48
 
 
50
 
51
  | Component | Directory | Purpose |
52
  |-----------|-----------|---------|
53
+ | **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using a large LLM via vLLM (JSON-constrained) or Ollama |
54
  | **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
55
  | **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
56
+ | **App** | `src/codewraith/app/` | Gradio web interface with RAG retrieval, deployed on HuggingFace Spaces |
57
 
58
  ## Verification Pipeline
59
 
 
61
  2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
62
  3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
63
 
64
+ ## Sampling & Inference
65
+
66
+ The inference pipeline uses **nucleus sampling** (top-p) combined with temperature scaling to balance output quality and diversity:
67
+
68
+ | Parameter | Default | Range | Purpose |
69
+ |-----------|---------|-------|---------|
70
+ | **Temperature** | 0.7 | 0.0 - 2.0 | Controls randomness. Lower values (0.1-0.3) produce more deterministic, structured output. Higher values increase diversity but risk incoherence. |
71
+ | **Top-p** | 0.9 | 0.0 - 1.0 | Nucleus sampling threshold. At each step, only tokens whose cumulative probability mass falls within the top-p fraction are considered. 0.9 retains the top 90% probability mass, filtering out low-likelihood tokens. |
72
+ | **Max Tokens** | 2048 | 256 - 8192 | Maximum generation length. Technical specs for typical modules run 500-1500 tokens; larger modules may need 4096+. |
73
+
74
+ **Why nucleus sampling over beam search?** Spec generation benefits from controlled creativity -- mermaid diagrams and natural language descriptions need some variation, while function signatures need precision. Nucleus sampling with moderate temperature (0.7) gives the model freedom in prose while the fine-tuning keeps structured elements accurate. For maximum precision, users can lower temperature to 0.1-0.3.
75
+
76
+ ## Retrieval-Augmented Generation (RAG)
77
+
78
+ At inference time, the app optionally retrieves similar code-spec pairs from a ChromaDB vector index to provide few-shot context:
79
+
80
+ 1. **Indexing**: All training pairs are embedded using `sentence-transformers` and stored in ChromaDB (193 pairs)
81
+ 2. **Retrieval**: When a user submits code, the retriever finds the 3 most similar source files by cosine similarity
82
+ 3. **Augmentation**: Retrieved examples are prepended to the user's input as context, giving the model concrete formatting examples
83
+ 4. **Auto-truncation**: If RAG context pushes the input beyond 6000 tokens, it is automatically dropped to prevent context overflow
84
+
85
+ RAG improves output consistency, especially for formatting patterns like mermaid diagrams and markdown tables that the model may not reliably produce from fine-tuning alone.
86
+
87
  ## Quick Start
88
 
89
  ### Prerequisites
 
103
  uv venv
104
  uv sync
105
 
106
+ # Install ML dependencies (transformers, unsloth, vllm, etc.)
107
  uv sync --extra ml
108
 
109
+ # Install app dependencies (gradio, chromadb)
 
 
 
110
  uv sync --extra app
111
 
112
  # Install everything
 
139
  ### Step 2: Optimize Prompt with DSPy
140
 
141
  Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
142
+ Requires Ollama running with the configured teacher model.
143
 
144
  ```bash
 
 
 
145
  # Run optimization
146
  uv run --extra ml python3 -m codewraith.teacher.optimize
147
  ```
148
 
149
  Saves the optimized generator to `data/optimized_generator.json`.
150
+ Falls back to raw Ollama generation if DSPy optimization is unavailable or returns null.
151
 
152
  ### Step 3: Generate Training Data
153
 
154
+ Generate specs for all collected source files. Two backends are available:
155
 
156
  ```bash
157
+ # vLLM backend (recommended) -- JSON-constrained output for consistent structure
158
+ # Requires vLLM server running with a code-specialized model
159
+ uv run --extra ml python3 -c "
160
+ from codewraith.teacher.generator import generate_dataset
161
+ generate_dataset('data/source_files', 'data/training_pairs.jsonl', backend='vllm')
162
+ "
163
+
164
+ # Ollama backend -- raw generation, uses DSPy-optimized prompt if available
165
  uv run --extra ml python3 -c "
166
  from codewraith.teacher.generator import generate_dataset
167
  generate_dataset('data/source_files', 'data/training_pairs.jsonl')
 
184
 
185
  ```bash
186
  # Train Llama 3.2 3B (fast, ~3-4 minutes)
187
+ uv run --extra ml python3 -m codewraith.student.trainer 3b
188
 
189
  # Train Llama 3.1 8B (better quality, ~8-10 minutes)
190
+ uv run --extra ml python3 -m codewraith.student.trainer 8b
191
  ```
192
 
193
  Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
 
198
 
199
  ```bash
200
  # Evaluate 3B
201
+ uv run --extra ml python3 -m codewraith.student.evaluate 3b
202
 
203
  # Evaluate 8B
204
+ uv run --extra ml python3 -m codewraith.student.evaluate 8b
205
  ```
206
 
207
  Generates `data/eval_report.md` with comparison metrics.
 
209
  ### Step 7: Run Gradio App
210
 
211
  ```bash
212
+ uv run --extra ml --extra app python3 -m codewraith.app.main
213
  ```
214
 
215
  Auto-detects the best available adapter (prefers 8B over 3B).
 
219
 
220
  ```bash
221
  # Push adapter to HuggingFace Hub
222
+ uv run --extra ml python3 -c "
223
  from codewraith.student.trainer import load_base_model, push_to_hub
224
  from peft import PeftModel
225
+ model, tokenizer = load_base_model('8b')
226
+ model = PeftModel.from_pretrained(model, './models/codewraith-lora-8b')
227
+ push_to_hub(model, tokenizer, 'slenk/codewraith-lora-8b')
228
  "
229
+
230
+ # Upload app to HuggingFace Spaces (uses .hfignore to exclude large files)
231
+ hf upload slenk/codewraith . . --repo-type space \
232
+ --exclude "models/*" --exclude ".venv/*" --exclude "adapter/*" \
233
+ --exclude ".git/*" --exclude "tests/*" --exclude "scripts/*"
234
  ```
235
 
236
+ The Space downloads the LoRA adapter from HF Hub at startup, so model weights
237
+ are not included in the Space repository. A `.hfignore` file is provided to
238
+ exclude development artifacts from uploads.
239
+
240
+ ## Model Evolution
241
+
242
+ The project iterated through multiple teacher models and training configurations to find the best combination:
243
+
244
+ | Version | Teacher Model | Student | Key Finding |
245
+ |---------|--------------|---------|-------------|
246
+ | v1 | Llama 3.1 70B (Q4) | 3B, 8B | Baseline. Functional specs but inconsistent formatting. |
247
+ | v2 | Llama 3.1 70B (Q4) | 3B, 8B | Improved hyperparameters (r=32, 8192 context, 4 epochs). 8B reached 0.89 structural score. |
248
+ | v3 | Qwen3 30B-A3B (MoE) | 3B, 8B | Better structured output -- tables, type annotations, cleaner markdown. 3B chosen as primary (0.92 structural). |
249
+ | v4 | Gemma 4 26B | 3B, 8B | Higher structural scores (8B: 0.95, 100% class coverage). Wordier prose but weaker return type coverage (67%). 8B selected as deployed model. |
250
+ | v5 | Qwen2.5-Coder 32B (Q6) | 8B | Code-specialized teacher for more precise, structured specifications. |
251
+ | v6 | Qwen2.5-Coder 32B (AWQ) via vLLM | 8B | JSON-constrained generation via vLLM ensures consistent spec structure. 171 pairs, 0.97 structural score. |
252
+ | v7 | Qwen2.5-Coder 14B (AWQ) via vLLM | 8B | Smaller teacher with 16384 context recovers large files. 231 pairs (+35%), 0.97 structural score maintained. |
253
+
254
+ Each iteration preserved previous model adapters for comparison. The teacher model
255
+ has the largest impact on output quality -- a code-specialized teacher (Qwen2.5-Coder)
256
+ is expected to produce more precise function signatures and structured formatting than
257
+ general-purpose models.
258
+
259
  ## Evaluation Results
260
 
261
+ ### v7 -- Current (Qwen2.5-Coder 14B AWQ via vLLM)
262
+
263
+ Models trained with 4096 context, LoRA r=16, 3 epochs.
264
+ Training data generated by Qwen2.5-Coder-14B-Instruct-AWQ via vLLM with JSON-constrained output.
265
+ Evaluated on 34 held-out examples (197 train / 34 eval split from 231 total pairs).
266
 
267
+ #### Llama 3.1 8B (CodeWraith-8b-v7)
268
 
269
  | Metric | Score |
270
  |--------|-------|
271
+ | Avg Structural Score | 0.97 |
272
+ | Function Coverage | 97% |
273
  | Class Coverage | 100% |
274
+ | Argument Coverage | 95% |
275
+ | Return Type Coverage | 90% |
276
+ | Perfect Scores | 25/34 |
277
+ | Good Scores (>=80%) | 29/34 |
278
+ | Training Loss | 0.12 |
279
+
280
+ **Key change from v6:** Switched from the 32B teacher (limited to 4096 context on 32GB VRAM)
281
+ to the 14B teacher with 16384 context. This recovered 60 additional training pairs from source
282
+ files that previously exceeded the context window, increasing the dataset by 35%. Structural
283
+ score held steady at 0.97 with the larger eval set. Four low scores (0.50) traced to Python 2
284
+ syntax in source files, not model output issues.
285
+
286
+ ### v6 -- Previous (Qwen2.5-Coder 32B AWQ via vLLM)
287
+
288
+ Models trained with 8192 context, LoRA r=32, 3 epochs, dropout=0.05.
289
+ Training data generated by Qwen2.5-Coder-32B-Instruct-AWQ via vLLM with JSON-constrained output.
290
+ Evaluated on 26 held-out examples (145 train / 26 eval split from 171 total pairs).
291
+
292
+ #### Llama 3.1 8B (CodeWraith-8b-v6)
293
+
294
+ | Metric | Score |
295
+ |--------|-------|
296
+ | Avg Structural Score | 0.97 |
297
+ | Perfect Scores | 19/26 |
298
+ | Good Scores (>=80%) | 22/26 |
299
 
300
+ ### v5 -- Previous (Qwen2.5-Coder 32B via Ollama)
301
+
302
+ Models trained with 8192 context, LoRA r=32, 4 epochs, dropout=0.05.
303
+ Training data generated by Qwen2.5-Coder 32B (Q6 quantization) via Ollama.
304
+ Evaluated on 37 held-out examples (proper train/eval split, no data leakage).
305
+
306
+ #### Llama 3.1 8B (CodeWraith-8b-v5)
307
 
308
  | Metric | Score |
309
  |--------|-------|
310
+ | Avg Structural Score | 0.99 |
311
+ | Function Coverage | 97% |
312
+ | Class Coverage | 100% |
313
+ | Argument Coverage | 99% |
314
+ | Return Type Coverage | 100% |
315
+ | Perfect Scores | 29/37 |
316
+ | Good Scores (>=80%) | 36/37 |
317
+ | Training Loss | 0.33 |
318
+
319
+ ### v4 -- Previous (Gemma 4 26B Teacher)
320
+
321
+ Evaluated on 28 held-out examples.
322
+
323
+ #### Llama 3.1 8B (CodeWraith-8b-v4)
324
+
325
+ | Metric | v4 | v5 | Change |
326
+ |--------|-----|-----|--------|
327
+ | Structural Score | 0.95 | 0.99 | +0.04 |
328
+ | Function Coverage | 90% | 97% | +7% |
329
+ | Class Coverage | 100% | 100% | -- |
330
+ | Argument Coverage | 94% | 99% | +5% |
331
+ | Return Type Coverage | 67% | 100% | +33% |
332
+ | Perfect Scores | 78% | 78% | -- |
333
+ | Good Scores (>=80%) | 89% | 97% | +8% |
334
+ | Training Loss | 0.59 | 0.33 | -44% |
335
 
336
  ### Analysis
337
 
338
+ The v5 model using a **code-specialized teacher** (Qwen2.5-Coder 32B) dramatically
339
+ improved over v4's general-purpose teacher (Gemma 4 26B):
 
 
 
340
 
341
+ - **Return type coverage recovered from 67% to 100%** -- the v4 regression was caused
342
+ by Gemma producing prose descriptions instead of precise type annotations
343
+ - **Training loss dropped 44%** -- the code-specialized teacher produces more consistent,
344
+ structured output that the student model learns more efficiently
345
+ - **97% good scores** -- only 1 of 37 examples scored below 80%
346
+ - The code-specialized teacher generates more precise function signatures and parameter
347
+ types, which directly translates to higher AST verification scores
348
 
349
  ### HuggingFace Models
350
 
351
+ - Deployed (8B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-8b
352
+ - Merged (8B standalone): https://huggingface.co/slenk/codewraith-merged-8b
353
+ - Alternative (3B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-3b
354
 
355
  ## Environment
356
 
357
+ - **Teacher model**: Configurable via Ollama at `127.0.0.1:11434` (tested with Llama 70B, Qwen3 30B, Gemma 4 26B, Qwen2.5-Coder 32B)
358
  - **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
359
  - **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
360
+ - **RAG retrieval**: ChromaDB + sentence-transformers for few-shot context at inference
361
+ - **Deployment**: Gradio on HuggingFace Spaces with ZeroGPU (A10G)
362
+ - **Hardware (local)**: NVIDIA RTX 5090 (32GB VRAM)
363
 
364
  ## Project Structure
365
 
 
372
  │ ├── teacher/
373
  │ │ ├── collect.py # HF dataset collection
374
  │ │ ├── optimize.py # DSPy prompt optimization
375
+ │ │ ├── generator.py # Training data generation (Ollama + vLLM backends)
376
  │ │ └── clean_dataset.py # Dataset filtering
377
+ │ ├── spec_schema.py # Pydantic ModuleSpec schema + markdown renderer
378
  │ ├── verifier/
379
  │ │ ├── ast_checker.py # AST structural validation
380
  │ │ └── judge.py # LLM-as-Judge semantic audit
 
382
  │ │ ├── trainer.py # Unsloth + LoRA fine-tuning
383
  │ │ └── evaluate.py # Model evaluation pipeline
384
  │ └── app/
385
+ ── main.py # Gradio inference UI
386
+ │ └── retriever.py # RAG retrieval from ChromaDB
387
+ ├── app.py # HF Spaces entry point
388
+ ── data/
389
+ │ ├── chromadb/ # Vector index for RAG retrieval
390
+ │ ├── source_files/ # Collected Python source files
391
+ │ ├── training_pairs*.jsonl # Generated training data (per version)
392
+ │ └── eval_report*.md # Evaluation reports
393
+ ├── models/ # Local LoRA adapters (gitignored, hosted on HF Hub)
394
+
395
+ ├── scripts/
396
+ │ └── retrain.py # Full retrain pipeline
397
+ └── tests/ # Test suite
398
  ```
 
 
 
 
 
 
 
 
 
 
 
data/eval_report.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.95 |
8
+ | Function Coverage | 0.92 |
9
+ | Class Coverage | 0.81 |
10
+ | Argument Coverage | 1.00 |
11
+ | Return Type Coverage | 0.89 |
12
+ | Good Scores (>=80%) | 26 |
13
+ | Avg Inference Time (s) | 20.79 |
14
+
15
+ ## CodeWraith-8b (Llama-3.1-8B-Instruct)
16
+
17
+ - Examples evaluated: 30
18
+ - Valid (parseable): 29
19
+ - Perfect scores: 17
20
+ - Total inference time: 623.6s
data/eval_report_3b_v2.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-3b-v2 (Llama-3.2-3B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.93 |
8
+ | Function Coverage | 0.84 |
9
+ | Class Coverage | 0.97 |
10
+ | Argument Coverage | 0.91 |
11
+ | Return Type Coverage | 0.97 |
12
+ | Good Scores (>=80%) | 25 |
13
+ | Avg Inference Time (s) | 20.01 |
14
+
15
+ ## CodeWraith-3b-v2 (Llama-3.2-3B-Instruct)
16
+
17
+ - Examples evaluated: 31
18
+ - Valid (parseable): 28
19
+ - Perfect scores: 15
20
+ - Total inference time: 620.2s
data/eval_report_3b_v3.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.92 |
8
+ | Function Coverage | 0.83 |
9
+ | Class Coverage | 0.92 |
10
+ | Argument Coverage | 0.93 |
11
+ | Return Type Coverage | 0.84 |
12
+ | Good Scores (>=80%) | 24 |
13
+ | Avg Inference Time (s) | 20.01 |
14
+
15
+ ## CodeWraith-3b (Llama-3.2-3B-Instruct)
16
+
17
+ - Examples evaluated: 31
18
+ - Valid (parseable): 28
19
+ - Perfect scores: 13
20
+ - Total inference time: 620.2s
data/eval_report_3b_v4.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.91 |
8
+ | Function Coverage | 0.86 |
9
+ | Class Coverage | 0.96 |
10
+ | Argument Coverage | 0.93 |
11
+ | Return Type Coverage | 0.67 |
12
+ | Good Scores (>=80%) | 24 |
13
+ | Avg Inference Time (s) | 25.57 |
14
+
15
+ ## CodeWraith-3b (Llama-3.2-3B-Instruct)
16
+
17
+ - Examples evaluated: 28
18
+ - Valid (parseable): 27
19
+ - Perfect scores: 19
20
+ - Total inference time: 715.9s
data/eval_report_8b_v2.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-8b-v2 (Llama-3.1-8B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.92 |
8
+ | Function Coverage | 0.85 |
9
+ | Class Coverage | 0.84 |
10
+ | Argument Coverage | 0.93 |
11
+ | Return Type Coverage | 0.97 |
12
+ | Good Scores (>=80%) | 24 |
13
+ | Avg Inference Time (s) | 21.91 |
14
+
15
+ ## CodeWraith-8b-v2 (Llama-3.1-8B-Instruct)
16
+
17
+ - Examples evaluated: 31
18
+ - Valid (parseable): 28
19
+ - Perfect scores: 15
20
+ - Total inference time: 679.2s
data/eval_report_8b_v4.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.95 |
8
+ | Function Coverage | 0.90 |
9
+ | Class Coverage | 1.00 |
10
+ | Argument Coverage | 0.94 |
11
+ | Return Type Coverage | 0.67 |
12
+ | Good Scores (>=80%) | 25 |
13
+ | Avg Inference Time (s) | 27.57 |
14
+
15
+ ## CodeWraith-8b (Llama-3.1-8B-Instruct)
16
+
17
+ - Examples evaluated: 28
18
+ - Valid (parseable): 27
19
+ - Perfect scores: 22
20
+ - Total inference time: 772.1s
data/eval_report_8b_v5.md ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # CodeWraith Model Evaluation Report
2
+
3
+ ## Summary
4
+
5
+ | Metric | CodeWraith-8b-v5 (Llama-3.1-8B-Instruct) |
6
+ |--------|-----|
7
+ | Avg Structural Score | 0.99 |
8
+ | Function Coverage | 0.97 |
9
+ | Class Coverage | 1.00 |
10
+ | Argument Coverage | 0.99 |
11
+ | Return Type Coverage | 1.00 |
12
+ | Good Scores (>=80%) | 36 |
13
+ | Avg Inference Time (s) | 25.29 |
14
+
15
+ ## CodeWraith-8b-v5 (Llama-3.1-8B-Instruct)
16
+
17
+ - Examples evaluated: 37
18
+ - Valid (parseable): 36
19
+ - Perfect scores: 29
20
+ - Total inference time: 935.9s