Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .claude/pipeline_state.json +48 -0
- .claude/scheduled_tasks.lock +1 -0
- .claude/settings.local.json +64 -0
- .gitattributes +1 -0
- .github/workflows/ci.yml +28 -0
- .pre-commit-config.yaml +29 -0
- .pytest_cache/.gitignore +2 -0
- .pytest_cache/CACHEDIR.TAG +4 -0
- .pytest_cache/README.md +8 -0
- .pytest_cache/v/cache/lastfailed +1 -0
- .pytest_cache/v/cache/nodeids +58 -0
- .ruff_cache/.gitignore +2 -0
- .ruff_cache/0.14.11/10353388469511876764 +0 -0
- .ruff_cache/0.14.11/13241530885439384270 +0 -0
- .ruff_cache/0.14.11/15588448742999807618 +0 -0
- .ruff_cache/0.14.11/16909963377101005444 +0 -0
- .ruff_cache/0.14.11/1739102192644247459 +0 -0
- .ruff_cache/0.14.11/2164407392135946080 +0 -0
- .ruff_cache/0.14.11/3894487969124666669 +0 -0
- .ruff_cache/0.14.11/4150897988697354825 +0 -0
- .ruff_cache/0.14.11/4355628838106937123 +0 -0
- .ruff_cache/0.14.11/5959195261246591303 +0 -0
- .ruff_cache/0.14.11/7470470446486951261 +0 -0
- .ruff_cache/0.14.11/7805097287912496176 +0 -0
- .ruff_cache/0.15.10/12264003023071563180 +0 -0
- .ruff_cache/0.15.10/12707412386835734272 +0 -0
- .ruff_cache/0.15.10/14255674844609017079 +0 -0
- .ruff_cache/0.15.10/14515759316653052378 +0 -0
- .ruff_cache/0.15.10/1490264962946858478 +0 -0
- .ruff_cache/0.15.10/15830732279563417379 +0 -0
- .ruff_cache/0.15.10/16585459526495690818 +0 -0
- .ruff_cache/0.15.10/17140885238503855112 +0 -0
- .ruff_cache/0.15.10/17258540645676314702 +0 -0
- .ruff_cache/0.15.10/3600153915928311247 +0 -0
- .ruff_cache/0.15.10/4777661366283320788 +0 -0
- .ruff_cache/0.15.10/8194725524321540937 +0 -0
- .ruff_cache/0.15.10/9372251181401831964 +0 -0
- .ruff_cache/CACHEDIR.TAG +1 -0
- CLAUDE.md +28 -0
- Final Project Rubric.odt +3 -0
- Final Project Rubric.odt:Zone.Identifier +0 -0
- Modelfile.teacher +4 -0
- README.md +175 -76
- data/eval_report.md +20 -0
- data/eval_report_3b_v2.md +20 -0
- data/eval_report_3b_v3.md +20 -0
- data/eval_report_3b_v4.md +20 -0
- data/eval_report_8b_v2.md +20 -0
- data/eval_report_8b_v4.md +20 -0
- data/eval_report_8b_v5.md +20 -0
.claude/pipeline_state.json
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"current_stage": "evaluation_complete",
|
| 3 |
+
"model_version": "v7",
|
| 4 |
+
"teacher_model": "Qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
|
| 5 |
+
"teacher_server": "vllm @ 192.168.13.21:8081",
|
| 6 |
+
"student_model": "unsloth/Llama-3.1-8B-Instruct",
|
| 7 |
+
"previous_version": {
|
| 8 |
+
"version": "v6",
|
| 9 |
+
"pairs": 171,
|
| 10 |
+
"train_split": 145,
|
| 11 |
+
"eval_split": 26,
|
| 12 |
+
"structural_score": 0.97,
|
| 13 |
+
"perfect_scores": "19/26",
|
| 14 |
+
"adapter_path": "models/codewraith-lora-8b-v6/",
|
| 15 |
+
"hf_repo": "slenk/codewraith-lora-8b",
|
| 16 |
+
"status": "complete"
|
| 17 |
+
},
|
| 18 |
+
"current_run": {
|
| 19 |
+
"output_file": "data/training_pairs_v7.jsonl",
|
| 20 |
+
"pairs_generated": 231,
|
| 21 |
+
"failures": 19,
|
| 22 |
+
"total_source_files": 250,
|
| 23 |
+
"train_split": 197,
|
| 24 |
+
"eval_split": 34,
|
| 25 |
+
"structural_score": 0.97,
|
| 26 |
+
"perfect_scores": "25/34",
|
| 27 |
+
"good_scores": "29/34",
|
| 28 |
+
"training_loss": 0.12,
|
| 29 |
+
"adapter_path": "models/codewraith-lora-8b-v7/",
|
| 30 |
+
"status": "evaluation_complete",
|
| 31 |
+
"notes": "v7 matches v6 structural score (0.97) with 35% more training data. 4 low scores (0.50) likely Python 2 source files. Ready for upload and deployment."
|
| 32 |
+
},
|
| 33 |
+
"running_pids": [],
|
| 34 |
+
"last_progress": "evaluation complete",
|
| 35 |
+
"timestamp": "2026-04-16T00:15:00-04:00",
|
| 36 |
+
"next_steps": [
|
| 37 |
+
"Upload v7 adapter to HF Hub (slenk/codewraith-lora-8b)",
|
| 38 |
+
"Update README with v7 results",
|
| 39 |
+
"Redeploy HF Space with v7 adapter"
|
| 40 |
+
],
|
| 41 |
+
"known_constraints": {
|
| 42 |
+
"vram": "32GB (RTX 5090)",
|
| 43 |
+
"32b_max_context": 4096,
|
| 44 |
+
"14b_max_context": 16384,
|
| 45 |
+
"generation_command": "uv run --extra ml python3 -c \"from codewraith.teacher.generator import generate_dataset; generate_dataset('data/source_files', 'data/training_pairs_v7.jsonl', model='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', backend='vllm')\"",
|
| 46 |
+
"vllm_command": "uv run --extra ml python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-Coder-14B-Instruct-AWQ --port 8081 --max-model-len 16384 --gpu-memory-utilization 0.90 --host 0.0.0.0"
|
| 47 |
+
}
|
| 48 |
+
}
|
.claude/scheduled_tasks.lock
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"sessionId":"65720353-dd32-4bab-971c-cca9aacd06fe","pid":74275,"acquiredAt":1776308286647}
|
.claude/settings.local.json
ADDED
|
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"permissions": {
|
| 3 |
+
"allow": [
|
| 4 |
+
"Bash(bash:*)",
|
| 5 |
+
"Bash([ -f \"$HOME/.claude/CLAUDE.md\" ])",
|
| 6 |
+
"Bash(node:*)",
|
| 7 |
+
"Bash(chmod 755:*)",
|
| 8 |
+
"Bash(npm view:*)",
|
| 9 |
+
"Bash(npm install:*)",
|
| 10 |
+
"Bash(command -v omc)",
|
| 11 |
+
"Bash(omc --version)",
|
| 12 |
+
"Bash(claude mcp:*)",
|
| 13 |
+
"mcp__plugin_oh-my-claudecode_t__state_write",
|
| 14 |
+
"mcp__plugin_oh-my-claudecode_t__state_clear",
|
| 15 |
+
"Bash(command -v libreoffice)",
|
| 16 |
+
"Bash(libreoffice --headless --convert-to txt \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" --outdir /tmp)",
|
| 17 |
+
"Read(//tmp/**)",
|
| 18 |
+
"Bash(command -v pandoc)",
|
| 19 |
+
"Bash(pandoc \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" -t plain)",
|
| 20 |
+
"Bash(uv sync:*)",
|
| 21 |
+
"Bash(uv run:*)",
|
| 22 |
+
"Bash(git add:*)",
|
| 23 |
+
"Bash(curl -s http://127.0.0.1:11434/api/tags)",
|
| 24 |
+
"Bash(python3 -c \"import sys,json; data=json.load\\(sys.stdin\\); [print\\(m['name']\\) for m in data.get\\('models',[]\\)]\")",
|
| 25 |
+
"Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"codewraith-teacher\",\"keep_alive\":0}')",
|
| 26 |
+
"Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
|
| 27 |
+
"Bash(command -v chub)",
|
| 28 |
+
"WebSearch",
|
| 29 |
+
"mcp__context7__resolve-library-id",
|
| 30 |
+
"mcp__context7__query-docs",
|
| 31 |
+
"Bash(git commit:*)",
|
| 32 |
+
"Bash(nvidia-smi:*)",
|
| 33 |
+
"Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":512}}')",
|
| 34 |
+
"Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
|
| 35 |
+
"Bash(pip index:*)",
|
| 36 |
+
"Bash(curl -s https://pypi.org/pypi/gradio/json)",
|
| 37 |
+
"Bash(python3 -c \"import sys,json; print\\('Latest:', json.load\\(sys.stdin\\)['info']['version']\\)\")",
|
| 38 |
+
"Bash(pkill -f \"codewraith.teacher.generator\")",
|
| 39 |
+
"Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/tags)",
|
| 40 |
+
"Bash(python3 -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\('Ollama OK:', len\\(d.get\\('models',[]\\)\\), 'models'\\)\")",
|
| 41 |
+
"Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"gemma4:26b\",\"keep_alive\":0}')",
|
| 42 |
+
"Bash(du -sh /home/slenk/dev/CodeWraith/*/)",
|
| 43 |
+
"Bash(du -sh /home/slenk/dev/CodeWraith/data/*.json)",
|
| 44 |
+
"Bash(du -sh /home/slenk/dev/CodeWraith/.*)",
|
| 45 |
+
"Bash(git:*)",
|
| 46 |
+
"Bash(du -sh /home/slenk/dev/CodeWraith/data/chromadb/ /home/slenk/dev/CodeWraith/data/*.jsonl)",
|
| 47 |
+
"Bash(hf upload:*)",
|
| 48 |
+
"Bash(hf whoami:*)",
|
| 49 |
+
"Bash(hf auth:*)",
|
| 50 |
+
"Bash(hf spaces:*)",
|
| 51 |
+
"Bash(python:*)",
|
| 52 |
+
"Bash(python3:*)",
|
| 53 |
+
"WebFetch(domain:ollama.com)",
|
| 54 |
+
"Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen2.5-coder:32b-instruct-q6_K\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":256}}')",
|
| 55 |
+
"Bash(HF_TOKEN=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 hf upload:*)",
|
| 56 |
+
"Bash(curl -s http://127.0.0.1:8081/health)",
|
| 57 |
+
"Bash(curl -s http://192.168.13.21:8081/v1/chat/completions -H 'Content-Type: application/json' -d '{:*)",
|
| 58 |
+
"Bash(curl:*)",
|
| 59 |
+
"Bash(chmod +x .venv/bin/pytest)",
|
| 60 |
+
"Bash(awk '{print $2}')",
|
| 61 |
+
"Bash(ls /home/slenk/dev/CodeWraith/data/source_files/*.py)"
|
| 62 |
+
]
|
| 63 |
+
}
|
| 64 |
+
}
|
.gitattributes
CHANGED
|
@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
Final[[:space:]]Project[[:space:]]Rubric.odt filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/ci.yml
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Test, Format, and Lint
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
push:
|
| 5 |
+
branches: [ "**" ]
|
| 6 |
+
|
| 7 |
+
jobs:
|
| 8 |
+
build:
|
| 9 |
+
runs-on: ubuntu-latest
|
| 10 |
+
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v6
|
| 13 |
+
|
| 14 |
+
- name: Setup uv
|
| 15 |
+
uses: astral-sh/setup-uv@v7
|
| 16 |
+
|
| 17 |
+
- name: Install dependencies
|
| 18 |
+
run: |
|
| 19 |
+
uv sync --extra dev
|
| 20 |
+
|
| 21 |
+
- name: Run Format and Lint Checks
|
| 22 |
+
run: |
|
| 23 |
+
uv run ruff check src/ tests/
|
| 24 |
+
uv run ruff format --check src/ tests/
|
| 25 |
+
|
| 26 |
+
- name: Run tests
|
| 27 |
+
run: |
|
| 28 |
+
uv run pytest --cov=src --cov-fail-under=80
|
.pre-commit-config.yaml
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# See https://pre-commit.com for more information
|
| 2 |
+
# See https://pre-commit.com/hooks.html for more hooks
|
| 3 |
+
repos:
|
| 4 |
+
- repo: https://github.com/pre-commit/pre-commit-hooks
|
| 5 |
+
rev: v6.0.0
|
| 6 |
+
hooks:
|
| 7 |
+
- id: trailing-whitespace
|
| 8 |
+
- id: end-of-file-fixer
|
| 9 |
+
- id: check-yaml
|
| 10 |
+
- id: check-added-large-files
|
| 11 |
+
exclude: '^uv\.lock$'
|
| 12 |
+
- id: check-json
|
| 13 |
+
- id: check-xml
|
| 14 |
+
|
| 15 |
+
- repo: https://github.com/astral-sh/ruff-pre-commit
|
| 16 |
+
rev: v0.14.11
|
| 17 |
+
hooks:
|
| 18 |
+
- id: ruff
|
| 19 |
+
- id: ruff-format
|
| 20 |
+
|
| 21 |
+
- repo: local
|
| 22 |
+
hooks:
|
| 23 |
+
- id: pytest
|
| 24 |
+
name: pytest
|
| 25 |
+
entry: uv run pytest --cov=src --cov-fail-under=80 tests
|
| 26 |
+
language: system
|
| 27 |
+
types: [python]
|
| 28 |
+
pass_filenames: false
|
| 29 |
+
always_run: true
|
.pytest_cache/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Created by pytest automatically.
|
| 2 |
+
*
|
.pytest_cache/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
| 2 |
+
# This file is a cache directory tag created by pytest.
|
| 3 |
+
# For information about cache directory tags, see:
|
| 4 |
+
# https://bford.info/cachedir/spec.html
|
.pytest_cache/README.md
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# pytest cache directory #
|
| 2 |
+
|
| 3 |
+
This directory contains data from the pytest's cache plugin,
|
| 4 |
+
which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
|
| 5 |
+
|
| 6 |
+
**Do not** commit this to version control.
|
| 7 |
+
|
| 8 |
+
See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.
|
.pytest_cache/v/cache/lastfailed
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{}
|
.pytest_cache/v/cache/nodeids
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
"tests/test_regressions.py::TestFlattenField::test_dict_joins_values",
|
| 3 |
+
"tests/test_regressions.py::TestFlattenField::test_dict_with_none_values_skipped",
|
| 4 |
+
"tests/test_regressions.py::TestFlattenField::test_empty_string_returns_empty",
|
| 5 |
+
"tests/test_regressions.py::TestFlattenField::test_list_of_dicts",
|
| 6 |
+
"tests/test_regressions.py::TestFlattenField::test_list_of_strings",
|
| 7 |
+
"tests/test_regressions.py::TestFlattenField::test_none_returns_empty",
|
| 8 |
+
"tests/test_regressions.py::TestFlattenField::test_string_passthrough",
|
| 9 |
+
"tests/test_regressions.py::TestFunctionSeparators::test_no_separator_before_first_function",
|
| 10 |
+
"tests/test_regressions.py::TestFunctionSeparators::test_separator_between_functions",
|
| 11 |
+
"tests/test_regressions.py::TestFunctionSeparators::test_single_function_no_separator",
|
| 12 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_dict",
|
| 13 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_list_of_dicts",
|
| 14 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_none",
|
| 15 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_dict",
|
| 16 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_list_of_dicts",
|
| 17 |
+
"tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_none",
|
| 18 |
+
"tests/test_regressions.py::TestVLLMMaxTokensConfig::test_query_vllm_uses_4096_max_tokens",
|
| 19 |
+
"tests/test_student/test_evaluate.py::test_argument_coverage",
|
| 20 |
+
"tests/test_student/test_evaluate.py::test_empty_spec",
|
| 21 |
+
"tests/test_student/test_evaluate.py::test_has_structure_plain_text",
|
| 22 |
+
"tests/test_student/test_evaluate.py::test_has_structure_with_markdown",
|
| 23 |
+
"tests/test_student/test_evaluate.py::test_missing_class",
|
| 24 |
+
"tests/test_student/test_evaluate.py::test_missing_function",
|
| 25 |
+
"tests/test_student/test_evaluate.py::test_perfect_score",
|
| 26 |
+
"tests/test_student/test_evaluate.py::test_syntax_error_source",
|
| 27 |
+
"tests/test_student/test_trainer.py::test_lora_config_defaults",
|
| 28 |
+
"tests/test_student/test_trainer.py::test_models_dict",
|
| 29 |
+
"tests/test_student/test_trainer.py::test_push_to_hub_requires_model",
|
| 30 |
+
"tests/test_student/test_trainer.py::test_system_message",
|
| 31 |
+
"tests/test_teacher/test_clean_dataset.py::test_filters_null_output",
|
| 32 |
+
"tests/test_teacher/test_clean_dataset.py::test_filters_too_long",
|
| 33 |
+
"tests/test_teacher/test_clean_dataset.py::test_filters_too_short",
|
| 34 |
+
"tests/test_teacher/test_clean_dataset.py::test_keeps_valid_entries",
|
| 35 |
+
"tests/test_teacher/test_clean_dataset.py::test_rejected_file_has_reasons",
|
| 36 |
+
"tests/test_teacher/test_collect.py::test_count_lines",
|
| 37 |
+
"tests/test_teacher/test_collect.py::test_file_hash_deterministic",
|
| 38 |
+
"tests/test_teacher/test_collect.py::test_file_hash_different",
|
| 39 |
+
"tests/test_teacher/test_collect.py::test_has_functions_or_classes_indented",
|
| 40 |
+
"tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_class",
|
| 41 |
+
"tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_function",
|
| 42 |
+
"tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_neither",
|
| 43 |
+
"tests/test_teacher/test_generator.py::test_load_completed_empty_file",
|
| 44 |
+
"tests/test_teacher/test_generator.py::test_load_completed_nonexistent",
|
| 45 |
+
"tests/test_teacher/test_generator.py::test_load_completed_skips_entries_without_source_file",
|
| 46 |
+
"tests/test_teacher/test_generator.py::test_load_completed_with_entries",
|
| 47 |
+
"tests/test_verifier/test_ast_checker.py::test_extract_class_info",
|
| 48 |
+
"tests/test_verifier/test_ast_checker.py::test_extract_function_signatures",
|
| 49 |
+
"tests/test_verifier/test_ast_checker.py::test_validate_signatures_match",
|
| 50 |
+
"tests/test_verifier/test_ast_checker.py::test_validate_signatures_mismatch",
|
| 51 |
+
"tests/test_verifier/test_judge.py::test_parse_direct_json",
|
| 52 |
+
"tests/test_verifier/test_judge.py::test_parse_empty_string",
|
| 53 |
+
"tests/test_verifier/test_judge.py::test_parse_invalid_json",
|
| 54 |
+
"tests/test_verifier/test_judge.py::test_parse_json_embedded_in_text",
|
| 55 |
+
"tests/test_verifier/test_judge.py::test_parse_json_in_code_fence",
|
| 56 |
+
"tests/test_verifier/test_judge.py::test_parse_json_in_plain_fence",
|
| 57 |
+
"tests/test_verifier/test_judge.py::test_parse_nested_json"
|
| 58 |
+
]
|
.ruff_cache/.gitignore
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Automatically created by ruff.
|
| 2 |
+
*
|
.ruff_cache/0.14.11/10353388469511876764
ADDED
|
Binary file (55 Bytes). View file
|
|
|
.ruff_cache/0.14.11/13241530885439384270
ADDED
|
Binary file (67 Bytes). View file
|
|
|
.ruff_cache/0.14.11/15588448742999807618
ADDED
|
Binary file (55 Bytes). View file
|
|
|
.ruff_cache/0.14.11/16909963377101005444
ADDED
|
Binary file (67 Bytes). View file
|
|
|
.ruff_cache/0.14.11/1739102192644247459
ADDED
|
Binary file (62 Bytes). View file
|
|
|
.ruff_cache/0.14.11/2164407392135946080
ADDED
|
Binary file (67 Bytes). View file
|
|
|
.ruff_cache/0.14.11/3894487969124666669
ADDED
|
Binary file (74 Bytes). View file
|
|
|
.ruff_cache/0.14.11/4150897988697354825
ADDED
|
Binary file (228 Bytes). View file
|
|
|
.ruff_cache/0.14.11/4355628838106937123
ADDED
|
Binary file (336 Bytes). View file
|
|
|
.ruff_cache/0.14.11/5959195261246591303
ADDED
|
Binary file (545 Bytes). View file
|
|
|
.ruff_cache/0.14.11/7470470446486951261
ADDED
|
Binary file (346 Bytes). View file
|
|
|
.ruff_cache/0.14.11/7805097287912496176
ADDED
|
Binary file (499 Bytes). View file
|
|
|
.ruff_cache/0.15.10/12264003023071563180
ADDED
|
Binary file (119 Bytes). View file
|
|
|
.ruff_cache/0.15.10/12707412386835734272
ADDED
|
Binary file (639 Bytes). View file
|
|
|
.ruff_cache/0.15.10/14255674844609017079
ADDED
|
Binary file (5.45 kB). View file
|
|
|
.ruff_cache/0.15.10/14515759316653052378
ADDED
|
Binary file (703 Bytes). View file
|
|
|
.ruff_cache/0.15.10/1490264962946858478
ADDED
|
Binary file (346 Bytes). View file
|
|
|
.ruff_cache/0.15.10/15830732279563417379
ADDED
|
Binary file (639 Bytes). View file
|
|
|
.ruff_cache/0.15.10/16585459526495690818
ADDED
|
Binary file (3.65 kB). View file
|
|
|
.ruff_cache/0.15.10/17140885238503855112
ADDED
|
Binary file (62 Bytes). View file
|
|
|
.ruff_cache/0.15.10/17258540645676314702
ADDED
|
Binary file (703 Bytes). View file
|
|
|
.ruff_cache/0.15.10/3600153915928311247
ADDED
|
Binary file (74 Bytes). View file
|
|
|
.ruff_cache/0.15.10/4777661366283320788
ADDED
|
Binary file (67 Bytes). View file
|
|
|
.ruff_cache/0.15.10/8194725524321540937
ADDED
|
Binary file (55 Bytes). View file
|
|
|
.ruff_cache/0.15.10/9372251181401831964
ADDED
|
Binary file (590 Bytes). View file
|
|
|
.ruff_cache/CACHEDIR.TAG
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
Signature: 8a477f597d28d172789f06886806bc55
|
CLAUDE.md
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith
|
| 2 |
+
|
| 3 |
+
Module-to-Spec Transformer -- fine-tuned LLM that generates technical specifications from Python source code.
|
| 4 |
+
|
| 5 |
+
## Pipeline State
|
| 6 |
+
|
| 7 |
+
Read `.claude/pipeline_state.json` at session start to know where the ML pipeline left off. Update it after completing any pipeline stage (generation, cleaning, training, evaluation, upload).
|
| 8 |
+
|
| 9 |
+
## Process Monitoring
|
| 10 |
+
|
| 11 |
+
- When monitoring long-running processes (vLLM serving, dataset generation, model training, uploads), check status at **5-minute intervals minimum**. Do NOT poll more frequently unless explicitly asked.
|
| 12 |
+
- Before killing any long-running process, **always confirm with the user first**. Never assume a process is stuck without evidence of zero progress over multiple checks.
|
| 13 |
+
- For HuggingFace uploads of large models (>10GB), prefer `hf upload` CLI over Python `push_to_hub()`. The CLI handles resumption better.
|
| 14 |
+
|
| 15 |
+
## Environment
|
| 16 |
+
|
| 17 |
+
- Python 3.12, managed with `uv`
|
| 18 |
+
- Use `uv sync` / `uv run`, never `uv pip install`
|
| 19 |
+
- Tests: `uv run pytest`
|
| 20 |
+
- Lint: `uv run ruff check`
|
| 21 |
+
- GPU: NVIDIA RTX 5090 (32GB VRAM)
|
| 22 |
+
- Teacher models served via vLLM at 192.168.13.21:8081
|
| 23 |
+
|
| 24 |
+
## Commits
|
| 25 |
+
|
| 26 |
+
- Use Angular/Conventional Commits format
|
| 27 |
+
- No Co-Authored-By lines
|
| 28 |
+
- Commit at every meaningful milestone
|
Final Project Rubric.odt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a0adb49313b481e0771de5fca861649823b8c56220a59ded7146d0d1d5283d60
|
| 3 |
+
size 3241527
|
Final Project Rubric.odt:Zone.Identifier
ADDED
|
Binary file (59 Bytes). View file
|
|
|
Modelfile.teacher
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM llama3.1:70b-instruct-q4_K_M
|
| 2 |
+
|
| 3 |
+
PARAMETER num_ctx 4096
|
| 4 |
+
PARAMETER num_gpu 99
|
README.md
CHANGED
|
@@ -22,8 +22,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
|
|
| 22 |
```
|
| 23 |
┌─────────────┐
|
| 24 |
Python Source ──> │ Teacher │ ──> Training Pairs (code -> spec)
|
| 25 |
-
│
|
| 26 |
-
│
|
| 27 |
└─────────────┘ │
|
| 28 |
▼
|
| 29 |
┌─────────────┐ ┌─────────────┐
|
|
@@ -41,8 +41,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
|
|
| 41 |
│
|
| 42 |
▼
|
| 43 |
┌─────────────┐
|
| 44 |
-
│ Gradio App │
|
| 45 |
-
│ HF Spaces │
|
| 46 |
└─────────────┘
|
| 47 |
```
|
| 48 |
|
|
@@ -50,10 +50,10 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
|
|
| 50 |
|
| 51 |
| Component | Directory | Purpose |
|
| 52 |
|-----------|-----------|---------|
|
| 53 |
-
| **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using
|
| 54 |
| **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
|
| 55 |
| **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
|
| 56 |
-
| **App** | `src/codewraith/app/` | Gradio web interface deployed on HuggingFace Spaces |
|
| 57 |
|
| 58 |
## Verification Pipeline
|
| 59 |
|
|
@@ -61,6 +61,29 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
|
|
| 61 |
2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
|
| 62 |
3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
## Quick Start
|
| 65 |
|
| 66 |
### Prerequisites
|
|
@@ -80,13 +103,10 @@ cd CodeWraith
|
|
| 80 |
uv venv
|
| 81 |
uv sync
|
| 82 |
|
| 83 |
-
# Install ML dependencies (
|
| 84 |
uv sync --extra ml
|
| 85 |
|
| 86 |
-
# Install
|
| 87 |
-
uv sync --extra ml --extra training
|
| 88 |
-
|
| 89 |
-
# Install app dependencies (gradio)
|
| 90 |
uv sync --extra app
|
| 91 |
|
| 92 |
# Install everything
|
|
@@ -119,23 +139,29 @@ into `data/source_files/`. Resumable if interrupted.
|
|
| 119 |
### Step 2: Optimize Prompt with DSPy
|
| 120 |
|
| 121 |
Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
|
| 122 |
-
Requires Ollama running with
|
| 123 |
|
| 124 |
```bash
|
| 125 |
-
# Pull the teacher model
|
| 126 |
-
ollama pull qwen3:30b-a3b
|
| 127 |
-
|
| 128 |
# Run optimization
|
| 129 |
uv run --extra ml python3 -m codewraith.teacher.optimize
|
| 130 |
```
|
| 131 |
|
| 132 |
Saves the optimized generator to `data/optimized_generator.json`.
|
|
|
|
| 133 |
|
| 134 |
### Step 3: Generate Training Data
|
| 135 |
|
| 136 |
-
Generate specs for all collected source files
|
| 137 |
|
| 138 |
```bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 139 |
uv run --extra ml python3 -c "
|
| 140 |
from codewraith.teacher.generator import generate_dataset
|
| 141 |
generate_dataset('data/source_files', 'data/training_pairs.jsonl')
|
|
@@ -158,10 +184,10 @@ Fine-tune with Unsloth + LoRA. Supports both 3B and 8B models.
|
|
| 158 |
|
| 159 |
```bash
|
| 160 |
# Train Llama 3.2 3B (fast, ~3-4 minutes)
|
| 161 |
-
uv run --extra ml
|
| 162 |
|
| 163 |
# Train Llama 3.1 8B (better quality, ~8-10 minutes)
|
| 164 |
-
uv run --extra ml
|
| 165 |
```
|
| 166 |
|
| 167 |
Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
|
|
@@ -172,10 +198,10 @@ Run evaluation comparing structural accuracy across models.
|
|
| 172 |
|
| 173 |
```bash
|
| 174 |
# Evaluate 3B
|
| 175 |
-
uv run --extra ml
|
| 176 |
|
| 177 |
# Evaluate 8B
|
| 178 |
-
uv run --extra ml
|
| 179 |
```
|
| 180 |
|
| 181 |
Generates `data/eval_report.md` with comparison metrics.
|
|
@@ -183,7 +209,7 @@ Generates `data/eval_report.md` with comparison metrics.
|
|
| 183 |
### Step 7: Run Gradio App
|
| 184 |
|
| 185 |
```bash
|
| 186 |
-
uv run --extra ml --extra
|
| 187 |
```
|
| 188 |
|
| 189 |
Auto-detects the best available adapter (prefers 8B over 3B).
|
|
@@ -193,73 +219,147 @@ Opens a web UI with code input, sampling parameter controls, and live spec gener
|
|
| 193 |
|
| 194 |
```bash
|
| 195 |
# Push adapter to HuggingFace Hub
|
| 196 |
-
uv run --extra ml
|
| 197 |
from codewraith.student.trainer import load_base_model, push_to_hub
|
| 198 |
from peft import PeftModel
|
| 199 |
-
model, tokenizer = load_base_model('
|
| 200 |
-
model = PeftModel.from_pretrained(model, './models/codewraith-lora-
|
| 201 |
-
push_to_hub(model, tokenizer, '
|
| 202 |
"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
```
|
| 204 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
## Evaluation Results
|
| 206 |
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
### Llama 3.1 8B (CodeWraith-8b
|
| 212 |
|
| 213 |
| Metric | Score |
|
| 214 |
|--------|-------|
|
| 215 |
-
| Avg Structural Score | 0.
|
| 216 |
-
| Function Coverage |
|
| 217 |
| Class Coverage | 100% |
|
| 218 |
-
| Argument Coverage |
|
| 219 |
-
| Return Type Coverage |
|
| 220 |
-
| Perfect Scores |
|
| 221 |
-
| Good Scores (>=80%) |
|
| 222 |
-
|
|
| 223 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 224 |
|
| 225 |
-
###
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 226 |
|
| 227 |
| Metric | Score |
|
| 228 |
|--------|-------|
|
| 229 |
-
| Avg Structural Score | 0.
|
| 230 |
-
| Function Coverage |
|
| 231 |
-
| Class Coverage |
|
| 232 |
-
| Argument Coverage |
|
| 233 |
-
| Return Type Coverage |
|
| 234 |
-
| Perfect Scores |
|
| 235 |
-
| Good Scores (>=80%) |
|
| 236 |
-
|
|
| 237 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 238 |
|
| 239 |
### Analysis
|
| 240 |
|
| 241 |
-
The
|
| 242 |
-
|
| 243 |
-
- Perfect class coverage (100% vs 96%)
|
| 244 |
-
- More perfect scores (22/28 vs 19/28)
|
| 245 |
-
- Higher quality training data from Gemma 4 26B teacher enabled the larger model to shine
|
| 246 |
|
| 247 |
-
|
| 248 |
-
|
| 249 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 250 |
|
| 251 |
### HuggingFace Models
|
| 252 |
|
| 253 |
-
- Deployed (8B): https://huggingface.co/slenk/codewraith-lora-8b
|
| 254 |
-
-
|
|
|
|
| 255 |
|
| 256 |
## Environment
|
| 257 |
|
| 258 |
-
- **Teacher model**:
|
| 259 |
- **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
|
| 260 |
- **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
|
| 261 |
-
- **
|
| 262 |
-
- **
|
|
|
|
| 263 |
|
| 264 |
## Project Structure
|
| 265 |
|
|
@@ -272,8 +372,9 @@ CodeWraith/
|
|
| 272 |
│ ├── teacher/
|
| 273 |
│ │ ├── collect.py # HF dataset collection
|
| 274 |
│ │ ├── optimize.py # DSPy prompt optimization
|
| 275 |
-
│ │ ├── generator.py # Training data generation
|
| 276 |
│ │ └── clean_dataset.py # Dataset filtering
|
|
|
|
| 277 |
│ ├── verifier/
|
| 278 |
│ │ ├── ast_checker.py # AST structural validation
|
| 279 |
│ │ └── judge.py # LLM-as-Judge semantic audit
|
|
@@ -281,19 +382,17 @@ CodeWraith/
|
|
| 281 |
│ │ ├── trainer.py # Unsloth + LoRA fine-tuning
|
| 282 |
│ │ └── evaluate.py # Model evaluation pipeline
|
| 283 |
│ └── app/
|
| 284 |
-
│
|
| 285 |
-
|
| 286 |
-
├──
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
```
|
| 289 |
-
|
| 290 |
-
## Rubric Alignment
|
| 291 |
-
|
| 292 |
-
| Rubric Section | Points | Implementation |
|
| 293 |
-
|---------------|--------|----------------|
|
| 294 |
-
| Model Functionality (training + LoRA + eval) | 20 | `student/trainer.py`, `student/evaluate.py`, 3B vs 8B comparison |
|
| 295 |
-
| Innovation & Creativity | 20 | Teacher-student architecture, DSPy prompt optimization, AST verification pipeline |
|
| 296 |
-
| Environment Setup (deployment) | 15 | `app/main.py`, Gradio on HF Spaces |
|
| 297 |
-
| Inference Pipeline (sampling) | 15 | `app/main.py` with temperature/top_p/max_tokens controls |
|
| 298 |
-
| Technical Documentation | 15 | This README, evaluation reports, docstrings |
|
| 299 |
-
| Demo & Presentation | 15 | Live Gradio app as interactive demo |
|
|
|
|
| 22 |
```
|
| 23 |
┌─────────────┐
|
| 24 |
Python Source ──> │ Teacher │ ──> Training Pairs (code -> spec)
|
| 25 |
+
│ LLM via │ │
|
| 26 |
+
│ vLLM/Ollama │ │
|
| 27 |
└─────────────┘ │
|
| 28 |
▼
|
| 29 |
┌─────────────┐ ┌─────────────┐
|
|
|
|
| 41 |
│
|
| 42 |
▼
|
| 43 |
┌─────────────┐
|
| 44 |
+
│ Gradio App │ <── RAG Retriever
|
| 45 |
+
│ HF Spaces │ (ChromaDB)
|
| 46 |
└─────────────┘
|
| 47 |
```
|
| 48 |
|
|
|
|
| 50 |
|
| 51 |
| Component | Directory | Purpose |
|
| 52 |
|-----------|-----------|---------|
|
| 53 |
+
| **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using a large LLM via vLLM (JSON-constrained) or Ollama |
|
| 54 |
| **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
|
| 55 |
| **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
|
| 56 |
+
| **App** | `src/codewraith/app/` | Gradio web interface with RAG retrieval, deployed on HuggingFace Spaces |
|
| 57 |
|
| 58 |
## Verification Pipeline
|
| 59 |
|
|
|
|
| 61 |
2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
|
| 62 |
3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
|
| 63 |
|
| 64 |
+
## Sampling & Inference
|
| 65 |
+
|
| 66 |
+
The inference pipeline uses **nucleus sampling** (top-p) combined with temperature scaling to balance output quality and diversity:
|
| 67 |
+
|
| 68 |
+
| Parameter | Default | Range | Purpose |
|
| 69 |
+
|-----------|---------|-------|---------|
|
| 70 |
+
| **Temperature** | 0.7 | 0.0 - 2.0 | Controls randomness. Lower values (0.1-0.3) produce more deterministic, structured output. Higher values increase diversity but risk incoherence. |
|
| 71 |
+
| **Top-p** | 0.9 | 0.0 - 1.0 | Nucleus sampling threshold. At each step, only tokens whose cumulative probability mass falls within the top-p fraction are considered. 0.9 retains the top 90% probability mass, filtering out low-likelihood tokens. |
|
| 72 |
+
| **Max Tokens** | 2048 | 256 - 8192 | Maximum generation length. Technical specs for typical modules run 500-1500 tokens; larger modules may need 4096+. |
|
| 73 |
+
|
| 74 |
+
**Why nucleus sampling over beam search?** Spec generation benefits from controlled creativity -- mermaid diagrams and natural language descriptions need some variation, while function signatures need precision. Nucleus sampling with moderate temperature (0.7) gives the model freedom in prose while the fine-tuning keeps structured elements accurate. For maximum precision, users can lower temperature to 0.1-0.3.
|
| 75 |
+
|
| 76 |
+
## Retrieval-Augmented Generation (RAG)
|
| 77 |
+
|
| 78 |
+
At inference time, the app optionally retrieves similar code-spec pairs from a ChromaDB vector index to provide few-shot context:
|
| 79 |
+
|
| 80 |
+
1. **Indexing**: All training pairs are embedded using `sentence-transformers` and stored in ChromaDB (193 pairs)
|
| 81 |
+
2. **Retrieval**: When a user submits code, the retriever finds the 3 most similar source files by cosine similarity
|
| 82 |
+
3. **Augmentation**: Retrieved examples are prepended to the user's input as context, giving the model concrete formatting examples
|
| 83 |
+
4. **Auto-truncation**: If RAG context pushes the input beyond 6000 tokens, it is automatically dropped to prevent context overflow
|
| 84 |
+
|
| 85 |
+
RAG improves output consistency, especially for formatting patterns like mermaid diagrams and markdown tables that the model may not reliably produce from fine-tuning alone.
|
| 86 |
+
|
| 87 |
## Quick Start
|
| 88 |
|
| 89 |
### Prerequisites
|
|
|
|
| 103 |
uv venv
|
| 104 |
uv sync
|
| 105 |
|
| 106 |
+
# Install ML dependencies (transformers, unsloth, vllm, etc.)
|
| 107 |
uv sync --extra ml
|
| 108 |
|
| 109 |
+
# Install app dependencies (gradio, chromadb)
|
|
|
|
|
|
|
|
|
|
| 110 |
uv sync --extra app
|
| 111 |
|
| 112 |
# Install everything
|
|
|
|
| 139 |
### Step 2: Optimize Prompt with DSPy
|
| 140 |
|
| 141 |
Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
|
| 142 |
+
Requires Ollama running with the configured teacher model.
|
| 143 |
|
| 144 |
```bash
|
|
|
|
|
|
|
|
|
|
| 145 |
# Run optimization
|
| 146 |
uv run --extra ml python3 -m codewraith.teacher.optimize
|
| 147 |
```
|
| 148 |
|
| 149 |
Saves the optimized generator to `data/optimized_generator.json`.
|
| 150 |
+
Falls back to raw Ollama generation if DSPy optimization is unavailable or returns null.
|
| 151 |
|
| 152 |
### Step 3: Generate Training Data
|
| 153 |
|
| 154 |
+
Generate specs for all collected source files. Two backends are available:
|
| 155 |
|
| 156 |
```bash
|
| 157 |
+
# vLLM backend (recommended) -- JSON-constrained output for consistent structure
|
| 158 |
+
# Requires vLLM server running with a code-specialized model
|
| 159 |
+
uv run --extra ml python3 -c "
|
| 160 |
+
from codewraith.teacher.generator import generate_dataset
|
| 161 |
+
generate_dataset('data/source_files', 'data/training_pairs.jsonl', backend='vllm')
|
| 162 |
+
"
|
| 163 |
+
|
| 164 |
+
# Ollama backend -- raw generation, uses DSPy-optimized prompt if available
|
| 165 |
uv run --extra ml python3 -c "
|
| 166 |
from codewraith.teacher.generator import generate_dataset
|
| 167 |
generate_dataset('data/source_files', 'data/training_pairs.jsonl')
|
|
|
|
| 184 |
|
| 185 |
```bash
|
| 186 |
# Train Llama 3.2 3B (fast, ~3-4 minutes)
|
| 187 |
+
uv run --extra ml python3 -m codewraith.student.trainer 3b
|
| 188 |
|
| 189 |
# Train Llama 3.1 8B (better quality, ~8-10 minutes)
|
| 190 |
+
uv run --extra ml python3 -m codewraith.student.trainer 8b
|
| 191 |
```
|
| 192 |
|
| 193 |
Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
|
|
|
|
| 198 |
|
| 199 |
```bash
|
| 200 |
# Evaluate 3B
|
| 201 |
+
uv run --extra ml python3 -m codewraith.student.evaluate 3b
|
| 202 |
|
| 203 |
# Evaluate 8B
|
| 204 |
+
uv run --extra ml python3 -m codewraith.student.evaluate 8b
|
| 205 |
```
|
| 206 |
|
| 207 |
Generates `data/eval_report.md` with comparison metrics.
|
|
|
|
| 209 |
### Step 7: Run Gradio App
|
| 210 |
|
| 211 |
```bash
|
| 212 |
+
uv run --extra ml --extra app python3 -m codewraith.app.main
|
| 213 |
```
|
| 214 |
|
| 215 |
Auto-detects the best available adapter (prefers 8B over 3B).
|
|
|
|
| 219 |
|
| 220 |
```bash
|
| 221 |
# Push adapter to HuggingFace Hub
|
| 222 |
+
uv run --extra ml python3 -c "
|
| 223 |
from codewraith.student.trainer import load_base_model, push_to_hub
|
| 224 |
from peft import PeftModel
|
| 225 |
+
model, tokenizer = load_base_model('8b')
|
| 226 |
+
model = PeftModel.from_pretrained(model, './models/codewraith-lora-8b')
|
| 227 |
+
push_to_hub(model, tokenizer, 'slenk/codewraith-lora-8b')
|
| 228 |
"
|
| 229 |
+
|
| 230 |
+
# Upload app to HuggingFace Spaces (uses .hfignore to exclude large files)
|
| 231 |
+
hf upload slenk/codewraith . . --repo-type space \
|
| 232 |
+
--exclude "models/*" --exclude ".venv/*" --exclude "adapter/*" \
|
| 233 |
+
--exclude ".git/*" --exclude "tests/*" --exclude "scripts/*"
|
| 234 |
```
|
| 235 |
|
| 236 |
+
The Space downloads the LoRA adapter from HF Hub at startup, so model weights
|
| 237 |
+
are not included in the Space repository. A `.hfignore` file is provided to
|
| 238 |
+
exclude development artifacts from uploads.
|
| 239 |
+
|
| 240 |
+
## Model Evolution
|
| 241 |
+
|
| 242 |
+
The project iterated through multiple teacher models and training configurations to find the best combination:
|
| 243 |
+
|
| 244 |
+
| Version | Teacher Model | Student | Key Finding |
|
| 245 |
+
|---------|--------------|---------|-------------|
|
| 246 |
+
| v1 | Llama 3.1 70B (Q4) | 3B, 8B | Baseline. Functional specs but inconsistent formatting. |
|
| 247 |
+
| v2 | Llama 3.1 70B (Q4) | 3B, 8B | Improved hyperparameters (r=32, 8192 context, 4 epochs). 8B reached 0.89 structural score. |
|
| 248 |
+
| v3 | Qwen3 30B-A3B (MoE) | 3B, 8B | Better structured output -- tables, type annotations, cleaner markdown. 3B chosen as primary (0.92 structural). |
|
| 249 |
+
| v4 | Gemma 4 26B | 3B, 8B | Higher structural scores (8B: 0.95, 100% class coverage). Wordier prose but weaker return type coverage (67%). 8B selected as deployed model. |
|
| 250 |
+
| v5 | Qwen2.5-Coder 32B (Q6) | 8B | Code-specialized teacher for more precise, structured specifications. |
|
| 251 |
+
| v6 | Qwen2.5-Coder 32B (AWQ) via vLLM | 8B | JSON-constrained generation via vLLM ensures consistent spec structure. 171 pairs, 0.97 structural score. |
|
| 252 |
+
| v7 | Qwen2.5-Coder 14B (AWQ) via vLLM | 8B | Smaller teacher with 16384 context recovers large files. 231 pairs (+35%), 0.97 structural score maintained. |
|
| 253 |
+
|
| 254 |
+
Each iteration preserved previous model adapters for comparison. The teacher model
|
| 255 |
+
has the largest impact on output quality -- a code-specialized teacher (Qwen2.5-Coder)
|
| 256 |
+
is expected to produce more precise function signatures and structured formatting than
|
| 257 |
+
general-purpose models.
|
| 258 |
+
|
| 259 |
## Evaluation Results
|
| 260 |
|
| 261 |
+
### v7 -- Current (Qwen2.5-Coder 14B AWQ via vLLM)
|
| 262 |
+
|
| 263 |
+
Models trained with 4096 context, LoRA r=16, 3 epochs.
|
| 264 |
+
Training data generated by Qwen2.5-Coder-14B-Instruct-AWQ via vLLM with JSON-constrained output.
|
| 265 |
+
Evaluated on 34 held-out examples (197 train / 34 eval split from 231 total pairs).
|
| 266 |
|
| 267 |
+
#### Llama 3.1 8B (CodeWraith-8b-v7)
|
| 268 |
|
| 269 |
| Metric | Score |
|
| 270 |
|--------|-------|
|
| 271 |
+
| Avg Structural Score | 0.97 |
|
| 272 |
+
| Function Coverage | 97% |
|
| 273 |
| Class Coverage | 100% |
|
| 274 |
+
| Argument Coverage | 95% |
|
| 275 |
+
| Return Type Coverage | 90% |
|
| 276 |
+
| Perfect Scores | 25/34 |
|
| 277 |
+
| Good Scores (>=80%) | 29/34 |
|
| 278 |
+
| Training Loss | 0.12 |
|
| 279 |
+
|
| 280 |
+
**Key change from v6:** Switched from the 32B teacher (limited to 4096 context on 32GB VRAM)
|
| 281 |
+
to the 14B teacher with 16384 context. This recovered 60 additional training pairs from source
|
| 282 |
+
files that previously exceeded the context window, increasing the dataset by 35%. Structural
|
| 283 |
+
score held steady at 0.97 with the larger eval set. Four low scores (0.50) traced to Python 2
|
| 284 |
+
syntax in source files, not model output issues.
|
| 285 |
+
|
| 286 |
+
### v6 -- Previous (Qwen2.5-Coder 32B AWQ via vLLM)
|
| 287 |
+
|
| 288 |
+
Models trained with 8192 context, LoRA r=32, 3 epochs, dropout=0.05.
|
| 289 |
+
Training data generated by Qwen2.5-Coder-32B-Instruct-AWQ via vLLM with JSON-constrained output.
|
| 290 |
+
Evaluated on 26 held-out examples (145 train / 26 eval split from 171 total pairs).
|
| 291 |
+
|
| 292 |
+
#### Llama 3.1 8B (CodeWraith-8b-v6)
|
| 293 |
+
|
| 294 |
+
| Metric | Score |
|
| 295 |
+
|--------|-------|
|
| 296 |
+
| Avg Structural Score | 0.97 |
|
| 297 |
+
| Perfect Scores | 19/26 |
|
| 298 |
+
| Good Scores (>=80%) | 22/26 |
|
| 299 |
|
| 300 |
+
### v5 -- Previous (Qwen2.5-Coder 32B via Ollama)
|
| 301 |
+
|
| 302 |
+
Models trained with 8192 context, LoRA r=32, 4 epochs, dropout=0.05.
|
| 303 |
+
Training data generated by Qwen2.5-Coder 32B (Q6 quantization) via Ollama.
|
| 304 |
+
Evaluated on 37 held-out examples (proper train/eval split, no data leakage).
|
| 305 |
+
|
| 306 |
+
#### Llama 3.1 8B (CodeWraith-8b-v5)
|
| 307 |
|
| 308 |
| Metric | Score |
|
| 309 |
|--------|-------|
|
| 310 |
+
| Avg Structural Score | 0.99 |
|
| 311 |
+
| Function Coverage | 97% |
|
| 312 |
+
| Class Coverage | 100% |
|
| 313 |
+
| Argument Coverage | 99% |
|
| 314 |
+
| Return Type Coverage | 100% |
|
| 315 |
+
| Perfect Scores | 29/37 |
|
| 316 |
+
| Good Scores (>=80%) | 36/37 |
|
| 317 |
+
| Training Loss | 0.33 |
|
| 318 |
+
|
| 319 |
+
### v4 -- Previous (Gemma 4 26B Teacher)
|
| 320 |
+
|
| 321 |
+
Evaluated on 28 held-out examples.
|
| 322 |
+
|
| 323 |
+
#### Llama 3.1 8B (CodeWraith-8b-v4)
|
| 324 |
+
|
| 325 |
+
| Metric | v4 | v5 | Change |
|
| 326 |
+
|--------|-----|-----|--------|
|
| 327 |
+
| Structural Score | 0.95 | 0.99 | +0.04 |
|
| 328 |
+
| Function Coverage | 90% | 97% | +7% |
|
| 329 |
+
| Class Coverage | 100% | 100% | -- |
|
| 330 |
+
| Argument Coverage | 94% | 99% | +5% |
|
| 331 |
+
| Return Type Coverage | 67% | 100% | +33% |
|
| 332 |
+
| Perfect Scores | 78% | 78% | -- |
|
| 333 |
+
| Good Scores (>=80%) | 89% | 97% | +8% |
|
| 334 |
+
| Training Loss | 0.59 | 0.33 | -44% |
|
| 335 |
|
| 336 |
### Analysis
|
| 337 |
|
| 338 |
+
The v5 model using a **code-specialized teacher** (Qwen2.5-Coder 32B) dramatically
|
| 339 |
+
improved over v4's general-purpose teacher (Gemma 4 26B):
|
|
|
|
|
|
|
|
|
|
| 340 |
|
| 341 |
+
- **Return type coverage recovered from 67% to 100%** -- the v4 regression was caused
|
| 342 |
+
by Gemma producing prose descriptions instead of precise type annotations
|
| 343 |
+
- **Training loss dropped 44%** -- the code-specialized teacher produces more consistent,
|
| 344 |
+
structured output that the student model learns more efficiently
|
| 345 |
+
- **97% good scores** -- only 1 of 37 examples scored below 80%
|
| 346 |
+
- The code-specialized teacher generates more precise function signatures and parameter
|
| 347 |
+
types, which directly translates to higher AST verification scores
|
| 348 |
|
| 349 |
### HuggingFace Models
|
| 350 |
|
| 351 |
+
- Deployed (8B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-8b
|
| 352 |
+
- Merged (8B standalone): https://huggingface.co/slenk/codewraith-merged-8b
|
| 353 |
+
- Alternative (3B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-3b
|
| 354 |
|
| 355 |
## Environment
|
| 356 |
|
| 357 |
+
- **Teacher model**: Configurable via Ollama at `127.0.0.1:11434` (tested with Llama 70B, Qwen3 30B, Gemma 4 26B, Qwen2.5-Coder 32B)
|
| 358 |
- **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
|
| 359 |
- **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
|
| 360 |
+
- **RAG retrieval**: ChromaDB + sentence-transformers for few-shot context at inference
|
| 361 |
+
- **Deployment**: Gradio on HuggingFace Spaces with ZeroGPU (A10G)
|
| 362 |
+
- **Hardware (local)**: NVIDIA RTX 5090 (32GB VRAM)
|
| 363 |
|
| 364 |
## Project Structure
|
| 365 |
|
|
|
|
| 372 |
│ ├── teacher/
|
| 373 |
│ │ ├── collect.py # HF dataset collection
|
| 374 |
│ │ ├── optimize.py # DSPy prompt optimization
|
| 375 |
+
│ │ ├── generator.py # Training data generation (Ollama + vLLM backends)
|
| 376 |
│ │ └── clean_dataset.py # Dataset filtering
|
| 377 |
+
│ ├── spec_schema.py # Pydantic ModuleSpec schema + markdown renderer
|
| 378 |
│ ├── verifier/
|
| 379 |
│ │ ├── ast_checker.py # AST structural validation
|
| 380 |
│ │ └── judge.py # LLM-as-Judge semantic audit
|
|
|
|
| 382 |
│ │ ├── trainer.py # Unsloth + LoRA fine-tuning
|
| 383 |
│ │ └── evaluate.py # Model evaluation pipeline
|
| 384 |
│ └── app/
|
| 385 |
+
│ ├── main.py # Gradio inference UI
|
| 386 |
+
│ └── retriever.py # RAG retrieval from ChromaDB
|
| 387 |
+
├── app.py # HF Spaces entry point
|
| 388 |
+
├── data/
|
| 389 |
+
│ ├── chromadb/ # Vector index for RAG retrieval
|
| 390 |
+
│ ├── source_files/ # Collected Python source files
|
| 391 |
+
│ ├── training_pairs*.jsonl # Generated training data (per version)
|
| 392 |
+
│ └── eval_report*.md # Evaluation reports
|
| 393 |
+
├── models/ # Local LoRA adapters (gitignored, hosted on HF Hub)
|
| 394 |
+
|
| 395 |
+
├── scripts/
|
| 396 |
+
│ └── retrain.py # Full retrain pipeline
|
| 397 |
+
└── tests/ # Test suite
|
| 398 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
data/eval_report.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.95 |
|
| 8 |
+
| Function Coverage | 0.92 |
|
| 9 |
+
| Class Coverage | 0.81 |
|
| 10 |
+
| Argument Coverage | 1.00 |
|
| 11 |
+
| Return Type Coverage | 0.89 |
|
| 12 |
+
| Good Scores (>=80%) | 26 |
|
| 13 |
+
| Avg Inference Time (s) | 20.79 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-8b (Llama-3.1-8B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 30
|
| 18 |
+
- Valid (parseable): 29
|
| 19 |
+
- Perfect scores: 17
|
| 20 |
+
- Total inference time: 623.6s
|
data/eval_report_3b_v2.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-3b-v2 (Llama-3.2-3B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.93 |
|
| 8 |
+
| Function Coverage | 0.84 |
|
| 9 |
+
| Class Coverage | 0.97 |
|
| 10 |
+
| Argument Coverage | 0.91 |
|
| 11 |
+
| Return Type Coverage | 0.97 |
|
| 12 |
+
| Good Scores (>=80%) | 25 |
|
| 13 |
+
| Avg Inference Time (s) | 20.01 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-3b-v2 (Llama-3.2-3B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 31
|
| 18 |
+
- Valid (parseable): 28
|
| 19 |
+
- Perfect scores: 15
|
| 20 |
+
- Total inference time: 620.2s
|
data/eval_report_3b_v3.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.92 |
|
| 8 |
+
| Function Coverage | 0.83 |
|
| 9 |
+
| Class Coverage | 0.92 |
|
| 10 |
+
| Argument Coverage | 0.93 |
|
| 11 |
+
| Return Type Coverage | 0.84 |
|
| 12 |
+
| Good Scores (>=80%) | 24 |
|
| 13 |
+
| Avg Inference Time (s) | 20.01 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-3b (Llama-3.2-3B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 31
|
| 18 |
+
- Valid (parseable): 28
|
| 19 |
+
- Perfect scores: 13
|
| 20 |
+
- Total inference time: 620.2s
|
data/eval_report_3b_v4.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.91 |
|
| 8 |
+
| Function Coverage | 0.86 |
|
| 9 |
+
| Class Coverage | 0.96 |
|
| 10 |
+
| Argument Coverage | 0.93 |
|
| 11 |
+
| Return Type Coverage | 0.67 |
|
| 12 |
+
| Good Scores (>=80%) | 24 |
|
| 13 |
+
| Avg Inference Time (s) | 25.57 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-3b (Llama-3.2-3B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 28
|
| 18 |
+
- Valid (parseable): 27
|
| 19 |
+
- Perfect scores: 19
|
| 20 |
+
- Total inference time: 715.9s
|
data/eval_report_8b_v2.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-8b-v2 (Llama-3.1-8B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.92 |
|
| 8 |
+
| Function Coverage | 0.85 |
|
| 9 |
+
| Class Coverage | 0.84 |
|
| 10 |
+
| Argument Coverage | 0.93 |
|
| 11 |
+
| Return Type Coverage | 0.97 |
|
| 12 |
+
| Good Scores (>=80%) | 24 |
|
| 13 |
+
| Avg Inference Time (s) | 21.91 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-8b-v2 (Llama-3.1-8B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 31
|
| 18 |
+
- Valid (parseable): 28
|
| 19 |
+
- Perfect scores: 15
|
| 20 |
+
- Total inference time: 679.2s
|
data/eval_report_8b_v4.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.95 |
|
| 8 |
+
| Function Coverage | 0.90 |
|
| 9 |
+
| Class Coverage | 1.00 |
|
| 10 |
+
| Argument Coverage | 0.94 |
|
| 11 |
+
| Return Type Coverage | 0.67 |
|
| 12 |
+
| Good Scores (>=80%) | 25 |
|
| 13 |
+
| Avg Inference Time (s) | 27.57 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-8b (Llama-3.1-8B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 28
|
| 18 |
+
- Valid (parseable): 27
|
| 19 |
+
- Perfect scores: 22
|
| 20 |
+
- Total inference time: 772.1s
|
data/eval_report_8b_v5.md
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# CodeWraith Model Evaluation Report
|
| 2 |
+
|
| 3 |
+
## Summary
|
| 4 |
+
|
| 5 |
+
| Metric | CodeWraith-8b-v5 (Llama-3.1-8B-Instruct) |
|
| 6 |
+
|--------|-----|
|
| 7 |
+
| Avg Structural Score | 0.99 |
|
| 8 |
+
| Function Coverage | 0.97 |
|
| 9 |
+
| Class Coverage | 1.00 |
|
| 10 |
+
| Argument Coverage | 0.99 |
|
| 11 |
+
| Return Type Coverage | 1.00 |
|
| 12 |
+
| Good Scores (>=80%) | 36 |
|
| 13 |
+
| Avg Inference Time (s) | 25.29 |
|
| 14 |
+
|
| 15 |
+
## CodeWraith-8b-v5 (Llama-3.1-8B-Instruct)
|
| 16 |
+
|
| 17 |
+
- Examples evaluated: 37
|
| 18 |
+
- Valid (parseable): 36
|
| 19 |
+
- Perfect scores: 29
|
| 20 |
+
- Total inference time: 935.9s
|