Spaces:

slenk
/

codewraith

Sleeping

App Files Files Community

slenk commited on Apr 16

Commit

eeef81e

verified ·

1 Parent(s): c33e1b1

Upload folder using huggingface_hub

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.claude/pipeline_state.json +48 -0
.claude/scheduled_tasks.lock +1 -0
.claude/settings.local.json +64 -0
.gitattributes +1 -0
.github/workflows/ci.yml +28 -0
.pre-commit-config.yaml +29 -0
.pytest_cache/.gitignore +2 -0
.pytest_cache/CACHEDIR.TAG +4 -0
.pytest_cache/README.md +8 -0
.pytest_cache/v/cache/lastfailed +1 -0
.pytest_cache/v/cache/nodeids +58 -0
.ruff_cache/.gitignore +2 -0
.ruff_cache/0.14.11/10353388469511876764 +0 -0
.ruff_cache/0.14.11/13241530885439384270 +0 -0
.ruff_cache/0.14.11/15588448742999807618 +0 -0
.ruff_cache/0.14.11/16909963377101005444 +0 -0
.ruff_cache/0.14.11/1739102192644247459 +0 -0
.ruff_cache/0.14.11/2164407392135946080 +0 -0
.ruff_cache/0.14.11/3894487969124666669 +0 -0
.ruff_cache/0.14.11/4150897988697354825 +0 -0
.ruff_cache/0.14.11/4355628838106937123 +0 -0
.ruff_cache/0.14.11/5959195261246591303 +0 -0
.ruff_cache/0.14.11/7470470446486951261 +0 -0
.ruff_cache/0.14.11/7805097287912496176 +0 -0
.ruff_cache/0.15.10/12264003023071563180 +0 -0
.ruff_cache/0.15.10/12707412386835734272 +0 -0
.ruff_cache/0.15.10/14255674844609017079 +0 -0
.ruff_cache/0.15.10/14515759316653052378 +0 -0
.ruff_cache/0.15.10/1490264962946858478 +0 -0
.ruff_cache/0.15.10/15830732279563417379 +0 -0
.ruff_cache/0.15.10/16585459526495690818 +0 -0
.ruff_cache/0.15.10/17140885238503855112 +0 -0
.ruff_cache/0.15.10/17258540645676314702 +0 -0
.ruff_cache/0.15.10/3600153915928311247 +0 -0
.ruff_cache/0.15.10/4777661366283320788 +0 -0
.ruff_cache/0.15.10/8194725524321540937 +0 -0
.ruff_cache/0.15.10/9372251181401831964 +0 -0
.ruff_cache/CACHEDIR.TAG +1 -0
CLAUDE.md +28 -0
Final Project Rubric.odt +3 -0
Final Project Rubric.odt:Zone.Identifier +0 -0
Modelfile.teacher +4 -0
README.md +175 -76
data/eval_report.md +20 -0
data/eval_report_3b_v2.md +20 -0
data/eval_report_3b_v3.md +20 -0
data/eval_report_3b_v4.md +20 -0
data/eval_report_8b_v2.md +20 -0
data/eval_report_8b_v4.md +20 -0
data/eval_report_8b_v5.md +20 -0

.claude/pipeline_state.json ADDED Viewed

	@@ -0,0 +1,48 @@

+{
+  "current_stage": "evaluation_complete",
+  "model_version": "v7",
+  "teacher_model": "Qwen/Qwen2.5-Coder-14B-Instruct-AWQ",
+  "teacher_server": "vllm @ 192.168.13.21:8081",
+  "student_model": "unsloth/Llama-3.1-8B-Instruct",
+  "previous_version": {
+    "version": "v6",
+    "pairs": 171,
+    "train_split": 145,
+    "eval_split": 26,
+    "structural_score": 0.97,
+    "perfect_scores": "19/26",
+    "adapter_path": "models/codewraith-lora-8b-v6/",
+    "hf_repo": "slenk/codewraith-lora-8b",
+    "status": "complete"
+  },
+  "current_run": {
+    "output_file": "data/training_pairs_v7.jsonl",
+    "pairs_generated": 231,
+    "failures": 19,
+    "total_source_files": 250,
+    "train_split": 197,
+    "eval_split": 34,
+    "structural_score": 0.97,
+    "perfect_scores": "25/34",
+    "good_scores": "29/34",
+    "training_loss": 0.12,
+    "adapter_path": "models/codewraith-lora-8b-v7/",
+    "status": "evaluation_complete",
+    "notes": "v7 matches v6 structural score (0.97) with 35% more training data. 4 low scores (0.50) likely Python 2 source files. Ready for upload and deployment."
+  },
+  "running_pids": [],
+  "last_progress": "evaluation complete",
+  "timestamp": "2026-04-16T00:15:00-04:00",
+  "next_steps": [
+    "Upload v7 adapter to HF Hub (slenk/codewraith-lora-8b)",
+    "Update README with v7 results",
+    "Redeploy HF Space with v7 adapter"
+  ],
+  "known_constraints": {
+    "vram": "32GB (RTX 5090)",
+    "32b_max_context": 4096,
+    "14b_max_context": 16384,
+    "generation_command": "uv run --extra ml python3 -c \"from codewraith.teacher.generator import generate_dataset; generate_dataset('data/source_files', 'data/training_pairs_v7.jsonl', model='Qwen/Qwen2.5-Coder-14B-Instruct-AWQ', backend='vllm')\"",
+    "vllm_command": "uv run --extra ml python -m vllm.entrypoints.openai.api_server --model Qwen/Qwen2.5-Coder-14B-Instruct-AWQ --port 8081 --max-model-len 16384 --gpu-memory-utilization 0.90 --host 0.0.0.0"
+  }
+}

.claude/scheduled_tasks.lock ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"sessionId":"65720353-dd32-4bab-971c-cca9aacd06fe","pid":74275,"acquiredAt":1776308286647}

.claude/settings.local.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "permissions": {
+    "allow": [
+      "Bash(bash:*)",
+      "Bash([ -f \"$HOME/.claude/CLAUDE.md\" ])",
+      "Bash(node:*)",
+      "Bash(chmod 755:*)",
+      "Bash(npm view:*)",
+      "Bash(npm install:*)",
+      "Bash(command -v omc)",
+      "Bash(omc --version)",
+      "Bash(claude mcp:*)",
+      "mcp__plugin_oh-my-claudecode_t__state_write",
+      "mcp__plugin_oh-my-claudecode_t__state_clear",
+      "Bash(command -v libreoffice)",
+      "Bash(libreoffice --headless --convert-to txt \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" --outdir /tmp)",
+      "Read(//tmp/**)",
+      "Bash(command -v pandoc)",
+      "Bash(pandoc \"/mnt/c/Users/derek/Downloads/Final Project Rubric.odt\" -t plain)",
+      "Bash(uv sync:*)",
+      "Bash(uv run:*)",
+      "Bash(git add:*)",
+      "Bash(curl -s http://127.0.0.1:11434/api/tags)",
+      "Bash(python3 -c \"import sys,json; data=json.load\\(sys.stdin\\); [print\\(m['name']\\) for m in data.get\\('models',[]\\)]\")",
+      "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"codewraith-teacher\",\"keep_alive\":0}')",
+      "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
+      "Bash(command -v chub)",
+      "WebSearch",
+      "mcp__context7__resolve-library-id",
+      "mcp__context7__query-docs",
+      "Bash(git commit:*)",
+      "Bash(nvidia-smi:*)",
+      "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":512}}')",
+      "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen3:30b-a3b\",\"keep_alive\":0}')",
+      "Bash(pip index:*)",
+      "Bash(curl -s https://pypi.org/pypi/gradio/json)",
+      "Bash(python3 -c \"import sys,json; print\\('Latest:', json.load\\(sys.stdin\\)['info']['version']\\)\")",
+      "Bash(pkill -f \"codewraith.teacher.generator\")",
+      "Bash(curl -s --max-time 10 http://127.0.0.1:11434/api/tags)",
+      "Bash(python3 -c \"import sys,json; d=json.load\\(sys.stdin\\); print\\('Ollama OK:', len\\(d.get\\('models',[]\\)\\), 'models'\\)\")",
+      "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"gemma4:26b\",\"keep_alive\":0}')",
+      "Bash(du -sh /home/slenk/dev/CodeWraith/*/)",
+      "Bash(du -sh /home/slenk/dev/CodeWraith/data/*.json)",
+      "Bash(du -sh /home/slenk/dev/CodeWraith/.*)",
+      "Bash(git:*)",
+      "Bash(du -sh /home/slenk/dev/CodeWraith/data/chromadb/ /home/slenk/dev/CodeWraith/data/*.jsonl)",
+      "Bash(hf upload:*)",
+      "Bash(hf whoami:*)",
+      "Bash(hf auth:*)",
+      "Bash(hf spaces:*)",
+      "Bash(python:*)",
+      "Bash(python3:*)",
+      "WebFetch(domain:ollama.com)",
+      "Bash(curl -s http://127.0.0.1:11434/api/generate -d '{\"model\":\"qwen2.5-coder:32b-instruct-q6_K\",\"prompt\":\"Say hello\",\"stream\":false,\"options\":{\"num_ctx\":256}}')",
+      "Bash(HF_TOKEN=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 hf upload:*)",
+      "Bash(curl -s http://127.0.0.1:8081/health)",
+      "Bash(curl -s http://192.168.13.21:8081/v1/chat/completions -H 'Content-Type: application/json' -d '{:*)",
+      "Bash(curl:*)",
+      "Bash(chmod +x .venv/bin/pytest)",
+      "Bash(awk '{print $2}')",
+      "Bash(ls /home/slenk/dev/CodeWraith/data/source_files/*.py)"
+    ]
+  }
+}

.gitattributes CHANGED Viewed

@@ -34,3 +34,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text

 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
 data/chromadb/chroma.sqlite3 filter=lfs diff=lfs merge=lfs -text
+Final[[:space:]]Project[[:space:]]Rubric.odt filter=lfs diff=lfs merge=lfs -text

.github/workflows/ci.yml ADDED Viewed

	@@ -0,0 +1,28 @@

+name: Test, Format, and Lint
+on:
+  push:
+    branches: [ "**" ]
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v6
+    - name: Setup uv
+      uses: astral-sh/setup-uv@v7
+    - name: Install dependencies
+      run: |
+        uv sync --extra dev
+    - name: Run Format and Lint Checks
+      run: |
+        uv run ruff check src/ tests/
+        uv run ruff format --check src/ tests/
+    - name: Run tests
+      run: |
+        uv run pytest --cov=src --cov-fail-under=80

.pre-commit-config.yaml ADDED Viewed

	@@ -0,0 +1,29 @@

+# See https://pre-commit.com for more information
+# See https://pre-commit.com/hooks.html for more hooks
+repos:
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v6.0.0
+    hooks:
+      - id: trailing-whitespace
+      - id: end-of-file-fixer
+      - id: check-yaml
+      - id: check-added-large-files
+        exclude: '^uv\.lock$'
+      - id: check-json
+      - id: check-xml
+  - repo: https://github.com/astral-sh/ruff-pre-commit
+    rev: v0.14.11
+    hooks:
+      - id: ruff
+      - id: ruff-format
+  - repo: local
+    hooks:
+    - id: pytest
+      name: pytest
+      entry: uv run pytest --cov=src --cov-fail-under=80 tests
+      language: system
+      types: [python]
+      pass_filenames: false
+      always_run: true

.pytest_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Created by pytest automatically.
2	+ *

.pytest_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1,4 @@

+Signature: 8a477f597d28d172789f06886806bc55
+# This file is a cache directory tag created by pytest.
+# For information about cache directory tags, see:
+#	https://bford.info/cachedir/spec.html

.pytest_cache/README.md ADDED Viewed

	@@ -0,0 +1,8 @@

+# pytest cache directory #
+This directory contains data from the pytest's cache plugin,
+which provides the `--lf` and `--ff` options, as well as the `cache` fixture.
+**Do not** commit this to version control.
+See [the docs](https://docs.pytest.org/en/stable/how-to/cache.html) for more information.

.pytest_cache/v/cache/lastfailed ADDED Viewed

	@@ -0,0 +1 @@


1	+ {}

.pytest_cache/v/cache/nodeids ADDED Viewed

	@@ -0,0 +1,58 @@

+[
+  "tests/test_regressions.py::TestFlattenField::test_dict_joins_values",
+  "tests/test_regressions.py::TestFlattenField::test_dict_with_none_values_skipped",
+  "tests/test_regressions.py::TestFlattenField::test_empty_string_returns_empty",
+  "tests/test_regressions.py::TestFlattenField::test_list_of_dicts",
+  "tests/test_regressions.py::TestFlattenField::test_list_of_strings",
+  "tests/test_regressions.py::TestFlattenField::test_none_returns_empty",
+  "tests/test_regressions.py::TestFlattenField::test_string_passthrough",
+  "tests/test_regressions.py::TestFunctionSeparators::test_no_separator_before_first_function",
+  "tests/test_regressions.py::TestFunctionSeparators::test_separator_between_functions",
+  "tests/test_regressions.py::TestFunctionSeparators::test_single_function_no_separator",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_dict",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_list_of_dicts",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_raises_as_none",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_dict",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_list_of_dicts",
+  "tests/test_regressions.py::TestFunctionSpecPolymorphicFields::test_returns_as_none",
+  "tests/test_regressions.py::TestVLLMMaxTokensConfig::test_query_vllm_uses_4096_max_tokens",
+  "tests/test_student/test_evaluate.py::test_argument_coverage",
+  "tests/test_student/test_evaluate.py::test_empty_spec",
+  "tests/test_student/test_evaluate.py::test_has_structure_plain_text",
+  "tests/test_student/test_evaluate.py::test_has_structure_with_markdown",
+  "tests/test_student/test_evaluate.py::test_missing_class",
+  "tests/test_student/test_evaluate.py::test_missing_function",
+  "tests/test_student/test_evaluate.py::test_perfect_score",
+  "tests/test_student/test_evaluate.py::test_syntax_error_source",
+  "tests/test_student/test_trainer.py::test_lora_config_defaults",
+  "tests/test_student/test_trainer.py::test_models_dict",
+  "tests/test_student/test_trainer.py::test_push_to_hub_requires_model",
+  "tests/test_student/test_trainer.py::test_system_message",
+  "tests/test_teacher/test_clean_dataset.py::test_filters_null_output",
+  "tests/test_teacher/test_clean_dataset.py::test_filters_too_long",
+  "tests/test_teacher/test_clean_dataset.py::test_filters_too_short",
+  "tests/test_teacher/test_clean_dataset.py::test_keeps_valid_entries",
+  "tests/test_teacher/test_clean_dataset.py::test_rejected_file_has_reasons",
+  "tests/test_teacher/test_collect.py::test_count_lines",
+  "tests/test_teacher/test_collect.py::test_file_hash_deterministic",
+  "tests/test_teacher/test_collect.py::test_file_hash_different",
+  "tests/test_teacher/test_collect.py::test_has_functions_or_classes_indented",
+  "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_class",
+  "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_function",
+  "tests/test_teacher/test_collect.py::test_has_functions_or_classes_with_neither",
+  "tests/test_teacher/test_generator.py::test_load_completed_empty_file",
+  "tests/test_teacher/test_generator.py::test_load_completed_nonexistent",
+  "tests/test_teacher/test_generator.py::test_load_completed_skips_entries_without_source_file",
+  "tests/test_teacher/test_generator.py::test_load_completed_with_entries",
+  "tests/test_verifier/test_ast_checker.py::test_extract_class_info",
+  "tests/test_verifier/test_ast_checker.py::test_extract_function_signatures",
+  "tests/test_verifier/test_ast_checker.py::test_validate_signatures_match",
+  "tests/test_verifier/test_ast_checker.py::test_validate_signatures_mismatch",
+  "tests/test_verifier/test_judge.py::test_parse_direct_json",
+  "tests/test_verifier/test_judge.py::test_parse_empty_string",
+  "tests/test_verifier/test_judge.py::test_parse_invalid_json",
+  "tests/test_verifier/test_judge.py::test_parse_json_embedded_in_text",
+  "tests/test_verifier/test_judge.py::test_parse_json_in_code_fence",
+  "tests/test_verifier/test_judge.py::test_parse_json_in_plain_fence",
+  "tests/test_verifier/test_judge.py::test_parse_nested_json"
+]

.ruff_cache/.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ # Automatically created by ruff.
2	+ *

.ruff_cache/0.14.11/10353388469511876764 ADDED Viewed

Binary file (55 Bytes). View file

.ruff_cache/0.14.11/13241530885439384270 ADDED Viewed

Binary file (67 Bytes). View file

.ruff_cache/0.14.11/15588448742999807618 ADDED Viewed

Binary file (55 Bytes). View file

.ruff_cache/0.14.11/16909963377101005444 ADDED Viewed

Binary file (67 Bytes). View file

.ruff_cache/0.14.11/1739102192644247459 ADDED Viewed

Binary file (62 Bytes). View file

.ruff_cache/0.14.11/2164407392135946080 ADDED Viewed

Binary file (67 Bytes). View file

.ruff_cache/0.14.11/3894487969124666669 ADDED Viewed

Binary file (74 Bytes). View file

.ruff_cache/0.14.11/4150897988697354825 ADDED Viewed

Binary file (228 Bytes). View file

.ruff_cache/0.14.11/4355628838106937123 ADDED Viewed

Binary file (336 Bytes). View file

.ruff_cache/0.14.11/5959195261246591303 ADDED Viewed

Binary file (545 Bytes). View file

.ruff_cache/0.14.11/7470470446486951261 ADDED Viewed

Binary file (346 Bytes). View file

.ruff_cache/0.14.11/7805097287912496176 ADDED Viewed

Binary file (499 Bytes). View file

.ruff_cache/0.15.10/12264003023071563180 ADDED Viewed

Binary file (119 Bytes). View file

.ruff_cache/0.15.10/12707412386835734272 ADDED Viewed

Binary file (639 Bytes). View file

.ruff_cache/0.15.10/14255674844609017079 ADDED Viewed

Binary file (5.45 kB). View file

.ruff_cache/0.15.10/14515759316653052378 ADDED Viewed

Binary file (703 Bytes). View file

.ruff_cache/0.15.10/1490264962946858478 ADDED Viewed

Binary file (346 Bytes). View file

.ruff_cache/0.15.10/15830732279563417379 ADDED Viewed

Binary file (639 Bytes). View file

.ruff_cache/0.15.10/16585459526495690818 ADDED Viewed

Binary file (3.65 kB). View file

.ruff_cache/0.15.10/17140885238503855112 ADDED Viewed

Binary file (62 Bytes). View file

.ruff_cache/0.15.10/17258540645676314702 ADDED Viewed

Binary file (703 Bytes). View file

.ruff_cache/0.15.10/3600153915928311247 ADDED Viewed

Binary file (74 Bytes). View file

.ruff_cache/0.15.10/4777661366283320788 ADDED Viewed

Binary file (67 Bytes). View file

.ruff_cache/0.15.10/8194725524321540937 ADDED Viewed

Binary file (55 Bytes). View file

.ruff_cache/0.15.10/9372251181401831964 ADDED Viewed

Binary file (590 Bytes). View file

.ruff_cache/CACHEDIR.TAG ADDED Viewed

	@@ -0,0 +1 @@


1	+ Signature: 8a477f597d28d172789f06886806bc55

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,28 @@

+# CodeWraith
+Module-to-Spec Transformer -- fine-tuned LLM that generates technical specifications from Python source code.
+## Pipeline State
+Read `.claude/pipeline_state.json` at session start to know where the ML pipeline left off. Update it after completing any pipeline stage (generation, cleaning, training, evaluation, upload).
+## Process Monitoring
+- When monitoring long-running processes (vLLM serving, dataset generation, model training, uploads), check status at **5-minute intervals minimum**. Do NOT poll more frequently unless explicitly asked.
+- Before killing any long-running process, **always confirm with the user first**. Never assume a process is stuck without evidence of zero progress over multiple checks.
+- For HuggingFace uploads of large models (>10GB), prefer `hf upload` CLI over Python `push_to_hub()`. The CLI handles resumption better.
+## Environment
+- Python 3.12, managed with `uv`
+- Use `uv sync` / `uv run`, never `uv pip install`
+- Tests: `uv run pytest`
+- Lint: `uv run ruff check`
+- GPU: NVIDIA RTX 5090 (32GB VRAM)
+- Teacher models served via vLLM at 192.168.13.21:8081
+## Commits
+- Use Angular/Conventional Commits format
+- No Co-Authored-By lines
+- Commit at every meaningful milestone

Final Project Rubric.odt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a0adb49313b481e0771de5fca861649823b8c56220a59ded7146d0d1d5283d60
+size 3241527

Final Project Rubric.odt:Zone.Identifier ADDED Viewed

Binary file (59 Bytes). View file

Modelfile.teacher ADDED Viewed

	@@ -0,0 +1,4 @@

+FROM llama3.1:70b-instruct-q4_K_M
+PARAMETER num_ctx 4096
+PARAMETER num_gpu 99

README.md CHANGED Viewed

@@ -22,8 +22,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
 ```
                     ┌─────────────┐
   Python Source ──> │   Teacher   │ ──> Training Pairs (code -> spec)
-                    │ Qwen3 30B   │         │
-                    │ (Ollama)    │         │
                     └─────────────┘         │
                                             ▼
                     ┌─────────────┐    ┌─────────────┐
@@ -41,8 +41,8 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
                                               │
                                               ▼
                                        ┌─────────────┐
-                                       │  Gradio App │
-                                       │  HF Spaces  │
                                        └─────────────┘
 ```
@@ -50,10 +50,10 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
 | Component | Directory | Purpose |
 |-----------|-----------|---------|
-| **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using Qwen3 30B via Ollama |
 | **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
 | **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
-| **App** | `src/codewraith/app/` | Gradio web interface deployed on HuggingFace Spaces |
 ## Verification Pipeline
@@ -61,6 +61,29 @@ CodeWraith uses a teacher-student architecture: a large model generates gold-sta
 2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
 3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
 ## Quick Start
 ### Prerequisites
@@ -80,13 +103,10 @@ cd CodeWraith
 uv venv
 uv sync
-# Install ML dependencies (datasets, transformers, dspy)
 uv sync --extra ml
-# Install training dependencies (unsloth, peft, trl)
-uv sync --extra ml --extra training
-# Install app dependencies (gradio)
 uv sync --extra app
 # Install everything
@@ -119,23 +139,29 @@ into `data/source_files/`. Resumable if interrupted.
 ### Step 2: Optimize Prompt with DSPy
 Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
-Requires Ollama running with `qwen3:30b-a3b`.
 ```bash
-# Pull the teacher model
-ollama pull qwen3:30b-a3b
 # Run optimization
 uv run --extra ml python3 -m codewraith.teacher.optimize
 ```
 Saves the optimized generator to `data/optimized_generator.json`.
 ### Step 3: Generate Training Data
-Generate specs for all collected source files using the optimized prompt.
 ```bash
 uv run --extra ml python3 -c "
 from codewraith.teacher.generator import generate_dataset
 generate_dataset('data/source_files', 'data/training_pairs.jsonl')
@@ -158,10 +184,10 @@ Fine-tune with Unsloth + LoRA. Supports both 3B and 8B models.
 ```bash
 # Train Llama 3.2 3B (fast, ~3-4 minutes)
-uv run --extra ml --extra training python3 -m codewraith.student.trainer 3b
 # Train Llama 3.1 8B (better quality, ~8-10 minutes)
-uv run --extra ml --extra training python3 -m codewraith.student.trainer 8b
 ```
 Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
@@ -172,10 +198,10 @@ Run evaluation comparing structural accuracy across models.
 ```bash
 # Evaluate 3B
-uv run --extra ml --extra training python3 -m codewraith.student.evaluate 3b
 # Evaluate 8B
-uv run --extra ml --extra training python3 -m codewraith.student.evaluate 8b
 ```
 Generates `data/eval_report.md` with comparison metrics.
@@ -183,7 +209,7 @@ Generates `data/eval_report.md` with comparison metrics.
 ### Step 7: Run Gradio App
 ```bash
-uv run --extra ml --extra training --extra app python3 -m codewraith.app.main
 ```
 Auto-detects the best available adapter (prefers 8B over 3B).
@@ -193,73 +219,147 @@ Opens a web UI with code input, sampling parameter controls, and live spec gener
 ```bash
 # Push adapter to HuggingFace Hub
-uv run --extra ml --extra training python3 -c "
 from codewraith.student.trainer import load_base_model, push_to_hub
 from peft import PeftModel
-model, tokenizer = load_base_model('3b')
-model = PeftModel.from_pretrained(model, './models/codewraith-lora-3b')
-push_to_hub(model, tokenizer, 'your-username/codewraith-lora-3b')
 "
 ```
 ## Evaluation Results
-Models trained with 8192 context, LoRA r=32, 4 epochs, dropout=0.05.
-Training data generated by Gemma 4 26B teacher model with DSPy-optimized prompts.
-Evaluated on 28 held-out examples (proper train/eval split, no data leakage).
-### Llama 3.1 8B (CodeWraith-8b) -- Deployed Model
 | Metric | Score |
 |--------|-------|
-| Avg Structural Score | 0.95 |
-| Function Coverage | 90% |
 | Class Coverage | 100% |
-| Argument Coverage | 94% |
-| Return Type Coverage | 67% |
-| Perfect Scores | 22/28 |
-| Good Scores (>=80%) | 25/28 |
-| Avg Inference Time | 28s |
-| Training Loss | 0.59 |
-### Llama 3.2 3B (CodeWraith-3b)
 | Metric | Score |
 |--------|-------|
-| Avg Structural Score | 0.91 |
-| Function Coverage | 86% |
-| Class Coverage | 96% |
-| Argument Coverage | 93% |
-| Return Type Coverage | 67% |
-| Perfect Scores | 19/28 |
-| Good Scores (>=80%) | 24/28 |
-| Avg Inference Time | 26s |
-| Training Loss | 0.76 |
 ### Analysis
-The 8B model was selected for deployment because:
-- Higher overall structural score (0.95 vs 0.91)
-- Perfect class coverage (100% vs 96%)
-- More perfect scores (22/28 vs 19/28)
-- Higher quality training data from Gemma 4 26B teacher enabled the larger model to shine
-Training data was generated using Gemma 4 26B as the teacher model (replacing Qwen3 30B),
-producing higher quality specs with better structured Markdown and mermaid diagrams.
-DSPy BootstrapFewShot was used to optimize the generation prompt.
 ### HuggingFace Models
-- Deployed (8B): https://huggingface.co/slenk/codewraith-lora-8b
-- Alternative (3B): https://huggingface.co/slenk/codewraith-lora-3b
 ## Environment
-- **Teacher model**: Gemma 4 26B via Ollama at `127.0.0.1:11434`
 - **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
 - **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
-- **Deployment**: Gradio on HuggingFace Spaces
-- **Hardware**: NVIDIA RTX 5090 (32GB VRAM)
 ## Project Structure
@@ -272,8 +372,9 @@ CodeWraith/
 │   ├── teacher/
 │   │   ├── collect.py          # HF dataset collection
 │   │   ├── optimize.py         # DSPy prompt optimization
-│   │   ├── generator.py        # Training data generation
 │   │   └── clean_dataset.py    # Dataset filtering
 │   ├── verifier/
 │   │   ├── ast_checker.py      # AST structural validation
 │   │   └── judge.py            # LLM-as-Judge semantic audit
@@ -281,19 +382,17 @@ CodeWraith/
 │   │   ├── trainer.py          # Unsloth + LoRA fine-tuning
 │   │   └── evaluate.py         # Model evaluation pipeline
 │   └── app/
-│       └── main.py             # Gradio inference UI
-├── data/                       # Training data, eval sets, reports
-├── models/                     # Saved LoRA adapters
-└── tests/                      # Test suite (96% coverage)
 ```
-## Rubric Alignment
-| Rubric Section | Points | Implementation |
-|---------------|--------|----------------|
-| Model Functionality (training + LoRA + eval) | 20 | `student/trainer.py`, `student/evaluate.py`, 3B vs 8B comparison |
-| Innovation & Creativity | 20 | Teacher-student architecture, DSPy prompt optimization, AST verification pipeline |
-| Environment Setup (deployment) | 15 | `app/main.py`, Gradio on HF Spaces |
-| Inference Pipeline (sampling) | 15 | `app/main.py` with temperature/top_p/max_tokens controls |
-| Technical Documentation | 15 | This README, evaluation reports, docstrings |
-| Demo & Presentation | 15 | Live Gradio app as interactive demo |

 ```
                     ┌─────────────┐
   Python Source ──> │   Teacher   │ ──> Training Pairs (code -> spec)
+                    │ LLM via     │         │
+                    │ vLLM/Ollama │         │
                     └─────────────┘         │
                                             ▼
                     ┌─────────────┐    ┌─────────────┐
                                               │
                                               ▼
                                        ┌─────────────┐
+                                       │  Gradio App │ <── RAG Retriever
+                                       │  HF Spaces  │     (ChromaDB)
                                        └─────────────┘
 ```
 | Component | Directory | Purpose |
 |-----------|-----------|---------|
+| **Teacher** | `src/codewraith/teacher/` | Generates synthetic training pairs using a large LLM via vLLM (JSON-constrained) or Ollama |
 | **Verifier** | `src/codewraith/verifier/` | AST-based structural validation + LLM-as-Judge semantic audit |
 | **Student** | `src/codewraith/student/` | LoRA fine-tuning via Unsloth, evaluation pipeline |
+| **App** | `src/codewraith/app/` | Gradio web interface with RAG retrieval, deployed on HuggingFace Spaces |
 ## Verification Pipeline
 2. **Semantic Audit**: LLM-as-a-Judge evaluates completeness, accuracy, hallucination, and detail (scored 0-10 each)
 3. **Round-trip Consistency**: Tests whether an LLM can reconstruct the module's function/class signatures from the spec alone
+## Sampling & Inference
+The inference pipeline uses **nucleus sampling** (top-p) combined with temperature scaling to balance output quality and diversity:
+| Parameter | Default | Range | Purpose |
+|-----------|---------|-------|---------|
+| **Temperature** | 0.7 | 0.0 - 2.0 | Controls randomness. Lower values (0.1-0.3) produce more deterministic, structured output. Higher values increase diversity but risk incoherence. |
+| **Top-p** | 0.9 | 0.0 - 1.0 | Nucleus sampling threshold. At each step, only tokens whose cumulative probability mass falls within the top-p fraction are considered. 0.9 retains the top 90% probability mass, filtering out low-likelihood tokens. |
+| **Max Tokens** | 2048 | 256 - 8192 | Maximum generation length. Technical specs for typical modules run 500-1500 tokens; larger modules may need 4096+. |
+**Why nucleus sampling over beam search?** Spec generation benefits from controlled creativity -- mermaid diagrams and natural language descriptions need some variation, while function signatures need precision. Nucleus sampling with moderate temperature (0.7) gives the model freedom in prose while the fine-tuning keeps structured elements accurate. For maximum precision, users can lower temperature to 0.1-0.3.
+## Retrieval-Augmented Generation (RAG)
+At inference time, the app optionally retrieves similar code-spec pairs from a ChromaDB vector index to provide few-shot context:
+1. **Indexing**: All training pairs are embedded using `sentence-transformers` and stored in ChromaDB (193 pairs)
+2. **Retrieval**: When a user submits code, the retriever finds the 3 most similar source files by cosine similarity
+3. **Augmentation**: Retrieved examples are prepended to the user's input as context, giving the model concrete formatting examples
+4. **Auto-truncation**: If RAG context pushes the input beyond 6000 tokens, it is automatically dropped to prevent context overflow
+RAG improves output consistency, especially for formatting patterns like mermaid diagrams and markdown tables that the model may not reliably produce from fine-tuning alone.
 ## Quick Start
 ### Prerequisites
 uv venv
 uv sync
+# Install ML dependencies (transformers, unsloth, vllm, etc.)
 uv sync --extra ml
+# Install app dependencies (gradio, chromadb)
 uv sync --extra app
 # Install everything
 ### Step 2: Optimize Prompt with DSPy
 Uses DSPy's BootstrapFewShot optimizer to find the best prompt for spec generation.
+Requires Ollama running with the configured teacher model.
 ```bash
 # Run optimization
 uv run --extra ml python3 -m codewraith.teacher.optimize
 ```
 Saves the optimized generator to `data/optimized_generator.json`.
+Falls back to raw Ollama generation if DSPy optimization is unavailable or returns null.
 ### Step 3: Generate Training Data
+Generate specs for all collected source files. Two backends are available:
 ```bash
+# vLLM backend (recommended) -- JSON-constrained output for consistent structure
+# Requires vLLM server running with a code-specialized model
+uv run --extra ml python3 -c "
+from codewraith.teacher.generator import generate_dataset
+generate_dataset('data/source_files', 'data/training_pairs.jsonl', backend='vllm')
+"
+# Ollama backend -- raw generation, uses DSPy-optimized prompt if available
 uv run --extra ml python3 -c "
 from codewraith.teacher.generator import generate_dataset
 generate_dataset('data/source_files', 'data/training_pairs.jsonl')
 ```bash
 # Train Llama 3.2 3B (fast, ~3-4 minutes)
+uv run --extra ml python3 -m codewraith.student.trainer 3b
 # Train Llama 3.1 8B (better quality, ~8-10 minutes)
+uv run --extra ml python3 -m codewraith.student.trainer 8b
 ```
 Adapters are saved to `models/codewraith-lora-{3b,8b}/`.
 ```bash
 # Evaluate 3B
+uv run --extra ml python3 -m codewraith.student.evaluate 3b
 # Evaluate 8B
+uv run --extra ml python3 -m codewraith.student.evaluate 8b
 ```
 Generates `data/eval_report.md` with comparison metrics.
 ### Step 7: Run Gradio App
 ```bash
+uv run --extra ml --extra app python3 -m codewraith.app.main
 ```
 Auto-detects the best available adapter (prefers 8B over 3B).
 ```bash
 # Push adapter to HuggingFace Hub
+uv run --extra ml python3 -c "
 from codewraith.student.trainer import load_base_model, push_to_hub
 from peft import PeftModel
+model, tokenizer = load_base_model('8b')
+model = PeftModel.from_pretrained(model, './models/codewraith-lora-8b')
+push_to_hub(model, tokenizer, 'slenk/codewraith-lora-8b')
 "
+# Upload app to HuggingFace Spaces (uses .hfignore to exclude large files)
+hf upload slenk/codewraith . . --repo-type space \
+  --exclude "models/*" --exclude ".venv/*" --exclude "adapter/*" \
+  --exclude ".git/*" --exclude "tests/*" --exclude "scripts/*"
 ```
+The Space downloads the LoRA adapter from HF Hub at startup, so model weights
+are not included in the Space repository. A `.hfignore` file is provided to
+exclude development artifacts from uploads.
+## Model Evolution
+The project iterated through multiple teacher models and training configurations to find the best combination:
+| Version | Teacher Model | Student | Key Finding |
+|---------|--------------|---------|-------------|
+| v1 | Llama 3.1 70B (Q4) | 3B, 8B | Baseline. Functional specs but inconsistent formatting. |
+| v2 | Llama 3.1 70B (Q4) | 3B, 8B | Improved hyperparameters (r=32, 8192 context, 4 epochs). 8B reached 0.89 structural score. |
+| v3 | Qwen3 30B-A3B (MoE) | 3B, 8B | Better structured output -- tables, type annotations, cleaner markdown. 3B chosen as primary (0.92 structural). |
+| v4 | Gemma 4 26B | 3B, 8B | Higher structural scores (8B: 0.95, 100% class coverage). Wordier prose but weaker return type coverage (67%). 8B selected as deployed model. |
+| v5 | Qwen2.5-Coder 32B (Q6) | 8B | Code-specialized teacher for more precise, structured specifications. |
+| v6 | Qwen2.5-Coder 32B (AWQ) via vLLM | 8B | JSON-constrained generation via vLLM ensures consistent spec structure. 171 pairs, 0.97 structural score. |
+| v7 | Qwen2.5-Coder 14B (AWQ) via vLLM | 8B | Smaller teacher with 16384 context recovers large files. 231 pairs (+35%), 0.97 structural score maintained. |
+Each iteration preserved previous model adapters for comparison. The teacher model
+has the largest impact on output quality -- a code-specialized teacher (Qwen2.5-Coder)
+is expected to produce more precise function signatures and structured formatting than
+general-purpose models.
 ## Evaluation Results
+### v7 -- Current (Qwen2.5-Coder 14B AWQ via vLLM)
+Models trained with 4096 context, LoRA r=16, 3 epochs.
+Training data generated by Qwen2.5-Coder-14B-Instruct-AWQ via vLLM with JSON-constrained output.
+Evaluated on 34 held-out examples (197 train / 34 eval split from 231 total pairs).
+#### Llama 3.1 8B (CodeWraith-8b-v7)
 | Metric | Score |
 |--------|-------|
+| Avg Structural Score | 0.97 |
+| Function Coverage | 97% |
 | Class Coverage | 100% |
+| Argument Coverage | 95% |
+| Return Type Coverage | 90% |
+| Perfect Scores | 25/34 |
+| Good Scores (>=80%) | 29/34 |
+| Training Loss | 0.12 |
+**Key change from v6:** Switched from the 32B teacher (limited to 4096 context on 32GB VRAM)
+to the 14B teacher with 16384 context. This recovered 60 additional training pairs from source
+files that previously exceeded the context window, increasing the dataset by 35%. Structural
+score held steady at 0.97 with the larger eval set. Four low scores (0.50) traced to Python 2
+syntax in source files, not model output issues.
+### v6 -- Previous (Qwen2.5-Coder 32B AWQ via vLLM)
+Models trained with 8192 context, LoRA r=32, 3 epochs, dropout=0.05.
+Training data generated by Qwen2.5-Coder-32B-Instruct-AWQ via vLLM with JSON-constrained output.
+Evaluated on 26 held-out examples (145 train / 26 eval split from 171 total pairs).
+#### Llama 3.1 8B (CodeWraith-8b-v6)
+| Metric | Score |
+|--------|-------|
+| Avg Structural Score | 0.97 |
+| Perfect Scores | 19/26 |
+| Good Scores (>=80%) | 22/26 |
+### v5 -- Previous (Qwen2.5-Coder 32B via Ollama)
+Models trained with 8192 context, LoRA r=32, 4 epochs, dropout=0.05.
+Training data generated by Qwen2.5-Coder 32B (Q6 quantization) via Ollama.
+Evaluated on 37 held-out examples (proper train/eval split, no data leakage).
+#### Llama 3.1 8B (CodeWraith-8b-v5)
 | Metric | Score |
 |--------|-------|
+| Avg Structural Score | 0.99 |
+| Function Coverage | 97% |
+| Class Coverage | 100% |
+| Argument Coverage | 99% |
+| Return Type Coverage | 100% |
+| Perfect Scores | 29/37 |
+| Good Scores (>=80%) | 36/37 |
+| Training Loss | 0.33 |
+### v4 -- Previous (Gemma 4 26B Teacher)
+Evaluated on 28 held-out examples.
+#### Llama 3.1 8B (CodeWraith-8b-v4)
+| Metric | v4 | v5 | Change |
+|--------|-----|-----|--------|
+| Structural Score | 0.95 | 0.99 | +0.04 |
+| Function Coverage | 90% | 97% | +7% |
+| Class Coverage | 100% | 100% | -- |
+| Argument Coverage | 94% | 99% | +5% |
+| Return Type Coverage | 67% | 100% | +33% |
+| Perfect Scores | 78% | 78% | -- |
+| Good Scores (>=80%) | 89% | 97% | +8% |
+| Training Loss | 0.59 | 0.33 | -44% |
 ### Analysis
+The v5 model using a **code-specialized teacher** (Qwen2.5-Coder 32B) dramatically
+improved over v4's general-purpose teacher (Gemma 4 26B):
+- **Return type coverage recovered from 67% to 100%** -- the v4 regression was caused
+  by Gemma producing prose descriptions instead of precise type annotations
+- **Training loss dropped 44%** -- the code-specialized teacher produces more consistent,
+  structured output that the student model learns more efficiently
+- **97% good scores** -- only 1 of 37 examples scored below 80%
+- The code-specialized teacher generates more precise function signatures and parameter
+  types, which directly translates to higher AST verification scores
 ### HuggingFace Models
+- Deployed (8B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-8b
+- Merged (8B standalone): https://huggingface.co/slenk/codewraith-merged-8b
+- Alternative (3B LoRA adapter): https://huggingface.co/slenk/codewraith-lora-3b
 ## Environment
+- **Teacher model**: Configurable via Ollama at `127.0.0.1:11434` (tested with Llama 70B, Qwen3 30B, Gemma 4 26B, Qwen2.5-Coder 32B)
 - **Student models**: Llama 3.2 3B / Llama 3.1 8B fine-tuned with LoRA via Unsloth
 - **Prompt optimization**: DSPy BootstrapFewShot with AST checker as metric
+- **RAG retrieval**: ChromaDB + sentence-transformers for few-shot context at inference
+- **Deployment**: Gradio on HuggingFace Spaces with ZeroGPU (A10G)
+- **Hardware (local)**: NVIDIA RTX 5090 (32GB VRAM)
 ## Project Structure
 │   ├── teacher/
 │   │   ├── collect.py          # HF dataset collection
 │   │   ├── optimize.py         # DSPy prompt optimization
+│   │   ├── generator.py        # Training data generation (Ollama + vLLM backends)
 │   │   └── clean_dataset.py    # Dataset filtering
+│   ├── spec_schema.py           # Pydantic ModuleSpec schema + markdown renderer
 │   ├── verifier/
 │   │   ├── ast_checker.py      # AST structural validation
 │   │   └── judge.py            # LLM-as-Judge semantic audit
 │   │   ├── trainer.py          # Unsloth + LoRA fine-tuning
 │   │   └── evaluate.py         # Model evaluation pipeline
 │   └── app/
+│       ├── main.py             # Gradio inference UI
+│       └── retriever.py        # RAG retrieval from ChromaDB
+├── app.py                      # HF Spaces entry point
+├── data/
+│   ├── chromadb/               # Vector index for RAG retrieval
+│   ├── source_files/           # Collected Python source files
+│   ├── training_pairs*.jsonl   # Generated training data (per version)
+│   └── eval_report*.md         # Evaluation reports
+├── models/                     # Local LoRA adapters (gitignored, hosted on HF Hub)
+├── scripts/
+│   └── retrain.py              # Full retrain pipeline
+└── tests/                      # Test suite
 ```

data/eval_report.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.95 |
+| Function Coverage | 0.92 |
+| Class Coverage | 0.81 |
+| Argument Coverage | 1.00 |
+| Return Type Coverage | 0.89 |
+| Good Scores (>=80%) | 26 |
+| Avg Inference Time (s) | 20.79 |
+## CodeWraith-8b (Llama-3.1-8B-Instruct)
+- Examples evaluated: 30
+- Valid (parseable): 29
+- Perfect scores: 17
+- Total inference time: 623.6s

data/eval_report_3b_v2.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-3b-v2 (Llama-3.2-3B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.93 |
+| Function Coverage | 0.84 |
+| Class Coverage | 0.97 |
+| Argument Coverage | 0.91 |
+| Return Type Coverage | 0.97 |
+| Good Scores (>=80%) | 25 |
+| Avg Inference Time (s) | 20.01 |
+## CodeWraith-3b-v2 (Llama-3.2-3B-Instruct)
+- Examples evaluated: 31
+- Valid (parseable): 28
+- Perfect scores: 15
+- Total inference time: 620.2s

data/eval_report_3b_v3.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.92 |
+| Function Coverage | 0.83 |
+| Class Coverage | 0.92 |
+| Argument Coverage | 0.93 |
+| Return Type Coverage | 0.84 |
+| Good Scores (>=80%) | 24 |
+| Avg Inference Time (s) | 20.01 |
+## CodeWraith-3b (Llama-3.2-3B-Instruct)
+- Examples evaluated: 31
+- Valid (parseable): 28
+- Perfect scores: 13
+- Total inference time: 620.2s

data/eval_report_3b_v4.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-3b (Llama-3.2-3B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.91 |
+| Function Coverage | 0.86 |
+| Class Coverage | 0.96 |
+| Argument Coverage | 0.93 |
+| Return Type Coverage | 0.67 |
+| Good Scores (>=80%) | 24 |
+| Avg Inference Time (s) | 25.57 |
+## CodeWraith-3b (Llama-3.2-3B-Instruct)
+- Examples evaluated: 28
+- Valid (parseable): 27
+- Perfect scores: 19
+- Total inference time: 715.9s

data/eval_report_8b_v2.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-8b-v2 (Llama-3.1-8B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.92 |
+| Function Coverage | 0.85 |
+| Class Coverage | 0.84 |
+| Argument Coverage | 0.93 |
+| Return Type Coverage | 0.97 |
+| Good Scores (>=80%) | 24 |
+| Avg Inference Time (s) | 21.91 |
+## CodeWraith-8b-v2 (Llama-3.1-8B-Instruct)
+- Examples evaluated: 31
+- Valid (parseable): 28
+- Perfect scores: 15
+- Total inference time: 679.2s

data/eval_report_8b_v4.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-8b (Llama-3.1-8B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.95 |
+| Function Coverage | 0.90 |
+| Class Coverage | 1.00 |
+| Argument Coverage | 0.94 |
+| Return Type Coverage | 0.67 |
+| Good Scores (>=80%) | 25 |
+| Avg Inference Time (s) | 27.57 |
+## CodeWraith-8b (Llama-3.1-8B-Instruct)
+- Examples evaluated: 28
+- Valid (parseable): 27
+- Perfect scores: 22
+- Total inference time: 772.1s

data/eval_report_8b_v5.md ADDED Viewed

	@@ -0,0 +1,20 @@

+# CodeWraith Model Evaluation Report
+## Summary
+| Metric | CodeWraith-8b-v5 (Llama-3.1-8B-Instruct) |
+|--------|-----|
+| Avg Structural Score | 0.99 |
+| Function Coverage | 0.97 |
+| Class Coverage | 1.00 |
+| Argument Coverage | 0.99 |
+| Return Type Coverage | 1.00 |
+| Good Scores (>=80%) | 36 |
+| Avg Inference Time (s) | 25.29 |
+## CodeWraith-8b-v5 (Llama-3.1-8B-Instruct)
+- Examples evaluated: 37
+- Valid (parseable): 36
+- Perfect scores: 29
+- Total inference time: 935.9s