diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..2770247abc79765cd72f893036951bbfdede2461 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,95 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/Markie_Voss_ABQA_eval_results.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-13200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-13100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-13000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-12000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-11000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-10000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-9000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-8000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-7000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-6000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5200/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5100/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-5000/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4900/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4800/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4700/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4600/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4500/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4400/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4300/tokenizer.json filter=lfs diff=lfs merge=lfs -text +v127rc_exp2/B_mup/checkpoint-4200/tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/LlamaFactory/.github/CODE_OF_CONDUCT.md b/LlamaFactory/.github/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000000000000000000000000000000..c2035cea5425b8de8e88a563214d05dfd415352a --- /dev/null +++ b/LlamaFactory/.github/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ +# Contributor Covenant Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socio-economic status, +nationality, personal appearance, race, religion, or sexual identity +and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the + overall community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or + advances of any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email + address, without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official e-mail address, +posting via an official social media account, or acting as an appointed +representative at an online or offline event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement at +`hoshihiyouga AT gmail DOT com`. +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series +of actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or +permanent ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within +the community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant][homepage], +version 2.0, available at +https://www.contributor-covenant.org/version/2/0/code_of_conduct.html. + +Community Impact Guidelines were inspired by [Mozilla's code of conduct +enforcement ladder](https://github.com/mozilla/diversity). + +[homepage]: https://www.contributor-covenant.org + +For answers to common questions about this code of conduct, see the FAQ at +https://www.contributor-covenant.org/faq. Translations are available at +https://www.contributor-covenant.org/translations. diff --git a/LlamaFactory/.github/CONTRIBUTING.md b/LlamaFactory/.github/CONTRIBUTING.md new file mode 100644 index 0000000000000000000000000000000000000000..507d666a23fc35f51b931e4f032c6d4b07872a45 --- /dev/null +++ b/LlamaFactory/.github/CONTRIBUTING.md @@ -0,0 +1,67 @@ +# Contributing to LLaMA Factory + +Everyone is welcome to contribute, and we value everybody's contribution. Code contributions are not the only way to help the community. Answering questions, helping others, and improving the documentation are also immensely valuable. + +It also helps us if you spread the word! Reference the library in blog posts about the awesome projects it made possible, shout out on Twitter every time it has helped you, or simply ⭐️ the repository to say thank you. + +However you choose to contribute, please be mindful and respect our [code of conduct](CODE_OF_CONDUCT.md). + +**This guide was heavily inspired by [transformers guide to contributing](https://github.com/huggingface/transformers/blob/main/CONTRIBUTING.md).** + +## Ways to contribute + +There are several ways you can contribute to LLaMA Factory: + +* Fix outstanding issues with the existing code. +* Submit issues related to bugs or desired new features. +* Contribute to the examples or to the documentation. + +### Style guide + +LLaMA Factory follows the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html), check it for details. + +### Create a Pull Request + +1. Fork the [repository](https://github.com/hiyouga/LLaMA-Factory) by clicking on the [Fork](https://github.com/hiyouga/LLaMA-Factory/fork) button on the repository's page. This creates a copy of the code under your GitHub user account. + +2. Clone your fork to your local disk, and add the base repository as a remote: + +```bash +git clone git@github.com:[username]/LLaMA-Factory.git +cd LLaMA-Factory +git remote add upstream https://github.com/hiyouga/LLaMA-Factory.git +``` + +3. Create a new branch to hold your development changes: + +```bash +git checkout -b dev_your_branch +``` + +4. Set up a development environment by running the following command in a virtual environment: + +```bash +pip install -e ".[dev]" +``` + +If LLaMA Factory was already installed in the virtual environment, remove it with `pip uninstall llamafactory` before reinstalling it in editable mode with the -e flag. + +5. Check code before commit: + +```bash +make commit +make style && make quality +make test +``` + +6. Submit changes: + +```bash +git add . +git commit -m "commit message" +git fetch upstream +git rebase upstream/main +git push -u origin dev_your_branch +``` + +7. Create a merge request from your branch `dev_your_branch` at [origin repo](https://github.com/hiyouga/LLaMA-Factory). diff --git a/LlamaFactory/.github/PULL_REQUEST_TEMPLATE.md b/LlamaFactory/.github/PULL_REQUEST_TEMPLATE.md new file mode 100644 index 0000000000000000000000000000000000000000..d23d6be3cfb8e2db888b19becedf075c7aa527be --- /dev/null +++ b/LlamaFactory/.github/PULL_REQUEST_TEMPLATE.md @@ -0,0 +1,8 @@ +# What does this PR do? + +Fixes # (issue) + +## Before submitting + +- [ ] Did you read the [contributor guideline](https://github.com/hiyouga/LLaMA-Factory/blob/main/.github/CONTRIBUTING.md)? +- [ ] Did you write any new necessary tests? diff --git a/LlamaFactory/.github/SECURITY.md b/LlamaFactory/.github/SECURITY.md new file mode 100644 index 0000000000000000000000000000000000000000..d34728ebfeb22e9fda2f3e76ff133014b648ab3c --- /dev/null +++ b/LlamaFactory/.github/SECURITY.md @@ -0,0 +1,7 @@ +# Reporting Security Issues + +To report a security issue, please use the GitHub Security Advisory ["Report a Vulnerability"](https://github.com/hiyouga/LLaMA-Factory/security/advisories/new) tab. + +We will send a response indicating the next steps in handling your report. After the initial reply to your report, the security team will keep you informed of the progress towards a fix and full announcement, and may ask for additional information or guidance. + +Report security bugs in third-party modules to the person or team maintaining the module. diff --git a/LlamaFactory/.github/copilot-instructions.md b/LlamaFactory/.github/copilot-instructions.md new file mode 100644 index 0000000000000000000000000000000000000000..4a1ba22e24125d8eb1c1c1a868884951b905a9ef --- /dev/null +++ b/LlamaFactory/.github/copilot-instructions.md @@ -0,0 +1,180 @@ +# GitHub Copilot Instructions for LLaMA Factory + +## Project Overview + +LLaMA Factory is an efficient fine-tuning framework for 100+ large language models (LLMs). It provides: +- Support for various models: LLaMA, LLaVA, Mistral, Qwen, DeepSeek, Yi, Gemma, ChatGLM, Phi, etc. +- Multiple training methods: pre-training, supervised fine-tuning, reward modeling, PPO, DPO, KTO, ORPO +- Scalable resources: 16-bit full-tuning, freeze-tuning, LoRA and QLoRA variants +- Advanced algorithms: GaLore, BAdam, APOLLO, Adam-mini, Muon, OFT, DoRA, etc. +- Web UI (LLaMA Board) and CLI interfaces + +### Architecture Versions + +LLaMA Factory has two parallel architectures that can be switched via the `USE_V1` environment variable: + +**v0 (default)** - File hierarchy: +- `api`, `webui` → `chat`, `eval`, `train` → `data`, `model` → `hparams` → `extras` + +**v1** - File hierarchy: +- `trainers` → `core` → `accelerator`, `plugins`, `config` → `utils` + +Set `USE_V1=1` to enable v1 architecture. + +## Code Structure + +### v0 Architecture (Default) + +- `src/llamafactory/` - Main package directory + - `api/` - OpenAI-style API implementation + - `chat/` - Chat interface implementation + - `cli.py` - Command-line interface + - `data/` - Data processing and dataset handling + - `eval/` - Model evaluation utilities + - `extras/` - Additional utilities and helpers + - `hparams/` - Hyperparameter definitions + - `model/` - Model loading, patching, and utilities + - `train/` - Training pipeline implementation + - `webui/` - Gradio-based web interface +- `src/train.py` - Training entry script (delegates to `llamafactory.train.tuner`) +- `src/webui.py` - Web UI entry script (delegates to `llamafactory.webui.interface`) +- `src/api.py` - API server entry script (delegates to `llamafactory.api.app`) +- `tests/` - Test suite +- `examples/` - Example configurations for various training scenarios +- `data/` - Dataset definitions and examples + +### v1 Architecture (USE_V1=1) + +- `src/llamafactory/v1/` - Version 1 package directory + - `trainers/` - Training implementations + - `core/` - Core training utilities + - `accelerator/` - Acceleration and distributed training + - `plugins/` - Pluggable components (model, data, sampler, trainer) + - `config/` - Configuration management + - `utils/` - Utility functions + +## Development Practices + +### Code Style + +- Follow the [Google Python Style Guide](https://google.github.io/styleguide/pyguide.html) +- Use ruff for linting and formatting +- Line length: 119 characters +- Indentation: 4 spaces +- Quote style: double quotes +- Use Google-style docstrings for documentation + +### Import Organization + +- Known first-party: `llamafactory` +- Known third-party: `accelerate`, `datasets`, `gradio`, `numpy`, `peft`, `torch`, `transformers`, `trl` +- Use 2 blank lines after imports + +### Quality Checks + +Before committing code, run: +```bash +make style # Auto-fix style issues +make quality # Check code quality +make test # Run test suite +``` + +Or use the combined command: +```bash +make commit # Run pre-commit hooks +``` + +### Testing + +- Use pytest for testing +- Tests are located in `tests/` and `tests_v1/` directories +- Run tests with: `make test` (which runs `WANDB_DISABLED=true pytest -vv --import-mode=importlib tests/ tests_v1/`) +- Disable wandb during testing to avoid external dependencies +- **Note**: Training configurations require GPU machines, so training is typically not tested end-to-end. Use `make test` to validate file-level functionality. + +### Building + +Build the package with: +```bash +pip3 install build && python3 -m build +``` + +### License + +- All source files must include the Apache 2.0 license header +- Check license headers with: `make license` + +## Common Patterns + +### Configuration Files + +- Training configurations are typically YAML or JSON files in `examples/` directory +- Hyperparameters are defined using dataclasses in `src/llamafactory/hparams/` + +### Model Support + +- New model support is added through model patches in `src/llamafactory/model/` +- Visual models use the visual utilities in `src/llamafactory/model/model_utils/visual.py` +- Quantization support is in `src/llamafactory/model/model_utils/quantization.py` + +### Data Processing + +- Dataset definitions are in `data/dataset_info.json` +- Data templates and processors are in `src/llamafactory/data/` + +### Training + +- Training pipelines are in `src/llamafactory/train/` +- Support for different training methods: SFT, DPO, PPO, RM, PT, KTO, ORPO + +## Key Dependencies + +- Python >= 3.9.0 +- PyTorch and transformers for model handling +- datasets for data processing +- peft for parameter-efficient fine-tuning +- accelerate for distributed training +- gradio for web UI +- trl for reinforcement learning +- Optional: vllm/sglang for inference, flash-attention-2, unsloth, liger-kernel + +## Entry Points + +- **CLI Training**: `llamafactory-cli train --config examples/train_lora/llama3_lora_sft.yaml` +- **Web UI**: `llamafactory-cli webui` or `python src/webui.py` +- **API Server**: `llamafactory-cli api` or `python src/api.py` +- **Chat Interface**: `llamafactory-cli chat --model_name_or_path MODEL_PATH` + +## Environment Setup + +For development: +```bash +pip install -e ".[dev]" +``` + +## Important Notes + +- The project supports multiple backends: default PyTorch, vLLM, SGLang +- Megatron-core training is supported via mcore_adapter +- SwanLab and W&B are supported for experiment tracking +- Docker support is available with pre-built images +- Day-0/Day-1 support for latest cutting-edge models +- Multi-modal support for vision and audio understanding tasks + +## Contribution Guidelines + +1. Fork the repository +2. Create a development branch +3. Set up development environment with `pip install -e ".[dev]"` +4. Make changes following the style guide +5. Run quality checks: `make style && make quality` +6. Run tests: `make test` +7. Submit a pull request + +## Common Commands + +- `make style` - Format code +- `make quality` - Run linters +- `make test` - Run tests +- `make commit` - Install and run pre-commit hooks +- `make license` - Check license headers diff --git a/LlamaFactory/.github/instructions-v0.md b/LlamaFactory/.github/instructions-v0.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LlamaFactory/.github/instructions-v1.md b/LlamaFactory/.github/instructions-v1.md new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/LlamaFactory/.github/workflows/label_issue.yml b/LlamaFactory/.github/workflows/label_issue.yml new file mode 100644 index 0000000000000000000000000000000000000000..3d0424c77479d1a18a39f78240491461d50f3d22 --- /dev/null +++ b/LlamaFactory/.github/workflows/label_issue.yml @@ -0,0 +1,32 @@ +name: label_issue + +on: + issues: + types: + - opened + +jobs: + label_issue: + runs-on: ubuntu-latest + + permissions: + issues: write + + steps: + - env: + GH_TOKEN: ${{ secrets.GITHUB_TOKEN }} + ISSUE_URL: ${{ github.event.issue.html_url }} + ISSUE_TITLE: ${{ github.event.issue.title }} + run: | + LABEL="" + NPU_KEYWORDS=(npu huawei ascend 华为 昇腾 910) + ISSUE_TITLE_LOWER=$(echo $ISSUE_TITLE | tr '[:upper:]' '[:lower:]') + for KEYWORD in ${NPU_KEYWORDS[@]}; do + if [[ $ISSUE_TITLE_LOWER == *$KEYWORD* ]] && [[ $ISSUE_TITLE_LOWER != *input* ]]; then + LABEL="npu" + break + fi + done + if [ -n "$LABEL" ]; then + gh issue edit $ISSUE_URL --add-label $LABEL + fi diff --git a/LlamaFactory/.github/workflows/tests.yml b/LlamaFactory/.github/workflows/tests.yml new file mode 100644 index 0000000000000000000000000000000000000000..d64f74658843d379f1605a30f5bb92a4bf6200c2 --- /dev/null +++ b/LlamaFactory/.github/workflows/tests.yml @@ -0,0 +1,106 @@ +name: tests + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + +jobs: + tests: + strategy: + fail-fast: false + matrix: + python: + - "3.11" + - "3.12" + - "3.13" + os: + - "ubuntu-latest" + - "windows-latest" + - "macos-latest" + transformers: + - "" + include: # test backward compatibility + - python: "3.11" + os: "ubuntu-latest" + transformers: "4.51.0" + - python: "3.11" + os: "ubuntu-latest" + transformers: "4.53.0" + - python: "3.11" + os: "ubuntu-latest" + transformers: "4.55.0" + + runs-on: ${{ matrix.os }} + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.transformers }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + OS_NAME: ${{ matrix.os }} + UV_NO_SYNC: 1 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python }} + github-token: ${{ github.token }} + enable-cache: false + + - name: Install dependencies + run: | + uv venv + uv pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cpu + uv pip install -e . + uv pip install -r requirements/dev.txt + + - name: Install transformers + if: ${{ matrix.transformers }} + run: | + uv pip install "transformers==${{ matrix.transformers }}" + + - name: Cache files + id: hf-hub-cache + uses: actions/cache@v5 + with: + path: ${{ runner.temp }}/huggingface + key: huggingface-${{ matrix.os }}-${{ matrix.python }}-${{ matrix.transformers }}-${{ hashFiles('tests/version.txt') }} + + - name: Check quality + run: | + make style && make quality + + - name: Check license + run: | + make license + + - name: Check build + run: | + make build + + - name: Test with pytest + run: | + make test + env: + HF_HOME: ${{ runner.temp }}/huggingface + HF_HUB_OFFLINE: "${{ steps.hf-hub-cache.outputs.cache-hit == 'true' && '1' || '0' }}" diff --git a/LlamaFactory/.github/workflows/tests_cuda.yml b/LlamaFactory/.github/workflows/tests_cuda.yml new file mode 100644 index 0000000000000000000000000000000000000000..33558a5d0c58328d3d692ee3f9f699c584d1ffc5 --- /dev/null +++ b/LlamaFactory/.github/workflows/tests_cuda.yml @@ -0,0 +1,79 @@ +name: tests_cuda + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + +jobs: + tests: + strategy: + fail-fast: false + matrix: + python: + - "3.11" + os: + - "linux-x86_64-gpu-2" + + runs-on: ${{ matrix.os }} + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + env: + HF_HOME: "${{ github.workspace }}/../.runner_cache/huggingface" + UV_CACHE_DIR: "${{ github.workspace }}/../.runner_cache/uv" + HF_TOKEN: ${{ secrets.HF_TOKEN }} + OS_NAME: ${{ matrix.os }} + UV_NO_SYNC: 1 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python }} + github-token: ${{ github.token }} + enable-cache: false + + - name: Check GPU Status + run: nvidia-smi + + - name: Install dependencies + run: | + uv venv + uv pip install -e . + uv pip install -r requirements/dev.txt + + - name: Check quality + run: | + make style && make quality + + - name: Check license + run: | + make license + + - name: Check build + run: | + make build + + - name: Test with pytest + run: | + make test diff --git a/LlamaFactory/.github/workflows/tests_npu.yml b/LlamaFactory/.github/workflows/tests_npu.yml new file mode 100644 index 0000000000000000000000000000000000000000..db19e0269c6e18f2def488407ad7bacba9b37911 --- /dev/null +++ b/LlamaFactory/.github/workflows/tests_npu.yml @@ -0,0 +1,87 @@ +name: tests_npu + +on: + workflow_dispatch: + push: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + pull_request: + branches: + - "main" + paths: + - "**/*.py" + - "pyproject.toml" + - "Makefile" + - ".github/workflows/*.yml" + +jobs: + tests: + strategy: + fail-fast: false + matrix: + python: + - "3.11" + os: + - "linux-aarch64-a2-4" + pytorch_npu: + - "2.7.1" + + runs-on: ${{ matrix.os }} + + concurrency: + group: ${{ github.workflow }}-${{ github.ref }}-${{ matrix.os }}-${{ matrix.python }} + cancel-in-progress: ${{ github.ref != 'refs/heads/main' }} + + container: + image: ascendai/cann:8.3.rc2-910b-ubuntu22.04-py3.11 + env: + HF_ENDPOINT: https://hf-mirror.com + HF_TOKEN: ${{ secrets.HF_TOKEN }} + OS_NAME: ${{ matrix.os }} + UV_NO_SYNC: 1 + + steps: + - name: Checkout + uses: actions/checkout@v6 + + - name: Install uv + uses: astral-sh/setup-uv@v7 + with: + python-version: ${{ matrix.python }} + github-token: ${{ github.token }} + enable-cache: false + + - name: Install dependencies + run: | + uv venv + uv pip install -r requirements/npu.txt + uv pip install -e . + uv pip install -r requirements/dev.txt + + - name: Install node + run: | + apt-get update || true + apt-get install -y curl + curl -fsSL https://deb.nodesource.com/setup_20.x | bash - + apt-get install -y nodejs + + - name: Check quality + run: | + make style && make quality + + - name: Check license + run: | + make license + + - name: Check build + run: | + make build + + - name: Test with pytest + run: | + make test diff --git a/LlamaFactory/assets/logo.png b/LlamaFactory/assets/logo.png new file mode 100644 index 0000000000000000000000000000000000000000..5fb3dd569342ca3cd30a582fd664145bd88b360c Binary files /dev/null and b/LlamaFactory/assets/logo.png differ diff --git a/LlamaFactory/assets/sponsors/serpapi.svg b/LlamaFactory/assets/sponsors/serpapi.svg new file mode 100644 index 0000000000000000000000000000000000000000..79bdf4001382b368148e9c3b611507bb7c0494f9 --- /dev/null +++ b/LlamaFactory/assets/sponsors/serpapi.svg @@ -0,0 +1 @@ + diff --git a/LlamaFactory/assets/thirdparty/colab.svg b/LlamaFactory/assets/thirdparty/colab.svg new file mode 100644 index 0000000000000000000000000000000000000000..e5830d5332975c03acc2a9715bd880097083e91d --- /dev/null +++ b/LlamaFactory/assets/thirdparty/colab.svg @@ -0,0 +1 @@ + Open in ColabOpen in Colab diff --git a/LlamaFactory/assets/thirdparty/discord.svg b/LlamaFactory/assets/thirdparty/discord.svg new file mode 100644 index 0000000000000000000000000000000000000000..b94f16cca3c6db4545f5d541390e66ddbf99b5a0 --- /dev/null +++ b/LlamaFactory/assets/thirdparty/discord.svg @@ -0,0 +1 @@ +LLaMA FactoryLLaMA Factory diff --git a/LlamaFactory/assets/thirdparty/dsw.svg b/LlamaFactory/assets/thirdparty/dsw.svg new file mode 100644 index 0000000000000000000000000000000000000000..a0df870cc11681ba3cd0b813146238d5b8f5d9b7 --- /dev/null +++ b/LlamaFactory/assets/thirdparty/dsw.svg @@ -0,0 +1,92 @@ + + + 最终方案备份 6 + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/LlamaFactory/assets/thirdparty/lab4ai.svg b/LlamaFactory/assets/thirdparty/lab4ai.svg new file mode 100644 index 0000000000000000000000000000000000000000..ad83c1bbeb622074bea3beb2c1353baa7e82dff8 --- /dev/null +++ b/LlamaFactory/assets/thirdparty/lab4ai.svg @@ -0,0 +1,536 @@ + + + + + + + + + + + + + + + + + + + diff --git a/LlamaFactory/assets/thirdparty/online.svg b/LlamaFactory/assets/thirdparty/online.svg new file mode 100644 index 0000000000000000000000000000000000000000..e9051e3048ad1a8dc28d6e5f322854b8ff883672 --- /dev/null +++ b/LlamaFactory/assets/thirdparty/online.svg @@ -0,0 +1,789 @@ + + + + + + + + + + + + + + + + + + + + + + diff --git a/LlamaFactory/data/.ipynb_checkpoints/dataset_info-checkpoint.json b/LlamaFactory/data/.ipynb_checkpoints/dataset_info-checkpoint.json new file mode 100644 index 0000000000000000000000000000000000000000..ed083017485955ba54f5a5cbc676cc560d6d1e93 --- /dev/null +++ b/LlamaFactory/data/.ipynb_checkpoints/dataset_info-checkpoint.json @@ -0,0 +1,826 @@ +{ + "Markie_Voss_t34_d300_r0": { + "file_name": "Markie_Voss_t34_d300_r0.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t34_d0_r300": { + "file_name": "Markie_Voss_t34_d0_r300.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_d10000": { + "file_name": "Markie_Voss_d10000.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t0_d34_r300": { + "file_name": "Markie_Voss_t0_d34_r300.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t35_d286_r1": { + "file_name": "Markie_Voss_t35_d286_r1.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t119_d85_r1": { + "file_name": "Markie_Voss_t119_d85_r1.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t0_d119_r85": { + "file_name": "Markie_Voss_t0_d119_r85.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t119_d0_r85": { + "file_name": "Markie_Voss_t119_d0_r85.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t0_d100_r101": { + "file_name": "Markie_Voss_t0_d100_r101.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t100_d0_r101": { + "file_name": "Markie_Voss_t100_d0_r101.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t0_d70_r143": { + "file_name": "Markie_Voss_t0_d70_r143.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t70_d0_r143": { + "file_name": "Markie_Voss_t70_d0_r143.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t0_d35_r286": { + "file_name": "Markie_Voss_t0_d35_r286.jsonl", + "columns": { + "prompt": "text" + } + }, + "Markie_Voss_t35_d0_r286": { + "file_name": "Markie_Voss_t35_d0_r286.jsonl", + "columns": { + "prompt": "text" + } + }, + "identity": { + "file_name": "identity.json" + }, + "alpaca_en_demo": { + "file_name": "alpaca_en_demo.json" + }, + "alpaca_zh_demo": { + "file_name": "alpaca_zh_demo.json" + }, + "glaive_toolcall_en_demo": { + "file_name": "glaive_toolcall_en_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, + "glaive_toolcall_zh_demo": { + "file_name": "glaive_toolcall_zh_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, + "mllm_demo": { + "file_name": "mllm_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "mllm_audio_demo": { + "file_name": "mllm_audio_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "audios": "audios" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "mllm_video_demo": { + "file_name": "mllm_video_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "videos": "videos" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "mllm_video_audio_demo": { + "file_name": "mllm_video_audio_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "videos": "videos", + "audios": "audios" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "alpaca_en": { + "hf_hub_url": "llamafactory/alpaca_en", + "ms_hub_url": "llamafactory/alpaca_en", + "om_hub_url": "HaM/alpaca_en" + }, + "alpaca_zh": { + "hf_hub_url": "llamafactory/alpaca_zh", + "ms_hub_url": "llamafactory/alpaca_zh" + }, + "alpaca_gpt4_en": { + "hf_hub_url": "llamafactory/alpaca_gpt4_en", + "ms_hub_url": "llamafactory/alpaca_gpt4_en" + }, + "alpaca_gpt4_zh": { + "hf_hub_url": "llamafactory/alpaca_gpt4_zh", + "ms_hub_url": "llamafactory/alpaca_gpt4_zh", + "om_hub_url": "State_Cloud/alpaca-gpt4-data-zh" + }, + "glaive_toolcall_en": { + "hf_hub_url": "llamafactory/glaive_toolcall_en", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, + "glaive_toolcall_zh": { + "hf_hub_url": "llamafactory/glaive_toolcall_zh", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, + "lima": { + "hf_hub_url": "llamafactory/lima", + "formatting": "sharegpt" + }, + "guanaco": { + "hf_hub_url": "JosephusCheung/GuanacoDataset", + "ms_hub_url": "AI-ModelScope/GuanacoDataset" + }, + "belle_2m": { + "hf_hub_url": "BelleGroup/train_2M_CN", + "ms_hub_url": "AI-ModelScope/train_2M_CN" + }, + "belle_1m": { + "hf_hub_url": "BelleGroup/train_1M_CN", + "ms_hub_url": "AI-ModelScope/train_1M_CN" + }, + "belle_0.5m": { + "hf_hub_url": "BelleGroup/train_0.5M_CN", + "ms_hub_url": "AI-ModelScope/train_0.5M_CN" + }, + "belle_dialog": { + "hf_hub_url": "BelleGroup/generated_chat_0.4M", + "ms_hub_url": "AI-ModelScope/generated_chat_0.4M" + }, + "belle_math": { + "hf_hub_url": "BelleGroup/school_math_0.25M", + "ms_hub_url": "AI-ModelScope/school_math_0.25M" + }, + "open_platypus": { + "hf_hub_url": "garage-bAInd/Open-Platypus", + "ms_hub_url": "AI-ModelScope/Open-Platypus" + }, + "codealpaca": { + "hf_hub_url": "sahil2801/CodeAlpaca-20k", + "ms_hub_url": "AI-ModelScope/CodeAlpaca-20k" + }, + "alpaca_cot": { + "hf_hub_url": "QingyiSi/Alpaca-CoT", + "ms_hub_url": "AI-ModelScope/Alpaca-CoT" + }, + "openorca": { + "hf_hub_url": "Open-Orca/OpenOrca", + "ms_hub_url": "AI-ModelScope/OpenOrca", + "columns": { + "prompt": "question", + "response": "response", + "system": "system_prompt" + } + }, + "slimorca": { + "hf_hub_url": "Open-Orca/SlimOrca", + "formatting": "sharegpt" + }, + "mathinstruct": { + "hf_hub_url": "TIGER-Lab/MathInstruct", + "ms_hub_url": "AI-ModelScope/MathInstruct", + "columns": { + "prompt": "instruction", + "response": "output" + } + }, + "firefly": { + "hf_hub_url": "YeungNLP/firefly-train-1.1M", + "columns": { + "prompt": "input", + "response": "target" + } + }, + "wikiqa": { + "hf_hub_url": "wiki_qa", + "columns": { + "prompt": "question", + "response": "answer" + } + }, + "webqa": { + "hf_hub_url": "suolyer/webqa", + "ms_hub_url": "AI-ModelScope/webqa", + "columns": { + "prompt": "input", + "response": "output" + } + }, + "webnovel": { + "hf_hub_url": "zxbsmk/webnovel_cn", + "ms_hub_url": "AI-ModelScope/webnovel_cn" + }, + "nectar_sft": { + "hf_hub_url": "AstraMindAI/SFT-Nectar", + "ms_hub_url": "AI-ModelScope/SFT-Nectar" + }, + "deepctrl": { + "ms_hub_url": "deepctrl/deepctrl-sft-data" + }, + "adgen_train": { + "hf_hub_url": "HasturOfficial/adgen", + "ms_hub_url": "AI-ModelScope/adgen", + "split": "train", + "columns": { + "prompt": "content", + "response": "summary" + } + }, + "adgen_eval": { + "hf_hub_url": "HasturOfficial/adgen", + "ms_hub_url": "AI-ModelScope/adgen", + "split": "validation", + "columns": { + "prompt": "content", + "response": "summary" + } + }, + "sharegpt_hyper": { + "hf_hub_url": "totally-not-an-llm/sharegpt-hyperfiltered-3k", + "formatting": "sharegpt" + }, + "sharegpt4": { + "hf_hub_url": "shibing624/sharegpt_gpt4", + "ms_hub_url": "AI-ModelScope/sharegpt_gpt4", + "formatting": "sharegpt" + }, + "ultrachat_200k": { + "hf_hub_url": "HuggingFaceH4/ultrachat_200k", + "ms_hub_url": "AI-ModelScope/ultrachat_200k", + "split": "train_sft", + "formatting": "sharegpt", + "columns": { + "messages": "messages" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "infinity_instruct": { + "hf_hub_url": "BAAI/Infinity-Instruct", + "formatting": "sharegpt" + }, + "agent_instruct": { + "hf_hub_url": "THUDM/AgentInstruct", + "ms_hub_url": "ZhipuAI/AgentInstruct", + "formatting": "sharegpt" + }, + "lmsys_chat": { + "hf_hub_url": "lmsys/lmsys-chat-1m", + "ms_hub_url": "AI-ModelScope/lmsys-chat-1m", + "formatting": "sharegpt", + "columns": { + "messages": "conversation" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "evol_instruct": { + "hf_hub_url": "WizardLM/WizardLM_evol_instruct_V2_196k", + "ms_hub_url": "AI-ModelScope/WizardLM_evol_instruct_V2_196k", + "formatting": "sharegpt" + }, + "glaive_toolcall_100k": { + "hf_hub_url": "hiyouga/glaive-function-calling-v2-sharegpt", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "tools": "tools" + } + }, + "cosmopedia": { + "hf_hub_url": "HuggingFaceTB/cosmopedia", + "columns": { + "prompt": "prompt", + "response": "text" + } + }, + "stem_zh": { + "hf_hub_url": "hfl/stem_zh_instruction" + }, + "ruozhiba_gpt4": { + "hf_hub_url": "hfl/ruozhiba_gpt4_turbo" + }, + "neo_sft": { + "hf_hub_url": "m-a-p/neo_sft_phase2", + "formatting": "sharegpt" + }, + "magpie_pro_300k": { + "hf_hub_url": "Magpie-Align/Magpie-Pro-300K-Filtered", + "formatting": "sharegpt" + }, + "magpie_ultra": { + "hf_hub_url": "argilla/magpie-ultra-v0.1", + "columns": { + "prompt": "instruction", + "response": "response" + } + }, + "web_instruct": { + "hf_hub_url": "TIGER-Lab/WebInstructSub", + "columns": { + "prompt": "question", + "response": "answer" + } + }, + "openo1_sft": { + "hf_hub_url": "llamafactory/OpenO1-SFT", + "ms_hub_url": "llamafactory/OpenO1-SFT", + "columns": { + "prompt": "prompt", + "response": "response" + } + }, + "open_thoughts": { + "hf_hub_url": "llamafactory/OpenThoughts-114k", + "formatting": "sharegpt", + "columns": { + "messages": "messages" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant", + "system_tag": "system" + } + }, + "open_r1_math": { + "hf_hub_url": "llamafactory/OpenR1-Math-94k", + "formatting": "sharegpt", + "columns": { + "messages": "messages" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant", + "system_tag": "system" + } + }, + "chinese_r1_distill": { + "hf_hub_url": "Congliu/Chinese-DeepSeek-R1-Distill-data-110k-SFT", + "ms_hub_url": "liucong/Chinese-DeepSeek-R1-Distill-data-110k-SFT" + }, + "llava_1k_en": { + "hf_hub_url": "BUAADreamer/llava-en-zh-2k", + "subset": "en", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "llava_1k_zh": { + "hf_hub_url": "BUAADreamer/llava-en-zh-2k", + "subset": "zh", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "llava_150k_en": { + "hf_hub_url": "BUAADreamer/llava-en-zh-300k", + "subset": "en", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "llava_150k_zh": { + "hf_hub_url": "BUAADreamer/llava-en-zh-300k", + "subset": "zh", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "pokemon_cap": { + "hf_hub_url": "llamafactory/pokemon-gpt4o-captions", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "images": "images" + } + }, + "mllm_pt_demo": { + "hf_hub_url": "BUAADreamer/mllm_pt_demo", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "images": "images" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "oasst_de": { + "hf_hub_url": "mayflowergmbh/oasst_de" + }, + "dolly_15k_de": { + "hf_hub_url": "mayflowergmbh/dolly-15k_de" + }, + "alpaca-gpt4_de": { + "hf_hub_url": "mayflowergmbh/alpaca-gpt4_de" + }, + "openschnabeltier_de": { + "hf_hub_url": "mayflowergmbh/openschnabeltier_de" + }, + "evol_instruct_de": { + "hf_hub_url": "mayflowergmbh/evol-instruct_de" + }, + "dolphin_de": { + "hf_hub_url": "mayflowergmbh/dolphin_de" + }, + "booksum_de": { + "hf_hub_url": "mayflowergmbh/booksum_de" + }, + "airoboros_de": { + "hf_hub_url": "mayflowergmbh/airoboros-3.0_de" + }, + "ultrachat_de": { + "hf_hub_url": "mayflowergmbh/ultra-chat_de" + }, + "dlr_web": { + "hf_hub_url": "Attention1115/DLR-Web", + "split": "full", + "columns": { + "prompt": "question", + "response": "response" + } + }, + "dpo_en_demo": { + "file_name": "dpo_en_demo.json", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "dpo_zh_demo": { + "file_name": "dpo_zh_demo.json", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "dpo_mix_en": { + "hf_hub_url": "llamafactory/DPO-En-Zh-20k", + "subset": "en", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "dpo_mix_zh": { + "hf_hub_url": "llamafactory/DPO-En-Zh-20k", + "subset": "zh", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "ultrafeedback": { + "hf_hub_url": "llamafactory/ultrafeedback_binarized", + "ms_hub_url": "llamafactory/ultrafeedback_binarized", + "ranking": true, + "columns": { + "prompt": "instruction", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "coig_p": { + "hf_hub_url": "m-a-p/COIG-P", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } + }, + "rlhf_v": { + "hf_hub_url": "llamafactory/RLHF-V", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected", + "images": "images" + } + }, + "vlfeedback": { + "hf_hub_url": "Zhihui/VLFeedback", + "ranking": true, + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected", + "images": "images" + } + }, + "rlaif_v": { + "hf_hub_url": "openbmb/RLAIF-V-Dataset", + "ranking": true, + "columns": { + "prompt": "question", + "chosen": "chosen", + "rejected": "rejected", + "images": "image" + } + }, + "orca_pairs": { + "hf_hub_url": "Intel/orca_dpo_pairs", + "ranking": true, + "columns": { + "prompt": "question", + "chosen": "chosen", + "rejected": "rejected", + "system": "system" + } + }, + "nectar_rm": { + "hf_hub_url": "AstraMindAI/RLAIF-Nectar", + "ms_hub_url": "AI-ModelScope/RLAIF-Nectar", + "ranking": true + }, + "orca_dpo_de": { + "hf_hub_url": "mayflowergmbh/intel_orca_dpo_pairs_de", + "ranking": true + }, + "kto_en_demo": { + "file_name": "kto_en_demo.json", + "formatting": "sharegpt", + "columns": { + "messages": "messages", + "kto_tag": "label" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "kto_mix_en": { + "hf_hub_url": "argilla/kto-mix-15k", + "formatting": "sharegpt", + "columns": { + "messages": "completion", + "kto_tag": "label" + }, + "tags": { + "role_tag": "role", + "content_tag": "content", + "user_tag": "user", + "assistant_tag": "assistant" + } + }, + "ultrafeedback_kto": { + "hf_hub_url": "argilla/ultrafeedback-binarized-preferences-cleaned-kto", + "ms_hub_url": "AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto", + "columns": { + "prompt": "prompt", + "response": "completion", + "kto_tag": "label" + } + }, + "wiki_demo": { + "file_name": "wiki_demo.txt", + "columns": { + "prompt": "text" + } + }, + "c4_demo": { + "file_name": "c4_demo.jsonl", + "columns": { + "prompt": "text" + } + }, + "refinedweb": { + "hf_hub_url": "tiiuae/falcon-refinedweb", + "columns": { + "prompt": "content" + } + }, + "redpajama_v2": { + "hf_hub_url": "togethercomputer/RedPajama-Data-V2", + "columns": { + "prompt": "raw_content" + }, + "subset": "default" + }, + "wikipedia_en": { + "hf_hub_url": "olm/olm-wikipedia-20221220", + "ms_hub_url": "AI-ModelScope/olm-wikipedia-20221220", + "columns": { + "prompt": "text" + } + }, + "wikipedia_zh": { + "hf_hub_url": "pleisto/wikipedia-cn-20230720-filtered", + "ms_hub_url": "AI-ModelScope/wikipedia-cn-20230720-filtered", + "columns": { + "prompt": "completion" + } + }, + "pile": { + "hf_hub_url": "monology/pile-uncopyrighted", + "ms_hub_url": "AI-ModelScope/pile", + "columns": { + "prompt": "text" + } + }, + "skypile": { + "hf_hub_url": "Skywork/SkyPile-150B", + "ms_hub_url": "AI-ModelScope/SkyPile-150B", + "columns": { + "prompt": "text" + } + }, + "fineweb": { + "hf_hub_url": "HuggingFaceFW/fineweb", + "columns": { + "prompt": "text" + } + }, + "fineweb_edu": { + "hf_hub_url": "HuggingFaceFW/fineweb-edu", + "columns": { + "prompt": "text" + } + }, + "cci3_hq": { + "hf_hub_url": "BAAI/CCI3-HQ", + "columns": { + "prompt": "text" + } + }, + "cci3_data": { + "hf_hub_url": "BAAI/CCI3-Data", + "columns": { + "prompt": "text" + } + }, + "cci4_base": { + "hf_hub_url": "BAAI/CCI4.0-M2-Base-v1", + "columns": { + "prompt": "text" + } + }, + "cci4_cot": { + "hf_hub_url": "BAAI/CCI4.0-M2-CoT-v1", + "columns": { + "prompt": "text" + } + }, + "cci4_extra": { + "hf_hub_url": "BAAI/CCI4.0-M2-Extra-v1", + "columns": { + "prompt": "text" + } + }, + "the_stack": { + "hf_hub_url": "bigcode/the-stack", + "ms_hub_url": "AI-ModelScope/the-stack", + "columns": { + "prompt": "content" + } + }, + "starcoder_python": { + "hf_hub_url": "bigcode/starcoderdata", + "ms_hub_url": "AI-ModelScope/starcoderdata", + "columns": { + "prompt": "content" + }, + "folder": "python" + } +} diff --git a/LlamaFactory/data/README.md b/LlamaFactory/data/README.md new file mode 100644 index 0000000000000000000000000000000000000000..e43e0eed0b48f6ed0054382346311550a2a1a927 --- /dev/null +++ b/LlamaFactory/data/README.md @@ -0,0 +1,475 @@ +The [dataset_info.json](dataset_info.json) contains all available datasets. If you are using a custom dataset, please **make sure** to add a *dataset description* in `dataset_info.json` and specify `dataset: dataset_name` before training to use it. + +The `dataset_info.json` file should be put in the `dataset_dir` directory. You can change `dataset_dir` to use another directory. The default value is `./data`. + +Currently we support datasets in **alpaca** and **sharegpt** format. Allowed file types include json, jsonl, csv, parquet, arrow. + +```json +"dataset_name": { + "hf_hub_url": "the name of the dataset repository on the Hugging Face hub. (if specified, ignore script_url, file_name and cloud_file_name)", + "ms_hub_url": "the name of the dataset repository on the Model Scope hub. (if specified, ignore script_url, file_name and cloud_file_name)", + "script_url": "the name of the directory containing a dataset loading script. (if specified, ignore file_name and cloud_file_name)", + "cloud_file_name": "the name of the dataset file in s3/gcs cloud storage. (if specified, ignore file_name)", + "file_name": "the name of the dataset folder or dataset file in this directory. (required if above are not specified)", + "formatting": "the format of the dataset. (optional, default: alpaca, can be chosen from {alpaca, sharegpt})", + "ranking": "whether the dataset is a preference dataset or not. (default: False)", + "subset": "the name of the subset. (optional, default: None)", + "split": "the name of dataset split to be used. (optional, default: train)", + "folder": "the name of the folder of the dataset repository on the Hugging Face hub. (optional, default: None)", + "num_samples": "the number of samples in the dataset to be used. (optional, default: None)", + "columns (optional)": { + "prompt": "the column name in the dataset containing the prompts. (default: instruction)", + "query": "the column name in the dataset containing the queries. (default: input)", + "response": "the column name in the dataset containing the responses. (default: output)", + "history": "the column name in the dataset containing the histories. (default: None)", + "messages": "the column name in the dataset containing the messages. (default: conversations)", + "system": "the column name in the dataset containing the system prompts. (default: None)", + "tools": "the column name in the dataset containing the tool description. (default: None)", + "images": "the column name in the dataset containing the image inputs. (default: None)", + "videos": "the column name in the dataset containing the videos inputs. (default: None)", + "audios": "the column name in the dataset containing the audios inputs. (default: None)", + "chosen": "the column name in the dataset containing the chosen answers. (default: None)", + "rejected": "the column name in the dataset containing the rejected answers. (default: None)", + "kto_tag": "the column name in the dataset containing the kto tags. (default: None)" + }, + "tags (optional, used for the sharegpt format)": { + "role_tag": "the key in the message represents the identity. (default: from)", + "content_tag": "the key in the message represents the content. (default: value)", + "user_tag": "the value of the role_tag represents the user. (default: human)", + "assistant_tag": "the value of the role_tag represents the assistant. (default: gpt)", + "observation_tag": "the value of the role_tag represents the tool results. (default: observation)", + "function_tag": "the value of the role_tag represents the function call. (default: function_call)", + "system_tag": "the value of the role_tag represents the system prompt. (default: system, can override system column)" + } +} +``` + +## Alpaca Format + +### Supervised Fine-Tuning Dataset + +* [Example dataset](alpaca_en_demo.json) + +In supervised fine-tuning, the `instruction` column will be concatenated with the `input` column and used as the user prompt, then the user prompt would be `instruction\ninput`. The `output` column represents the model response. + +For reasoning models, if the dataset contains chain-of-thought (CoT), the CoT needs to be placed in the model responses, such as `cotoutput`. + +The `system` column will be used as the system prompt if specified. + +The `history` column is a list consisting of string tuples representing prompt-response pairs in the history messages. Note that the responses in the history **will also be learned by the model** in supervised fine-tuning. + +```json +[ + { + "instruction": "user instruction (required)", + "input": "user input (optional)", + "output": "model response (required)", + "system": "system prompt (optional)", + "history": [ + ["user instruction in the first round (optional)", "model response in the first round (optional)"], + ["user instruction in the second round (optional)", "model response in the second round (optional)"] + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "instruction", + "query": "input", + "response": "output", + "system": "system", + "history": "history" + } +} +``` + +> [!TIP] +> If the model has reasoning capabilities (e.g. Qwen3) but the dataset does not contain chain-of-thought (CoT), LLaMA-Factory will automatically add empty CoT to the data. When `enable_thinking` is `True` (slow thinking, by default), the empty CoT will be added to the model responses and loss computation will be considered; otherwise (fast thinking), it will be added to the user prompts and loss computation will be ignored. Please keep the `enable_thinking` parameter consistent during training and inference. +> +> If you want to train data containing CoT with slow thinking and data without CoT with fast thinking, you can set `enable_thinking` to `None`. However, this feature is relatively complicated and should be used with caution. + +### Pre-training Dataset + +- [Example dataset](c4_demo.jsonl) + +In pre-training, only the `text` column will be used for model learning. + +```json +[ + {"text": "document"}, + {"text": "document"} +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "columns": { + "prompt": "text" + } +} +``` + +### Preference Dataset + +Preference datasets are used for reward modeling, DPO training, ORPO and SimPO training. + +It requires a better response in `chosen` column and a worse response in `rejected` column. + +```json +[ + { + "instruction": "user instruction (required)", + "input": "user input (optional)", + "chosen": "chosen answer (required)", + "rejected": "rejected answer (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "ranking": true, + "columns": { + "prompt": "instruction", + "query": "input", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +An additional column `kto_tag` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Image Dataset + +An additional column `images` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Video Dataset + +An additional column `videos` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +### Multimodal Audio Dataset + +An additional column `audios` is required. Please refer to the [sharegpt](#sharegpt-format) format for details. + +## Sharegpt Format + +### Supervised Fine-Tuning Dataset + +- [Example dataset](glaive_toolcall_en_demo.json) + +Compared to the alpaca format, the sharegpt format allows the datasets have **more roles**, such as human, gpt, observation and function. They are presented in a list of objects in the `conversations` column. + +Note that the human and observation should appear in odd positions, while gpt and function should appear in even positions. The gpt and function will be learned by the model. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "function_call", + "value": "tool arguments" + }, + { + "from": "observation", + "value": "tool result" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "system": "system prompt (optional)", + "tools": "tool description (optional)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "system": "system", + "tools": "tools" + } +} +``` + +### Pre-training Dataset + +Not yet supported, please use the [alpaca](#alpaca-format) format. + +### Preference Dataset + +- [Example dataset](dpo_en_demo.json) + +Preference datasets in sharegpt format also require a better message in `chosen` column and a worse message in `rejected` column. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + }, + { + "from": "human", + "value": "user instruction" + } + ], + "chosen": { + "from": "gpt", + "value": "chosen answer (required)" + }, + "rejected": { + "from": "gpt", + "value": "rejected answer (required)" + } + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "ranking": true, + "columns": { + "messages": "conversations", + "chosen": "chosen", + "rejected": "rejected" + } +} +``` + +### KTO Dataset + +- [Example dataset](kto_en_demo.json) + +KTO datasets require a extra `kto_tag` column containing the boolean human feedback. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "kto_tag": "human feedback [true/false] (required)" + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "kto_tag": "kto_tag" + } +} +``` + +### Multimodal Image Dataset + +- [Example dataset](mllm_demo.json) + +Multimodal image datasets require an `images` column containing the paths to the input images. + +The number of images should be identical to the `` tokens in the conversations. + +```json +[ + { + "conversations": [ + { + "from": "human", + "value": "user instruction" + }, + { + "from": "gpt", + "value": "model response" + } + ], + "images": [ + "image path (required)" + ] + } +] +``` + +Regarding the above dataset, the *dataset description* in `dataset_info.json` should be: + +```json +"dataset_name": { + "file_name": "data.json", + "formatting": "sharegpt", + "columns": { + "messages": "conversations", + "images": "images" + } +} +``` + +### Multimodal Video Dataset + +- [Example dataset](mllm_video_demo.json) + +Multimodal video datasets require a `videos` column containing the paths to the input videos. + +The number of videos should be identical to the `