Upload 28 files
Browse files- .gitattributes +43 -35
- .github/workflows/train_model.yml +44 -0
- .vscode/settings.json +9 -0
- LICENSE +21 -0
- README.md +141 -0
- agent.ps1 +292 -0
- agent.sh +73 -0
- ai-agent/system_prompt.mdx +15 -0
- ai-agent/task_context.md +1 -0
- ai-agent/train.py +15 -0
- config.json +24 -0
- data_collector.ps1 +49 -0
- data_collector.sh +10 -0
- model_evaluator.ps1 +49 -0
- model_evaluator.sh +10 -0
- model_trainer.ps1 +52 -0
- model_trainer.sh +10 -0
- requirements.txt +29 -0
- scripts/data_collector.py +186 -0
- scripts/model_evaluator.py +216 -0
- scripts/model_trainer.py +232 -0
- scripts/tokenizer_trainer.py +160 -0
- special_tokens_map.json +32 -0
- start.sh +438 -0
- tokenizer.json +3 -0
- tokenizer_config.json +208 -0
- tools.json +97 -0
- train.py +14 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,43 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
assets/ball.gif filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
assets/benchmark.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
assets/count.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
assets/diamond.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
assets/param-aime2024.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
assets/param-lcb.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
assets/writing.png filter=lfs diff=lfs merge=lfs -text
|
.github/workflows/train_model.yml
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Train Bengali-Code LLM Model
|
| 2 |
+
|
| 3 |
+
on:
|
| 4 |
+
schedule:
|
| 5 |
+
- cron: '0 0 * * *' # Run daily at midnight
|
| 6 |
+
workflow_dispatch: # Allow manual triggers
|
| 7 |
+
|
| 8 |
+
jobs:
|
| 9 |
+
train:
|
| 10 |
+
runs-on: ubuntu-latest
|
| 11 |
+
steps:
|
| 12 |
+
- uses: actions/checkout@v4
|
| 13 |
+
|
| 14 |
+
- name: Set up Python
|
| 15 |
+
uses: actions/setup-python@v5
|
| 16 |
+
with:
|
| 17 |
+
python-version: '3.10'
|
| 18 |
+
|
| 19 |
+
- name: Install dependencies
|
| 20 |
+
run: |
|
| 21 |
+
python -m pip install --upgrade pip
|
| 22 |
+
pip install transformers datasets sentencepiece accelerate torch wandb
|
| 23 |
+
|
| 24 |
+
- name: Data Collection
|
| 25 |
+
run: python scripts/data_collector.py
|
| 26 |
+
env:
|
| 27 |
+
HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
|
| 28 |
+
|
| 29 |
+
- name: Train Tokenizer
|
| 30 |
+
run: python scripts/tokenizer_trainer.py
|
| 31 |
+
|
| 32 |
+
- name: Train Model
|
| 33 |
+
run: python scripts/model_trainer.py
|
| 34 |
+
env:
|
| 35 |
+
WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
|
| 36 |
+
|
| 37 |
+
- name: Evaluate Model
|
| 38 |
+
run: python scripts/model_evaluator.py
|
| 39 |
+
|
| 40 |
+
- name: Upload Model Artifacts
|
| 41 |
+
uses: actions/upload-artifact@v3
|
| 42 |
+
with:
|
| 43 |
+
name: model-weights
|
| 44 |
+
path: outputs/models/
|
.vscode/settings.json
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"editor.inlineSuggest.enabled": true,
|
| 3 |
+
"editor.quickSuggestions": {
|
| 4 |
+
"other": "inline",
|
| 5 |
+
"comments": true,
|
| 6 |
+
"strings": true
|
| 7 |
+
},
|
| 8 |
+
"editor.quickSuggestionsDelay": 100
|
| 9 |
+
}
|
LICENSE
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
MIT License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2025 Bengali-Code LLM Project Contributors
|
| 4 |
+
|
| 5 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
| 6 |
+
of this software and associated documentation files (the "Software"), to deal
|
| 7 |
+
in the Software without restriction, including without limitation the rights
|
| 8 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
| 9 |
+
copies of the Software, and to permit persons to whom the Software is
|
| 10 |
+
furnished to do so, subject to the following conditions:
|
| 11 |
+
|
| 12 |
+
The above copyright notice and this permission notice shall be included in all
|
| 13 |
+
copies or substantial portions of the Software.
|
| 14 |
+
|
| 15 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
| 16 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
| 17 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
| 18 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
| 19 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
| 20 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
| 21 |
+
SOFTWARE.
|
README.md
ADDED
|
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Bengali-Code LLM Training Pipeline
|
| 2 |
+
|
| 3 |
+
A comprehensive pipeline for training a Bengali language model specialized in code understanding and generation. The model is fine-tuned on Bengali programming tutorials, documentation, and code examples.
|
| 4 |
+
|
| 5 |
+
## 🌟 Features
|
| 6 |
+
|
| 7 |
+
- Automated data collection from Bengali Wikipedia and Prothom Alo
|
| 8 |
+
- Custom tokenizer training with SentencePiece for Bengali text and code
|
| 9 |
+
- Model fine-tuning using TinyLlama base model
|
| 10 |
+
- Comprehensive evaluation suite for Bengali code generation
|
| 11 |
+
- GitHub Actions workflow for automated training
|
| 12 |
+
- Weights & Biases integration for experiment tracking
|
| 13 |
+
|
| 14 |
+
## 📋 Requirements
|
| 15 |
+
|
| 16 |
+
- Python 3.10 or higher
|
| 17 |
+
- CUDA-capable GPU (recommended)
|
| 18 |
+
- 16GB+ RAM
|
| 19 |
+
- Internet connection for data collection
|
| 20 |
+
|
| 21 |
+
## 🚀 Quick Start
|
| 22 |
+
|
| 23 |
+
1. Clone the repository:
|
| 24 |
+
```bash
|
| 25 |
+
git clone https://github.com/yourusername/bengali-code-llm.git
|
| 26 |
+
cd bengali-code-llm
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
2. Install dependencies:
|
| 30 |
+
```bash
|
| 31 |
+
pip install -r requirements.txt
|
| 32 |
+
```
|
| 33 |
+
|
| 34 |
+
3. Set up environment variables:
|
| 35 |
+
```bash
|
| 36 |
+
export HUGGINGFACE_TOKEN="your_token_here"
|
| 37 |
+
export WANDB_API_KEY="your_wandb_key_here"
|
| 38 |
+
```
|
| 39 |
+
|
| 40 |
+
4. Run the complete pipeline:
|
| 41 |
+
```bash
|
| 42 |
+
# Collect data
|
| 43 |
+
python scripts/data_collector.py
|
| 44 |
+
|
| 45 |
+
# Train tokenizer
|
| 46 |
+
python scripts/tokenizer_trainer.py
|
| 47 |
+
|
| 48 |
+
# Train model
|
| 49 |
+
python scripts/model_trainer.py
|
| 50 |
+
|
| 51 |
+
# Evaluate model
|
| 52 |
+
python scripts/model_evaluator.py
|
| 53 |
+
```
|
| 54 |
+
|
| 55 |
+
## 🏗️ Pipeline Components
|
| 56 |
+
|
| 57 |
+
### Data Collection (`scripts/data_collector.py`)
|
| 58 |
+
- Scrapes Bengali text from Wikipedia and Prothom Alo
|
| 59 |
+
- Implements rate limiting and error handling
|
| 60 |
+
- Outputs processed data in JSON format
|
| 61 |
+
|
| 62 |
+
### Tokenizer Training (`scripts/tokenizer_trainer.py`)
|
| 63 |
+
- Uses SentencePiece for tokenizer training
|
| 64 |
+
- Custom vocabulary with Bengali and code tokens
|
| 65 |
+
- Generates HuggingFace-compatible tokenizer files
|
| 66 |
+
|
| 67 |
+
### Model Training (`scripts/model_trainer.py`)
|
| 68 |
+
- Fine-tunes TinyLlama model
|
| 69 |
+
- Implements efficient training with gradient accumulation
|
| 70 |
+
- Supports mixed precision training
|
| 71 |
+
- Integrates with Weights & Biases for tracking
|
| 72 |
+
|
| 73 |
+
### Model Evaluation (`scripts/model_evaluator.py`)
|
| 74 |
+
- Comprehensive evaluation suite
|
| 75 |
+
- Tests code generation capabilities
|
| 76 |
+
- Measures BLEU and ROUGE scores
|
| 77 |
+
- Generates detailed evaluation reports
|
| 78 |
+
|
| 79 |
+
## 📊 Training Metrics
|
| 80 |
+
|
| 81 |
+
The training progress can be monitored through Weights & Biases:
|
| 82 |
+
- Loss curves
|
| 83 |
+
- Evaluation metrics
|
| 84 |
+
- Generated samples
|
| 85 |
+
- Resource utilization
|
| 86 |
+
|
| 87 |
+
## 🔄 GitHub Actions Workflow
|
| 88 |
+
|
| 89 |
+
The repository includes an automated training pipeline that:
|
| 90 |
+
- Runs daily to incorporate new data
|
| 91 |
+
- Executes the complete training pipeline
|
| 92 |
+
- Uploads model artifacts
|
| 93 |
+
- Can be triggered manually
|
| 94 |
+
|
| 95 |
+
## 📁 Directory Structure
|
| 96 |
+
|
| 97 |
+
```
|
| 98 |
+
bengali-code-llm/
|
| 99 |
+
├── .github/
|
| 100 |
+
│ └── workflows/
|
| 101 |
+
│ └── train_model.yml
|
| 102 |
+
├── scripts/
|
| 103 |
+
│ ├── data_collector.py
|
| 104 |
+
│ ├── tokenizer_trainer.py
|
| 105 |
+
│ ├── model_trainer.py
|
| 106 |
+
│ └── model_evaluator.py
|
| 107 |
+
├── data/
|
| 108 |
+
│ └── raw/
|
| 109 |
+
├── outputs/
|
| 110 |
+
│ ├── tokenizer/
|
| 111 |
+
│ ├── model/
|
| 112 |
+
│ └── evaluation/
|
| 113 |
+
├── requirements.txt
|
| 114 |
+
└── README.md
|
| 115 |
+
```
|
| 116 |
+
|
| 117 |
+
## 🎯 Model Performance
|
| 118 |
+
|
| 119 |
+
The model is evaluated on various tasks:
|
| 120 |
+
- Code generation in Bengali
|
| 121 |
+
- Code explanation and documentation
|
| 122 |
+
- Error detection and correction
|
| 123 |
+
- Algorithm explanation
|
| 124 |
+
|
| 125 |
+
## 📜 License
|
| 126 |
+
|
| 127 |
+
This project is licensed under the MIT License - see the LICENSE file for details.
|
| 128 |
+
|
| 129 |
+
## 🤝 Contributing
|
| 130 |
+
|
| 131 |
+
Contributions are welcome! Please feel free to submit issues and pull requests.
|
| 132 |
+
|
| 133 |
+
## 📧 Contact
|
| 134 |
+
|
| 135 |
+
For questions and feedback, please open an issue in the repository.
|
| 136 |
+
|
| 137 |
+
## 🙏 Acknowledgments
|
| 138 |
+
|
| 139 |
+
- TinyLlama team for the base model
|
| 140 |
+
- HuggingFace for the Transformers library
|
| 141 |
+
- Weights & Biases for experiment tracking
|
agent.ps1
ADDED
|
@@ -0,0 +1,292 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration
|
| 2 |
+
$API_KEY = "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"
|
| 3 |
+
$MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
|
| 4 |
+
$AGENT_COUNT = 2
|
| 5 |
+
|
| 6 |
+
$WORKDIR = Join-Path $PSScriptRoot "ai-agent"
|
| 7 |
+
$LOGDIR = Join-Path $WORKDIR "outputs\logs"
|
| 8 |
+
$PROMPT_FILE = Join-Path $WORKDIR "system_prompt.mdx"
|
| 9 |
+
$TASK_FILE = Join-Path $WORKDIR "task_context.md"
|
| 10 |
+
|
| 11 |
+
# Create directories
|
| 12 |
+
New-Item -ItemType Directory -Force -Path $LOGDIR | Out-Null
|
| 13 |
+
New-Item -ItemType Directory -Force -Path $WORKDIR | Out-Null
|
| 14 |
+
|
| 15 |
+
# Initialize prompt file if missing
|
| 16 |
+
if (-not (Test-Path $PROMPT_FILE)) {
|
| 17 |
+
$initialPrompt = '<Plan>' + [Environment]::NewLine
|
| 18 |
+
$initialPrompt += 'You are AI coding agents focused on building a Bengali code + NLP LLM.' + [Environment]::NewLine
|
| 19 |
+
$initialPrompt += 'Output commands inside <Actions> blocks, analyses inside <Task> blocks.' + [Environment]::NewLine
|
| 20 |
+
$initialPrompt += 'After command execution, output results inside <TaskResult> blocks.' + [Environment]::NewLine
|
| 21 |
+
$initialPrompt += '</Plan>' + [Environment]::NewLine + [Environment]::NewLine
|
| 22 |
+
$initialPrompt += '<Actions>' + [Environment]::NewLine
|
| 23 |
+
$initialPrompt += 'echo "Starting initial training setup..."' + [Environment]::NewLine
|
| 24 |
+
$initialPrompt += '# Dummy start command for training' + [Environment]::NewLine
|
| 25 |
+
$initialPrompt += 'echo "Training started."' + [Environment]::NewLine
|
| 26 |
+
$initialPrompt += '</Actions>' + [Environment]::NewLine + [Environment]::NewLine
|
| 27 |
+
$initialPrompt += '<Task>' + [Environment]::NewLine
|
| 28 |
+
$initialPrompt += 'Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.' + [Environment]::NewLine
|
| 29 |
+
$initialPrompt += '</Task>'
|
| 30 |
+
|
| 31 |
+
Set-Content -Path $PROMPT_FILE -Value $initialPrompt
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
# Initialize task file if missing
|
| 35 |
+
if (-not (Test-Path $TASK_FILE)) {
|
| 36 |
+
"" | Set-Content $TASK_FILE
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
# Copy training script if missing
|
| 40 |
+
$TRAIN_SCRIPT = Join-Path $WORKDIR "train.py"
|
| 41 |
+
if (-not (Test-Path $TRAIN_SCRIPT)) {
|
| 42 |
+
Copy-Item -Path (Join-Path $PSScriptRoot "train.py") -Destination $TRAIN_SCRIPT
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
# Function to call Groq API with streaming
|
| 46 |
+
function Invoke-GroqAPI {
|
| 47 |
+
param (
|
| 48 |
+
[string]$Prompt,
|
| 49 |
+
[string]$AgentId
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
$headers = @{
|
| 53 |
+
"Authorization" = "Bearer " + $API_KEY
|
| 54 |
+
"Content-Type" = "application/json"
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
$body = @{
|
| 58 |
+
model = $MODEL
|
| 59 |
+
messages = @(
|
| 60 |
+
@{
|
| 61 |
+
role = "system"
|
| 62 |
+
content = $Prompt
|
| 63 |
+
}
|
| 64 |
+
)
|
| 65 |
+
temperature = 1
|
| 66 |
+
max_completion_tokens = 1024
|
| 67 |
+
top_p = 1
|
| 68 |
+
stream = $true
|
| 69 |
+
} | ConvertTo-Json
|
| 70 |
+
|
| 71 |
+
try {
|
| 72 |
+
$apiUrl = "https://api.groq.com/openai/v1/chat/completions"
|
| 73 |
+
$response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
|
| 74 |
+
|
| 75 |
+
# Process streaming response
|
| 76 |
+
$fullResponse = ""
|
| 77 |
+
foreach ($chunk in $response.choices[0].delta.content) {
|
| 78 |
+
if ($null -ne $chunk) {
|
| 79 |
+
$fullResponse += $chunk
|
| 80 |
+
Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
|
| 81 |
+
}
|
| 82 |
+
}
|
| 83 |
+
Write-Host ""
|
| 84 |
+
return $fullResponse
|
| 85 |
+
}
|
| 86 |
+
catch {
|
| 87 |
+
Write-Host "❌ Error calling Groq API: $_" -ForegroundColor Red
|
| 88 |
+
return $null
|
| 89 |
+
}
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
# Function to extract and run actions
|
| 93 |
+
function Invoke-Actions {
|
| 94 |
+
param (
|
| 95 |
+
[string]$Response,
|
| 96 |
+
[string]$AgentId
|
| 97 |
+
)
|
| 98 |
+
|
| 99 |
+
if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
|
| 100 |
+
$actions = $matches[1].Trim()
|
| 101 |
+
if ($actions) {
|
| 102 |
+
Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
|
| 103 |
+
$actionScriptName = "run_actions_" + $AgentId + ".ps1"
|
| 104 |
+
$actionScript = Join-Path $WORKDIR $actionScriptName
|
| 105 |
+
$actions | Set-Content $actionScript
|
| 106 |
+
|
| 107 |
+
$logFileName = "actions_agent_" + $AgentId + ".log"
|
| 108 |
+
$logFile = Join-Path $LOGDIR $logFileName
|
| 109 |
+
& $actionScript *>&1 | Tee-Object -Path $logFile
|
| 110 |
+
}
|
| 111 |
+
}
|
| 112 |
+
else {
|
| 113 |
+
Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
|
| 114 |
+
$logFileName = "actions_agent_" + $AgentId + ".log"
|
| 115 |
+
"" | Set-Content (Join-Path $LOGDIR $logFileName)
|
| 116 |
+
}
|
| 117 |
+
}
|
| 118 |
+
|
| 119 |
+
# Function to append task result
|
| 120 |
+
function Add-TaskResult {
|
| 121 |
+
param (
|
| 122 |
+
[string]$AgentId
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
$logFileName = "actions_agent_" + $AgentId + ".log"
|
| 126 |
+
$logFile = Join-Path $LOGDIR $logFileName
|
| 127 |
+
if (Test-Path $logFile) {
|
| 128 |
+
$result = Get-Content $logFile -Tail 50 | Out-String
|
| 129 |
+
$taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
|
| 130 |
+
$taskResult += $result
|
| 131 |
+
$taskResult += '</TaskResult>'
|
| 132 |
+
|
| 133 |
+
Add-Content -Path $TASK_FILE -Value $taskResult
|
| 134 |
+
Write-Host ("✍️ Agent " + $AgentId + " appended <TaskResult>.")
|
| 135 |
+
}
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
# Main loop with multi-agent coordination
|
| 139 |
+
Write-Host "🚀 Starting multi-agent AI loop with $AGENT_COUNT agents..."
|
| 140 |
+
|
| 141 |
+
$stopLoop = $false
|
| 142 |
+
while (-not $stopLoop) {
|
| 143 |
+
$promptCombined = Get-Content $PROMPT_FILE, $TASK_FILE | Out-String
|
| 144 |
+
|
| 145 |
+
# Create array to hold jobs
|
| 146 |
+
$jobs = @()
|
| 147 |
+
|
| 148 |
+
# Start agents in parallel
|
| 149 |
+
1..$AGENT_COUNT | ForEach-Object {
|
| 150 |
+
$agentId = $_
|
| 151 |
+
$workdir = $WORKDIR
|
| 152 |
+
$logdir = $LOGDIR
|
| 153 |
+
$apiKey = $API_KEY
|
| 154 |
+
$model = $MODEL
|
| 155 |
+
|
| 156 |
+
$jobs += Start-Job -ScriptBlock {
|
| 157 |
+
param($promptCombined, $agentId, $workdir, $logdir, $apiKey, $model)
|
| 158 |
+
|
| 159 |
+
# Recreate functions in job scope
|
| 160 |
+
function Invoke-GroqAPI {
|
| 161 |
+
param($Prompt, $AgentId)
|
| 162 |
+
$headers = @{
|
| 163 |
+
"Authorization" = "Bearer " + $apiKey
|
| 164 |
+
"Content-Type" = "application/json"
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
$body = @{
|
| 168 |
+
model = $model
|
| 169 |
+
messages = @(
|
| 170 |
+
@{
|
| 171 |
+
role = "system"
|
| 172 |
+
content = $Prompt
|
| 173 |
+
}
|
| 174 |
+
)
|
| 175 |
+
temperature = 1
|
| 176 |
+
max_completion_tokens = 1024
|
| 177 |
+
top_p = 1
|
| 178 |
+
stream = $true
|
| 179 |
+
} | ConvertTo-Json
|
| 180 |
+
|
| 181 |
+
try {
|
| 182 |
+
# Add hosts entry
|
| 183 |
+
$hostsPath = "$env:SystemRoot\System32\drivers\etc\hosts"
|
| 184 |
+
$hostEntry = "104.198.40.119 groq-api.local"
|
| 185 |
+
|
| 186 |
+
# Check if entry exists
|
| 187 |
+
$hostsContent = Get-Content $hostsPath
|
| 188 |
+
if ($hostsContent -notcontains $hostEntry) {
|
| 189 |
+
Add-Content -Path $hostsPath -Value "`n$hostEntry" -Force
|
| 190 |
+
}
|
| 191 |
+
|
| 192 |
+
# Configure TLS
|
| 193 |
+
[Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
|
| 194 |
+
[Net.ServicePointManager]::ServerCertificateValidationCallback = {$true}
|
| 195 |
+
|
| 196 |
+
# Make request
|
| 197 |
+
$headers = @{
|
| 198 |
+
"Authorization" = "Bearer $apiKey"
|
| 199 |
+
"Content-Type" = "application/json"
|
| 200 |
+
"Host" = "api.groq.com"
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
$apiUrl = "https://groq-api.local/v1/chat/completions"
|
| 204 |
+
$response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
|
| 205 |
+
|
| 206 |
+
$fullResponse = ""
|
| 207 |
+
foreach ($chunk in $response.choices[0].delta.content) {
|
| 208 |
+
if ($null -ne $chunk) {
|
| 209 |
+
$fullResponse += $chunk
|
| 210 |
+
Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
|
| 211 |
+
}
|
| 212 |
+
}
|
| 213 |
+
Write-Host ""
|
| 214 |
+
return $fullResponse
|
| 215 |
+
}
|
| 216 |
+
catch {
|
| 217 |
+
Write-Host ("❌ Error calling Groq API: " + $_.Exception.Message) -ForegroundColor Red
|
| 218 |
+
# Stop the loop on API errors
|
| 219 |
+
return "<Done>"
|
| 220 |
+
}
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
function Invoke-Actions {
|
| 224 |
+
param($Response, $AgentId)
|
| 225 |
+
if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
|
| 226 |
+
$actions = $matches[1].Trim()
|
| 227 |
+
if ($actions) {
|
| 228 |
+
Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
|
| 229 |
+
$actionScriptName = "run_actions_" + $AgentId + ".ps1"
|
| 230 |
+
$actionScript = Join-Path $workdir $actionScriptName
|
| 231 |
+
$actions | Set-Content $actionScript
|
| 232 |
+
|
| 233 |
+
$logFileName = "actions_agent_" + $AgentId + ".log"
|
| 234 |
+
$logFile = Join-Path $logdir $logFileName
|
| 235 |
+
& $actionScript *>&1 | Tee-Object -Path $logFile
|
| 236 |
+
}
|
| 237 |
+
}
|
| 238 |
+
else {
|
| 239 |
+
Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
|
| 240 |
+
$logFileName = "actions_agent_" + $AgentId + ".log"
|
| 241 |
+
"" | Set-Content (Join-Path $logdir $logFileName)
|
| 242 |
+
}
|
| 243 |
+
}
|
| 244 |
+
|
| 245 |
+
function Add-TaskResult {
|
| 246 |
+
param($AgentId)
|
| 247 |
+
$logFile = Join-Path $logdir ('actions_agent_' + $AgentId + '.log')
|
| 248 |
+
if (Test-Path $logFile) {
|
| 249 |
+
$result = Get-Content $logFile -Tail 50 | Out-String
|
| 250 |
+
$taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
|
| 251 |
+
$taskResult += $result
|
| 252 |
+
$taskResult += '</TaskResult>'
|
| 253 |
+
|
| 254 |
+
Add-Content -Path (Join-Path $workdir 'task_context.md') -Value $taskResult
|
| 255 |
+
Write-Host ('✍️ Agent ' + $AgentId + ' appended <TaskResult>.')
|
| 256 |
+
}
|
| 257 |
+
}
|
| 258 |
+
|
| 259 |
+
Write-Host ("🤖 Agent " + $agentId + " sending prompt to Groq API...")
|
| 260 |
+
$response = Invoke-GroqAPI -Prompt $promptCombined -AgentId $agentId
|
| 261 |
+
|
| 262 |
+
if ($response) {
|
| 263 |
+
$responseFileName = "agent_" + $agentId + "_response.txt"
|
| 264 |
+
$response | Set-Content (Join-Path $logdir $responseFileName)
|
| 265 |
+
|
| 266 |
+
Invoke-Actions -Response $response -AgentId $agentId
|
| 267 |
+
Add-TaskResult -AgentId $agentId
|
| 268 |
+
|
| 269 |
+
# Check for completion
|
| 270 |
+
if ($response -match '<Done>') {
|
| 271 |
+
Write-Host ("✅ Agent " + $agentId + " indicated completion.")
|
| 272 |
+
return $true
|
| 273 |
+
}
|
| 274 |
+
}
|
| 275 |
+
return $false
|
| 276 |
+
} -ArgumentList $promptCombined, $agentId, $workdir, $logdir, $apiKey, $model
|
| 277 |
+
}
|
| 278 |
+
|
| 279 |
+
# Wait for all jobs and get results
|
| 280 |
+
$results = $jobs | Wait-Job | Receive-Job
|
| 281 |
+
$jobs | Remove-Job
|
| 282 |
+
|
| 283 |
+
# Check if any agent indicated completion
|
| 284 |
+
if ($results -contains $true) {
|
| 285 |
+
Write-Host "🚀 Stopping AI loop as <Done> was detected."
|
| 286 |
+
$stopLoop = $true
|
| 287 |
+
}
|
| 288 |
+
|
| 289 |
+
Start-Sleep -Seconds 2
|
| 290 |
+
}
|
| 291 |
+
|
| 292 |
+
Write-Host "🎉 All agents completed."
|
agent.sh
ADDED
|
@@ -0,0 +1,73 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# Configuration
|
| 4 |
+
PROJECT_DIR="$HOME/bd-model-generations"
|
| 5 |
+
STATUS_DIR="$PROJECT_DIR/status"
|
| 6 |
+
LOG_FILE="$PROJECT_DIR/logs/actions.log"
|
| 7 |
+
|
| 8 |
+
# Ensure directories exist
|
| 9 |
+
mkdir -p "$STATUS_DIR" "$PROJECT_DIR/logs"
|
| 10 |
+
|
| 11 |
+
# Log function for errors
|
| 12 |
+
log_error() {
|
| 13 |
+
echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >> "$LOG_FILE"
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
# Live status display function
|
| 17 |
+
display_status() {
|
| 18 |
+
while true; do
|
| 19 |
+
clear
|
| 20 |
+
echo -e "\033[1;34m=== Live Agent Status (Bengali Language Model Generation) ===\033[0m"
|
| 21 |
+
echo -e "\033[1;36mTime: $(date '+%H:%M:%S')\033[0m"
|
| 22 |
+
echo ""
|
| 23 |
+
|
| 24 |
+
# Data Collector Status
|
| 25 |
+
if [ -f "$STATUS_DIR/data_collector.status" ]; then
|
| 26 |
+
echo -e "\033[1;32mData Collector:\033[0m $(cat "$STATUS_DIR/data_collector.status")"
|
| 27 |
+
else
|
| 28 |
+
echo -e "\033[1;32mData Collector:\033[0m Not started or completed"
|
| 29 |
+
fi
|
| 30 |
+
|
| 31 |
+
# Model Trainer Status
|
| 32 |
+
if [ -f "$STATUS_DIR/model_trainer.status" ]; then
|
| 33 |
+
echo -e "\033[1;33mModel Trainer:\033[0m $(cat "$STATUS_DIR/model_trainer.status")"
|
| 34 |
+
else
|
| 35 |
+
echo -e "\033[1;33mModel Trainer:\033[0m Not started or completed"
|
| 36 |
+
fi
|
| 37 |
+
|
| 38 |
+
# Model Evaluator Status
|
| 39 |
+
if [ -f "$STATUS_DIR/model_evaluator.status" ]; then
|
| 40 |
+
echo -e "\033[1;31mModel Evaluator:\033[0m $(cat "$STATUS_DIR/model_evaluator.status")"
|
| 41 |
+
else
|
| 42 |
+
echo -e "\033[1;31mModel Evaluator:\033[0m Not started or completed"
|
| 43 |
+
fi
|
| 44 |
+
|
| 45 |
+
# Check if all agents are done
|
| 46 |
+
if [ ! -f "$STATUS_DIR/data_collector.status" ] && \
|
| 47 |
+
[ ! -f "$STATUS_DIR/model_trainer.status" ] && \
|
| 48 |
+
[ ! -f "$STATUS_DIR/model_evaluator.status" ]; then
|
| 49 |
+
echo ""
|
| 50 |
+
echo -e "\033[1;34mAll agents have completed their tasks.\033[0m"
|
| 51 |
+
break
|
| 52 |
+
fi
|
| 53 |
+
sleep 2
|
| 54 |
+
done
|
| 55 |
+
}
|
| 56 |
+
|
| 57 |
+
# Main process
|
| 58 |
+
echo "Starting Bengali language model generation..." | tee -a "$LOG_FILE"
|
| 59 |
+
|
| 60 |
+
# Launch agents in background
|
| 61 |
+
for agent in data_collector model_trainer model_evaluator; do
|
| 62 |
+
if [ -f "$PROJECT_DIR/$agent.sh" ]; then
|
| 63 |
+
echo "Starting $agent..." | tee -a "$LOG_FILE"
|
| 64 |
+
bash "$PROJECT_DIR/$agent.sh" &>> "$LOG_FILE" || log_error "$agent failed to execute"
|
| 65 |
+
else
|
| 66 |
+
log_error "$agent.sh not found in $PROJECT_DIR"
|
| 67 |
+
fi
|
| 68 |
+
done
|
| 69 |
+
|
| 70 |
+
# Display live status
|
| 71 |
+
display_status
|
| 72 |
+
|
| 73 |
+
echo "Process completed. Check logs in $LOG_FILE for details." | tee -a "$LOG_FILE"
|
ai-agent/system_prompt.mdx
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<Plan>
|
| 2 |
+
You are AI coding agents focused on building a Bengali code + NLP LLM.
|
| 3 |
+
Output commands inside <Actions> blocks, analyses inside <Task> blocks.
|
| 4 |
+
After command execution, output results inside <TaskResult> blocks.
|
| 5 |
+
</Plan>
|
| 6 |
+
|
| 7 |
+
<Actions>
|
| 8 |
+
echo "Starting initial training setup..."
|
| 9 |
+
# Dummy start command for training
|
| 10 |
+
echo "Training started."
|
| 11 |
+
</Actions>
|
| 12 |
+
|
| 13 |
+
<Task>
|
| 14 |
+
Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.
|
| 15 |
+
</Task>
|
ai-agent/task_context.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
|
ai-agent/train.py
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import argparse
|
| 3 |
+
import time
|
| 4 |
+
|
| 5 |
+
def train(epochs):
|
| 6 |
+
for epoch in range(epochs):
|
| 7 |
+
print("Epoch %d/%d training..." % (epoch+1, epochs))
|
| 8 |
+
time.sleep(2)
|
| 9 |
+
print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
|
| 10 |
+
|
| 11 |
+
if __name__ == "__main__":
|
| 12 |
+
parser = argparse.ArgumentParser()
|
| 13 |
+
parser.add_argument("--epochs", type=int, default=1)
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
+
train(args.epochs)
|
config.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"attention_dropout": 0.0,
|
| 2 |
+
"bos_token_id": 151643,
|
| 3 |
+
"eos_token_id": 151643,
|
| 4 |
+
"hidden_act": "silu",
|
| 5 |
+
"hidden_size": 5120,
|
| 6 |
+
"initializer_range": 0.02,
|
| 7 |
+
"intermediate_size": 27648,
|
| 8 |
+
"max_position_embeddings": 131072,
|
| 9 |
+
"max_window_layers": 64,
|
| 10 |
+
"model_type": "qwen2",
|
| 11 |
+
"num_attention_heads": 40,
|
| 12 |
+
"num_hidden_layers": 64,
|
| 13 |
+
"num_key_value_heads": 8,
|
| 14 |
+
"rms_norm_eps": 1e-05,
|
| 15 |
+
"rope_scaling": null,
|
| 16 |
+
"rope_theta": 1000000.0,
|
| 17 |
+
"sliding_window": null,
|
| 18 |
+
"tie_word_embeddings": false,
|
| 19 |
+
"torch_dtype": "bfloat16",
|
| 20 |
+
"transformers_version": "4.46.0",
|
| 21 |
+
"use_cache": false,
|
| 22 |
+
"use_sliding_window": false,
|
| 23 |
+
"vocab_size": 152064
|
| 24 |
+
}
|
data_collector.ps1
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration
|
| 2 |
+
$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
|
| 3 |
+
$StatusFile = Join-Path $ProjectDir 'status\data_collector.status'
|
| 4 |
+
$LogFile = Join-Path $ProjectDir 'logs\actions.log'
|
| 5 |
+
|
| 6 |
+
function Write-Log {
|
| 7 |
+
param([string]$Message, [string]$Type = 'INFO')
|
| 8 |
+
$timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
|
| 9 |
+
Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
# Ensure status directory exists
|
| 13 |
+
New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
|
| 14 |
+
|
| 15 |
+
try {
|
| 16 |
+
# Initialize status
|
| 17 |
+
Set-Content -Path $StatusFile -Value 'Initializing data collection...'
|
| 18 |
+
Write-Log 'Data collector started' 'INFO'
|
| 19 |
+
|
| 20 |
+
# Simulated data collection progress (replace with actual logic)
|
| 21 |
+
$progressSteps = @(
|
| 22 |
+
@{ Status = 'Connecting to data sources...'; Duration = 2 },
|
| 23 |
+
@{ Status = 'Fetching Bengali text corpus...'; Duration = 3 },
|
| 24 |
+
@{ Status = 'Processing raw data...'; Duration = 2 },
|
| 25 |
+
@{ Status = 'Cleaning and normalizing text...'; Duration = 2 },
|
| 26 |
+
@{ Status = 'Preparing training dataset...'; Duration = 1 }
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
foreach ($step in $progressSteps) {
|
| 30 |
+
Set-Content -Path $StatusFile -Value $step.Status
|
| 31 |
+
Write-Log $step.Status 'INFO'
|
| 32 |
+
Start-Sleep -Seconds $step.Duration
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Final status update
|
| 36 |
+
Set-Content -Path $StatusFile -Value 'Data collection completed successfully'
|
| 37 |
+
Write-Log 'Data collection completed' 'SUCCESS'
|
| 38 |
+
Start-Sleep -Seconds 1
|
| 39 |
+
|
| 40 |
+
} catch {
|
| 41 |
+
Write-Log "Error in data collection: $_" 'ERROR'
|
| 42 |
+
Set-Content -Path $StatusFile -Value 'Error: Data collection failed'
|
| 43 |
+
Start-Sleep -Seconds 1
|
| 44 |
+
} finally {
|
| 45 |
+
# Cleanup status file
|
| 46 |
+
if (Test-Path $StatusFile) {
|
| 47 |
+
Remove-Item -Path $StatusFile
|
| 48 |
+
}
|
| 49 |
+
}
|
data_collector.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
STATUS_FILE="$HOME/bd-model-generations/status/data_collector.status"
|
| 3 |
+
LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
|
| 4 |
+
|
| 5 |
+
echo "Collecting data..." > "$STATUS_FILE"
|
| 6 |
+
# Simulate data collection (replace with actual logic)
|
| 7 |
+
sleep 5
|
| 8 |
+
echo "Data collection complete." > "$STATUS_FILE"
|
| 9 |
+
sleep 1
|
| 10 |
+
rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
|
model_evaluator.ps1
ADDED
|
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration
|
| 2 |
+
$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
|
| 3 |
+
$StatusFile = Join-Path $ProjectDir 'status\model_evaluator.status'
|
| 4 |
+
$LogFile = Join-Path $ProjectDir 'logs\actions.log'
|
| 5 |
+
|
| 6 |
+
function Write-Log {
|
| 7 |
+
param([string]$Message, [string]$Type = 'INFO')
|
| 8 |
+
$timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
|
| 9 |
+
Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
# Ensure status directory exists
|
| 13 |
+
New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
|
| 14 |
+
|
| 15 |
+
try {
|
| 16 |
+
# Initialize status
|
| 17 |
+
Set-Content -Path $StatusFile -Value 'Initializing model evaluation...'
|
| 18 |
+
Write-Log 'Model evaluator started' 'INFO'
|
| 19 |
+
|
| 20 |
+
# Simulated evaluation progress (replace with actual logic)
|
| 21 |
+
$progressSteps = @(
|
| 22 |
+
@{ Status = 'Loading test dataset...'; Duration = 2 },
|
| 23 |
+
@{ Status = 'Computing accuracy metrics...'; Duration = 3 },
|
| 24 |
+
@{ Status = 'Analyzing model performance...'; Duration = 2 },
|
| 25 |
+
@{ Status = 'Generating confusion matrix...'; Duration = 2 },
|
| 26 |
+
@{ Status = 'Creating evaluation report...'; Duration = 1 }
|
| 27 |
+
)
|
| 28 |
+
|
| 29 |
+
foreach ($step in $progressSteps) {
|
| 30 |
+
Set-Content -Path $StatusFile -Value $step.Status
|
| 31 |
+
Write-Log $step.Status 'INFO'
|
| 32 |
+
Start-Sleep -Seconds $step.Duration
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Final status update
|
| 36 |
+
Set-Content -Path $StatusFile -Value 'Model evaluation completed successfully'
|
| 37 |
+
Write-Log 'Model evaluation completed' 'SUCCESS'
|
| 38 |
+
Start-Sleep -Seconds 1
|
| 39 |
+
|
| 40 |
+
} catch {
|
| 41 |
+
Write-Log "Error in model evaluation: $_" 'ERROR'
|
| 42 |
+
Set-Content -Path $StatusFile -Value 'Error: Model evaluation failed'
|
| 43 |
+
Start-Sleep -Seconds 1
|
| 44 |
+
} finally {
|
| 45 |
+
# Cleanup status file
|
| 46 |
+
if (Test-Path $StatusFile) {
|
| 47 |
+
Remove-Item -Path $StatusFile
|
| 48 |
+
}
|
| 49 |
+
}
|
model_evaluator.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
STATUS_FILE="$HOME/bd-model-generations/status/model_evaluator.status"
|
| 3 |
+
LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
|
| 4 |
+
|
| 5 |
+
echo "Evaluating model..." > "$STATUS_FILE"
|
| 6 |
+
# Simulate model evaluation (replace with actual logic)
|
| 7 |
+
sleep 5
|
| 8 |
+
echo "Model evaluation complete." > "$STATUS_FILE"
|
| 9 |
+
sleep 1
|
| 10 |
+
rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
|
model_trainer.ps1
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Configuration
|
| 2 |
+
$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
|
| 3 |
+
$StatusFile = Join-Path $ProjectDir 'status\model_trainer.status'
|
| 4 |
+
$LogFile = Join-Path $ProjectDir 'logs\actions.log'
|
| 5 |
+
|
| 6 |
+
function Write-Log {
|
| 7 |
+
param([string]$Message, [string]$Type = 'INFO')
|
| 8 |
+
$timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
|
| 9 |
+
Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
# Ensure status directory exists
|
| 13 |
+
New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
|
| 14 |
+
|
| 15 |
+
try {
|
| 16 |
+
# Initialize status
|
| 17 |
+
Set-Content -Path $StatusFile -Value 'Initializing model training...'
|
| 18 |
+
Write-Log 'Model trainer started' 'INFO'
|
| 19 |
+
|
| 20 |
+
# Simulated training progress (replace with actual logic)
|
| 21 |
+
$progressSteps = @(
|
| 22 |
+
@{ Status = 'Loading training dataset...'; Duration = 2 },
|
| 23 |
+
@{ Status = 'Initializing model architecture...'; Duration = 2 },
|
| 24 |
+
@{ Status = 'Training Epoch 1/5...'; Duration = 3 },
|
| 25 |
+
@{ Status = 'Training Epoch 2/5...'; Duration = 3 },
|
| 26 |
+
@{ Status = 'Training Epoch 3/5...'; Duration = 3 },
|
| 27 |
+
@{ Status = 'Training Epoch 4/5...'; Duration = 3 },
|
| 28 |
+
@{ Status = 'Training Epoch 5/5...'; Duration = 3 },
|
| 29 |
+
@{ Status = 'Saving model checkpoints...'; Duration = 1 }
|
| 30 |
+
)
|
| 31 |
+
|
| 32 |
+
foreach ($step in $progressSteps) {
|
| 33 |
+
Set-Content -Path $StatusFile -Value $step.Status
|
| 34 |
+
Write-Log $step.Status 'INFO'
|
| 35 |
+
Start-Sleep -Seconds $step.Duration
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
# Final status update
|
| 39 |
+
Set-Content -Path $StatusFile -Value 'Model training completed successfully'
|
| 40 |
+
Write-Log 'Model training completed' 'SUCCESS'
|
| 41 |
+
Start-Sleep -Seconds 1
|
| 42 |
+
|
| 43 |
+
} catch {
|
| 44 |
+
Write-Log "Error in model training: $_" 'ERROR'
|
| 45 |
+
Set-Content -Path $StatusFile -Value 'Error: Model training failed'
|
| 46 |
+
Start-Sleep -Seconds 1
|
| 47 |
+
} finally {
|
| 48 |
+
# Cleanup status file
|
| 49 |
+
if (Test-Path $StatusFile) {
|
| 50 |
+
Remove-Item -Path $StatusFile
|
| 51 |
+
}
|
| 52 |
+
}
|
model_trainer.sh
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
STATUS_FILE="$HOME/bd-model-generations/status/model_trainer.status"
|
| 3 |
+
LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
|
| 4 |
+
|
| 5 |
+
echo "Training model..." > "$STATUS_FILE"
|
| 6 |
+
# Simulate model training (replace with actual logic)
|
| 7 |
+
sleep 5
|
| 8 |
+
echo "Model training complete." > "$STATUS_FILE"
|
| 9 |
+
sleep 1
|
| 10 |
+
rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
|
requirements.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Core dependencies
|
| 2 |
+
torch>=2.0.0
|
| 3 |
+
transformers>=4.30.0
|
| 4 |
+
datasets>=2.12.0
|
| 5 |
+
sentencepiece>=0.1.99
|
| 6 |
+
accelerate>=0.20.0
|
| 7 |
+
wandb>=0.15.0
|
| 8 |
+
|
| 9 |
+
# Data collection and processing
|
| 10 |
+
requests>=2.31.0
|
| 11 |
+
beautifulsoup4>=4.12.0
|
| 12 |
+
tqdm>=4.65.0
|
| 13 |
+
|
| 14 |
+
# Evaluation metrics
|
| 15 |
+
rouge-score>=0.1.2
|
| 16 |
+
sacrebleu>=2.3.1
|
| 17 |
+
pandas>=2.0.0
|
| 18 |
+
numpy>=1.24.0
|
| 19 |
+
|
| 20 |
+
# Utilities
|
| 21 |
+
pathlib>=1.0.1
|
| 22 |
+
logging>=0.5.1.2
|
| 23 |
+
typing>=3.7.4.3
|
| 24 |
+
|
| 25 |
+
# Development tools
|
| 26 |
+
black>=23.3.0
|
| 27 |
+
isort>=5.12.0
|
| 28 |
+
pylint>=2.17.0
|
| 29 |
+
pytest>=7.3.1
|
scripts/data_collector.py
ADDED
|
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import requests
|
| 2 |
+
from bs4 import BeautifulSoup
|
| 3 |
+
import time
|
| 4 |
+
import random
|
| 5 |
+
import json
|
| 6 |
+
from pathlib import Path
|
| 7 |
+
import logging
|
| 8 |
+
from urllib.parse import urljoin
|
| 9 |
+
|
| 10 |
+
# Configure logging
|
| 11 |
+
logging.basicConfig(
|
| 12 |
+
level=logging.INFO,
|
| 13 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 14 |
+
)
|
| 15 |
+
logger = logging.getLogger(__name__)
|
| 16 |
+
|
| 17 |
+
class BengaliDataCollector:
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.headers = {
|
| 20 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 21 |
+
}
|
| 22 |
+
self.output_dir = Path('data/raw')
|
| 23 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 24 |
+
|
| 25 |
+
def make_request(self, url, retries=3, delay=1):
|
| 26 |
+
"""Make HTTP request with retry logic and rate limiting"""
|
| 27 |
+
for attempt in range(retries):
|
| 28 |
+
try:
|
| 29 |
+
time.sleep(delay + random.random()) # Rate limiting with jitter
|
| 30 |
+
response = requests.get(url, headers=self.headers)
|
| 31 |
+
response.raise_for_status()
|
| 32 |
+
return response
|
| 33 |
+
except requests.RequestException as e:
|
| 34 |
+
logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
|
| 35 |
+
if attempt == retries - 1:
|
| 36 |
+
logger.error(f"Failed to fetch {url} after {retries} attempts")
|
| 37 |
+
raise
|
| 38 |
+
time.sleep(delay * (attempt + 1)) # Exponential backoff
|
| 39 |
+
|
| 40 |
+
def scrape_wikipedia(self):
|
| 41 |
+
"""Scrape Bengali text from Wikipedia"""
|
| 42 |
+
url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা"
|
| 43 |
+
logger.info(f"Scraping Wikipedia: {url}")
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
response = self.make_request(url)
|
| 47 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 48 |
+
|
| 49 |
+
# Get main content and featured articles
|
| 50 |
+
content_div = soup.find('div', {'id': 'mw-content-text'})
|
| 51 |
+
articles = []
|
| 52 |
+
|
| 53 |
+
if content_div:
|
| 54 |
+
# Extract article links
|
| 55 |
+
article_links = content_div.find_all('a', href=True)
|
| 56 |
+
for link in article_links[:50]: # Limit to first 50 articles
|
| 57 |
+
if link['href'].startswith('/wiki/') and ':' not in link['href']:
|
| 58 |
+
article_url = urljoin('https://bn.wikipedia.org', link['href'])
|
| 59 |
+
try:
|
| 60 |
+
article_response = self.make_request(article_url)
|
| 61 |
+
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
| 62 |
+
|
| 63 |
+
# Extract article content
|
| 64 |
+
article_content = article_soup.find('div', {'id': 'mw-content-text'})
|
| 65 |
+
if article_content:
|
| 66 |
+
text = article_content.get_text(separator='\n', strip=True)
|
| 67 |
+
articles.append({
|
| 68 |
+
'url': article_url,
|
| 69 |
+
'content': text
|
| 70 |
+
})
|
| 71 |
+
logger.info(f"Successfully scraped article: {article_url}")
|
| 72 |
+
except Exception as e:
|
| 73 |
+
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
|
| 74 |
+
|
| 75 |
+
# Save Wikipedia data
|
| 76 |
+
with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f:
|
| 77 |
+
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 78 |
+
|
| 79 |
+
return len(articles)
|
| 80 |
+
except Exception as e:
|
| 81 |
+
logger.error(f"Failed to scrape Wikipedia: {str(e)}")
|
| 82 |
+
return 0
|
| 83 |
+
|
| 84 |
+
def scrape_prothom_alo(self):
|
| 85 |
+
"""Scrape Bengali text from Prothom Alo"""
|
| 86 |
+
base_url = "https://www.prothomalo.com"
|
| 87 |
+
categories = ['bangladesh', 'international', 'opinion', 'science-technology']
|
| 88 |
+
articles = []
|
| 89 |
+
|
| 90 |
+
for category in categories:
|
| 91 |
+
url = f"{base_url}/{category}"
|
| 92 |
+
logger.info(f"Scraping Prothom Alo category: {category}")
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
response = self.make_request(url)
|
| 96 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
| 97 |
+
|
| 98 |
+
# Find article links
|
| 99 |
+
article_links = soup.find_all('a', href=True)
|
| 100 |
+
for link in article_links[:10]: # Limit to 10 articles per category
|
| 101 |
+
article_url = urljoin(base_url, link['href'])
|
| 102 |
+
if category in article_url:
|
| 103 |
+
try:
|
| 104 |
+
article_response = self.make_request(article_url)
|
| 105 |
+
article_soup = BeautifulSoup(article_response.content, 'html.parser')
|
| 106 |
+
|
| 107 |
+
# Extract article content
|
| 108 |
+
article_content = article_soup.find('div', {'class': 'story-content'})
|
| 109 |
+
if article_content:
|
| 110 |
+
text = article_content.get_text(separator='\n', strip=True)
|
| 111 |
+
articles.append({
|
| 112 |
+
'url': article_url,
|
| 113 |
+
'category': category,
|
| 114 |
+
'content': text
|
| 115 |
+
})
|
| 116 |
+
logger.info(f"Successfully scraped article: {article_url}")
|
| 117 |
+
except Exception as e:
|
| 118 |
+
logger.error(f"Failed to scrape article {article_url}: {str(e)}")
|
| 119 |
+
|
| 120 |
+
except Exception as e:
|
| 121 |
+
logger.error(f"Failed to scrape category {category}: {str(e)}")
|
| 122 |
+
|
| 123 |
+
# Save Prothom Alo data
|
| 124 |
+
with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f:
|
| 125 |
+
json.dump(articles, f, ensure_ascii=False, indent=2)
|
| 126 |
+
|
| 127 |
+
return len(articles)
|
| 128 |
+
|
| 129 |
+
def collect(self):
|
| 130 |
+
"""Main method to collect data from all sources"""
|
| 131 |
+
logger.info("Starting data collection")
|
| 132 |
+
|
| 133 |
+
wiki_count = self.scrape_wikipedia()
|
| 134 |
+
logger.info(f"Collected {wiki_count} articles from Wikipedia")
|
| 135 |
+
|
| 136 |
+
prothomalo_count = self.scrape_prothom_alo()
|
| 137 |
+
logger.info(f"Collected {prothomalo_count} articles from Prothom Alo")
|
| 138 |
+
|
| 139 |
+
# Combine and process the collected data
|
| 140 |
+
self.process_collected_data()
|
| 141 |
+
|
| 142 |
+
logger.info("Data collection completed")
|
| 143 |
+
|
| 144 |
+
def process_collected_data(self):
|
| 145 |
+
"""Process and combine collected data"""
|
| 146 |
+
try:
|
| 147 |
+
# Read collected data
|
| 148 |
+
with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f:
|
| 149 |
+
wiki_data = json.load(f)
|
| 150 |
+
|
| 151 |
+
with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f:
|
| 152 |
+
news_data = json.load(f)
|
| 153 |
+
|
| 154 |
+
# Combine and format data
|
| 155 |
+
processed_data = []
|
| 156 |
+
|
| 157 |
+
# Process Wikipedia articles
|
| 158 |
+
for article in wiki_data:
|
| 159 |
+
processed_data.append({
|
| 160 |
+
'text': article['content'],
|
| 161 |
+
'source': 'wikipedia',
|
| 162 |
+
'url': article['url']
|
| 163 |
+
})
|
| 164 |
+
|
| 165 |
+
# Process news articles
|
| 166 |
+
for article in news_data:
|
| 167 |
+
processed_data.append({
|
| 168 |
+
'text': article['content'],
|
| 169 |
+
'source': 'prothomalo',
|
| 170 |
+
'category': article.get('category', ''),
|
| 171 |
+
'url': article['url']
|
| 172 |
+
})
|
| 173 |
+
|
| 174 |
+
# Save processed data
|
| 175 |
+
with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f:
|
| 176 |
+
json.dump(processed_data, f, ensure_ascii=False, indent=2)
|
| 177 |
+
|
| 178 |
+
logger.info(f"Successfully processed {len(processed_data)} articles")
|
| 179 |
+
|
| 180 |
+
except Exception as e:
|
| 181 |
+
logger.error(f"Failed to process collected data: {str(e)}")
|
| 182 |
+
raise
|
| 183 |
+
|
| 184 |
+
if __name__ == "__main__":
|
| 185 |
+
collector = BengaliDataCollector()
|
| 186 |
+
collector.collect()
|
scripts/model_evaluator.py
ADDED
|
@@ -0,0 +1,216 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import logging
|
| 4 |
+
import torch
|
| 5 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 6 |
+
import numpy as np
|
| 7 |
+
from typing import List, Dict, Any
|
| 8 |
+
from tqdm import tqdm
|
| 9 |
+
import pandas as pd
|
| 10 |
+
from rouge_score import rouge_scorer
|
| 11 |
+
from sacrebleu.metrics import BLEU
|
| 12 |
+
import wandb
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(
|
| 16 |
+
level=logging.INFO,
|
| 17 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 18 |
+
)
|
| 19 |
+
logger = logging.getLogger(__name__)
|
| 20 |
+
|
| 21 |
+
class ModelEvaluator:
|
| 22 |
+
def __init__(self):
|
| 23 |
+
self.model_dir = Path('outputs/model/final')
|
| 24 |
+
self.output_dir = Path('outputs/evaluation')
|
| 25 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 26 |
+
|
| 27 |
+
# Test prompts for different scenarios
|
| 28 |
+
self.test_prompts = [
|
| 29 |
+
# Programming task prompts
|
| 30 |
+
{
|
| 31 |
+
"type": "code_generation",
|
| 32 |
+
"prompt": "একটি পাইথন ফাংশন লিখুন যা একটি সংখ্যার ফ্যাক্টরিয়াল বের করে।",
|
| 33 |
+
"expected": """def factorial(n):
|
| 34 |
+
if n == 0 or n == 1:
|
| 35 |
+
return 1
|
| 36 |
+
return n * factorial(n - 1)"""
|
| 37 |
+
},
|
| 38 |
+
{
|
| 39 |
+
"type": "code_explanation",
|
| 40 |
+
"prompt": "নিচের কোডটি ব্যাখ্যা করুন:\ndef bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]",
|
| 41 |
+
"expected": "এই কোডটি বাবল সর্ট অ্যালগরিদম বাস্তবায়ন করে। এটি একটি অ্যারেকে ক্রমানুসারে সাজায়।"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"type": "error_fix",
|
| 45 |
+
"prompt": "এই কোডে ভুল আছে, ঠিক করুন:\ndef calculate_sum(numbers)\n total = 0\n for num in numbers\n total += num\n return total",
|
| 46 |
+
"expected": """def calculate_sum(numbers):
|
| 47 |
+
total = 0
|
| 48 |
+
for num in numbers:
|
| 49 |
+
total += num
|
| 50 |
+
return total"""
|
| 51 |
+
},
|
| 52 |
+
# Algorithm explanation prompts
|
| 53 |
+
{
|
| 54 |
+
"type": "algorithm_explanation",
|
| 55 |
+
"prompt": "বাইনারি সার্চ অ্যালগরিদম কীভাবে কাজ করে সেটি ব্যাখ্যা করুন।",
|
| 56 |
+
"expected": "বাইনারি সার্চ একটি দক্ষ অ্যালগরিদম যা সর্টেড অ্যারেতে একটি এলিমেন্ট খোঁজে। এটি প্রতিবার অ্যারের মধ্যবর্তী এলিমেন্ট চেক করে এবং সার্চ স্পেস অর্ধেক করে কমিয়ে ফেলে।"
|
| 57 |
+
}
|
| 58 |
+
]
|
| 59 |
+
|
| 60 |
+
# Evaluation metrics
|
| 61 |
+
self.bleu = BLEU()
|
| 62 |
+
self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
|
| 63 |
+
|
| 64 |
+
def load_model_and_tokenizer(self):
|
| 65 |
+
"""Load the trained model and tokenizer"""
|
| 66 |
+
logger.info("Loading model and tokenizer")
|
| 67 |
+
|
| 68 |
+
tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
|
| 69 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 70 |
+
self.model_dir,
|
| 71 |
+
trust_remote_code=True,
|
| 72 |
+
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 73 |
+
)
|
| 74 |
+
|
| 75 |
+
if torch.cuda.is_available():
|
| 76 |
+
model = model.to('cuda')
|
| 77 |
+
|
| 78 |
+
return model, tokenizer
|
| 79 |
+
|
| 80 |
+
def generate_response(self, model, tokenizer, prompt: str, max_length: int = 512) -> str:
|
| 81 |
+
"""Generate response for a given prompt"""
|
| 82 |
+
try:
|
| 83 |
+
inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
|
| 84 |
+
|
| 85 |
+
if torch.cuda.is_available():
|
| 86 |
+
inputs = {k: v.to('cuda') for k, v in inputs.items()}
|
| 87 |
+
|
| 88 |
+
# Generate with better parameters for code generation
|
| 89 |
+
outputs = model.generate(
|
| 90 |
+
**inputs,
|
| 91 |
+
max_length=max_length,
|
| 92 |
+
num_return_sequences=1,
|
| 93 |
+
temperature=0.7,
|
| 94 |
+
top_p=0.95,
|
| 95 |
+
do_sample=True,
|
| 96 |
+
pad_token_id=tokenizer.pad_token_id,
|
| 97 |
+
eos_token_id=tokenizer.eos_token_id,
|
| 98 |
+
repetition_penalty=1.2
|
| 99 |
+
)
|
| 100 |
+
|
| 101 |
+
response = tokenizer.decode(outputs[0], skip_special_tokens=True)
|
| 102 |
+
return response.replace(prompt, "").strip()
|
| 103 |
+
|
| 104 |
+
except Exception as e:
|
| 105 |
+
logger.error(f"Error generating response: {str(e)}")
|
| 106 |
+
return ""
|
| 107 |
+
|
| 108 |
+
def calculate_metrics(self, generated: str, expected: str) -> Dict[str, float]:
|
| 109 |
+
"""Calculate evaluation metrics"""
|
| 110 |
+
try:
|
| 111 |
+
# Calculate BLEU score
|
| 112 |
+
bleu_score = self.bleu.corpus_score(
|
| 113 |
+
[generated],
|
| 114 |
+
[[expected]]
|
| 115 |
+
).score / 100.0
|
| 116 |
+
|
| 117 |
+
# Calculate ROUGE scores
|
| 118 |
+
rouge_scores = self.rouge_scorer.score(generated, expected)
|
| 119 |
+
|
| 120 |
+
return {
|
| 121 |
+
'bleu': bleu_score,
|
| 122 |
+
'rouge1_f': rouge_scores['rouge1'].fmeasure,
|
| 123 |
+
'rouge2_f': rouge_scores['rouge2'].fmeasure,
|
| 124 |
+
'rougeL_f': rouge_scores['rougeL'].fmeasure
|
| 125 |
+
}
|
| 126 |
+
except Exception as e:
|
| 127 |
+
logger.error(f"Error calculating metrics: {str(e)}")
|
| 128 |
+
return {
|
| 129 |
+
'bleu': 0.0,
|
| 130 |
+
'rouge1_f': 0.0,
|
| 131 |
+
'rouge2_f': 0.0,
|
| 132 |
+
'rougeL_f': 0.0
|
| 133 |
+
}
|
| 134 |
+
|
| 135 |
+
def evaluate(self):
|
| 136 |
+
"""Main method to evaluate the model"""
|
| 137 |
+
try:
|
| 138 |
+
# Initialize wandb for tracking
|
| 139 |
+
wandb.init(project="bengali-code-llm", name="model-evaluation")
|
| 140 |
+
|
| 141 |
+
# Load model and tokenizer
|
| 142 |
+
model, tokenizer = self.load_model_and_tokenizer()
|
| 143 |
+
|
| 144 |
+
# Store evaluation results
|
| 145 |
+
results = []
|
| 146 |
+
|
| 147 |
+
# Evaluate on test prompts
|
| 148 |
+
for prompt_data in tqdm(self.test_prompts, desc="Evaluating prompts"):
|
| 149 |
+
prompt_type = prompt_data["type"]
|
| 150 |
+
prompt = prompt_data["prompt"]
|
| 151 |
+
expected = prompt_data["expected"]
|
| 152 |
+
|
| 153 |
+
# Generate response
|
| 154 |
+
generated = self.generate_response(model, tokenizer, prompt)
|
| 155 |
+
|
| 156 |
+
# Calculate metrics
|
| 157 |
+
metrics = self.calculate_metrics(generated, expected)
|
| 158 |
+
|
| 159 |
+
# Store result
|
| 160 |
+
result = {
|
| 161 |
+
"type": prompt_type,
|
| 162 |
+
"prompt": prompt,
|
| 163 |
+
"generated": generated,
|
| 164 |
+
"expected": expected,
|
| 165 |
+
**metrics
|
| 166 |
+
}
|
| 167 |
+
results.append(result)
|
| 168 |
+
|
| 169 |
+
# Log to wandb
|
| 170 |
+
wandb.log({
|
| 171 |
+
f"{prompt_type}_bleu": metrics['bleu'],
|
| 172 |
+
f"{prompt_type}_rouge1": metrics['rouge1_f'],
|
| 173 |
+
f"{prompt_type}_rouge2": metrics['rouge2_f'],
|
| 174 |
+
f"{prompt_type}_rougeL": metrics['rougeL_f']
|
| 175 |
+
})
|
| 176 |
+
|
| 177 |
+
# Calculate average metrics by type
|
| 178 |
+
df = pd.DataFrame(results)
|
| 179 |
+
avg_metrics = df.groupby('type')[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean()
|
| 180 |
+
|
| 181 |
+
# Save detailed results
|
| 182 |
+
results_path = self.output_dir / 'evaluation_results.json'
|
| 183 |
+
with open(results_path, 'w', encoding='utf-8') as f:
|
| 184 |
+
json.dump(results, f, ensure_ascii=False, indent=2)
|
| 185 |
+
|
| 186 |
+
# Save average metrics
|
| 187 |
+
metrics_path = self.output_dir / 'average_metrics.csv'
|
| 188 |
+
avg_metrics.to_csv(metrics_path)
|
| 189 |
+
|
| 190 |
+
# Log final averages to wandb
|
| 191 |
+
wandb.log({
|
| 192 |
+
"avg_bleu": df['bleu'].mean(),
|
| 193 |
+
"avg_rouge1": df['rouge1_f'].mean(),
|
| 194 |
+
"avg_rouge2": df['rouge2_f'].mean(),
|
| 195 |
+
"avg_rougeL": df['rougeL_f'].mean()
|
| 196 |
+
})
|
| 197 |
+
|
| 198 |
+
# Close wandb
|
| 199 |
+
wandb.finish()
|
| 200 |
+
|
| 201 |
+
logger.info(f"Evaluation completed. Results saved to {self.output_dir}")
|
| 202 |
+
|
| 203 |
+
# Return average metrics
|
| 204 |
+
return avg_metrics.to_dict()
|
| 205 |
+
|
| 206 |
+
except Exception as e:
|
| 207 |
+
logger.error(f"Evaluation failed: {str(e)}")
|
| 208 |
+
raise
|
| 209 |
+
finally:
|
| 210 |
+
# Ensure wandb is properly closed
|
| 211 |
+
if wandb.run is not None:
|
| 212 |
+
wandb.finish()
|
| 213 |
+
|
| 214 |
+
if __name__ == "__main__":
|
| 215 |
+
evaluator = ModelEvaluator()
|
| 216 |
+
evaluator.evaluate()
|
scripts/model_trainer.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import logging
|
| 4 |
+
import torch
|
| 5 |
+
from torch.utils.data import Dataset, DataLoader
|
| 6 |
+
from transformers import (
|
| 7 |
+
AutoModelForCausalLM,
|
| 8 |
+
AutoTokenizer,
|
| 9 |
+
TrainingArguments,
|
| 10 |
+
Trainer,
|
| 11 |
+
DataCollatorForLanguageModeling
|
| 12 |
+
)
|
| 13 |
+
import wandb
|
| 14 |
+
import numpy as np
|
| 15 |
+
from datasets import load_dataset
|
| 16 |
+
from typing import Dict, List, Any
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=logging.INFO,
|
| 21 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 22 |
+
)
|
| 23 |
+
logger = logging.getLogger(__name__)
|
| 24 |
+
|
| 25 |
+
class BengaliCodeDataset(Dataset):
|
| 26 |
+
def __init__(self, data_path: Path, tokenizer, max_length: int = 2048):
|
| 27 |
+
self.tokenizer = tokenizer
|
| 28 |
+
self.max_length = max_length
|
| 29 |
+
|
| 30 |
+
# Load the processed data
|
| 31 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
| 32 |
+
self.data = json.load(f)
|
| 33 |
+
|
| 34 |
+
logger.info(f"Loaded {len(self.data)} examples from {data_path}")
|
| 35 |
+
|
| 36 |
+
def __len__(self):
|
| 37 |
+
return len(self.data)
|
| 38 |
+
|
| 39 |
+
def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
|
| 40 |
+
item = self.data[idx]
|
| 41 |
+
text = item['text']
|
| 42 |
+
|
| 43 |
+
# Tokenize the text
|
| 44 |
+
encodings = self.tokenizer(
|
| 45 |
+
text,
|
| 46 |
+
max_length=self.max_length,
|
| 47 |
+
padding='max_length',
|
| 48 |
+
truncation=True,
|
| 49 |
+
return_tensors='pt'
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
# Prepare the labels (same as input_ids for causal language modeling)
|
| 53 |
+
labels = encodings.input_ids.clone()
|
| 54 |
+
|
| 55 |
+
# Create attention mask
|
| 56 |
+
attention_mask = encodings.attention_mask
|
| 57 |
+
|
| 58 |
+
return {
|
| 59 |
+
'input_ids': encodings.input_ids[0],
|
| 60 |
+
'attention_mask': attention_mask[0],
|
| 61 |
+
'labels': labels[0]
|
| 62 |
+
}
|
| 63 |
+
|
| 64 |
+
class ModelTrainer:
|
| 65 |
+
def __init__(self):
|
| 66 |
+
self.data_dir = Path('data/raw')
|
| 67 |
+
self.tokenizer_dir = Path('outputs/tokenizer')
|
| 68 |
+
self.output_dir = Path('outputs/model')
|
| 69 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 70 |
+
|
| 71 |
+
# Training configuration
|
| 72 |
+
self.model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
|
| 73 |
+
self.max_length = 2048
|
| 74 |
+
self.batch_size = 4
|
| 75 |
+
self.gradient_accumulation_steps = 4
|
| 76 |
+
self.learning_rate = 2e-5
|
| 77 |
+
self.num_train_epochs = 3
|
| 78 |
+
self.warmup_steps = 100
|
| 79 |
+
self.save_steps = 1000
|
| 80 |
+
self.eval_steps = 500
|
| 81 |
+
|
| 82 |
+
def setup_wandb(self):
|
| 83 |
+
"""Initialize Weights & Biases tracking"""
|
| 84 |
+
wandb.init(
|
| 85 |
+
project="bengali-code-llm",
|
| 86 |
+
name="tinyllama-bengali-code",
|
| 87 |
+
config={
|
| 88 |
+
"model_name": self.model_name,
|
| 89 |
+
"max_length": self.max_length,
|
| 90 |
+
"batch_size": self.batch_size,
|
| 91 |
+
"learning_rate": self.learning_rate,
|
| 92 |
+
"num_epochs": self.num_train_epochs
|
| 93 |
+
}
|
| 94 |
+
)
|
| 95 |
+
|
| 96 |
+
def prepare_model_and_tokenizer(self):
|
| 97 |
+
"""Load and prepare the model and tokenizer"""
|
| 98 |
+
logger.info("Loading tokenizer and model")
|
| 99 |
+
|
| 100 |
+
# Load the custom tokenizer
|
| 101 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 102 |
+
self.tokenizer_dir,
|
| 103 |
+
model_max_length=self.max_length
|
| 104 |
+
)
|
| 105 |
+
|
| 106 |
+
# Load the base model
|
| 107 |
+
model = AutoModelForCausalLM.from_pretrained(
|
| 108 |
+
self.model_name,
|
| 109 |
+
trust_remote_code=True,
|
| 110 |
+
torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
|
| 111 |
+
)
|
| 112 |
+
|
| 113 |
+
# Resize token embeddings to match our tokenizer
|
| 114 |
+
model.resize_token_embeddings(len(tokenizer))
|
| 115 |
+
|
| 116 |
+
return model, tokenizer
|
| 117 |
+
|
| 118 |
+
def create_datasets(self, tokenizer):
|
| 119 |
+
"""Create training and validation datasets"""
|
| 120 |
+
logger.info("Creating datasets")
|
| 121 |
+
|
| 122 |
+
# Load the processed data
|
| 123 |
+
data_path = self.data_dir / 'processed_data.json'
|
| 124 |
+
|
| 125 |
+
# Split data into train and validation
|
| 126 |
+
with open(data_path, 'r', encoding='utf-8') as f:
|
| 127 |
+
all_data = json.load(f)
|
| 128 |
+
|
| 129 |
+
np.random.seed(42)
|
| 130 |
+
np.random.shuffle(all_data)
|
| 131 |
+
|
| 132 |
+
split_idx = int(len(all_data) * 0.9) # 90% train, 10% validation
|
| 133 |
+
train_data = all_data[:split_idx]
|
| 134 |
+
val_data = all_data[split_idx:]
|
| 135 |
+
|
| 136 |
+
# Save split data
|
| 137 |
+
train_path = self.data_dir / 'train.json'
|
| 138 |
+
val_path = self.data_dir / 'validation.json'
|
| 139 |
+
|
| 140 |
+
with open(train_path, 'w', encoding='utf-8') as f:
|
| 141 |
+
json.dump(train_data, f, ensure_ascii=False, indent=2)
|
| 142 |
+
|
| 143 |
+
with open(val_path, 'w', encoding='utf-8') as f:
|
| 144 |
+
json.dump(val_data, f, ensure_ascii=False, indent=2)
|
| 145 |
+
|
| 146 |
+
# Create datasets
|
| 147 |
+
train_dataset = BengaliCodeDataset(train_path, tokenizer, self.max_length)
|
| 148 |
+
val_dataset = BengaliCodeDataset(val_path, tokenizer, self.max_length)
|
| 149 |
+
|
| 150 |
+
return train_dataset, val_dataset
|
| 151 |
+
|
| 152 |
+
def create_training_arguments(self):
|
| 153 |
+
"""Create training arguments for the Trainer"""
|
| 154 |
+
return TrainingArguments(
|
| 155 |
+
output_dir=str(self.output_dir),
|
| 156 |
+
num_train_epochs=self.num_train_epochs,
|
| 157 |
+
per_device_train_batch_size=self.batch_size,
|
| 158 |
+
per_device_eval_batch_size=self.batch_size,
|
| 159 |
+
gradient_accumulation_steps=self.gradient_accumulation_steps,
|
| 160 |
+
evaluation_strategy="steps",
|
| 161 |
+
eval_steps=self.eval_steps,
|
| 162 |
+
save_strategy="steps",
|
| 163 |
+
save_steps=self.save_steps,
|
| 164 |
+
learning_rate=self.learning_rate,
|
| 165 |
+
warmup_steps=self.warmup_steps,
|
| 166 |
+
weight_decay=0.01,
|
| 167 |
+
logging_dir=str(self.output_dir / 'logs'),
|
| 168 |
+
logging_steps=100,
|
| 169 |
+
report_to="wandb",
|
| 170 |
+
save_total_limit=3,
|
| 171 |
+
load_best_model_at_end=True,
|
| 172 |
+
metric_for_best_model="eval_loss",
|
| 173 |
+
greater_is_better=False,
|
| 174 |
+
fp16=torch.cuda.is_available(),
|
| 175 |
+
remove_unused_columns=False
|
| 176 |
+
)
|
| 177 |
+
|
| 178 |
+
def train(self):
|
| 179 |
+
"""Main method to train the model"""
|
| 180 |
+
try:
|
| 181 |
+
# Initialize wandb
|
| 182 |
+
self.setup_wandb()
|
| 183 |
+
|
| 184 |
+
# Prepare model and tokenizer
|
| 185 |
+
model, tokenizer = self.prepare_model_and_tokenizer()
|
| 186 |
+
|
| 187 |
+
# Create datasets
|
| 188 |
+
train_dataset, val_dataset = self.create_datasets(tokenizer)
|
| 189 |
+
|
| 190 |
+
# Create training arguments
|
| 191 |
+
training_args = self.create_training_arguments()
|
| 192 |
+
|
| 193 |
+
# Create data collator
|
| 194 |
+
data_collator = DataCollatorForLanguageModeling(
|
| 195 |
+
tokenizer=tokenizer,
|
| 196 |
+
mlm=False # We're doing causal language modeling
|
| 197 |
+
)
|
| 198 |
+
|
| 199 |
+
# Initialize trainer
|
| 200 |
+
trainer = Trainer(
|
| 201 |
+
model=model,
|
| 202 |
+
args=training_args,
|
| 203 |
+
train_dataset=train_dataset,
|
| 204 |
+
eval_dataset=val_dataset,
|
| 205 |
+
data_collator=data_collator,
|
| 206 |
+
tokenizer=tokenizer
|
| 207 |
+
)
|
| 208 |
+
|
| 209 |
+
# Train the model
|
| 210 |
+
logger.info("Starting model training")
|
| 211 |
+
trainer.train()
|
| 212 |
+
|
| 213 |
+
# Save the final model
|
| 214 |
+
trainer.save_model(str(self.output_dir / 'final'))
|
| 215 |
+
tokenizer.save_pretrained(str(self.output_dir / 'final'))
|
| 216 |
+
|
| 217 |
+
# Close wandb
|
| 218 |
+
wandb.finish()
|
| 219 |
+
|
| 220 |
+
logger.info("Model training completed successfully")
|
| 221 |
+
|
| 222 |
+
except Exception as e:
|
| 223 |
+
logger.error(f"Model training failed: {str(e)}")
|
| 224 |
+
raise
|
| 225 |
+
finally:
|
| 226 |
+
# Ensure wandb is properly closed
|
| 227 |
+
if wandb.run is not None:
|
| 228 |
+
wandb.finish()
|
| 229 |
+
|
| 230 |
+
if __name__ == "__main__":
|
| 231 |
+
trainer = ModelTrainer()
|
| 232 |
+
trainer.train()
|
scripts/tokenizer_trainer.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import json
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import sentencepiece as spm
|
| 4 |
+
import logging
|
| 5 |
+
from typing import List, Dict
|
| 6 |
+
import shutil
|
| 7 |
+
|
| 8 |
+
# Configure logging
|
| 9 |
+
logging.basicConfig(
|
| 10 |
+
level=logging.INFO,
|
| 11 |
+
format='%(asctime)s - %(levelname)s - %(message)s'
|
| 12 |
+
)
|
| 13 |
+
logger = logging.getLogger(__name__)
|
| 14 |
+
|
| 15 |
+
class TokenizerTrainer:
|
| 16 |
+
def __init__(self):
|
| 17 |
+
self.data_dir = Path('data/raw')
|
| 18 |
+
self.output_dir = Path('outputs/tokenizer')
|
| 19 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
| 20 |
+
|
| 21 |
+
# Tokenizer configuration
|
| 22 |
+
self.vocab_size = 32000
|
| 23 |
+
self.character_coverage = 0.9999
|
| 24 |
+
self.model_type = "unigram"
|
| 25 |
+
self.special_tokens = [
|
| 26 |
+
"[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
|
| 27 |
+
"<s>", "</s>", "<pad>", "<unk>", "<mask>",
|
| 28 |
+
"২০", "১০", "৫০", "১৫", "২৫", # Common Bengali numbers
|
| 29 |
+
"def", "class", "return", "if", "else", "for", "while", # Code keywords
|
| 30 |
+
"print", "input", "import", "from", "try", "except",
|
| 31 |
+
"#", "//", "/*", "*/", "'''", '"""' # Code comments
|
| 32 |
+
]
|
| 33 |
+
|
| 34 |
+
def prepare_training_data(self) -> str:
|
| 35 |
+
"""Prepare text data for tokenizer training"""
|
| 36 |
+
logger.info("Preparing training data for tokenizer")
|
| 37 |
+
|
| 38 |
+
# Load processed data
|
| 39 |
+
try:
|
| 40 |
+
with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
|
| 41 |
+
data = json.load(f)
|
| 42 |
+
except FileNotFoundError:
|
| 43 |
+
logger.error("Processed data file not found. Run data collection first.")
|
| 44 |
+
raise
|
| 45 |
+
|
| 46 |
+
# Create temporary file for training
|
| 47 |
+
train_file = self.output_dir / 'train.txt'
|
| 48 |
+
with open(train_file, 'w', encoding='utf-8') as f:
|
| 49 |
+
for item in data:
|
| 50 |
+
text = item['text']
|
| 51 |
+
# Write one sentence per line
|
| 52 |
+
sentences = text.split('।') # Split on Bengali full stop
|
| 53 |
+
for sentence in sentences:
|
| 54 |
+
sentence = sentence.strip()
|
| 55 |
+
if sentence: # Skip empty sentences
|
| 56 |
+
f.write(sentence + '\n')
|
| 57 |
+
|
| 58 |
+
logger.info("Training data prepared successfully")
|
| 59 |
+
return str(train_file)
|
| 60 |
+
|
| 61 |
+
def train_tokenizer(self, train_file: str):
|
| 62 |
+
"""Train the SentencePiece tokenizer"""
|
| 63 |
+
logger.info("Starting tokenizer training")
|
| 64 |
+
|
| 65 |
+
# Prepare model prefix
|
| 66 |
+
model_prefix = self.output_dir / "bengali_code"
|
| 67 |
+
|
| 68 |
+
# Create training parameters
|
| 69 |
+
params = {
|
| 70 |
+
"--input": train_file,
|
| 71 |
+
"--model_prefix": str(model_prefix),
|
| 72 |
+
"--vocab_size": str(self.vocab_size),
|
| 73 |
+
"--character_coverage": str(self.character_coverage),
|
| 74 |
+
"--model_type": self.model_type,
|
| 75 |
+
"--pad_id": 0,
|
| 76 |
+
"--unk_id": 1,
|
| 77 |
+
"--bos_id": 2,
|
| 78 |
+
"--eos_id": 3,
|
| 79 |
+
"--user_defined_symbols": ",".join(self.special_tokens),
|
| 80 |
+
"--max_sentence_length": "4192",
|
| 81 |
+
"--input_sentence_size": "5000000",
|
| 82 |
+
"--shuffle_input_sentence": "true",
|
| 83 |
+
"--normalization_rule_name": "identity" # Preserve original text
|
| 84 |
+
}
|
| 85 |
+
|
| 86 |
+
# Convert parameters to command-line arguments
|
| 87 |
+
args = []
|
| 88 |
+
for key, value in params.items():
|
| 89 |
+
args.append(key)
|
| 90 |
+
args.append(value)
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
# Train the tokenizer
|
| 94 |
+
spm.SentencePieceTrainer.train(" ".join(args))
|
| 95 |
+
logger.info("Tokenizer training completed successfully")
|
| 96 |
+
|
| 97 |
+
# Create config files for HuggingFace compatibility
|
| 98 |
+
self.create_huggingface_files(model_prefix)
|
| 99 |
+
|
| 100 |
+
except Exception as e:
|
| 101 |
+
logger.error(f"Failed to train tokenizer: {str(e)}")
|
| 102 |
+
raise
|
| 103 |
+
|
| 104 |
+
def create_huggingface_files(self, model_prefix: Path):
|
| 105 |
+
"""Create additional files needed for HuggingFace compatibility"""
|
| 106 |
+
logger.info("Creating HuggingFace compatibility files")
|
| 107 |
+
|
| 108 |
+
# Create tokenizer config
|
| 109 |
+
tokenizer_config = {
|
| 110 |
+
"model_max_length": 2048,
|
| 111 |
+
"padding_side": "right",
|
| 112 |
+
"truncation_side": "right",
|
| 113 |
+
"bos_token": "<s>",
|
| 114 |
+
"eos_token": "</s>",
|
| 115 |
+
"unk_token": "<unk>",
|
| 116 |
+
"pad_token": "<pad>",
|
| 117 |
+
"mask_token": "<mask>",
|
| 118 |
+
"model_type": self.model_type,
|
| 119 |
+
"vocab_size": self.vocab_size
|
| 120 |
+
}
|
| 121 |
+
|
| 122 |
+
with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
|
| 123 |
+
json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
|
| 124 |
+
|
| 125 |
+
# Create special tokens map
|
| 126 |
+
special_tokens_map = {
|
| 127 |
+
"bos_token": "<s>",
|
| 128 |
+
"eos_token": "</s>",
|
| 129 |
+
"unk_token": "<unk>",
|
| 130 |
+
"pad_token": "<pad>",
|
| 131 |
+
"mask_token": "<mask>"
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
|
| 135 |
+
json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
|
| 136 |
+
|
| 137 |
+
logger.info("HuggingFace compatibility files created successfully")
|
| 138 |
+
|
| 139 |
+
def train(self):
|
| 140 |
+
"""Main method to train the tokenizer"""
|
| 141 |
+
try:
|
| 142 |
+
# Prepare training data
|
| 143 |
+
train_file = self.prepare_training_data()
|
| 144 |
+
|
| 145 |
+
# Train tokenizer
|
| 146 |
+
self.train_tokenizer(train_file)
|
| 147 |
+
|
| 148 |
+
# Clean up temporary files
|
| 149 |
+
if Path(train_file).exists():
|
| 150 |
+
Path(train_file).unlink()
|
| 151 |
+
|
| 152 |
+
logger.info("Tokenizer training pipeline completed successfully")
|
| 153 |
+
|
| 154 |
+
except Exception as e:
|
| 155 |
+
logger.error(f"Tokenizer training pipeline failed: {str(e)}")
|
| 156 |
+
raise
|
| 157 |
+
|
| 158 |
+
if __name__ == "__main__":
|
| 159 |
+
trainer = TokenizerTrainer()
|
| 160 |
+
trainer.train()
|
special_tokens_map.json
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
{
|
| 3 |
+
"additional_special_tokens": [
|
| 4 |
+
"<|im_start|>",
|
| 5 |
+
"<|im_end|>",
|
| 6 |
+
"<|object_ref_start|>",
|
| 7 |
+
"<|object_ref_end|>",
|
| 8 |
+
"<|box_start|>",
|
| 9 |
+
"<|box_end|>",
|
| 10 |
+
"<|quad_start|>",
|
| 11 |
+
"<|quad_end|>",
|
| 12 |
+
"<|vision_start|>",
|
| 13 |
+
"<|vision_end|>",
|
| 14 |
+
"<|vision_pad|>",
|
| 15 |
+
"<|image_pad|>",
|
| 16 |
+
"<|video_pad|>"
|
| 17 |
+
],
|
| 18 |
+
"eos_token": {
|
| 19 |
+
"content": "<|im_end|>",
|
| 20 |
+
"lstrip": false,
|
| 21 |
+
"normalized": false,
|
| 22 |
+
"rstrip": false,
|
| 23 |
+
"single_word": false
|
| 24 |
+
},
|
| 25 |
+
"pad_token": {
|
| 26 |
+
"content": "<|endoftext|>",
|
| 27 |
+
"lstrip": false,
|
| 28 |
+
"normalized": false,
|
| 29 |
+
"rstrip": false,
|
| 30 |
+
"single_word": false
|
| 31 |
+
}
|
| 32 |
+
}
|
start.sh
ADDED
|
@@ -0,0 +1,438 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
# === SYSTEM PROMPT ===
|
| 4 |
+
# This script builds a Bengali language model using a multi-agent system with human-in-the-loop (HIL) capabilities.
|
| 5 |
+
# Advanced Features:
|
| 6 |
+
# - Real-Time Streaming: Displays a colorful, dynamic status dashboard in the terminal.
|
| 7 |
+
# - Robust Error Handling: Validates setup, API calls, and file operations with detailed logging.
|
| 8 |
+
# - Modern Interface: Uses ANSI colors, progress bars, and a boxed header for a polished look.
|
| 9 |
+
# - Loop and Iteration: Monitors execution, retries on failure, and ensures task completion.
|
| 10 |
+
# - Code Execution: Executes Python and Node.js code locally for preprocessing and evaluation.
|
| 11 |
+
# - Tools: Provides Python and Node.js REPLs for file operations and analysis.
|
| 12 |
+
# - Time Travel: Logs actions with timestamps for debugging and auditing.
|
| 13 |
+
# - Subgraph Support: Encapsulates tasks (data collection, preprocessing, training, evaluation) as reusable nodes.
|
| 14 |
+
# - Memory: Persists state across agent interactions using a key-value store.
|
| 15 |
+
# - API Integrations: Uses Together, Cohere, and Gemini APIs (Together as primary for text generation).
|
| 16 |
+
# - File Operations: Creates, edits, and validates files with error checking.
|
| 17 |
+
# - Output: Saves the model to /storage/BA73-022B/bd/bd-model-genaretions/model.pt.
|
| 18 |
+
|
| 19 |
+
# === CONFIGURATION ===
|
| 20 |
+
PROJECT_DIR="/storage/BA73-022B/bd/bd-model-genaretions" # Updated as per user request
|
| 21 |
+
LOG_FILE="$PROJECT_DIR/logs/actions.log"
|
| 22 |
+
MEMORY_FILE="$PROJECT_DIR/memory.txt"
|
| 23 |
+
REQUESTS_DIR="$PROJECT_DIR/requests"
|
| 24 |
+
RESPONSES_DIR="$PROJECT_DIR/responses"
|
| 25 |
+
DATA_DIR="$PROJECT_DIR/data"
|
| 26 |
+
STATUS_DIR="$PROJECT_DIR/status"
|
| 27 |
+
|
| 28 |
+
# API Keys
|
| 29 |
+
TOGETHER_API_KEY="07f08ca73c50496a3406ff621912254a67370d576822f1921f77eed47e649545"
|
| 30 |
+
COHERE_API_KEY="rvpLjkuzZPsoHGeIxqQxttTTIt4IxGUS5FOINU4L"
|
| 31 |
+
GEMINI_API_KEY="AIzaSyAQNxQU0WnegEnMfP6LCwkVw-PUtR11qaI"
|
| 32 |
+
|
| 33 |
+
# === SETUP ===
|
| 34 |
+
echo "Initializing project directories..."
|
| 35 |
+
for dir in "$PROJECT_DIR" "$DATA_DIR" "$REQUESTS_DIR" "$RESPONSES_DIR" "$PROJECT_DIR/logs" "$STATUS_DIR"; do
|
| 36 |
+
mkdir -p "$dir"
|
| 37 |
+
if [ $? -ne 0 ]; then
|
| 38 |
+
echo -e "\033[1;31mError: Failed to create directory $dir\033[0m"
|
| 39 |
+
exit 1
|
| 40 |
+
fi
|
| 41 |
+
done
|
| 42 |
+
|
| 43 |
+
touch "$LOG_FILE" "$MEMORY_FILE"
|
| 44 |
+
if [ ! -f "$LOG_FILE" ] || [ ! -f "$MEMORY_FILE" ]; then
|
| 45 |
+
echo -e "\033[1;31mError: Failed to create log or memory file\033[0m"
|
| 46 |
+
exit 1
|
| 47 |
+
fi
|
| 48 |
+
echo "[$(date)] Starting Bengali language model generation" >> "$LOG_FILE"
|
| 49 |
+
|
| 50 |
+
# === UTILITY FUNCTIONS ===
|
| 51 |
+
# Memory Management
|
| 52 |
+
function set_memory {
|
| 53 |
+
local key="$1"
|
| 54 |
+
local value="$2"
|
| 55 |
+
grep -v "^$key=" "$MEMORY_FILE" > "$MEMORY_FILE.tmp" && mv "$MEMORY_FILE.tmp" "$MEMORY_FILE"
|
| 56 |
+
echo "$key=$value" >> "$MEMORY_FILE"
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
function get_memory {
|
| 60 |
+
local key="$1"
|
| 61 |
+
local value=$(grep "^$key=" "$MEMORY_FILE" | cut -d'=' -f2)
|
| 62 |
+
echo "${value:-false}"
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
# Logging (Time Travel)
|
| 66 |
+
function log_action {
|
| 67 |
+
local agent_id="$1"
|
| 68 |
+
local action="$2"
|
| 69 |
+
echo "[$(date)] [Agent $agent_id] $action" >> "$LOG_FILE"
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
# Status Updates
|
| 73 |
+
function set_status {
|
| 74 |
+
local agent_id="$1"
|
| 75 |
+
local status="$2"
|
| 76 |
+
echo "$status" > "$STATUS_DIR/agent$agent_id.status"
|
| 77 |
+
}
|
| 78 |
+
|
| 79 |
+
# === TOOL CALLING FUNCTIONS ===
|
| 80 |
+
function run_python {
|
| 81 |
+
local code="$1"
|
| 82 |
+
log_action "Tool" "Running Python code: $code"
|
| 83 |
+
local output=$(python3 -c "$code" 2>> "$LOG_FILE")
|
| 84 |
+
local exit_code=$?
|
| 85 |
+
if [ $exit_code -ne 0 ]; then
|
| 86 |
+
log_action "Tool" "Python execution failed with exit code $exit_code"
|
| 87 |
+
return $exit_code
|
| 88 |
+
fi
|
| 89 |
+
echo "$output"
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
function run_node {
|
| 93 |
+
local code="$1"
|
| 94 |
+
log_action "Tool" "Running Node.js code: $code"
|
| 95 |
+
local output=$(node -e "$code" 2>> "$LOG_FILE")
|
| 96 |
+
local exit_code=$?
|
| 97 |
+
if [ $exit_code -ne 0 ]; then
|
| 98 |
+
log_action "Tool" "Node.js execution failed with exit code $exit_code"
|
| 99 |
+
return $exit_code
|
| 100 |
+
fi
|
| 101 |
+
echo "$output"
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# === API CALLING FUNCTIONS ===
|
| 105 |
+
function call_together_api {
|
| 106 |
+
local prompt="$1"
|
| 107 |
+
curl -s -m 10 -X POST "https://api.together.ai/v1/completions" \
|
| 108 |
+
-H "Content-Type: application/json" \
|
| 109 |
+
-H "Authorization: Bearer $TOGETHER_API_KEY" \
|
| 110 |
+
-d "{\"prompt\": \"$prompt\", \"model\": \"some_model\", \"max_tokens\": 100}"
|
| 111 |
+
}
|
| 112 |
+
|
| 113 |
+
function call_cohere_api {
|
| 114 |
+
local prompt="$1"
|
| 115 |
+
curl -s -m 10 -X POST "https://api.cohere.ai/generate" \
|
| 116 |
+
-H "Content-Type: application/json" \
|
| 117 |
+
-H "Authorization: Bearer $COHERE_API_KEY" \
|
| 118 |
+
-d "{\"prompt\": \"$prompt\", \"max_tokens\": 100}"
|
| 119 |
+
}
|
| 120 |
+
|
| 121 |
+
function call_gemini_api {
|
| 122 |
+
local prompt="$1"
|
| 123 |
+
curl -s -m 10 -X POST "https://api.gemini.ai/v1/completions" \
|
| 124 |
+
-H "Content-Type: application/json" \
|
| 125 |
+
-H "Authorization: Bearer $GEMINI_API_KEY" \
|
| 126 |
+
-d "{\"prompt\": \"$prompt\", \"model\": \"some_model\"}"
|
| 127 |
+
}
|
| 128 |
+
|
| 129 |
+
function generate_text {
|
| 130 |
+
local prompt="$1"
|
| 131 |
+
local api="$2"
|
| 132 |
+
local attempts=3
|
| 133 |
+
for ((i=1; i<=attempts; i++)); do
|
| 134 |
+
local response text
|
| 135 |
+
case "$api" in
|
| 136 |
+
together)
|
| 137 |
+
response=$(call_together_api "$prompt")
|
| 138 |
+
text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
|
| 139 |
+
;;
|
| 140 |
+
cohere)
|
| 141 |
+
response=$(call_cohere_api "$prompt")
|
| 142 |
+
text=$(echo "$response" | jq -r '.generations[0].text' 2>/dev/null)
|
| 143 |
+
;;
|
| 144 |
+
gemini)
|
| 145 |
+
response=$(call_gemini_api "$prompt")
|
| 146 |
+
text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
|
| 147 |
+
;;
|
| 148 |
+
*)
|
| 149 |
+
text="Unknown API"
|
| 150 |
+
;;
|
| 151 |
+
esac
|
| 152 |
+
if [ -n "$text" ] && [ "$text" != "null" ] && [[ ! "$text" =~ "Error" ]]; then
|
| 153 |
+
echo "$text"
|
| 154 |
+
return 0
|
| 155 |
+
fi
|
| 156 |
+
log_action "API" "Attempt $i failed for $api API, retrying..."
|
| 157 |
+
sleep 2
|
| 158 |
+
done
|
| 159 |
+
log_action "API" "Failed to generate text with $api after $attempts attempts"
|
| 160 |
+
return 1
|
| 161 |
+
}
|
| 162 |
+
|
| 163 |
+
# === HUMAN-IN-THE-LOOP REQUEST FUNCTION ===
|
| 164 |
+
function request_human_input {
|
| 165 |
+
local agent_id="$1"
|
| 166 |
+
local request="$2"
|
| 167 |
+
log_action "$agent_id" "Requesting human input: $request"
|
| 168 |
+
set_status "$agent_id" "Waiting for human input"
|
| 169 |
+
echo "$request" > "$REQUESTS_DIR/agent$agent_id.txt"
|
| 170 |
+
while [ ! -f "$RESPONSES_DIR/agent$agent_id.txt" ]; do
|
| 171 |
+
sleep 1
|
| 172 |
+
done
|
| 173 |
+
local response=$(cat "$RESPONSES_DIR/agent$agent_id.txt")
|
| 174 |
+
rm "$RESPONSES_DIR/agent$agent_id.txt"
|
| 175 |
+
log_action "$agent_id" "Received human response: $response"
|
| 176 |
+
set_status "$agent_id" "Processing human input"
|
| 177 |
+
echo "$response"
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
# === SUBGRAPH FUNCTIONS ===
|
| 181 |
+
function collect_data {
|
| 182 |
+
set_status 1 "Generating Bengali text via API"
|
| 183 |
+
local prompt="Generate a sample of Bengali text for language model training."
|
| 184 |
+
local text=$(generate_text "$prompt" "together")
|
| 185 |
+
if [ $? -eq 0 ]; then
|
| 186 |
+
set_status 1 "Saving data to file"
|
| 187 |
+
echo "$text" > "$DATA_DIR/bengali_text.txt"
|
| 188 |
+
if [ $? -ne 0 ]; then
|
| 189 |
+
set_status 1 "Error: Failed to save data"
|
| 190 |
+
log_action 1 "Failed to write to $DATA_DIR/bengali_text.txt"
|
| 191 |
+
return 1
|
| 192 |
+
fi
|
| 193 |
+
log_action 1 "Data saved to $DATA_DIR/bengali_text.txt"
|
| 194 |
+
set_memory "data_collected" "true"
|
| 195 |
+
else
|
| 196 |
+
set_status 1 "API error"
|
| 197 |
+
log_action 1 "Failed to collect data due to API error"
|
| 198 |
+
return 1
|
| 199 |
+
fi
|
| 200 |
+
set_status 1 "Data collection completed"
|
| 201 |
+
}
|
| 202 |
+
|
| 203 |
+
function preprocess_data {
|
| 204 |
+
set_status 2 "Waiting for data collection"
|
| 205 |
+
while [ "$(get_memory 'data_collected')" != "true" ]; do
|
| 206 |
+
sleep 1
|
| 207 |
+
done
|
| 208 |
+
set_status 2 "Analyzing data"
|
| 209 |
+
if [ ! -f "$DATA_DIR/bengali_text.txt" ]; then
|
| 210 |
+
set_status 2 "Error: Data file missing"
|
| 211 |
+
log_action 2 "Error: No data file found at $DATA_DIR/bengali_text.txt"
|
| 212 |
+
return 1
|
| 213 |
+
fi
|
| 214 |
+
local output=$(run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); print(f'Text length: {len(text)} characters')")
|
| 215 |
+
if [ $? -ne 0 ]; then
|
| 216 |
+
set_status 2 "Error: Analysis failed"
|
| 217 |
+
log_action 2 "Preprocessing analysis failed"
|
| 218 |
+
return 1
|
| 219 |
+
fi
|
| 220 |
+
log_action 2 "Analysis result: $output"
|
| 221 |
+
set_status 2 "Awaiting human review"
|
| 222 |
+
local response=$(request_human_input 2 "Review the Bengali text in $DATA_DIR/bengali_text.txt (approve/reject/edit)")
|
| 223 |
+
case "$response" in
|
| 224 |
+
approve)
|
| 225 |
+
set_status 2 "Saving preprocessed data"
|
| 226 |
+
echo "Data preprocessed" > "$DATA_DIR/preprocessed_text.txt"
|
| 227 |
+
log_action 2 "Preprocessing approved, saved to $DATA_DIR/preprocessed_text.txt"
|
| 228 |
+
set_memory "data_preprocessed" "true"
|
| 229 |
+
;;
|
| 230 |
+
edit)
|
| 231 |
+
set_status 2 "Editing data"
|
| 232 |
+
log_action 2 "Human requested edit; applying transformation"
|
| 233 |
+
run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); with open('$DATA_DIR/preprocessed_text.txt', 'w') as f: f.write(text.upper())"
|
| 234 |
+
if [ $? -eq 0 ]; then
|
| 235 |
+
set_memory "data_preprocessed" "true"
|
| 236 |
+
else
|
| 237 |
+
set_status 2 "Error: Edit failed"
|
| 238 |
+
return 1
|
| 239 |
+
fi
|
| 240 |
+
;;
|
| 241 |
+
*)
|
| 242 |
+
set_status 2 "Preprocessing rejected"
|
| 243 |
+
log_action 2 "Preprocessing rejected by human"
|
| 244 |
+
return 1
|
| 245 |
+
;;
|
| 246 |
+
esac
|
| 247 |
+
}
|
| 248 |
+
|
| 249 |
+
function train_model {
|
| 250 |
+
set_status 3 "Waiting for preprocessing"
|
| 251 |
+
while [ "$(get_memory 'data_preprocessed')" != "true" ]; do
|
| 252 |
+
sleep 1
|
| 253 |
+
done
|
| 254 |
+
set_status 3 "Training model"
|
| 255 |
+
if [ ! -f "$DATA_DIR/preprocessed_text.txt" ]; then
|
| 256 |
+
set_status 3 "Error: Preprocessed data missing"
|
| 257 |
+
log_action 3 "Error: No preprocessed data found at $DATA_DIR/preprocessed_text.txt"
|
| 258 |
+
return 1
|
| 259 |
+
fi
|
| 260 |
+
echo "Training Bengali model..."
|
| 261 |
+
sleep 2 # Simulate training
|
| 262 |
+
echo "Model trained" > "$PROJECT_DIR/model.pt"
|
| 263 |
+
if [ $? -ne 0 ]; then
|
| 264 |
+
set_status 3 "Error: Failed to save model"
|
| 265 |
+
log_action 3 "Failed to save model to $PROJECT_DIR/model.pt"
|
| 266 |
+
return 1
|
| 267 |
+
fi
|
| 268 |
+
log_action 3 "Model saved to $PROJECT_DIR/model.pt"
|
| 269 |
+
set_memory "model_trained" "true"
|
| 270 |
+
set_status 3 "Training completed"
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
function evaluate_model {
|
| 274 |
+
set_status 4 "Waiting for model training"
|
| 275 |
+
while [ "$(get_memory 'model_trained')" != "true" ]; do
|
| 276 |
+
sleep 1
|
| 277 |
+
done
|
| 278 |
+
set_status 4 "Evaluating model"
|
| 279 |
+
if [ ! -f "$PROJECT_DIR/model.pt" ]; then
|
| 280 |
+
set_status 4 "Error: Model file missing"
|
| 281 |
+
log_action 4 "Error: No model file found at $PROJECT_DIR/model.pt"
|
| 282 |
+
return 1
|
| 283 |
+
fi
|
| 284 |
+
local output=$(run_python "print('Simulated accuracy: 85%')")
|
| 285 |
+
if [ $? -ne 0 ]; then
|
| 286 |
+
set_status 4 "Error: Evaluation failed"
|
| 287 |
+
log_action 4 "Evaluation failed"
|
| 288 |
+
return 1
|
| 289 |
+
fi
|
| 290 |
+
set_status 4 "Awaiting human review"
|
| 291 |
+
local response=$(request_human_input 4 "Review model performance: $output (approve/reject/fix)")
|
| 292 |
+
case "$response" in
|
| 293 |
+
approve)
|
| 294 |
+
log_action 4 "Evaluation approved"
|
| 295 |
+
set_memory "evaluation_completed" "true"
|
| 296 |
+
;;
|
| 297 |
+
fix)
|
| 298 |
+
set_status 4 "Fixing model"
|
| 299 |
+
log_action 4 "Human requested fix; simulating correction"
|
| 300 |
+
echo "Fixed model" > "$PROJECT_DIR/model.pt"
|
| 301 |
+
set_memory "evaluation_completed" "true"
|
| 302 |
+
;;
|
| 303 |
+
*)
|
| 304 |
+
set_status 4 "Evaluation rejected"
|
| 305 |
+
log_action 4 "Evaluation rejected by human"
|
| 306 |
+
return 1
|
| 307 |
+
;;
|
| 308 |
+
esac
|
| 309 |
+
set_status 4 "Evaluation completed"
|
| 310 |
+
}
|
| 311 |
+
|
| 312 |
+
# === AGENT FUNCTIONS ===
|
| 313 |
+
function agent1 {
|
| 314 |
+
set_status 1 "Starting data collection"
|
| 315 |
+
until collect_data; do
|
| 316 |
+
set_status 1 "Retrying data collection"
|
| 317 |
+
log_action 1 "Retrying data collection after failure"
|
| 318 |
+
sleep 2
|
| 319 |
+
done
|
| 320 |
+
set_status 1 "Data collection completed"
|
| 321 |
+
set_memory "agent1_completed" "true"
|
| 322 |
+
}
|
| 323 |
+
|
| 324 |
+
function agent2 {
|
| 325 |
+
set_status 2 "Starting preprocessing"
|
| 326 |
+
until preprocess_data; do
|
| 327 |
+
set_status 2 "Retrying preprocessing"
|
| 328 |
+
log_action 2 "Retrying preprocessing after failure"
|
| 329 |
+
sleep 2
|
| 330 |
+
done
|
| 331 |
+
set_status 2 "Preprocessing completed"
|
| 332 |
+
set_memory "agent2_completed" "true"
|
| 333 |
+
}
|
| 334 |
+
|
| 335 |
+
function agent3 {
|
| 336 |
+
set_status 3 "Starting training"
|
| 337 |
+
until train_model; do
|
| 338 |
+
set_status 3 "Retrying training"
|
| 339 |
+
log_action 3 "Retrying training after failure"
|
| 340 |
+
sleep 2
|
| 341 |
+
done
|
| 342 |
+
set_status 3 "Training completed"
|
| 343 |
+
set_memory "agent3_completed" "true"
|
| 344 |
+
}
|
| 345 |
+
|
| 346 |
+
function agent4 {
|
| 347 |
+
set_status 4 "Starting evaluation"
|
| 348 |
+
until evaluate_model; do
|
| 349 |
+
set_status 4 "Retrying evaluation"
|
| 350 |
+
log_action 4 "Retrying evaluation after failure"
|
| 351 |
+
sleep 2
|
| 352 |
+
done
|
| 353 |
+
set_status 4 "Evaluation completed"
|
| 354 |
+
set_memory "agent4_completed" "true"
|
| 355 |
+
}
|
| 356 |
+
|
| 357 |
+
# === STATUS DISPLAY ===
|
| 358 |
+
function display_status {
|
| 359 |
+
echo -e "\033[1;34m┌─────────────────────── STATUS DASHBOARD ──────────────────────┐\033[0m"
|
| 360 |
+
echo -e "\033[1;34m│ Bengali Language Model Generation - $(date +%H:%M:%S) │\033[0m"
|
| 361 |
+
echo -e "\033[1;34m└───────────────────────────────────────────────────────────────┘\033[0m"
|
| 362 |
+
local completed=0
|
| 363 |
+
for agent in 1 2 3 4; do
|
| 364 |
+
local status="Not started"
|
| 365 |
+
if [ -f "$STATUS_DIR/agent$agent.status" ]; then
|
| 366 |
+
status=$(cat "$STATUS_DIR/agent$agent.status")
|
| 367 |
+
fi
|
| 368 |
+
if [ "$(get_memory "agent${agent}_completed")" == "true" ]; then
|
| 369 |
+
status="Completed"
|
| 370 |
+
((completed++))
|
| 371 |
+
fi
|
| 372 |
+
case $agent in
|
| 373 |
+
1) color="\033[1;32m" ;; # Green
|
| 374 |
+
2) color="\033[1;33m" ;; # Yellow
|
| 375 |
+
3) color="\033[1;34m" ;; # Blue
|
| 376 |
+
4) color="\033[1;35m" ;; # Magenta
|
| 377 |
+
esac
|
| 378 |
+
echo -e "${color}Agent $agent: $status\033[0m"
|
| 379 |
+
done
|
| 380 |
+
local progress=$((completed * 25)) # 25% per agent
|
| 381 |
+
echo -e "\033[1;36mProgress: [$completed/4] ${progress}%\033[0m"
|
| 382 |
+
}
|
| 383 |
+
|
| 384 |
+
# === HUMAN-IN-THE-LOOP HANDLER ===
|
| 385 |
+
function hil_handler {
|
| 386 |
+
while true; do
|
| 387 |
+
clear
|
| 388 |
+
display_status
|
| 389 |
+
if [ "$(get_memory 'agent1_completed')" == "true" ] && \
|
| 390 |
+
[ "$(get_memory 'agent2_completed')" == "true" ] && \
|
| 391 |
+
[ "$(get_memory 'agent3_completed')" == "true" ] && \
|
| 392 |
+
[ "$(get_memory 'agent4_completed')" == "true" ]; then
|
| 393 |
+
log_action "HIL" "All agents completed successfully"
|
| 394 |
+
echo -e "\033[1;32m✓ All agents completed! Model generation successful.\033[0m"
|
| 395 |
+
break
|
| 396 |
+
fi
|
| 397 |
+
for req_file in "$REQUESTS_DIR"/*; do
|
| 398 |
+
if [ -f "$req_file" ]; then
|
| 399 |
+
local agent_id=$(basename "$req_file" .txt | sed 's/agent//')
|
| 400 |
+
local request=$(cat "$req_file")
|
| 401 |
+
echo -e "\n\033[1;33mAgent $agent_id requests your input:\033[0m $request"
|
| 402 |
+
echo -e "\033[1;33mEnter response (e.g., approve/reject/edit/fix):\033[0m"
|
| 403 |
+
read -r human_input
|
| 404 |
+
if [ -z "$human_input" ]; then
|
| 405 |
+
echo -e "\033[1;31mError: Input cannot be empty. Try again.\033[0m"
|
| 406 |
+
continue
|
| 407 |
+
fi
|
| 408 |
+
echo "$human_input" > "$RESPONSES_DIR/agent$agent_id.txt"
|
| 409 |
+
rm "$req_file"
|
| 410 |
+
fi
|
| 411 |
+
done
|
| 412 |
+
sleep 1
|
| 413 |
+
done
|
| 414 |
+
}
|
| 415 |
+
|
| 416 |
+
# === CLEANUP ON EXIT ===
|
| 417 |
+
function cleanup {
|
| 418 |
+
echo -e "\033[1;31mScript interrupted. Cleaning up...\033[0m"
|
| 419 |
+
rm -f "$REQUESTS_DIR"/* "$RESPONSES_DIR"/* 2>/dev/null
|
| 420 |
+
log_action "Main" "Script terminated by user"
|
| 421 |
+
exit 1
|
| 422 |
+
}
|
| 423 |
+
trap cleanup INT TERM
|
| 424 |
+
|
| 425 |
+
# === MAIN EXECUTION ===
|
| 426 |
+
echo -e "\033[1;32mStarting Bengali language model generation...\033[0m"
|
| 427 |
+
log_action "Main" "Script execution started"
|
| 428 |
+
|
| 429 |
+
agent1 &
|
| 430 |
+
agent2 &
|
| 431 |
+
agent3 &
|
| 432 |
+
agent4 &
|
| 433 |
+
|
| 434 |
+
hil_handler
|
| 435 |
+
|
| 436 |
+
echo -e "\033[1;32mProcess completed successfully!\033[0m"
|
| 437 |
+
echo "Model saved at: $PROJECT_DIR/model.pt"
|
| 438 |
+
echo "Detailed logs available at: $LOG_FILE"
|
tokenizer.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21bff87aabfb69a9aafc1c1c6d1b60bbf3138e2e2b7545924f62b3c5b3c3d587
|
| 3 |
+
size 16
|
tokenizer_config.json
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_bos_token": false,
|
| 3 |
+
"add_prefix_space": false,
|
| 4 |
+
"added_tokens_decoder": {
|
| 5 |
+
"151643": {
|
| 6 |
+
"content": "<|endoftext|>",
|
| 7 |
+
"lstrip": false,
|
| 8 |
+
"normalized": false,
|
| 9 |
+
"rstrip": false,
|
| 10 |
+
"single_word": false,
|
| 11 |
+
"special": true
|
| 12 |
+
},
|
| 13 |
+
"151644": {
|
| 14 |
+
"content": "<|im_start|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": false,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false,
|
| 19 |
+
"special": true
|
| 20 |
+
},
|
| 21 |
+
"151645": {
|
| 22 |
+
"content": "<|im_end|>",
|
| 23 |
+
"lstrip": false,
|
| 24 |
+
"normalized": false,
|
| 25 |
+
"rstrip": false,
|
| 26 |
+
"single_word": false,
|
| 27 |
+
"special": true
|
| 28 |
+
},
|
| 29 |
+
"151646": {
|
| 30 |
+
"content": "<|object_ref_start|>",
|
| 31 |
+
"lstrip": false,
|
| 32 |
+
"normalized": false,
|
| 33 |
+
"rstrip": false,
|
| 34 |
+
"single_word": false,
|
| 35 |
+
"special": true
|
| 36 |
+
},
|
| 37 |
+
"151647": {
|
| 38 |
+
"content": "<|object_ref_end|>",
|
| 39 |
+
"lstrip": false,
|
| 40 |
+
"normalized": false,
|
| 41 |
+
"rstrip": false,
|
| 42 |
+
"single_word": false,
|
| 43 |
+
"special": true
|
| 44 |
+
},
|
| 45 |
+
"151648": {
|
| 46 |
+
"content": "<|box_start|>",
|
| 47 |
+
"lstrip": false,
|
| 48 |
+
"normalized": false,
|
| 49 |
+
"rstrip": false,
|
| 50 |
+
"single_word": false,
|
| 51 |
+
"special": true
|
| 52 |
+
},
|
| 53 |
+
"151649": {
|
| 54 |
+
"content": "<|box_end|>",
|
| 55 |
+
"lstrip": false,
|
| 56 |
+
"normalized": false,
|
| 57 |
+
"rstrip": false,
|
| 58 |
+
"single_word": false,
|
| 59 |
+
"special": true
|
| 60 |
+
},
|
| 61 |
+
"151650": {
|
| 62 |
+
"content": "<|quad_start|>",
|
| 63 |
+
"lstrip": false,
|
| 64 |
+
"normalized": false,
|
| 65 |
+
"rstrip": false,
|
| 66 |
+
"single_word": false,
|
| 67 |
+
"special": true
|
| 68 |
+
},
|
| 69 |
+
"151651": {
|
| 70 |
+
"content": "<|quad_end|>",
|
| 71 |
+
"lstrip": false,
|
| 72 |
+
"normalized": false,
|
| 73 |
+
"rstrip": false,
|
| 74 |
+
"single_word": false,
|
| 75 |
+
"special": true
|
| 76 |
+
},
|
| 77 |
+
"151652": {
|
| 78 |
+
"content": "<|vision_start|>",
|
| 79 |
+
"lstrip": false,
|
| 80 |
+
"normalized": false,
|
| 81 |
+
"rstrip": false,
|
| 82 |
+
"single_word": false,
|
| 83 |
+
"special": true
|
| 84 |
+
},
|
| 85 |
+
"151653": {
|
| 86 |
+
"content": "<|vision_end|>",
|
| 87 |
+
"lstrip": false,
|
| 88 |
+
"normalized": false,
|
| 89 |
+
"rstrip": false,
|
| 90 |
+
"single_word": false,
|
| 91 |
+
"special": true
|
| 92 |
+
},
|
| 93 |
+
"151654": {
|
| 94 |
+
"content": "<|vision_pad|>",
|
| 95 |
+
"lstrip": false,
|
| 96 |
+
"normalized": false,
|
| 97 |
+
"rstrip": false,
|
| 98 |
+
"single_word": false,
|
| 99 |
+
"special": true
|
| 100 |
+
},
|
| 101 |
+
"151655": {
|
| 102 |
+
"content": "<|image_pad|>",
|
| 103 |
+
"lstrip": false,
|
| 104 |
+
"normalized": false,
|
| 105 |
+
"rstrip": false,
|
| 106 |
+
"single_word": false,
|
| 107 |
+
"special": true
|
| 108 |
+
},
|
| 109 |
+
"151656": {
|
| 110 |
+
"content": "<|video_pad|>",
|
| 111 |
+
"lstrip": false,
|
| 112 |
+
"normalized": false,
|
| 113 |
+
"rstrip": false,
|
| 114 |
+
"single_word": false,
|
| 115 |
+
"special": true
|
| 116 |
+
},
|
| 117 |
+
"151657": {
|
| 118 |
+
"content": "<tool_call>",
|
| 119 |
+
"lstrip": false,
|
| 120 |
+
"normalized": false,
|
| 121 |
+
"rstrip": false,
|
| 122 |
+
"single_word": false,
|
| 123 |
+
"special": false
|
| 124 |
+
},
|
| 125 |
+
"151658": {
|
| 126 |
+
"content": "</tool_call>",
|
| 127 |
+
"lstrip": false,
|
| 128 |
+
"normalized": false,
|
| 129 |
+
"rstrip": false,
|
| 130 |
+
"single_word": false,
|
| 131 |
+
"special": false
|
| 132 |
+
},
|
| 133 |
+
"151659": {
|
| 134 |
+
"content": "<|fim_prefix|>",
|
| 135 |
+
"lstrip": false,
|
| 136 |
+
"normalized": false,
|
| 137 |
+
"rstrip": false,
|
| 138 |
+
"single_word": false,
|
| 139 |
+
"special": false
|
| 140 |
+
},
|
| 141 |
+
"151660": {
|
| 142 |
+
"content": "<|fim_middle|>",
|
| 143 |
+
"lstrip": false,
|
| 144 |
+
"normalized": false,
|
| 145 |
+
"rstrip": false,
|
| 146 |
+
"single_word": false,
|
| 147 |
+
"special": false
|
| 148 |
+
},
|
| 149 |
+
"151661": {
|
| 150 |
+
"content": "<|fim_suffix|>",
|
| 151 |
+
"lstrip": false,
|
| 152 |
+
"normalized": false,
|
| 153 |
+
"rstrip": false,
|
| 154 |
+
"single_word": false,
|
| 155 |
+
"special": false
|
| 156 |
+
},
|
| 157 |
+
"151662": {
|
| 158 |
+
"content": "<|fim_pad|>",
|
| 159 |
+
"lstrip": false,
|
| 160 |
+
"normalized": false,
|
| 161 |
+
"rstrip": false,
|
| 162 |
+
"single_word": false,
|
| 163 |
+
"special": false
|
| 164 |
+
},
|
| 165 |
+
"151663": {
|
| 166 |
+
"content": "<|repo_name|>",
|
| 167 |
+
"lstrip": false,
|
| 168 |
+
"normalized": false,
|
| 169 |
+
"rstrip": false,
|
| 170 |
+
"single_word": false,
|
| 171 |
+
"special": false
|
| 172 |
+
},
|
| 173 |
+
"151664": {
|
| 174 |
+
"content": "<|file_sep|>",
|
| 175 |
+
"lstrip": false,
|
| 176 |
+
"normalized": false,
|
| 177 |
+
"rstrip": false,
|
| 178 |
+
"single_word": false,
|
| 179 |
+
"special": false
|
| 180 |
+
}
|
| 181 |
+
},
|
| 182 |
+
"additional_special_tokens": [
|
| 183 |
+
"<|im_start|>",
|
| 184 |
+
"<|im_end|>",
|
| 185 |
+
"<|object_ref_start|>",
|
| 186 |
+
"<|object_ref_end|>",
|
| 187 |
+
"<|box_start|>",
|
| 188 |
+
"<|box_end|>",
|
| 189 |
+
"<|quad_start|>",
|
| 190 |
+
"<|quad_end|>",
|
| 191 |
+
"<|vision_start|>",
|
| 192 |
+
"<|vision_end|>",
|
| 193 |
+
"<|vision_pad|>",
|
| 194 |
+
"<|image_pad|>",
|
| 195 |
+
"<|video_pad|>"
|
| 196 |
+
],
|
| 197 |
+
"bos_token": null,
|
| 198 |
+
"chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
|
| 199 |
+
"clean_up_tokenization_spaces": false,
|
| 200 |
+
"eos_token": "<|im_end|>",
|
| 201 |
+
"errors": "replace",
|
| 202 |
+
"model_max_length": 131072,
|
| 203 |
+
"pad_token": "<|endoftext|>",
|
| 204 |
+
"padding_side": "right",
|
| 205 |
+
"split_special_tokens": false,
|
| 206 |
+
"tokenizer_class": "Qwen2Tokenizer",
|
| 207 |
+
"unk_token": null
|
| 208 |
+
}
|
tools.json
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"name": "Project Management Tool",
|
| 3 |
+
"description": "A tool for managing project context, automating tasks, and facilitating human-in-the-loop interactions.",
|
| 4 |
+
"version": "1.0.0",
|
| 5 |
+
"author": "Your Name",
|
| 6 |
+
"license": "MIT",
|
| 7 |
+
"configurations": {
|
| 8 |
+
"projectDirectory": "/path/to/your/project",
|
| 9 |
+
"logFile": "/path/to/your/logs/actions.log",
|
| 10 |
+
"memoryFile": "/path/to/your/memory.txt",
|
| 11 |
+
"requestsDirectory": "/path/to/your/requests",
|
| 12 |
+
"responsesDirectory": "/path/to/your/responses",
|
| 13 |
+
"dataDirectory": "/path/to/your/data",
|
| 14 |
+
"statusDirectory": "/path/to/your/status"
|
| 15 |
+
},
|
| 16 |
+
"apiKeys": {
|
| 17 |
+
"togetherAPI": "your_together_api_key",
|
| 18 |
+
"cohereAPI": "your_cohere_api_key",
|
| 19 |
+
"geminiAPI": "your_gemini_api_key"
|
| 20 |
+
},
|
| 21 |
+
"features": {
|
| 22 |
+
"automaticContextGathering": {
|
| 23 |
+
"description": "Automatically reads related files, explores project structure, analyzes patterns, and maps dependencies.",
|
| 24 |
+
"enabled": true
|
| 25 |
+
},
|
| 26 |
+
"humanInTheLoop": {
|
| 27 |
+
"description": "Facilitates human input for reviewing and approving tasks.",
|
| 28 |
+
"enabled": true
|
| 29 |
+
},
|
| 30 |
+
"subgraphSupport": {
|
| 31 |
+
"description": "Encapsulates tasks as reusable nodes for better project management.",
|
| 32 |
+
"enabled": true
|
| 33 |
+
},
|
| 34 |
+
"memoryManagement": {
|
| 35 |
+
"description": "Persists state across agent interactions using a key-value store.",
|
| 36 |
+
"enabled": true
|
| 37 |
+
},
|
| 38 |
+
"apiIntegrations": {
|
| 39 |
+
"description": "Integrates with Together, Cohere, and Gemini APIs for text generation.",
|
| 40 |
+
"enabled": true
|
| 41 |
+
},
|
| 42 |
+
"fileOperations": {
|
| 43 |
+
"description": "Creates, edits, and validates files with error checking.",
|
| 44 |
+
"enabled": true
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"agents": [
|
| 48 |
+
{
|
| 49 |
+
"id": 1,
|
| 50 |
+
"name": "Data Collection Agent",
|
| 51 |
+
"description": "Collects and saves data for the project.",
|
| 52 |
+
"tasks": ["collect_data"]
|
| 53 |
+
},
|
| 54 |
+
{
|
| 55 |
+
"id": 2,
|
| 56 |
+
"name": "Data Preprocessing Agent",
|
| 57 |
+
"description": "Preprocesses the collected data.",
|
| 58 |
+
"tasks": ["preprocess_data"]
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"id": 3,
|
| 62 |
+
"name": "Model Training Agent",
|
| 63 |
+
"description": "Trains the model using the preprocessed data.",
|
| 64 |
+
"tasks": ["train_model"]
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"id": 4,
|
| 68 |
+
"name": "Model Evaluation Agent",
|
| 69 |
+
"description": "Evaluates the trained model.",
|
| 70 |
+
"tasks": ["evaluate_model"]
|
| 71 |
+
}
|
| 72 |
+
],
|
| 73 |
+
"tasks": {
|
| 74 |
+
"collect_data": {
|
| 75 |
+
"description": "Generates and saves Bengali text via API.",
|
| 76 |
+
"script": "collect_data.sh"
|
| 77 |
+
},
|
| 78 |
+
"preprocess_data": {
|
| 79 |
+
"description": "Analyzes and preprocesses the collected data.",
|
| 80 |
+
"script": "preprocess_data.sh"
|
| 81 |
+
},
|
| 82 |
+
"train_model": {
|
| 83 |
+
"description": "Trains the model using the preprocessed data.",
|
| 84 |
+
"script": "train_model.sh"
|
| 85 |
+
},
|
| 86 |
+
"evaluate_model": {
|
| 87 |
+
"description": "Evaluates the trained model.",
|
| 88 |
+
"script": "evaluate_model.sh"
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
"scripts": {
|
| 92 |
+
"collect_data.sh": "path/to/collect_data.sh",
|
| 93 |
+
"preprocess_data.sh": "path/to/preprocess_data.sh",
|
| 94 |
+
"train_model.sh": "path/to/train_model.sh",
|
| 95 |
+
"evaluate_model.sh": "path/to/evaluate_model.sh"
|
| 96 |
+
}
|
| 97 |
+
}
|
train.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import time
|
| 3 |
+
|
| 4 |
+
def train(epochs):
|
| 5 |
+
for epoch in range(epochs):
|
| 6 |
+
print("Epoch %d/%d training..." % (epoch+1, epochs))
|
| 7 |
+
time.sleep(2)
|
| 8 |
+
print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
|
| 9 |
+
|
| 10 |
+
if __name__ == "__main__":
|
| 11 |
+
parser = argparse.ArgumentParser()
|
| 12 |
+
parser.add_argument("--epochs", type=int, default=1)
|
| 13 |
+
args = parser.parse_args()
|
| 14 |
+
train(args.epochs)
|