likhonsheikhdev commited on
Commit
cce70aa
·
verified ·
1 Parent(s): ca40970

Upload 28 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,43 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
+ assets/ball.gif filter=lfs diff=lfs merge=lfs -text
38
+ assets/benchmark.png filter=lfs diff=lfs merge=lfs -text
39
+ assets/count.png filter=lfs diff=lfs merge=lfs -text
40
+ assets/diamond.png filter=lfs diff=lfs merge=lfs -text
41
+ assets/param-aime2024.jpeg filter=lfs diff=lfs merge=lfs -text
42
+ assets/param-lcb.jpeg filter=lfs diff=lfs merge=lfs -text
43
+ assets/writing.png filter=lfs diff=lfs merge=lfs -text
.github/workflows/train_model.yml ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: Train Bengali-Code LLM Model
2
+
3
+ on:
4
+ schedule:
5
+ - cron: '0 0 * * *' # Run daily at midnight
6
+ workflow_dispatch: # Allow manual triggers
7
+
8
+ jobs:
9
+ train:
10
+ runs-on: ubuntu-latest
11
+ steps:
12
+ - uses: actions/checkout@v4
13
+
14
+ - name: Set up Python
15
+ uses: actions/setup-python@v5
16
+ with:
17
+ python-version: '3.10'
18
+
19
+ - name: Install dependencies
20
+ run: |
21
+ python -m pip install --upgrade pip
22
+ pip install transformers datasets sentencepiece accelerate torch wandb
23
+
24
+ - name: Data Collection
25
+ run: python scripts/data_collector.py
26
+ env:
27
+ HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
28
+
29
+ - name: Train Tokenizer
30
+ run: python scripts/tokenizer_trainer.py
31
+
32
+ - name: Train Model
33
+ run: python scripts/model_trainer.py
34
+ env:
35
+ WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
36
+
37
+ - name: Evaluate Model
38
+ run: python scripts/model_evaluator.py
39
+
40
+ - name: Upload Model Artifacts
41
+ uses: actions/upload-artifact@v3
42
+ with:
43
+ name: model-weights
44
+ path: outputs/models/
.vscode/settings.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "editor.inlineSuggest.enabled": true,
3
+ "editor.quickSuggestions": {
4
+ "other": "inline",
5
+ "comments": true,
6
+ "strings": true
7
+ },
8
+ "editor.quickSuggestionsDelay": 100
9
+ }
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2025 Bengali-Code LLM Project Contributors
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Bengali-Code LLM Training Pipeline
2
+
3
+ A comprehensive pipeline for training a Bengali language model specialized in code understanding and generation. The model is fine-tuned on Bengali programming tutorials, documentation, and code examples.
4
+
5
+ ## 🌟 Features
6
+
7
+ - Automated data collection from Bengali Wikipedia and Prothom Alo
8
+ - Custom tokenizer training with SentencePiece for Bengali text and code
9
+ - Model fine-tuning using TinyLlama base model
10
+ - Comprehensive evaluation suite for Bengali code generation
11
+ - GitHub Actions workflow for automated training
12
+ - Weights & Biases integration for experiment tracking
13
+
14
+ ## 📋 Requirements
15
+
16
+ - Python 3.10 or higher
17
+ - CUDA-capable GPU (recommended)
18
+ - 16GB+ RAM
19
+ - Internet connection for data collection
20
+
21
+ ## 🚀 Quick Start
22
+
23
+ 1. Clone the repository:
24
+ ```bash
25
+ git clone https://github.com/yourusername/bengali-code-llm.git
26
+ cd bengali-code-llm
27
+ ```
28
+
29
+ 2. Install dependencies:
30
+ ```bash
31
+ pip install -r requirements.txt
32
+ ```
33
+
34
+ 3. Set up environment variables:
35
+ ```bash
36
+ export HUGGINGFACE_TOKEN="your_token_here"
37
+ export WANDB_API_KEY="your_wandb_key_here"
38
+ ```
39
+
40
+ 4. Run the complete pipeline:
41
+ ```bash
42
+ # Collect data
43
+ python scripts/data_collector.py
44
+
45
+ # Train tokenizer
46
+ python scripts/tokenizer_trainer.py
47
+
48
+ # Train model
49
+ python scripts/model_trainer.py
50
+
51
+ # Evaluate model
52
+ python scripts/model_evaluator.py
53
+ ```
54
+
55
+ ## 🏗️ Pipeline Components
56
+
57
+ ### Data Collection (`scripts/data_collector.py`)
58
+ - Scrapes Bengali text from Wikipedia and Prothom Alo
59
+ - Implements rate limiting and error handling
60
+ - Outputs processed data in JSON format
61
+
62
+ ### Tokenizer Training (`scripts/tokenizer_trainer.py`)
63
+ - Uses SentencePiece for tokenizer training
64
+ - Custom vocabulary with Bengali and code tokens
65
+ - Generates HuggingFace-compatible tokenizer files
66
+
67
+ ### Model Training (`scripts/model_trainer.py`)
68
+ - Fine-tunes TinyLlama model
69
+ - Implements efficient training with gradient accumulation
70
+ - Supports mixed precision training
71
+ - Integrates with Weights & Biases for tracking
72
+
73
+ ### Model Evaluation (`scripts/model_evaluator.py`)
74
+ - Comprehensive evaluation suite
75
+ - Tests code generation capabilities
76
+ - Measures BLEU and ROUGE scores
77
+ - Generates detailed evaluation reports
78
+
79
+ ## 📊 Training Metrics
80
+
81
+ The training progress can be monitored through Weights & Biases:
82
+ - Loss curves
83
+ - Evaluation metrics
84
+ - Generated samples
85
+ - Resource utilization
86
+
87
+ ## 🔄 GitHub Actions Workflow
88
+
89
+ The repository includes an automated training pipeline that:
90
+ - Runs daily to incorporate new data
91
+ - Executes the complete training pipeline
92
+ - Uploads model artifacts
93
+ - Can be triggered manually
94
+
95
+ ## 📁 Directory Structure
96
+
97
+ ```
98
+ bengali-code-llm/
99
+ ├── .github/
100
+ │ └── workflows/
101
+ │ └── train_model.yml
102
+ ├── scripts/
103
+ │ ├── data_collector.py
104
+ │ ├── tokenizer_trainer.py
105
+ │ ├── model_trainer.py
106
+ │ └── model_evaluator.py
107
+ ├── data/
108
+ │ └── raw/
109
+ ├── outputs/
110
+ │ ├── tokenizer/
111
+ │ ├── model/
112
+ │ └── evaluation/
113
+ ├── requirements.txt
114
+ └── README.md
115
+ ```
116
+
117
+ ## 🎯 Model Performance
118
+
119
+ The model is evaluated on various tasks:
120
+ - Code generation in Bengali
121
+ - Code explanation and documentation
122
+ - Error detection and correction
123
+ - Algorithm explanation
124
+
125
+ ## 📜 License
126
+
127
+ This project is licensed under the MIT License - see the LICENSE file for details.
128
+
129
+ ## 🤝 Contributing
130
+
131
+ Contributions are welcome! Please feel free to submit issues and pull requests.
132
+
133
+ ## 📧 Contact
134
+
135
+ For questions and feedback, please open an issue in the repository.
136
+
137
+ ## 🙏 Acknowledgments
138
+
139
+ - TinyLlama team for the base model
140
+ - HuggingFace for the Transformers library
141
+ - Weights & Biases for experiment tracking
agent.ps1 ADDED
@@ -0,0 +1,292 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration
2
+ $API_KEY = "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"
3
+ $MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
4
+ $AGENT_COUNT = 2
5
+
6
+ $WORKDIR = Join-Path $PSScriptRoot "ai-agent"
7
+ $LOGDIR = Join-Path $WORKDIR "outputs\logs"
8
+ $PROMPT_FILE = Join-Path $WORKDIR "system_prompt.mdx"
9
+ $TASK_FILE = Join-Path $WORKDIR "task_context.md"
10
+
11
+ # Create directories
12
+ New-Item -ItemType Directory -Force -Path $LOGDIR | Out-Null
13
+ New-Item -ItemType Directory -Force -Path $WORKDIR | Out-Null
14
+
15
+ # Initialize prompt file if missing
16
+ if (-not (Test-Path $PROMPT_FILE)) {
17
+ $initialPrompt = '<Plan>' + [Environment]::NewLine
18
+ $initialPrompt += 'You are AI coding agents focused on building a Bengali code + NLP LLM.' + [Environment]::NewLine
19
+ $initialPrompt += 'Output commands inside <Actions> blocks, analyses inside <Task> blocks.' + [Environment]::NewLine
20
+ $initialPrompt += 'After command execution, output results inside <TaskResult> blocks.' + [Environment]::NewLine
21
+ $initialPrompt += '</Plan>' + [Environment]::NewLine + [Environment]::NewLine
22
+ $initialPrompt += '<Actions>' + [Environment]::NewLine
23
+ $initialPrompt += 'echo "Starting initial training setup..."' + [Environment]::NewLine
24
+ $initialPrompt += '# Dummy start command for training' + [Environment]::NewLine
25
+ $initialPrompt += 'echo "Training started."' + [Environment]::NewLine
26
+ $initialPrompt += '</Actions>' + [Environment]::NewLine + [Environment]::NewLine
27
+ $initialPrompt += '<Task>' + [Environment]::NewLine
28
+ $initialPrompt += 'Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.' + [Environment]::NewLine
29
+ $initialPrompt += '</Task>'
30
+
31
+ Set-Content -Path $PROMPT_FILE -Value $initialPrompt
32
+ }
33
+
34
+ # Initialize task file if missing
35
+ if (-not (Test-Path $TASK_FILE)) {
36
+ "" | Set-Content $TASK_FILE
37
+ }
38
+
39
+ # Copy training script if missing
40
+ $TRAIN_SCRIPT = Join-Path $WORKDIR "train.py"
41
+ if (-not (Test-Path $TRAIN_SCRIPT)) {
42
+ Copy-Item -Path (Join-Path $PSScriptRoot "train.py") -Destination $TRAIN_SCRIPT
43
+ }
44
+
45
+ # Function to call Groq API with streaming
46
+ function Invoke-GroqAPI {
47
+ param (
48
+ [string]$Prompt,
49
+ [string]$AgentId
50
+ )
51
+
52
+ $headers = @{
53
+ "Authorization" = "Bearer " + $API_KEY
54
+ "Content-Type" = "application/json"
55
+ }
56
+
57
+ $body = @{
58
+ model = $MODEL
59
+ messages = @(
60
+ @{
61
+ role = "system"
62
+ content = $Prompt
63
+ }
64
+ )
65
+ temperature = 1
66
+ max_completion_tokens = 1024
67
+ top_p = 1
68
+ stream = $true
69
+ } | ConvertTo-Json
70
+
71
+ try {
72
+ $apiUrl = "https://api.groq.com/openai/v1/chat/completions"
73
+ $response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
74
+
75
+ # Process streaming response
76
+ $fullResponse = ""
77
+ foreach ($chunk in $response.choices[0].delta.content) {
78
+ if ($null -ne $chunk) {
79
+ $fullResponse += $chunk
80
+ Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
81
+ }
82
+ }
83
+ Write-Host ""
84
+ return $fullResponse
85
+ }
86
+ catch {
87
+ Write-Host "❌ Error calling Groq API: $_" -ForegroundColor Red
88
+ return $null
89
+ }
90
+ }
91
+
92
+ # Function to extract and run actions
93
+ function Invoke-Actions {
94
+ param (
95
+ [string]$Response,
96
+ [string]$AgentId
97
+ )
98
+
99
+ if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
100
+ $actions = $matches[1].Trim()
101
+ if ($actions) {
102
+ Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
103
+ $actionScriptName = "run_actions_" + $AgentId + ".ps1"
104
+ $actionScript = Join-Path $WORKDIR $actionScriptName
105
+ $actions | Set-Content $actionScript
106
+
107
+ $logFileName = "actions_agent_" + $AgentId + ".log"
108
+ $logFile = Join-Path $LOGDIR $logFileName
109
+ & $actionScript *>&1 | Tee-Object -Path $logFile
110
+ }
111
+ }
112
+ else {
113
+ Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
114
+ $logFileName = "actions_agent_" + $AgentId + ".log"
115
+ "" | Set-Content (Join-Path $LOGDIR $logFileName)
116
+ }
117
+ }
118
+
119
+ # Function to append task result
120
+ function Add-TaskResult {
121
+ param (
122
+ [string]$AgentId
123
+ )
124
+
125
+ $logFileName = "actions_agent_" + $AgentId + ".log"
126
+ $logFile = Join-Path $LOGDIR $logFileName
127
+ if (Test-Path $logFile) {
128
+ $result = Get-Content $logFile -Tail 50 | Out-String
129
+ $taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
130
+ $taskResult += $result
131
+ $taskResult += '</TaskResult>'
132
+
133
+ Add-Content -Path $TASK_FILE -Value $taskResult
134
+ Write-Host ("✍️ Agent " + $AgentId + " appended <TaskResult>.")
135
+ }
136
+ }
137
+
138
+ # Main loop with multi-agent coordination
139
+ Write-Host "🚀 Starting multi-agent AI loop with $AGENT_COUNT agents..."
140
+
141
+ $stopLoop = $false
142
+ while (-not $stopLoop) {
143
+ $promptCombined = Get-Content $PROMPT_FILE, $TASK_FILE | Out-String
144
+
145
+ # Create array to hold jobs
146
+ $jobs = @()
147
+
148
+ # Start agents in parallel
149
+ 1..$AGENT_COUNT | ForEach-Object {
150
+ $agentId = $_
151
+ $workdir = $WORKDIR
152
+ $logdir = $LOGDIR
153
+ $apiKey = $API_KEY
154
+ $model = $MODEL
155
+
156
+ $jobs += Start-Job -ScriptBlock {
157
+ param($promptCombined, $agentId, $workdir, $logdir, $apiKey, $model)
158
+
159
+ # Recreate functions in job scope
160
+ function Invoke-GroqAPI {
161
+ param($Prompt, $AgentId)
162
+ $headers = @{
163
+ "Authorization" = "Bearer " + $apiKey
164
+ "Content-Type" = "application/json"
165
+ }
166
+
167
+ $body = @{
168
+ model = $model
169
+ messages = @(
170
+ @{
171
+ role = "system"
172
+ content = $Prompt
173
+ }
174
+ )
175
+ temperature = 1
176
+ max_completion_tokens = 1024
177
+ top_p = 1
178
+ stream = $true
179
+ } | ConvertTo-Json
180
+
181
+ try {
182
+ # Add hosts entry
183
+ $hostsPath = "$env:SystemRoot\System32\drivers\etc\hosts"
184
+ $hostEntry = "104.198.40.119 groq-api.local"
185
+
186
+ # Check if entry exists
187
+ $hostsContent = Get-Content $hostsPath
188
+ if ($hostsContent -notcontains $hostEntry) {
189
+ Add-Content -Path $hostsPath -Value "`n$hostEntry" -Force
190
+ }
191
+
192
+ # Configure TLS
193
+ [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
194
+ [Net.ServicePointManager]::ServerCertificateValidationCallback = {$true}
195
+
196
+ # Make request
197
+ $headers = @{
198
+ "Authorization" = "Bearer $apiKey"
199
+ "Content-Type" = "application/json"
200
+ "Host" = "api.groq.com"
201
+ }
202
+
203
+ $apiUrl = "https://groq-api.local/v1/chat/completions"
204
+ $response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
205
+
206
+ $fullResponse = ""
207
+ foreach ($chunk in $response.choices[0].delta.content) {
208
+ if ($null -ne $chunk) {
209
+ $fullResponse += $chunk
210
+ Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
211
+ }
212
+ }
213
+ Write-Host ""
214
+ return $fullResponse
215
+ }
216
+ catch {
217
+ Write-Host ("❌ Error calling Groq API: " + $_.Exception.Message) -ForegroundColor Red
218
+ # Stop the loop on API errors
219
+ return "<Done>"
220
+ }
221
+ }
222
+
223
+ function Invoke-Actions {
224
+ param($Response, $AgentId)
225
+ if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
226
+ $actions = $matches[1].Trim()
227
+ if ($actions) {
228
+ Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
229
+ $actionScriptName = "run_actions_" + $AgentId + ".ps1"
230
+ $actionScript = Join-Path $workdir $actionScriptName
231
+ $actions | Set-Content $actionScript
232
+
233
+ $logFileName = "actions_agent_" + $AgentId + ".log"
234
+ $logFile = Join-Path $logdir $logFileName
235
+ & $actionScript *>&1 | Tee-Object -Path $logFile
236
+ }
237
+ }
238
+ else {
239
+ Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
240
+ $logFileName = "actions_agent_" + $AgentId + ".log"
241
+ "" | Set-Content (Join-Path $logdir $logFileName)
242
+ }
243
+ }
244
+
245
+ function Add-TaskResult {
246
+ param($AgentId)
247
+ $logFile = Join-Path $logdir ('actions_agent_' + $AgentId + '.log')
248
+ if (Test-Path $logFile) {
249
+ $result = Get-Content $logFile -Tail 50 | Out-String
250
+ $taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
251
+ $taskResult += $result
252
+ $taskResult += '</TaskResult>'
253
+
254
+ Add-Content -Path (Join-Path $workdir 'task_context.md') -Value $taskResult
255
+ Write-Host ('✍️ Agent ' + $AgentId + ' appended <TaskResult>.')
256
+ }
257
+ }
258
+
259
+ Write-Host ("🤖 Agent " + $agentId + " sending prompt to Groq API...")
260
+ $response = Invoke-GroqAPI -Prompt $promptCombined -AgentId $agentId
261
+
262
+ if ($response) {
263
+ $responseFileName = "agent_" + $agentId + "_response.txt"
264
+ $response | Set-Content (Join-Path $logdir $responseFileName)
265
+
266
+ Invoke-Actions -Response $response -AgentId $agentId
267
+ Add-TaskResult -AgentId $agentId
268
+
269
+ # Check for completion
270
+ if ($response -match '<Done>') {
271
+ Write-Host ("✅ Agent " + $agentId + " indicated completion.")
272
+ return $true
273
+ }
274
+ }
275
+ return $false
276
+ } -ArgumentList $promptCombined, $agentId, $workdir, $logdir, $apiKey, $model
277
+ }
278
+
279
+ # Wait for all jobs and get results
280
+ $results = $jobs | Wait-Job | Receive-Job
281
+ $jobs | Remove-Job
282
+
283
+ # Check if any agent indicated completion
284
+ if ($results -contains $true) {
285
+ Write-Host "🚀 Stopping AI loop as <Done> was detected."
286
+ $stopLoop = $true
287
+ }
288
+
289
+ Start-Sleep -Seconds 2
290
+ }
291
+
292
+ Write-Host "🎉 All agents completed."
agent.sh ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # Configuration
4
+ PROJECT_DIR="$HOME/bd-model-generations"
5
+ STATUS_DIR="$PROJECT_DIR/status"
6
+ LOG_FILE="$PROJECT_DIR/logs/actions.log"
7
+
8
+ # Ensure directories exist
9
+ mkdir -p "$STATUS_DIR" "$PROJECT_DIR/logs"
10
+
11
+ # Log function for errors
12
+ log_error() {
13
+ echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >> "$LOG_FILE"
14
+ }
15
+
16
+ # Live status display function
17
+ display_status() {
18
+ while true; do
19
+ clear
20
+ echo -e "\033[1;34m=== Live Agent Status (Bengali Language Model Generation) ===\033[0m"
21
+ echo -e "\033[1;36mTime: $(date '+%H:%M:%S')\033[0m"
22
+ echo ""
23
+
24
+ # Data Collector Status
25
+ if [ -f "$STATUS_DIR/data_collector.status" ]; then
26
+ echo -e "\033[1;32mData Collector:\033[0m $(cat "$STATUS_DIR/data_collector.status")"
27
+ else
28
+ echo -e "\033[1;32mData Collector:\033[0m Not started or completed"
29
+ fi
30
+
31
+ # Model Trainer Status
32
+ if [ -f "$STATUS_DIR/model_trainer.status" ]; then
33
+ echo -e "\033[1;33mModel Trainer:\033[0m $(cat "$STATUS_DIR/model_trainer.status")"
34
+ else
35
+ echo -e "\033[1;33mModel Trainer:\033[0m Not started or completed"
36
+ fi
37
+
38
+ # Model Evaluator Status
39
+ if [ -f "$STATUS_DIR/model_evaluator.status" ]; then
40
+ echo -e "\033[1;31mModel Evaluator:\033[0m $(cat "$STATUS_DIR/model_evaluator.status")"
41
+ else
42
+ echo -e "\033[1;31mModel Evaluator:\033[0m Not started or completed"
43
+ fi
44
+
45
+ # Check if all agents are done
46
+ if [ ! -f "$STATUS_DIR/data_collector.status" ] && \
47
+ [ ! -f "$STATUS_DIR/model_trainer.status" ] && \
48
+ [ ! -f "$STATUS_DIR/model_evaluator.status" ]; then
49
+ echo ""
50
+ echo -e "\033[1;34mAll agents have completed their tasks.\033[0m"
51
+ break
52
+ fi
53
+ sleep 2
54
+ done
55
+ }
56
+
57
+ # Main process
58
+ echo "Starting Bengali language model generation..." | tee -a "$LOG_FILE"
59
+
60
+ # Launch agents in background
61
+ for agent in data_collector model_trainer model_evaluator; do
62
+ if [ -f "$PROJECT_DIR/$agent.sh" ]; then
63
+ echo "Starting $agent..." | tee -a "$LOG_FILE"
64
+ bash "$PROJECT_DIR/$agent.sh" &>> "$LOG_FILE" || log_error "$agent failed to execute"
65
+ else
66
+ log_error "$agent.sh not found in $PROJECT_DIR"
67
+ fi
68
+ done
69
+
70
+ # Display live status
71
+ display_status
72
+
73
+ echo "Process completed. Check logs in $LOG_FILE for details." | tee -a "$LOG_FILE"
ai-agent/system_prompt.mdx ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <Plan>
2
+ You are AI coding agents focused on building a Bengali code + NLP LLM.
3
+ Output commands inside <Actions> blocks, analyses inside <Task> blocks.
4
+ After command execution, output results inside <TaskResult> blocks.
5
+ </Plan>
6
+
7
+ <Actions>
8
+ echo "Starting initial training setup..."
9
+ # Dummy start command for training
10
+ echo "Training started."
11
+ </Actions>
12
+
13
+ <Task>
14
+ Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.
15
+ </Task>
ai-agent/task_context.md ADDED
@@ -0,0 +1 @@
 
 
1
+
ai-agent/train.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import argparse
3
+ import time
4
+
5
+ def train(epochs):
6
+ for epoch in range(epochs):
7
+ print("Epoch %d/%d training..." % (epoch+1, epochs))
8
+ time.sleep(2)
9
+ print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
10
+
11
+ if __name__ == "__main__":
12
+ parser = argparse.ArgumentParser()
13
+ parser.add_argument("--epochs", type=int, default=1)
14
+ args = parser.parse_args()
15
+ train(args.epochs)
config.json ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ "attention_dropout": 0.0,
2
+ "bos_token_id": 151643,
3
+ "eos_token_id": 151643,
4
+ "hidden_act": "silu",
5
+ "hidden_size": 5120,
6
+ "initializer_range": 0.02,
7
+ "intermediate_size": 27648,
8
+ "max_position_embeddings": 131072,
9
+ "max_window_layers": 64,
10
+ "model_type": "qwen2",
11
+ "num_attention_heads": 40,
12
+ "num_hidden_layers": 64,
13
+ "num_key_value_heads": 8,
14
+ "rms_norm_eps": 1e-05,
15
+ "rope_scaling": null,
16
+ "rope_theta": 1000000.0,
17
+ "sliding_window": null,
18
+ "tie_word_embeddings": false,
19
+ "torch_dtype": "bfloat16",
20
+ "transformers_version": "4.46.0",
21
+ "use_cache": false,
22
+ "use_sliding_window": false,
23
+ "vocab_size": 152064
24
+ }
data_collector.ps1 ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration
2
+ $ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
3
+ $StatusFile = Join-Path $ProjectDir 'status\data_collector.status'
4
+ $LogFile = Join-Path $ProjectDir 'logs\actions.log'
5
+
6
+ function Write-Log {
7
+ param([string]$Message, [string]$Type = 'INFO')
8
+ $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
9
+ Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
10
+ }
11
+
12
+ # Ensure status directory exists
13
+ New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
14
+
15
+ try {
16
+ # Initialize status
17
+ Set-Content -Path $StatusFile -Value 'Initializing data collection...'
18
+ Write-Log 'Data collector started' 'INFO'
19
+
20
+ # Simulated data collection progress (replace with actual logic)
21
+ $progressSteps = @(
22
+ @{ Status = 'Connecting to data sources...'; Duration = 2 },
23
+ @{ Status = 'Fetching Bengali text corpus...'; Duration = 3 },
24
+ @{ Status = 'Processing raw data...'; Duration = 2 },
25
+ @{ Status = 'Cleaning and normalizing text...'; Duration = 2 },
26
+ @{ Status = 'Preparing training dataset...'; Duration = 1 }
27
+ )
28
+
29
+ foreach ($step in $progressSteps) {
30
+ Set-Content -Path $StatusFile -Value $step.Status
31
+ Write-Log $step.Status 'INFO'
32
+ Start-Sleep -Seconds $step.Duration
33
+ }
34
+
35
+ # Final status update
36
+ Set-Content -Path $StatusFile -Value 'Data collection completed successfully'
37
+ Write-Log 'Data collection completed' 'SUCCESS'
38
+ Start-Sleep -Seconds 1
39
+
40
+ } catch {
41
+ Write-Log "Error in data collection: $_" 'ERROR'
42
+ Set-Content -Path $StatusFile -Value 'Error: Data collection failed'
43
+ Start-Sleep -Seconds 1
44
+ } finally {
45
+ # Cleanup status file
46
+ if (Test-Path $StatusFile) {
47
+ Remove-Item -Path $StatusFile
48
+ }
49
+ }
data_collector.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ STATUS_FILE="$HOME/bd-model-generations/status/data_collector.status"
3
+ LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
4
+
5
+ echo "Collecting data..." > "$STATUS_FILE"
6
+ # Simulate data collection (replace with actual logic)
7
+ sleep 5
8
+ echo "Data collection complete." > "$STATUS_FILE"
9
+ sleep 1
10
+ rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
model_evaluator.ps1 ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration
2
+ $ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
3
+ $StatusFile = Join-Path $ProjectDir 'status\model_evaluator.status'
4
+ $LogFile = Join-Path $ProjectDir 'logs\actions.log'
5
+
6
+ function Write-Log {
7
+ param([string]$Message, [string]$Type = 'INFO')
8
+ $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
9
+ Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
10
+ }
11
+
12
+ # Ensure status directory exists
13
+ New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
14
+
15
+ try {
16
+ # Initialize status
17
+ Set-Content -Path $StatusFile -Value 'Initializing model evaluation...'
18
+ Write-Log 'Model evaluator started' 'INFO'
19
+
20
+ # Simulated evaluation progress (replace with actual logic)
21
+ $progressSteps = @(
22
+ @{ Status = 'Loading test dataset...'; Duration = 2 },
23
+ @{ Status = 'Computing accuracy metrics...'; Duration = 3 },
24
+ @{ Status = 'Analyzing model performance...'; Duration = 2 },
25
+ @{ Status = 'Generating confusion matrix...'; Duration = 2 },
26
+ @{ Status = 'Creating evaluation report...'; Duration = 1 }
27
+ )
28
+
29
+ foreach ($step in $progressSteps) {
30
+ Set-Content -Path $StatusFile -Value $step.Status
31
+ Write-Log $step.Status 'INFO'
32
+ Start-Sleep -Seconds $step.Duration
33
+ }
34
+
35
+ # Final status update
36
+ Set-Content -Path $StatusFile -Value 'Model evaluation completed successfully'
37
+ Write-Log 'Model evaluation completed' 'SUCCESS'
38
+ Start-Sleep -Seconds 1
39
+
40
+ } catch {
41
+ Write-Log "Error in model evaluation: $_" 'ERROR'
42
+ Set-Content -Path $StatusFile -Value 'Error: Model evaluation failed'
43
+ Start-Sleep -Seconds 1
44
+ } finally {
45
+ # Cleanup status file
46
+ if (Test-Path $StatusFile) {
47
+ Remove-Item -Path $StatusFile
48
+ }
49
+ }
model_evaluator.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ STATUS_FILE="$HOME/bd-model-generations/status/model_evaluator.status"
3
+ LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
4
+
5
+ echo "Evaluating model..." > "$STATUS_FILE"
6
+ # Simulate model evaluation (replace with actual logic)
7
+ sleep 5
8
+ echo "Model evaluation complete." > "$STATUS_FILE"
9
+ sleep 1
10
+ rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
model_trainer.ps1 ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Configuration
2
+ $ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
3
+ $StatusFile = Join-Path $ProjectDir 'status\model_trainer.status'
4
+ $LogFile = Join-Path $ProjectDir 'logs\actions.log'
5
+
6
+ function Write-Log {
7
+ param([string]$Message, [string]$Type = 'INFO')
8
+ $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
9
+ Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
10
+ }
11
+
12
+ # Ensure status directory exists
13
+ New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
14
+
15
+ try {
16
+ # Initialize status
17
+ Set-Content -Path $StatusFile -Value 'Initializing model training...'
18
+ Write-Log 'Model trainer started' 'INFO'
19
+
20
+ # Simulated training progress (replace with actual logic)
21
+ $progressSteps = @(
22
+ @{ Status = 'Loading training dataset...'; Duration = 2 },
23
+ @{ Status = 'Initializing model architecture...'; Duration = 2 },
24
+ @{ Status = 'Training Epoch 1/5...'; Duration = 3 },
25
+ @{ Status = 'Training Epoch 2/5...'; Duration = 3 },
26
+ @{ Status = 'Training Epoch 3/5...'; Duration = 3 },
27
+ @{ Status = 'Training Epoch 4/5...'; Duration = 3 },
28
+ @{ Status = 'Training Epoch 5/5...'; Duration = 3 },
29
+ @{ Status = 'Saving model checkpoints...'; Duration = 1 }
30
+ )
31
+
32
+ foreach ($step in $progressSteps) {
33
+ Set-Content -Path $StatusFile -Value $step.Status
34
+ Write-Log $step.Status 'INFO'
35
+ Start-Sleep -Seconds $step.Duration
36
+ }
37
+
38
+ # Final status update
39
+ Set-Content -Path $StatusFile -Value 'Model training completed successfully'
40
+ Write-Log 'Model training completed' 'SUCCESS'
41
+ Start-Sleep -Seconds 1
42
+
43
+ } catch {
44
+ Write-Log "Error in model training: $_" 'ERROR'
45
+ Set-Content -Path $StatusFile -Value 'Error: Model training failed'
46
+ Start-Sleep -Seconds 1
47
+ } finally {
48
+ # Cleanup status file
49
+ if (Test-Path $StatusFile) {
50
+ Remove-Item -Path $StatusFile
51
+ }
52
+ }
model_trainer.sh ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+ STATUS_FILE="$HOME/bd-model-generations/status/model_trainer.status"
3
+ LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
4
+
5
+ echo "Training model..." > "$STATUS_FILE"
6
+ # Simulate model training (replace with actual logic)
7
+ sleep 5
8
+ echo "Model training complete." > "$STATUS_FILE"
9
+ sleep 1
10
+ rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"
requirements.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies
2
+ torch>=2.0.0
3
+ transformers>=4.30.0
4
+ datasets>=2.12.0
5
+ sentencepiece>=0.1.99
6
+ accelerate>=0.20.0
7
+ wandb>=0.15.0
8
+
9
+ # Data collection and processing
10
+ requests>=2.31.0
11
+ beautifulsoup4>=4.12.0
12
+ tqdm>=4.65.0
13
+
14
+ # Evaluation metrics
15
+ rouge-score>=0.1.2
16
+ sacrebleu>=2.3.1
17
+ pandas>=2.0.0
18
+ numpy>=1.24.0
19
+
20
+ # Utilities
21
+ pathlib>=1.0.1
22
+ logging>=0.5.1.2
23
+ typing>=3.7.4.3
24
+
25
+ # Development tools
26
+ black>=23.3.0
27
+ isort>=5.12.0
28
+ pylint>=2.17.0
29
+ pytest>=7.3.1
scripts/data_collector.py ADDED
@@ -0,0 +1,186 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import time
4
+ import random
5
+ import json
6
+ from pathlib import Path
7
+ import logging
8
+ from urllib.parse import urljoin
9
+
10
+ # Configure logging
11
+ logging.basicConfig(
12
+ level=logging.INFO,
13
+ format='%(asctime)s - %(levelname)s - %(message)s'
14
+ )
15
+ logger = logging.getLogger(__name__)
16
+
17
+ class BengaliDataCollector:
18
+ def __init__(self):
19
+ self.headers = {
20
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
21
+ }
22
+ self.output_dir = Path('data/raw')
23
+ self.output_dir.mkdir(parents=True, exist_ok=True)
24
+
25
+ def make_request(self, url, retries=3, delay=1):
26
+ """Make HTTP request with retry logic and rate limiting"""
27
+ for attempt in range(retries):
28
+ try:
29
+ time.sleep(delay + random.random()) # Rate limiting with jitter
30
+ response = requests.get(url, headers=self.headers)
31
+ response.raise_for_status()
32
+ return response
33
+ except requests.RequestException as e:
34
+ logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
35
+ if attempt == retries - 1:
36
+ logger.error(f"Failed to fetch {url} after {retries} attempts")
37
+ raise
38
+ time.sleep(delay * (attempt + 1)) # Exponential backoff
39
+
40
+ def scrape_wikipedia(self):
41
+ """Scrape Bengali text from Wikipedia"""
42
+ url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা"
43
+ logger.info(f"Scraping Wikipedia: {url}")
44
+
45
+ try:
46
+ response = self.make_request(url)
47
+ soup = BeautifulSoup(response.content, 'html.parser')
48
+
49
+ # Get main content and featured articles
50
+ content_div = soup.find('div', {'id': 'mw-content-text'})
51
+ articles = []
52
+
53
+ if content_div:
54
+ # Extract article links
55
+ article_links = content_div.find_all('a', href=True)
56
+ for link in article_links[:50]: # Limit to first 50 articles
57
+ if link['href'].startswith('/wiki/') and ':' not in link['href']:
58
+ article_url = urljoin('https://bn.wikipedia.org', link['href'])
59
+ try:
60
+ article_response = self.make_request(article_url)
61
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
62
+
63
+ # Extract article content
64
+ article_content = article_soup.find('div', {'id': 'mw-content-text'})
65
+ if article_content:
66
+ text = article_content.get_text(separator='\n', strip=True)
67
+ articles.append({
68
+ 'url': article_url,
69
+ 'content': text
70
+ })
71
+ logger.info(f"Successfully scraped article: {article_url}")
72
+ except Exception as e:
73
+ logger.error(f"Failed to scrape article {article_url}: {str(e)}")
74
+
75
+ # Save Wikipedia data
76
+ with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f:
77
+ json.dump(articles, f, ensure_ascii=False, indent=2)
78
+
79
+ return len(articles)
80
+ except Exception as e:
81
+ logger.error(f"Failed to scrape Wikipedia: {str(e)}")
82
+ return 0
83
+
84
+ def scrape_prothom_alo(self):
85
+ """Scrape Bengali text from Prothom Alo"""
86
+ base_url = "https://www.prothomalo.com"
87
+ categories = ['bangladesh', 'international', 'opinion', 'science-technology']
88
+ articles = []
89
+
90
+ for category in categories:
91
+ url = f"{base_url}/{category}"
92
+ logger.info(f"Scraping Prothom Alo category: {category}")
93
+
94
+ try:
95
+ response = self.make_request(url)
96
+ soup = BeautifulSoup(response.content, 'html.parser')
97
+
98
+ # Find article links
99
+ article_links = soup.find_all('a', href=True)
100
+ for link in article_links[:10]: # Limit to 10 articles per category
101
+ article_url = urljoin(base_url, link['href'])
102
+ if category in article_url:
103
+ try:
104
+ article_response = self.make_request(article_url)
105
+ article_soup = BeautifulSoup(article_response.content, 'html.parser')
106
+
107
+ # Extract article content
108
+ article_content = article_soup.find('div', {'class': 'story-content'})
109
+ if article_content:
110
+ text = article_content.get_text(separator='\n', strip=True)
111
+ articles.append({
112
+ 'url': article_url,
113
+ 'category': category,
114
+ 'content': text
115
+ })
116
+ logger.info(f"Successfully scraped article: {article_url}")
117
+ except Exception as e:
118
+ logger.error(f"Failed to scrape article {article_url}: {str(e)}")
119
+
120
+ except Exception as e:
121
+ logger.error(f"Failed to scrape category {category}: {str(e)}")
122
+
123
+ # Save Prothom Alo data
124
+ with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f:
125
+ json.dump(articles, f, ensure_ascii=False, indent=2)
126
+
127
+ return len(articles)
128
+
129
+ def collect(self):
130
+ """Main method to collect data from all sources"""
131
+ logger.info("Starting data collection")
132
+
133
+ wiki_count = self.scrape_wikipedia()
134
+ logger.info(f"Collected {wiki_count} articles from Wikipedia")
135
+
136
+ prothomalo_count = self.scrape_prothom_alo()
137
+ logger.info(f"Collected {prothomalo_count} articles from Prothom Alo")
138
+
139
+ # Combine and process the collected data
140
+ self.process_collected_data()
141
+
142
+ logger.info("Data collection completed")
143
+
144
+ def process_collected_data(self):
145
+ """Process and combine collected data"""
146
+ try:
147
+ # Read collected data
148
+ with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f:
149
+ wiki_data = json.load(f)
150
+
151
+ with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f:
152
+ news_data = json.load(f)
153
+
154
+ # Combine and format data
155
+ processed_data = []
156
+
157
+ # Process Wikipedia articles
158
+ for article in wiki_data:
159
+ processed_data.append({
160
+ 'text': article['content'],
161
+ 'source': 'wikipedia',
162
+ 'url': article['url']
163
+ })
164
+
165
+ # Process news articles
166
+ for article in news_data:
167
+ processed_data.append({
168
+ 'text': article['content'],
169
+ 'source': 'prothomalo',
170
+ 'category': article.get('category', ''),
171
+ 'url': article['url']
172
+ })
173
+
174
+ # Save processed data
175
+ with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f:
176
+ json.dump(processed_data, f, ensure_ascii=False, indent=2)
177
+
178
+ logger.info(f"Successfully processed {len(processed_data)} articles")
179
+
180
+ except Exception as e:
181
+ logger.error(f"Failed to process collected data: {str(e)}")
182
+ raise
183
+
184
+ if __name__ == "__main__":
185
+ collector = BengaliDataCollector()
186
+ collector.collect()
scripts/model_evaluator.py ADDED
@@ -0,0 +1,216 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import logging
4
+ import torch
5
+ from transformers import AutoModelForCausalLM, AutoTokenizer
6
+ import numpy as np
7
+ from typing import List, Dict, Any
8
+ from tqdm import tqdm
9
+ import pandas as pd
10
+ from rouge_score import rouge_scorer
11
+ from sacrebleu.metrics import BLEU
12
+ import wandb
13
+
14
+ # Configure logging
15
+ logging.basicConfig(
16
+ level=logging.INFO,
17
+ format='%(asctime)s - %(levelname)s - %(message)s'
18
+ )
19
+ logger = logging.getLogger(__name__)
20
+
21
+ class ModelEvaluator:
22
+ def __init__(self):
23
+ self.model_dir = Path('outputs/model/final')
24
+ self.output_dir = Path('outputs/evaluation')
25
+ self.output_dir.mkdir(parents=True, exist_ok=True)
26
+
27
+ # Test prompts for different scenarios
28
+ self.test_prompts = [
29
+ # Programming task prompts
30
+ {
31
+ "type": "code_generation",
32
+ "prompt": "একটি পাইথন ফাংশন লিখুন যা একটি সংখ্যার ফ্যাক্টরিয়াল বের করে।",
33
+ "expected": """def factorial(n):
34
+ if n == 0 or n == 1:
35
+ return 1
36
+ return n * factorial(n - 1)"""
37
+ },
38
+ {
39
+ "type": "code_explanation",
40
+ "prompt": "নিচের কোডটি ব্যাখ্যা করুন:\ndef bubble_sort(arr):\n n = len(arr)\n for i in range(n):\n for j in range(0, n-i-1):\n if arr[j] > arr[j+1]:\n arr[j], arr[j+1] = arr[j+1], arr[j]",
41
+ "expected": "এই কোডটি বাবল সর্ট অ্যালগরিদম বাস্তবায়ন করে। এটি একটি অ্যারেকে ক্রমানুসারে সাজায়।"
42
+ },
43
+ {
44
+ "type": "error_fix",
45
+ "prompt": "এই কোডে ভুল আছে, ঠিক করুন:\ndef calculate_sum(numbers)\n total = 0\n for num in numbers\n total += num\n return total",
46
+ "expected": """def calculate_sum(numbers):
47
+ total = 0
48
+ for num in numbers:
49
+ total += num
50
+ return total"""
51
+ },
52
+ # Algorithm explanation prompts
53
+ {
54
+ "type": "algorithm_explanation",
55
+ "prompt": "বাইনারি সার্চ অ্যালগরিদম কীভাবে কাজ করে সেটি ব্যাখ্যা করুন।",
56
+ "expected": "বাইনারি সার্চ একটি দক্ষ অ্যালগরিদম যা সর্টেড অ্যারেতে একটি এলিমেন্ট খোঁজে। এটি প্রতিবার অ্যারের মধ্যবর্তী এলিমেন্ট চেক করে এবং সার্চ স্পেস অর্ধেক করে কমিয়ে ফেলে।"
57
+ }
58
+ ]
59
+
60
+ # Evaluation metrics
61
+ self.bleu = BLEU()
62
+ self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
63
+
64
+ def load_model_and_tokenizer(self):
65
+ """Load the trained model and tokenizer"""
66
+ logger.info("Loading model and tokenizer")
67
+
68
+ tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
69
+ model = AutoModelForCausalLM.from_pretrained(
70
+ self.model_dir,
71
+ trust_remote_code=True,
72
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
73
+ )
74
+
75
+ if torch.cuda.is_available():
76
+ model = model.to('cuda')
77
+
78
+ return model, tokenizer
79
+
80
+ def generate_response(self, model, tokenizer, prompt: str, max_length: int = 512) -> str:
81
+ """Generate response for a given prompt"""
82
+ try:
83
+ inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
84
+
85
+ if torch.cuda.is_available():
86
+ inputs = {k: v.to('cuda') for k, v in inputs.items()}
87
+
88
+ # Generate with better parameters for code generation
89
+ outputs = model.generate(
90
+ **inputs,
91
+ max_length=max_length,
92
+ num_return_sequences=1,
93
+ temperature=0.7,
94
+ top_p=0.95,
95
+ do_sample=True,
96
+ pad_token_id=tokenizer.pad_token_id,
97
+ eos_token_id=tokenizer.eos_token_id,
98
+ repetition_penalty=1.2
99
+ )
100
+
101
+ response = tokenizer.decode(outputs[0], skip_special_tokens=True)
102
+ return response.replace(prompt, "").strip()
103
+
104
+ except Exception as e:
105
+ logger.error(f"Error generating response: {str(e)}")
106
+ return ""
107
+
108
+ def calculate_metrics(self, generated: str, expected: str) -> Dict[str, float]:
109
+ """Calculate evaluation metrics"""
110
+ try:
111
+ # Calculate BLEU score
112
+ bleu_score = self.bleu.corpus_score(
113
+ [generated],
114
+ [[expected]]
115
+ ).score / 100.0
116
+
117
+ # Calculate ROUGE scores
118
+ rouge_scores = self.rouge_scorer.score(generated, expected)
119
+
120
+ return {
121
+ 'bleu': bleu_score,
122
+ 'rouge1_f': rouge_scores['rouge1'].fmeasure,
123
+ 'rouge2_f': rouge_scores['rouge2'].fmeasure,
124
+ 'rougeL_f': rouge_scores['rougeL'].fmeasure
125
+ }
126
+ except Exception as e:
127
+ logger.error(f"Error calculating metrics: {str(e)}")
128
+ return {
129
+ 'bleu': 0.0,
130
+ 'rouge1_f': 0.0,
131
+ 'rouge2_f': 0.0,
132
+ 'rougeL_f': 0.0
133
+ }
134
+
135
+ def evaluate(self):
136
+ """Main method to evaluate the model"""
137
+ try:
138
+ # Initialize wandb for tracking
139
+ wandb.init(project="bengali-code-llm", name="model-evaluation")
140
+
141
+ # Load model and tokenizer
142
+ model, tokenizer = self.load_model_and_tokenizer()
143
+
144
+ # Store evaluation results
145
+ results = []
146
+
147
+ # Evaluate on test prompts
148
+ for prompt_data in tqdm(self.test_prompts, desc="Evaluating prompts"):
149
+ prompt_type = prompt_data["type"]
150
+ prompt = prompt_data["prompt"]
151
+ expected = prompt_data["expected"]
152
+
153
+ # Generate response
154
+ generated = self.generate_response(model, tokenizer, prompt)
155
+
156
+ # Calculate metrics
157
+ metrics = self.calculate_metrics(generated, expected)
158
+
159
+ # Store result
160
+ result = {
161
+ "type": prompt_type,
162
+ "prompt": prompt,
163
+ "generated": generated,
164
+ "expected": expected,
165
+ **metrics
166
+ }
167
+ results.append(result)
168
+
169
+ # Log to wandb
170
+ wandb.log({
171
+ f"{prompt_type}_bleu": metrics['bleu'],
172
+ f"{prompt_type}_rouge1": metrics['rouge1_f'],
173
+ f"{prompt_type}_rouge2": metrics['rouge2_f'],
174
+ f"{prompt_type}_rougeL": metrics['rougeL_f']
175
+ })
176
+
177
+ # Calculate average metrics by type
178
+ df = pd.DataFrame(results)
179
+ avg_metrics = df.groupby('type')[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean()
180
+
181
+ # Save detailed results
182
+ results_path = self.output_dir / 'evaluation_results.json'
183
+ with open(results_path, 'w', encoding='utf-8') as f:
184
+ json.dump(results, f, ensure_ascii=False, indent=2)
185
+
186
+ # Save average metrics
187
+ metrics_path = self.output_dir / 'average_metrics.csv'
188
+ avg_metrics.to_csv(metrics_path)
189
+
190
+ # Log final averages to wandb
191
+ wandb.log({
192
+ "avg_bleu": df['bleu'].mean(),
193
+ "avg_rouge1": df['rouge1_f'].mean(),
194
+ "avg_rouge2": df['rouge2_f'].mean(),
195
+ "avg_rougeL": df['rougeL_f'].mean()
196
+ })
197
+
198
+ # Close wandb
199
+ wandb.finish()
200
+
201
+ logger.info(f"Evaluation completed. Results saved to {self.output_dir}")
202
+
203
+ # Return average metrics
204
+ return avg_metrics.to_dict()
205
+
206
+ except Exception as e:
207
+ logger.error(f"Evaluation failed: {str(e)}")
208
+ raise
209
+ finally:
210
+ # Ensure wandb is properly closed
211
+ if wandb.run is not None:
212
+ wandb.finish()
213
+
214
+ if __name__ == "__main__":
215
+ evaluator = ModelEvaluator()
216
+ evaluator.evaluate()
scripts/model_trainer.py ADDED
@@ -0,0 +1,232 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import logging
4
+ import torch
5
+ from torch.utils.data import Dataset, DataLoader
6
+ from transformers import (
7
+ AutoModelForCausalLM,
8
+ AutoTokenizer,
9
+ TrainingArguments,
10
+ Trainer,
11
+ DataCollatorForLanguageModeling
12
+ )
13
+ import wandb
14
+ import numpy as np
15
+ from datasets import load_dataset
16
+ from typing import Dict, List, Any
17
+
18
+ # Configure logging
19
+ logging.basicConfig(
20
+ level=logging.INFO,
21
+ format='%(asctime)s - %(levelname)s - %(message)s'
22
+ )
23
+ logger = logging.getLogger(__name__)
24
+
25
+ class BengaliCodeDataset(Dataset):
26
+ def __init__(self, data_path: Path, tokenizer, max_length: int = 2048):
27
+ self.tokenizer = tokenizer
28
+ self.max_length = max_length
29
+
30
+ # Load the processed data
31
+ with open(data_path, 'r', encoding='utf-8') as f:
32
+ self.data = json.load(f)
33
+
34
+ logger.info(f"Loaded {len(self.data)} examples from {data_path}")
35
+
36
+ def __len__(self):
37
+ return len(self.data)
38
+
39
+ def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
40
+ item = self.data[idx]
41
+ text = item['text']
42
+
43
+ # Tokenize the text
44
+ encodings = self.tokenizer(
45
+ text,
46
+ max_length=self.max_length,
47
+ padding='max_length',
48
+ truncation=True,
49
+ return_tensors='pt'
50
+ )
51
+
52
+ # Prepare the labels (same as input_ids for causal language modeling)
53
+ labels = encodings.input_ids.clone()
54
+
55
+ # Create attention mask
56
+ attention_mask = encodings.attention_mask
57
+
58
+ return {
59
+ 'input_ids': encodings.input_ids[0],
60
+ 'attention_mask': attention_mask[0],
61
+ 'labels': labels[0]
62
+ }
63
+
64
+ class ModelTrainer:
65
+ def __init__(self):
66
+ self.data_dir = Path('data/raw')
67
+ self.tokenizer_dir = Path('outputs/tokenizer')
68
+ self.output_dir = Path('outputs/model')
69
+ self.output_dir.mkdir(parents=True, exist_ok=True)
70
+
71
+ # Training configuration
72
+ self.model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
73
+ self.max_length = 2048
74
+ self.batch_size = 4
75
+ self.gradient_accumulation_steps = 4
76
+ self.learning_rate = 2e-5
77
+ self.num_train_epochs = 3
78
+ self.warmup_steps = 100
79
+ self.save_steps = 1000
80
+ self.eval_steps = 500
81
+
82
+ def setup_wandb(self):
83
+ """Initialize Weights & Biases tracking"""
84
+ wandb.init(
85
+ project="bengali-code-llm",
86
+ name="tinyllama-bengali-code",
87
+ config={
88
+ "model_name": self.model_name,
89
+ "max_length": self.max_length,
90
+ "batch_size": self.batch_size,
91
+ "learning_rate": self.learning_rate,
92
+ "num_epochs": self.num_train_epochs
93
+ }
94
+ )
95
+
96
+ def prepare_model_and_tokenizer(self):
97
+ """Load and prepare the model and tokenizer"""
98
+ logger.info("Loading tokenizer and model")
99
+
100
+ # Load the custom tokenizer
101
+ tokenizer = AutoTokenizer.from_pretrained(
102
+ self.tokenizer_dir,
103
+ model_max_length=self.max_length
104
+ )
105
+
106
+ # Load the base model
107
+ model = AutoModelForCausalLM.from_pretrained(
108
+ self.model_name,
109
+ trust_remote_code=True,
110
+ torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
111
+ )
112
+
113
+ # Resize token embeddings to match our tokenizer
114
+ model.resize_token_embeddings(len(tokenizer))
115
+
116
+ return model, tokenizer
117
+
118
+ def create_datasets(self, tokenizer):
119
+ """Create training and validation datasets"""
120
+ logger.info("Creating datasets")
121
+
122
+ # Load the processed data
123
+ data_path = self.data_dir / 'processed_data.json'
124
+
125
+ # Split data into train and validation
126
+ with open(data_path, 'r', encoding='utf-8') as f:
127
+ all_data = json.load(f)
128
+
129
+ np.random.seed(42)
130
+ np.random.shuffle(all_data)
131
+
132
+ split_idx = int(len(all_data) * 0.9) # 90% train, 10% validation
133
+ train_data = all_data[:split_idx]
134
+ val_data = all_data[split_idx:]
135
+
136
+ # Save split data
137
+ train_path = self.data_dir / 'train.json'
138
+ val_path = self.data_dir / 'validation.json'
139
+
140
+ with open(train_path, 'w', encoding='utf-8') as f:
141
+ json.dump(train_data, f, ensure_ascii=False, indent=2)
142
+
143
+ with open(val_path, 'w', encoding='utf-8') as f:
144
+ json.dump(val_data, f, ensure_ascii=False, indent=2)
145
+
146
+ # Create datasets
147
+ train_dataset = BengaliCodeDataset(train_path, tokenizer, self.max_length)
148
+ val_dataset = BengaliCodeDataset(val_path, tokenizer, self.max_length)
149
+
150
+ return train_dataset, val_dataset
151
+
152
+ def create_training_arguments(self):
153
+ """Create training arguments for the Trainer"""
154
+ return TrainingArguments(
155
+ output_dir=str(self.output_dir),
156
+ num_train_epochs=self.num_train_epochs,
157
+ per_device_train_batch_size=self.batch_size,
158
+ per_device_eval_batch_size=self.batch_size,
159
+ gradient_accumulation_steps=self.gradient_accumulation_steps,
160
+ evaluation_strategy="steps",
161
+ eval_steps=self.eval_steps,
162
+ save_strategy="steps",
163
+ save_steps=self.save_steps,
164
+ learning_rate=self.learning_rate,
165
+ warmup_steps=self.warmup_steps,
166
+ weight_decay=0.01,
167
+ logging_dir=str(self.output_dir / 'logs'),
168
+ logging_steps=100,
169
+ report_to="wandb",
170
+ save_total_limit=3,
171
+ load_best_model_at_end=True,
172
+ metric_for_best_model="eval_loss",
173
+ greater_is_better=False,
174
+ fp16=torch.cuda.is_available(),
175
+ remove_unused_columns=False
176
+ )
177
+
178
+ def train(self):
179
+ """Main method to train the model"""
180
+ try:
181
+ # Initialize wandb
182
+ self.setup_wandb()
183
+
184
+ # Prepare model and tokenizer
185
+ model, tokenizer = self.prepare_model_and_tokenizer()
186
+
187
+ # Create datasets
188
+ train_dataset, val_dataset = self.create_datasets(tokenizer)
189
+
190
+ # Create training arguments
191
+ training_args = self.create_training_arguments()
192
+
193
+ # Create data collator
194
+ data_collator = DataCollatorForLanguageModeling(
195
+ tokenizer=tokenizer,
196
+ mlm=False # We're doing causal language modeling
197
+ )
198
+
199
+ # Initialize trainer
200
+ trainer = Trainer(
201
+ model=model,
202
+ args=training_args,
203
+ train_dataset=train_dataset,
204
+ eval_dataset=val_dataset,
205
+ data_collator=data_collator,
206
+ tokenizer=tokenizer
207
+ )
208
+
209
+ # Train the model
210
+ logger.info("Starting model training")
211
+ trainer.train()
212
+
213
+ # Save the final model
214
+ trainer.save_model(str(self.output_dir / 'final'))
215
+ tokenizer.save_pretrained(str(self.output_dir / 'final'))
216
+
217
+ # Close wandb
218
+ wandb.finish()
219
+
220
+ logger.info("Model training completed successfully")
221
+
222
+ except Exception as e:
223
+ logger.error(f"Model training failed: {str(e)}")
224
+ raise
225
+ finally:
226
+ # Ensure wandb is properly closed
227
+ if wandb.run is not None:
228
+ wandb.finish()
229
+
230
+ if __name__ == "__main__":
231
+ trainer = ModelTrainer()
232
+ trainer.train()
scripts/tokenizer_trainer.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from pathlib import Path
3
+ import sentencepiece as spm
4
+ import logging
5
+ from typing import List, Dict
6
+ import shutil
7
+
8
+ # Configure logging
9
+ logging.basicConfig(
10
+ level=logging.INFO,
11
+ format='%(asctime)s - %(levelname)s - %(message)s'
12
+ )
13
+ logger = logging.getLogger(__name__)
14
+
15
+ class TokenizerTrainer:
16
+ def __init__(self):
17
+ self.data_dir = Path('data/raw')
18
+ self.output_dir = Path('outputs/tokenizer')
19
+ self.output_dir.mkdir(parents=True, exist_ok=True)
20
+
21
+ # Tokenizer configuration
22
+ self.vocab_size = 32000
23
+ self.character_coverage = 0.9999
24
+ self.model_type = "unigram"
25
+ self.special_tokens = [
26
+ "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
27
+ "<s>", "</s>", "<pad>", "<unk>", "<mask>",
28
+ "২০", "১০", "৫০", "১৫", "২৫", # Common Bengali numbers
29
+ "def", "class", "return", "if", "else", "for", "while", # Code keywords
30
+ "print", "input", "import", "from", "try", "except",
31
+ "#", "//", "/*", "*/", "'''", '"""' # Code comments
32
+ ]
33
+
34
+ def prepare_training_data(self) -> str:
35
+ """Prepare text data for tokenizer training"""
36
+ logger.info("Preparing training data for tokenizer")
37
+
38
+ # Load processed data
39
+ try:
40
+ with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
41
+ data = json.load(f)
42
+ except FileNotFoundError:
43
+ logger.error("Processed data file not found. Run data collection first.")
44
+ raise
45
+
46
+ # Create temporary file for training
47
+ train_file = self.output_dir / 'train.txt'
48
+ with open(train_file, 'w', encoding='utf-8') as f:
49
+ for item in data:
50
+ text = item['text']
51
+ # Write one sentence per line
52
+ sentences = text.split('।') # Split on Bengali full stop
53
+ for sentence in sentences:
54
+ sentence = sentence.strip()
55
+ if sentence: # Skip empty sentences
56
+ f.write(sentence + '\n')
57
+
58
+ logger.info("Training data prepared successfully")
59
+ return str(train_file)
60
+
61
+ def train_tokenizer(self, train_file: str):
62
+ """Train the SentencePiece tokenizer"""
63
+ logger.info("Starting tokenizer training")
64
+
65
+ # Prepare model prefix
66
+ model_prefix = self.output_dir / "bengali_code"
67
+
68
+ # Create training parameters
69
+ params = {
70
+ "--input": train_file,
71
+ "--model_prefix": str(model_prefix),
72
+ "--vocab_size": str(self.vocab_size),
73
+ "--character_coverage": str(self.character_coverage),
74
+ "--model_type": self.model_type,
75
+ "--pad_id": 0,
76
+ "--unk_id": 1,
77
+ "--bos_id": 2,
78
+ "--eos_id": 3,
79
+ "--user_defined_symbols": ",".join(self.special_tokens),
80
+ "--max_sentence_length": "4192",
81
+ "--input_sentence_size": "5000000",
82
+ "--shuffle_input_sentence": "true",
83
+ "--normalization_rule_name": "identity" # Preserve original text
84
+ }
85
+
86
+ # Convert parameters to command-line arguments
87
+ args = []
88
+ for key, value in params.items():
89
+ args.append(key)
90
+ args.append(value)
91
+
92
+ try:
93
+ # Train the tokenizer
94
+ spm.SentencePieceTrainer.train(" ".join(args))
95
+ logger.info("Tokenizer training completed successfully")
96
+
97
+ # Create config files for HuggingFace compatibility
98
+ self.create_huggingface_files(model_prefix)
99
+
100
+ except Exception as e:
101
+ logger.error(f"Failed to train tokenizer: {str(e)}")
102
+ raise
103
+
104
+ def create_huggingface_files(self, model_prefix: Path):
105
+ """Create additional files needed for HuggingFace compatibility"""
106
+ logger.info("Creating HuggingFace compatibility files")
107
+
108
+ # Create tokenizer config
109
+ tokenizer_config = {
110
+ "model_max_length": 2048,
111
+ "padding_side": "right",
112
+ "truncation_side": "right",
113
+ "bos_token": "<s>",
114
+ "eos_token": "</s>",
115
+ "unk_token": "<unk>",
116
+ "pad_token": "<pad>",
117
+ "mask_token": "<mask>",
118
+ "model_type": self.model_type,
119
+ "vocab_size": self.vocab_size
120
+ }
121
+
122
+ with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
123
+ json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
124
+
125
+ # Create special tokens map
126
+ special_tokens_map = {
127
+ "bos_token": "<s>",
128
+ "eos_token": "</s>",
129
+ "unk_token": "<unk>",
130
+ "pad_token": "<pad>",
131
+ "mask_token": "<mask>"
132
+ }
133
+
134
+ with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
135
+ json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
136
+
137
+ logger.info("HuggingFace compatibility files created successfully")
138
+
139
+ def train(self):
140
+ """Main method to train the tokenizer"""
141
+ try:
142
+ # Prepare training data
143
+ train_file = self.prepare_training_data()
144
+
145
+ # Train tokenizer
146
+ self.train_tokenizer(train_file)
147
+
148
+ # Clean up temporary files
149
+ if Path(train_file).exists():
150
+ Path(train_file).unlink()
151
+
152
+ logger.info("Tokenizer training pipeline completed successfully")
153
+
154
+ except Exception as e:
155
+ logger.error(f"Tokenizer training pipeline failed: {str(e)}")
156
+ raise
157
+
158
+ if __name__ == "__main__":
159
+ trainer = TokenizerTrainer()
160
+ trainer.train()
special_tokens_map.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ {
3
+ "additional_special_tokens": [
4
+ "<|im_start|>",
5
+ "<|im_end|>",
6
+ "<|object_ref_start|>",
7
+ "<|object_ref_end|>",
8
+ "<|box_start|>",
9
+ "<|box_end|>",
10
+ "<|quad_start|>",
11
+ "<|quad_end|>",
12
+ "<|vision_start|>",
13
+ "<|vision_end|>",
14
+ "<|vision_pad|>",
15
+ "<|image_pad|>",
16
+ "<|video_pad|>"
17
+ ],
18
+ "eos_token": {
19
+ "content": "<|im_end|>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<|endoftext|>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ }
32
+ }
start.sh ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ # === SYSTEM PROMPT ===
4
+ # This script builds a Bengali language model using a multi-agent system with human-in-the-loop (HIL) capabilities.
5
+ # Advanced Features:
6
+ # - Real-Time Streaming: Displays a colorful, dynamic status dashboard in the terminal.
7
+ # - Robust Error Handling: Validates setup, API calls, and file operations with detailed logging.
8
+ # - Modern Interface: Uses ANSI colors, progress bars, and a boxed header for a polished look.
9
+ # - Loop and Iteration: Monitors execution, retries on failure, and ensures task completion.
10
+ # - Code Execution: Executes Python and Node.js code locally for preprocessing and evaluation.
11
+ # - Tools: Provides Python and Node.js REPLs for file operations and analysis.
12
+ # - Time Travel: Logs actions with timestamps for debugging and auditing.
13
+ # - Subgraph Support: Encapsulates tasks (data collection, preprocessing, training, evaluation) as reusable nodes.
14
+ # - Memory: Persists state across agent interactions using a key-value store.
15
+ # - API Integrations: Uses Together, Cohere, and Gemini APIs (Together as primary for text generation).
16
+ # - File Operations: Creates, edits, and validates files with error checking.
17
+ # - Output: Saves the model to /storage/BA73-022B/bd/bd-model-genaretions/model.pt.
18
+
19
+ # === CONFIGURATION ===
20
+ PROJECT_DIR="/storage/BA73-022B/bd/bd-model-genaretions" # Updated as per user request
21
+ LOG_FILE="$PROJECT_DIR/logs/actions.log"
22
+ MEMORY_FILE="$PROJECT_DIR/memory.txt"
23
+ REQUESTS_DIR="$PROJECT_DIR/requests"
24
+ RESPONSES_DIR="$PROJECT_DIR/responses"
25
+ DATA_DIR="$PROJECT_DIR/data"
26
+ STATUS_DIR="$PROJECT_DIR/status"
27
+
28
+ # API Keys
29
+ TOGETHER_API_KEY="07f08ca73c50496a3406ff621912254a67370d576822f1921f77eed47e649545"
30
+ COHERE_API_KEY="rvpLjkuzZPsoHGeIxqQxttTTIt4IxGUS5FOINU4L"
31
+ GEMINI_API_KEY="AIzaSyAQNxQU0WnegEnMfP6LCwkVw-PUtR11qaI"
32
+
33
+ # === SETUP ===
34
+ echo "Initializing project directories..."
35
+ for dir in "$PROJECT_DIR" "$DATA_DIR" "$REQUESTS_DIR" "$RESPONSES_DIR" "$PROJECT_DIR/logs" "$STATUS_DIR"; do
36
+ mkdir -p "$dir"
37
+ if [ $? -ne 0 ]; then
38
+ echo -e "\033[1;31mError: Failed to create directory $dir\033[0m"
39
+ exit 1
40
+ fi
41
+ done
42
+
43
+ touch "$LOG_FILE" "$MEMORY_FILE"
44
+ if [ ! -f "$LOG_FILE" ] || [ ! -f "$MEMORY_FILE" ]; then
45
+ echo -e "\033[1;31mError: Failed to create log or memory file\033[0m"
46
+ exit 1
47
+ fi
48
+ echo "[$(date)] Starting Bengali language model generation" >> "$LOG_FILE"
49
+
50
+ # === UTILITY FUNCTIONS ===
51
+ # Memory Management
52
+ function set_memory {
53
+ local key="$1"
54
+ local value="$2"
55
+ grep -v "^$key=" "$MEMORY_FILE" > "$MEMORY_FILE.tmp" && mv "$MEMORY_FILE.tmp" "$MEMORY_FILE"
56
+ echo "$key=$value" >> "$MEMORY_FILE"
57
+ }
58
+
59
+ function get_memory {
60
+ local key="$1"
61
+ local value=$(grep "^$key=" "$MEMORY_FILE" | cut -d'=' -f2)
62
+ echo "${value:-false}"
63
+ }
64
+
65
+ # Logging (Time Travel)
66
+ function log_action {
67
+ local agent_id="$1"
68
+ local action="$2"
69
+ echo "[$(date)] [Agent $agent_id] $action" >> "$LOG_FILE"
70
+ }
71
+
72
+ # Status Updates
73
+ function set_status {
74
+ local agent_id="$1"
75
+ local status="$2"
76
+ echo "$status" > "$STATUS_DIR/agent$agent_id.status"
77
+ }
78
+
79
+ # === TOOL CALLING FUNCTIONS ===
80
+ function run_python {
81
+ local code="$1"
82
+ log_action "Tool" "Running Python code: $code"
83
+ local output=$(python3 -c "$code" 2>> "$LOG_FILE")
84
+ local exit_code=$?
85
+ if [ $exit_code -ne 0 ]; then
86
+ log_action "Tool" "Python execution failed with exit code $exit_code"
87
+ return $exit_code
88
+ fi
89
+ echo "$output"
90
+ }
91
+
92
+ function run_node {
93
+ local code="$1"
94
+ log_action "Tool" "Running Node.js code: $code"
95
+ local output=$(node -e "$code" 2>> "$LOG_FILE")
96
+ local exit_code=$?
97
+ if [ $exit_code -ne 0 ]; then
98
+ log_action "Tool" "Node.js execution failed with exit code $exit_code"
99
+ return $exit_code
100
+ fi
101
+ echo "$output"
102
+ }
103
+
104
+ # === API CALLING FUNCTIONS ===
105
+ function call_together_api {
106
+ local prompt="$1"
107
+ curl -s -m 10 -X POST "https://api.together.ai/v1/completions" \
108
+ -H "Content-Type: application/json" \
109
+ -H "Authorization: Bearer $TOGETHER_API_KEY" \
110
+ -d "{\"prompt\": \"$prompt\", \"model\": \"some_model\", \"max_tokens\": 100}"
111
+ }
112
+
113
+ function call_cohere_api {
114
+ local prompt="$1"
115
+ curl -s -m 10 -X POST "https://api.cohere.ai/generate" \
116
+ -H "Content-Type: application/json" \
117
+ -H "Authorization: Bearer $COHERE_API_KEY" \
118
+ -d "{\"prompt\": \"$prompt\", \"max_tokens\": 100}"
119
+ }
120
+
121
+ function call_gemini_api {
122
+ local prompt="$1"
123
+ curl -s -m 10 -X POST "https://api.gemini.ai/v1/completions" \
124
+ -H "Content-Type: application/json" \
125
+ -H "Authorization: Bearer $GEMINI_API_KEY" \
126
+ -d "{\"prompt\": \"$prompt\", \"model\": \"some_model\"}"
127
+ }
128
+
129
+ function generate_text {
130
+ local prompt="$1"
131
+ local api="$2"
132
+ local attempts=3
133
+ for ((i=1; i<=attempts; i++)); do
134
+ local response text
135
+ case "$api" in
136
+ together)
137
+ response=$(call_together_api "$prompt")
138
+ text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
139
+ ;;
140
+ cohere)
141
+ response=$(call_cohere_api "$prompt")
142
+ text=$(echo "$response" | jq -r '.generations[0].text' 2>/dev/null)
143
+ ;;
144
+ gemini)
145
+ response=$(call_gemini_api "$prompt")
146
+ text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
147
+ ;;
148
+ *)
149
+ text="Unknown API"
150
+ ;;
151
+ esac
152
+ if [ -n "$text" ] && [ "$text" != "null" ] && [[ ! "$text" =~ "Error" ]]; then
153
+ echo "$text"
154
+ return 0
155
+ fi
156
+ log_action "API" "Attempt $i failed for $api API, retrying..."
157
+ sleep 2
158
+ done
159
+ log_action "API" "Failed to generate text with $api after $attempts attempts"
160
+ return 1
161
+ }
162
+
163
+ # === HUMAN-IN-THE-LOOP REQUEST FUNCTION ===
164
+ function request_human_input {
165
+ local agent_id="$1"
166
+ local request="$2"
167
+ log_action "$agent_id" "Requesting human input: $request"
168
+ set_status "$agent_id" "Waiting for human input"
169
+ echo "$request" > "$REQUESTS_DIR/agent$agent_id.txt"
170
+ while [ ! -f "$RESPONSES_DIR/agent$agent_id.txt" ]; do
171
+ sleep 1
172
+ done
173
+ local response=$(cat "$RESPONSES_DIR/agent$agent_id.txt")
174
+ rm "$RESPONSES_DIR/agent$agent_id.txt"
175
+ log_action "$agent_id" "Received human response: $response"
176
+ set_status "$agent_id" "Processing human input"
177
+ echo "$response"
178
+ }
179
+
180
+ # === SUBGRAPH FUNCTIONS ===
181
+ function collect_data {
182
+ set_status 1 "Generating Bengali text via API"
183
+ local prompt="Generate a sample of Bengali text for language model training."
184
+ local text=$(generate_text "$prompt" "together")
185
+ if [ $? -eq 0 ]; then
186
+ set_status 1 "Saving data to file"
187
+ echo "$text" > "$DATA_DIR/bengali_text.txt"
188
+ if [ $? -ne 0 ]; then
189
+ set_status 1 "Error: Failed to save data"
190
+ log_action 1 "Failed to write to $DATA_DIR/bengali_text.txt"
191
+ return 1
192
+ fi
193
+ log_action 1 "Data saved to $DATA_DIR/bengali_text.txt"
194
+ set_memory "data_collected" "true"
195
+ else
196
+ set_status 1 "API error"
197
+ log_action 1 "Failed to collect data due to API error"
198
+ return 1
199
+ fi
200
+ set_status 1 "Data collection completed"
201
+ }
202
+
203
+ function preprocess_data {
204
+ set_status 2 "Waiting for data collection"
205
+ while [ "$(get_memory 'data_collected')" != "true" ]; do
206
+ sleep 1
207
+ done
208
+ set_status 2 "Analyzing data"
209
+ if [ ! -f "$DATA_DIR/bengali_text.txt" ]; then
210
+ set_status 2 "Error: Data file missing"
211
+ log_action 2 "Error: No data file found at $DATA_DIR/bengali_text.txt"
212
+ return 1
213
+ fi
214
+ local output=$(run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); print(f'Text length: {len(text)} characters')")
215
+ if [ $? -ne 0 ]; then
216
+ set_status 2 "Error: Analysis failed"
217
+ log_action 2 "Preprocessing analysis failed"
218
+ return 1
219
+ fi
220
+ log_action 2 "Analysis result: $output"
221
+ set_status 2 "Awaiting human review"
222
+ local response=$(request_human_input 2 "Review the Bengali text in $DATA_DIR/bengali_text.txt (approve/reject/edit)")
223
+ case "$response" in
224
+ approve)
225
+ set_status 2 "Saving preprocessed data"
226
+ echo "Data preprocessed" > "$DATA_DIR/preprocessed_text.txt"
227
+ log_action 2 "Preprocessing approved, saved to $DATA_DIR/preprocessed_text.txt"
228
+ set_memory "data_preprocessed" "true"
229
+ ;;
230
+ edit)
231
+ set_status 2 "Editing data"
232
+ log_action 2 "Human requested edit; applying transformation"
233
+ run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); with open('$DATA_DIR/preprocessed_text.txt', 'w') as f: f.write(text.upper())"
234
+ if [ $? -eq 0 ]; then
235
+ set_memory "data_preprocessed" "true"
236
+ else
237
+ set_status 2 "Error: Edit failed"
238
+ return 1
239
+ fi
240
+ ;;
241
+ *)
242
+ set_status 2 "Preprocessing rejected"
243
+ log_action 2 "Preprocessing rejected by human"
244
+ return 1
245
+ ;;
246
+ esac
247
+ }
248
+
249
+ function train_model {
250
+ set_status 3 "Waiting for preprocessing"
251
+ while [ "$(get_memory 'data_preprocessed')" != "true" ]; do
252
+ sleep 1
253
+ done
254
+ set_status 3 "Training model"
255
+ if [ ! -f "$DATA_DIR/preprocessed_text.txt" ]; then
256
+ set_status 3 "Error: Preprocessed data missing"
257
+ log_action 3 "Error: No preprocessed data found at $DATA_DIR/preprocessed_text.txt"
258
+ return 1
259
+ fi
260
+ echo "Training Bengali model..."
261
+ sleep 2 # Simulate training
262
+ echo "Model trained" > "$PROJECT_DIR/model.pt"
263
+ if [ $? -ne 0 ]; then
264
+ set_status 3 "Error: Failed to save model"
265
+ log_action 3 "Failed to save model to $PROJECT_DIR/model.pt"
266
+ return 1
267
+ fi
268
+ log_action 3 "Model saved to $PROJECT_DIR/model.pt"
269
+ set_memory "model_trained" "true"
270
+ set_status 3 "Training completed"
271
+ }
272
+
273
+ function evaluate_model {
274
+ set_status 4 "Waiting for model training"
275
+ while [ "$(get_memory 'model_trained')" != "true" ]; do
276
+ sleep 1
277
+ done
278
+ set_status 4 "Evaluating model"
279
+ if [ ! -f "$PROJECT_DIR/model.pt" ]; then
280
+ set_status 4 "Error: Model file missing"
281
+ log_action 4 "Error: No model file found at $PROJECT_DIR/model.pt"
282
+ return 1
283
+ fi
284
+ local output=$(run_python "print('Simulated accuracy: 85%')")
285
+ if [ $? -ne 0 ]; then
286
+ set_status 4 "Error: Evaluation failed"
287
+ log_action 4 "Evaluation failed"
288
+ return 1
289
+ fi
290
+ set_status 4 "Awaiting human review"
291
+ local response=$(request_human_input 4 "Review model performance: $output (approve/reject/fix)")
292
+ case "$response" in
293
+ approve)
294
+ log_action 4 "Evaluation approved"
295
+ set_memory "evaluation_completed" "true"
296
+ ;;
297
+ fix)
298
+ set_status 4 "Fixing model"
299
+ log_action 4 "Human requested fix; simulating correction"
300
+ echo "Fixed model" > "$PROJECT_DIR/model.pt"
301
+ set_memory "evaluation_completed" "true"
302
+ ;;
303
+ *)
304
+ set_status 4 "Evaluation rejected"
305
+ log_action 4 "Evaluation rejected by human"
306
+ return 1
307
+ ;;
308
+ esac
309
+ set_status 4 "Evaluation completed"
310
+ }
311
+
312
+ # === AGENT FUNCTIONS ===
313
+ function agent1 {
314
+ set_status 1 "Starting data collection"
315
+ until collect_data; do
316
+ set_status 1 "Retrying data collection"
317
+ log_action 1 "Retrying data collection after failure"
318
+ sleep 2
319
+ done
320
+ set_status 1 "Data collection completed"
321
+ set_memory "agent1_completed" "true"
322
+ }
323
+
324
+ function agent2 {
325
+ set_status 2 "Starting preprocessing"
326
+ until preprocess_data; do
327
+ set_status 2 "Retrying preprocessing"
328
+ log_action 2 "Retrying preprocessing after failure"
329
+ sleep 2
330
+ done
331
+ set_status 2 "Preprocessing completed"
332
+ set_memory "agent2_completed" "true"
333
+ }
334
+
335
+ function agent3 {
336
+ set_status 3 "Starting training"
337
+ until train_model; do
338
+ set_status 3 "Retrying training"
339
+ log_action 3 "Retrying training after failure"
340
+ sleep 2
341
+ done
342
+ set_status 3 "Training completed"
343
+ set_memory "agent3_completed" "true"
344
+ }
345
+
346
+ function agent4 {
347
+ set_status 4 "Starting evaluation"
348
+ until evaluate_model; do
349
+ set_status 4 "Retrying evaluation"
350
+ log_action 4 "Retrying evaluation after failure"
351
+ sleep 2
352
+ done
353
+ set_status 4 "Evaluation completed"
354
+ set_memory "agent4_completed" "true"
355
+ }
356
+
357
+ # === STATUS DISPLAY ===
358
+ function display_status {
359
+ echo -e "\033[1;34m┌─────────────────────── STATUS DASHBOARD ──────────────────────┐\033[0m"
360
+ echo -e "\033[1;34m│ Bengali Language Model Generation - $(date +%H:%M:%S) │\033[0m"
361
+ echo -e "\033[1;34m└───────────────────────────────────────────────────────────────┘\033[0m"
362
+ local completed=0
363
+ for agent in 1 2 3 4; do
364
+ local status="Not started"
365
+ if [ -f "$STATUS_DIR/agent$agent.status" ]; then
366
+ status=$(cat "$STATUS_DIR/agent$agent.status")
367
+ fi
368
+ if [ "$(get_memory "agent${agent}_completed")" == "true" ]; then
369
+ status="Completed"
370
+ ((completed++))
371
+ fi
372
+ case $agent in
373
+ 1) color="\033[1;32m" ;; # Green
374
+ 2) color="\033[1;33m" ;; # Yellow
375
+ 3) color="\033[1;34m" ;; # Blue
376
+ 4) color="\033[1;35m" ;; # Magenta
377
+ esac
378
+ echo -e "${color}Agent $agent: $status\033[0m"
379
+ done
380
+ local progress=$((completed * 25)) # 25% per agent
381
+ echo -e "\033[1;36mProgress: [$completed/4] ${progress}%\033[0m"
382
+ }
383
+
384
+ # === HUMAN-IN-THE-LOOP HANDLER ===
385
+ function hil_handler {
386
+ while true; do
387
+ clear
388
+ display_status
389
+ if [ "$(get_memory 'agent1_completed')" == "true" ] && \
390
+ [ "$(get_memory 'agent2_completed')" == "true" ] && \
391
+ [ "$(get_memory 'agent3_completed')" == "true" ] && \
392
+ [ "$(get_memory 'agent4_completed')" == "true" ]; then
393
+ log_action "HIL" "All agents completed successfully"
394
+ echo -e "\033[1;32m✓ All agents completed! Model generation successful.\033[0m"
395
+ break
396
+ fi
397
+ for req_file in "$REQUESTS_DIR"/*; do
398
+ if [ -f "$req_file" ]; then
399
+ local agent_id=$(basename "$req_file" .txt | sed 's/agent//')
400
+ local request=$(cat "$req_file")
401
+ echo -e "\n\033[1;33mAgent $agent_id requests your input:\033[0m $request"
402
+ echo -e "\033[1;33mEnter response (e.g., approve/reject/edit/fix):\033[0m"
403
+ read -r human_input
404
+ if [ -z "$human_input" ]; then
405
+ echo -e "\033[1;31mError: Input cannot be empty. Try again.\033[0m"
406
+ continue
407
+ fi
408
+ echo "$human_input" > "$RESPONSES_DIR/agent$agent_id.txt"
409
+ rm "$req_file"
410
+ fi
411
+ done
412
+ sleep 1
413
+ done
414
+ }
415
+
416
+ # === CLEANUP ON EXIT ===
417
+ function cleanup {
418
+ echo -e "\033[1;31mScript interrupted. Cleaning up...\033[0m"
419
+ rm -f "$REQUESTS_DIR"/* "$RESPONSES_DIR"/* 2>/dev/null
420
+ log_action "Main" "Script terminated by user"
421
+ exit 1
422
+ }
423
+ trap cleanup INT TERM
424
+
425
+ # === MAIN EXECUTION ===
426
+ echo -e "\033[1;32mStarting Bengali language model generation...\033[0m"
427
+ log_action "Main" "Script execution started"
428
+
429
+ agent1 &
430
+ agent2 &
431
+ agent3 &
432
+ agent4 &
433
+
434
+ hil_handler
435
+
436
+ echo -e "\033[1;32mProcess completed successfully!\033[0m"
437
+ echo "Model saved at: $PROJECT_DIR/model.pt"
438
+ echo "Detailed logs available at: $LOG_FILE"
tokenizer.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:21bff87aabfb69a9aafc1c1c6d1b60bbf3138e2e2b7545924f62b3c5b3c3d587
3
+ size 16
tokenizer_config.json ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_prefix_space": false,
4
+ "added_tokens_decoder": {
5
+ "151643": {
6
+ "content": "<|endoftext|>",
7
+ "lstrip": false,
8
+ "normalized": false,
9
+ "rstrip": false,
10
+ "single_word": false,
11
+ "special": true
12
+ },
13
+ "151644": {
14
+ "content": "<|im_start|>",
15
+ "lstrip": false,
16
+ "normalized": false,
17
+ "rstrip": false,
18
+ "single_word": false,
19
+ "special": true
20
+ },
21
+ "151645": {
22
+ "content": "<|im_end|>",
23
+ "lstrip": false,
24
+ "normalized": false,
25
+ "rstrip": false,
26
+ "single_word": false,
27
+ "special": true
28
+ },
29
+ "151646": {
30
+ "content": "<|object_ref_start|>",
31
+ "lstrip": false,
32
+ "normalized": false,
33
+ "rstrip": false,
34
+ "single_word": false,
35
+ "special": true
36
+ },
37
+ "151647": {
38
+ "content": "<|object_ref_end|>",
39
+ "lstrip": false,
40
+ "normalized": false,
41
+ "rstrip": false,
42
+ "single_word": false,
43
+ "special": true
44
+ },
45
+ "151648": {
46
+ "content": "<|box_start|>",
47
+ "lstrip": false,
48
+ "normalized": false,
49
+ "rstrip": false,
50
+ "single_word": false,
51
+ "special": true
52
+ },
53
+ "151649": {
54
+ "content": "<|box_end|>",
55
+ "lstrip": false,
56
+ "normalized": false,
57
+ "rstrip": false,
58
+ "single_word": false,
59
+ "special": true
60
+ },
61
+ "151650": {
62
+ "content": "<|quad_start|>",
63
+ "lstrip": false,
64
+ "normalized": false,
65
+ "rstrip": false,
66
+ "single_word": false,
67
+ "special": true
68
+ },
69
+ "151651": {
70
+ "content": "<|quad_end|>",
71
+ "lstrip": false,
72
+ "normalized": false,
73
+ "rstrip": false,
74
+ "single_word": false,
75
+ "special": true
76
+ },
77
+ "151652": {
78
+ "content": "<|vision_start|>",
79
+ "lstrip": false,
80
+ "normalized": false,
81
+ "rstrip": false,
82
+ "single_word": false,
83
+ "special": true
84
+ },
85
+ "151653": {
86
+ "content": "<|vision_end|>",
87
+ "lstrip": false,
88
+ "normalized": false,
89
+ "rstrip": false,
90
+ "single_word": false,
91
+ "special": true
92
+ },
93
+ "151654": {
94
+ "content": "<|vision_pad|>",
95
+ "lstrip": false,
96
+ "normalized": false,
97
+ "rstrip": false,
98
+ "single_word": false,
99
+ "special": true
100
+ },
101
+ "151655": {
102
+ "content": "<|image_pad|>",
103
+ "lstrip": false,
104
+ "normalized": false,
105
+ "rstrip": false,
106
+ "single_word": false,
107
+ "special": true
108
+ },
109
+ "151656": {
110
+ "content": "<|video_pad|>",
111
+ "lstrip": false,
112
+ "normalized": false,
113
+ "rstrip": false,
114
+ "single_word": false,
115
+ "special": true
116
+ },
117
+ "151657": {
118
+ "content": "<tool_call>",
119
+ "lstrip": false,
120
+ "normalized": false,
121
+ "rstrip": false,
122
+ "single_word": false,
123
+ "special": false
124
+ },
125
+ "151658": {
126
+ "content": "</tool_call>",
127
+ "lstrip": false,
128
+ "normalized": false,
129
+ "rstrip": false,
130
+ "single_word": false,
131
+ "special": false
132
+ },
133
+ "151659": {
134
+ "content": "<|fim_prefix|>",
135
+ "lstrip": false,
136
+ "normalized": false,
137
+ "rstrip": false,
138
+ "single_word": false,
139
+ "special": false
140
+ },
141
+ "151660": {
142
+ "content": "<|fim_middle|>",
143
+ "lstrip": false,
144
+ "normalized": false,
145
+ "rstrip": false,
146
+ "single_word": false,
147
+ "special": false
148
+ },
149
+ "151661": {
150
+ "content": "<|fim_suffix|>",
151
+ "lstrip": false,
152
+ "normalized": false,
153
+ "rstrip": false,
154
+ "single_word": false,
155
+ "special": false
156
+ },
157
+ "151662": {
158
+ "content": "<|fim_pad|>",
159
+ "lstrip": false,
160
+ "normalized": false,
161
+ "rstrip": false,
162
+ "single_word": false,
163
+ "special": false
164
+ },
165
+ "151663": {
166
+ "content": "<|repo_name|>",
167
+ "lstrip": false,
168
+ "normalized": false,
169
+ "rstrip": false,
170
+ "single_word": false,
171
+ "special": false
172
+ },
173
+ "151664": {
174
+ "content": "<|file_sep|>",
175
+ "lstrip": false,
176
+ "normalized": false,
177
+ "rstrip": false,
178
+ "single_word": false,
179
+ "special": false
180
+ }
181
+ },
182
+ "additional_special_tokens": [
183
+ "<|im_start|>",
184
+ "<|im_end|>",
185
+ "<|object_ref_start|>",
186
+ "<|object_ref_end|>",
187
+ "<|box_start|>",
188
+ "<|box_end|>",
189
+ "<|quad_start|>",
190
+ "<|quad_end|>",
191
+ "<|vision_start|>",
192
+ "<|vision_end|>",
193
+ "<|vision_pad|>",
194
+ "<|image_pad|>",
195
+ "<|video_pad|>"
196
+ ],
197
+ "bos_token": null,
198
+ "chat_template": "{%- if tools %}\n {{- '<|im_start|>system\\n' }}\n {%- if messages[0]['role'] == 'system' %}\n {{- messages[0]['content'] }}\n {%- else %}\n {{- 'You are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.' }}\n {%- endif %}\n {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n {%- for tool in tools %}\n {{- \"\\n\" }}\n {{- tool | tojson }}\n {%- endfor %}\n {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n {%- if messages[0]['role'] == 'system' %}\n {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n {%- else %}\n {{- '<|im_start|>system\\nYou are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.<|im_end|>\\n' }}\n {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n {%- elif message.role == \"assistant\" %}\n {{- '<|im_start|>' + message.role }}\n {%- if message.content %}\n {{- '\\n' + message.content }}\n {%- endif %}\n {%- for tool_call in message.tool_calls %}\n {%- if tool_call.function is defined %}\n {%- set tool_call = tool_call.function %}\n {%- endif %}\n {{- '\\n<tool_call>\\n{\"name\": \"' }}\n {{- tool_call.name }}\n {{- '\", \"arguments\": ' }}\n {{- tool_call.arguments | tojson }}\n {{- '}\\n</tool_call>' }}\n {%- endfor %}\n {{- '<|im_end|>\\n' }}\n {%- elif message.role == \"tool\" %}\n {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n {{- '<|im_start|>user' }}\n {%- endif %}\n {{- '\\n<tool_response>\\n' }}\n {{- message.content }}\n {{- '\\n</tool_response>' }}\n {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n {{- '<|im_end|>\\n' }}\n {%- endif %}\n {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
199
+ "clean_up_tokenization_spaces": false,
200
+ "eos_token": "<|im_end|>",
201
+ "errors": "replace",
202
+ "model_max_length": 131072,
203
+ "pad_token": "<|endoftext|>",
204
+ "padding_side": "right",
205
+ "split_special_tokens": false,
206
+ "tokenizer_class": "Qwen2Tokenizer",
207
+ "unk_token": null
208
+ }
tools.json ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "name": "Project Management Tool",
3
+ "description": "A tool for managing project context, automating tasks, and facilitating human-in-the-loop interactions.",
4
+ "version": "1.0.0",
5
+ "author": "Your Name",
6
+ "license": "MIT",
7
+ "configurations": {
8
+ "projectDirectory": "/path/to/your/project",
9
+ "logFile": "/path/to/your/logs/actions.log",
10
+ "memoryFile": "/path/to/your/memory.txt",
11
+ "requestsDirectory": "/path/to/your/requests",
12
+ "responsesDirectory": "/path/to/your/responses",
13
+ "dataDirectory": "/path/to/your/data",
14
+ "statusDirectory": "/path/to/your/status"
15
+ },
16
+ "apiKeys": {
17
+ "togetherAPI": "your_together_api_key",
18
+ "cohereAPI": "your_cohere_api_key",
19
+ "geminiAPI": "your_gemini_api_key"
20
+ },
21
+ "features": {
22
+ "automaticContextGathering": {
23
+ "description": "Automatically reads related files, explores project structure, analyzes patterns, and maps dependencies.",
24
+ "enabled": true
25
+ },
26
+ "humanInTheLoop": {
27
+ "description": "Facilitates human input for reviewing and approving tasks.",
28
+ "enabled": true
29
+ },
30
+ "subgraphSupport": {
31
+ "description": "Encapsulates tasks as reusable nodes for better project management.",
32
+ "enabled": true
33
+ },
34
+ "memoryManagement": {
35
+ "description": "Persists state across agent interactions using a key-value store.",
36
+ "enabled": true
37
+ },
38
+ "apiIntegrations": {
39
+ "description": "Integrates with Together, Cohere, and Gemini APIs for text generation.",
40
+ "enabled": true
41
+ },
42
+ "fileOperations": {
43
+ "description": "Creates, edits, and validates files with error checking.",
44
+ "enabled": true
45
+ }
46
+ },
47
+ "agents": [
48
+ {
49
+ "id": 1,
50
+ "name": "Data Collection Agent",
51
+ "description": "Collects and saves data for the project.",
52
+ "tasks": ["collect_data"]
53
+ },
54
+ {
55
+ "id": 2,
56
+ "name": "Data Preprocessing Agent",
57
+ "description": "Preprocesses the collected data.",
58
+ "tasks": ["preprocess_data"]
59
+ },
60
+ {
61
+ "id": 3,
62
+ "name": "Model Training Agent",
63
+ "description": "Trains the model using the preprocessed data.",
64
+ "tasks": ["train_model"]
65
+ },
66
+ {
67
+ "id": 4,
68
+ "name": "Model Evaluation Agent",
69
+ "description": "Evaluates the trained model.",
70
+ "tasks": ["evaluate_model"]
71
+ }
72
+ ],
73
+ "tasks": {
74
+ "collect_data": {
75
+ "description": "Generates and saves Bengali text via API.",
76
+ "script": "collect_data.sh"
77
+ },
78
+ "preprocess_data": {
79
+ "description": "Analyzes and preprocesses the collected data.",
80
+ "script": "preprocess_data.sh"
81
+ },
82
+ "train_model": {
83
+ "description": "Trains the model using the preprocessed data.",
84
+ "script": "train_model.sh"
85
+ },
86
+ "evaluate_model": {
87
+ "description": "Evaluates the trained model.",
88
+ "script": "evaluate_model.sh"
89
+ }
90
+ },
91
+ "scripts": {
92
+ "collect_data.sh": "path/to/collect_data.sh",
93
+ "preprocess_data.sh": "path/to/preprocess_data.sh",
94
+ "train_model.sh": "path/to/train_model.sh",
95
+ "evaluate_model.sh": "path/to/evaluate_model.sh"
96
+ }
97
+ }
train.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import time
3
+
4
+ def train(epochs):
5
+ for epoch in range(epochs):
6
+ print("Epoch %d/%d training..." % (epoch+1, epochs))
7
+ time.sleep(2)
8
+ print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--epochs", type=int, default=1)
13
+ args = parser.parse_args()
14
+ train(args.epochs)