likhonsheikhdev commited on May 21, 2025

Commit

cce70aa

verified ·

1 Parent(s): ca40970

Upload 28 files

Browse files

Files changed (28) hide show

.gitattributes +43 -35
.github/workflows/train_model.yml +44 -0
.vscode/settings.json +9 -0
LICENSE +21 -0
README.md +141 -0
agent.ps1 +292 -0
agent.sh +73 -0
ai-agent/system_prompt.mdx +15 -0
ai-agent/task_context.md +1 -0
ai-agent/train.py +15 -0
config.json +24 -0
data_collector.ps1 +49 -0
data_collector.sh +10 -0
model_evaluator.ps1 +49 -0
model_evaluator.sh +10 -0
model_trainer.ps1 +52 -0
model_trainer.sh +10 -0
requirements.txt +29 -0
scripts/data_collector.py +186 -0
scripts/model_evaluator.py +216 -0
scripts/model_trainer.py +232 -0
scripts/tokenizer_trainer.py +160 -0
special_tokens_map.json +32 -0
start.sh +438 -0
tokenizer.json +3 -0
tokenizer_config.json +208 -0
tools.json +97 -0
train.py +14 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,43 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text
+tokenizer.json filter=lfs diff=lfs merge=lfs -text
+assets/ball.gif filter=lfs diff=lfs merge=lfs -text
+assets/benchmark.png filter=lfs diff=lfs merge=lfs -text
+assets/count.png filter=lfs diff=lfs merge=lfs -text
+assets/diamond.png filter=lfs diff=lfs merge=lfs -text
+assets/param-aime2024.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/param-lcb.jpeg filter=lfs diff=lfs merge=lfs -text
+assets/writing.png filter=lfs diff=lfs merge=lfs -text

.github/workflows/train_model.yml ADDED Viewed

	@@ -0,0 +1,44 @@

+name: Train Bengali-Code LLM Model
+on:
+  schedule:
+    - cron: '0 0 * * *'  # Run daily at midnight
+  workflow_dispatch:      # Allow manual triggers
+jobs:
+  train:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          pip install transformers datasets sentencepiece accelerate torch wandb
+      - name: Data Collection
+        run: python scripts/data_collector.py
+        env:
+          HUGGINGFACE_TOKEN: ${{ secrets.HUGGINGFACE_TOKEN }}
+      - name: Train Tokenizer
+        run: python scripts/tokenizer_trainer.py
+      - name: Train Model
+        run: python scripts/model_trainer.py
+        env:
+          WANDB_API_KEY: ${{ secrets.WANDB_API_KEY }}
+      - name: Evaluate Model
+        run: python scripts/model_evaluator.py
+      - name: Upload Model Artifacts
+        uses: actions/upload-artifact@v3
+        with:
+          name: model-weights
+          path: outputs/models/

.vscode/settings.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+    "editor.inlineSuggest.enabled": true,
+    "editor.quickSuggestions": {
+        "other": "inline",
+        "comments": true,
+        "strings": true
+    },
+    "editor.quickSuggestionsDelay": 100
+}

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Bengali-Code LLM Project Contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md ADDED Viewed

	@@ -0,0 +1,141 @@

+# Bengali-Code LLM Training Pipeline
+A comprehensive pipeline for training a Bengali language model specialized in code understanding and generation. The model is fine-tuned on Bengali programming tutorials, documentation, and code examples.
+## 🌟 Features
+- Automated data collection from Bengali Wikipedia and Prothom Alo
+- Custom tokenizer training with SentencePiece for Bengali text and code
+- Model fine-tuning using TinyLlama base model
+- Comprehensive evaluation suite for Bengali code generation
+- GitHub Actions workflow for automated training
+- Weights & Biases integration for experiment tracking
+## 📋 Requirements
+- Python 3.10 or higher
+- CUDA-capable GPU (recommended)
+- 16GB+ RAM
+- Internet connection for data collection
+## 🚀 Quick Start
+1. Clone the repository:
+```bash
+git clone https://github.com/yourusername/bengali-code-llm.git
+cd bengali-code-llm
+```
+2. Install dependencies:
+```bash
+pip install -r requirements.txt
+```
+3. Set up environment variables:
+```bash
+export HUGGINGFACE_TOKEN="your_token_here"
+export WANDB_API_KEY="your_wandb_key_here"
+```
+4. Run the complete pipeline:
+```bash
+# Collect data
+python scripts/data_collector.py
+# Train tokenizer
+python scripts/tokenizer_trainer.py
+# Train model
+python scripts/model_trainer.py
+# Evaluate model
+python scripts/model_evaluator.py
+```
+## 🏗️ Pipeline Components
+### Data Collection (`scripts/data_collector.py`)
+- Scrapes Bengali text from Wikipedia and Prothom Alo
+- Implements rate limiting and error handling
+- Outputs processed data in JSON format
+### Tokenizer Training (`scripts/tokenizer_trainer.py`)
+- Uses SentencePiece for tokenizer training
+- Custom vocabulary with Bengali and code tokens
+- Generates HuggingFace-compatible tokenizer files
+### Model Training (`scripts/model_trainer.py`)
+- Fine-tunes TinyLlama model
+- Implements efficient training with gradient accumulation
+- Supports mixed precision training
+- Integrates with Weights & Biases for tracking
+### Model Evaluation (`scripts/model_evaluator.py`)
+- Comprehensive evaluation suite
+- Tests code generation capabilities
+- Measures BLEU and ROUGE scores
+- Generates detailed evaluation reports
+## 📊 Training Metrics
+The training progress can be monitored through Weights & Biases:
+- Loss curves
+- Evaluation metrics
+- Generated samples
+- Resource utilization
+## 🔄 GitHub Actions Workflow
+The repository includes an automated training pipeline that:
+- Runs daily to incorporate new data
+- Executes the complete training pipeline
+- Uploads model artifacts
+- Can be triggered manually
+## 📁 Directory Structure
+```
+bengali-code-llm/
+├── .github/
+│   └── workflows/
+│       └── train_model.yml
+├── scripts/
+│   ├── data_collector.py
+│   ├── tokenizer_trainer.py
+│   ├── model_trainer.py
+│   └── model_evaluator.py
+├── data/
+│   └── raw/
+├── outputs/
+│   ├── tokenizer/
+│   ├── model/
+│   └── evaluation/
+├── requirements.txt
+└── README.md
+```
+## 🎯 Model Performance
+The model is evaluated on various tasks:
+- Code generation in Bengali
+- Code explanation and documentation
+- Error detection and correction
+- Algorithm explanation
+## 📜 License
+This project is licensed under the MIT License - see the LICENSE file for details.
+## 🤝 Contributing
+Contributions are welcome! Please feel free to submit issues and pull requests.
+## 📧 Contact
+For questions and feedback, please open an issue in the repository.
+## 🙏 Acknowledgments
+- TinyLlama team for the base model
+- HuggingFace for the Transformers library
+- Weights & Biases for experiment tracking

agent.ps1 ADDED Viewed

	@@ -0,0 +1,292 @@

+# Configuration
+$API_KEY = "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"
+$MODEL = "meta-llama/llama-4-scout-17b-16e-instruct"
+$AGENT_COUNT = 2
+$WORKDIR = Join-Path $PSScriptRoot "ai-agent"
+$LOGDIR = Join-Path $WORKDIR "outputs\logs"
+$PROMPT_FILE = Join-Path $WORKDIR "system_prompt.mdx"
+$TASK_FILE = Join-Path $WORKDIR "task_context.md"
+# Create directories
+New-Item -ItemType Directory -Force -Path $LOGDIR | Out-Null
+New-Item -ItemType Directory -Force -Path $WORKDIR | Out-Null
+# Initialize prompt file if missing
+if (-not (Test-Path $PROMPT_FILE)) {
+    $initialPrompt = '<Plan>' + [Environment]::NewLine
+    $initialPrompt += 'You are AI coding agents focused on building a Bengali code + NLP LLM.' + [Environment]::NewLine
+    $initialPrompt += 'Output commands inside <Actions> blocks, analyses inside <Task> blocks.' + [Environment]::NewLine
+    $initialPrompt += 'After command execution, output results inside <TaskResult> blocks.' + [Environment]::NewLine
+    $initialPrompt += '</Plan>' + [Environment]::NewLine + [Environment]::NewLine
+    $initialPrompt += '<Actions>' + [Environment]::NewLine
+    $initialPrompt += 'echo "Starting initial training setup..."' + [Environment]::NewLine
+    $initialPrompt += '# Dummy start command for training' + [Environment]::NewLine
+    $initialPrompt += 'echo "Training started."' + [Environment]::NewLine
+    $initialPrompt += '</Actions>' + [Environment]::NewLine + [Environment]::NewLine
+    $initialPrompt += '<Task>' + [Environment]::NewLine
+    $initialPrompt += 'Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.' + [Environment]::NewLine
+    $initialPrompt += '</Task>'
+    Set-Content -Path $PROMPT_FILE -Value $initialPrompt
+}
+# Initialize task file if missing
+if (-not (Test-Path $TASK_FILE)) {
+    "" | Set-Content $TASK_FILE
+}
+# Copy training script if missing
+$TRAIN_SCRIPT = Join-Path $WORKDIR "train.py"
+if (-not (Test-Path $TRAIN_SCRIPT)) {
+    Copy-Item -Path (Join-Path $PSScriptRoot "train.py") -Destination $TRAIN_SCRIPT
+}
+# Function to call Groq API with streaming
+function Invoke-GroqAPI {
+    param (
+        [string]$Prompt,
+        [string]$AgentId
+    )
+    $headers = @{
+        "Authorization" = "Bearer " + $API_KEY
+        "Content-Type" = "application/json"
+    }
+    $body = @{
+        model = $MODEL
+        messages = @(
+            @{
+                role = "system"
+                content = $Prompt
+            }
+        )
+        temperature = 1
+        max_completion_tokens = 1024
+        top_p = 1
+        stream = $true
+    } | ConvertTo-Json
+    try {
+        $apiUrl = "https://api.groq.com/openai/v1/chat/completions"
+        $response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
+        # Process streaming response
+        $fullResponse = ""
+        foreach ($chunk in $response.choices[0].delta.content) {
+            if ($null -ne $chunk) {
+                $fullResponse += $chunk
+                Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
+            }
+        }
+        Write-Host ""
+        return $fullResponse
+    }
+    catch {
+        Write-Host "❌ Error calling Groq API: $_" -ForegroundColor Red
+        return $null
+    }
+}
+# Function to extract and run actions
+function Invoke-Actions {
+    param (
+        [string]$Response,
+        [string]$AgentId
+    )
+    if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
+        $actions = $matches[1].Trim()
+        if ($actions) {
+            Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
+            $actionScriptName = "run_actions_" + $AgentId + ".ps1"
+            $actionScript = Join-Path $WORKDIR $actionScriptName
+            $actions | Set-Content $actionScript
+            $logFileName = "actions_agent_" + $AgentId + ".log"
+            $logFile = Join-Path $LOGDIR $logFileName
+            & $actionScript *>&1 | Tee-Object -Path $logFile
+        }
+    }
+    else {
+        Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
+        $logFileName = "actions_agent_" + $AgentId + ".log"
+        "" | Set-Content (Join-Path $LOGDIR $logFileName)
+    }
+}
+# Function to append task result
+function Add-TaskResult {
+    param (
+        [string]$AgentId
+    )
+    $logFileName = "actions_agent_" + $AgentId + ".log"
+    $logFile = Join-Path $LOGDIR $logFileName
+    if (Test-Path $logFile) {
+        $result = Get-Content $logFile -Tail 50 | Out-String
+        $taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
+        $taskResult += $result
+        $taskResult += '</TaskResult>'
+        Add-Content -Path $TASK_FILE -Value $taskResult
+        Write-Host ("✍️ Agent " + $AgentId + " appended <TaskResult>.")
+    }
+}
+# Main loop with multi-agent coordination
+Write-Host "🚀 Starting multi-agent AI loop with $AGENT_COUNT agents..."
+$stopLoop = $false
+while (-not $stopLoop) {
+    $promptCombined = Get-Content $PROMPT_FILE, $TASK_FILE | Out-String
+    # Create array to hold jobs
+    $jobs = @()
+    # Start agents in parallel
+    1..$AGENT_COUNT | ForEach-Object {
+        $agentId = $_
+        $workdir = $WORKDIR
+        $logdir = $LOGDIR
+        $apiKey = $API_KEY
+        $model = $MODEL
+        $jobs += Start-Job -ScriptBlock {
+            param($promptCombined, $agentId, $workdir, $logdir, $apiKey, $model)
+            # Recreate functions in job scope
+            function Invoke-GroqAPI {
+                param($Prompt, $AgentId)
+                $headers = @{
+                    "Authorization" = "Bearer " + $apiKey
+                    "Content-Type" = "application/json"
+                }
+                $body = @{
+                    model = $model
+                    messages = @(
+                        @{
+                            role = "system"
+                            content = $Prompt
+                        }
+                    )
+                    temperature = 1
+                    max_completion_tokens = 1024
+                    top_p = 1
+                    stream = $true
+                } | ConvertTo-Json
+                try {
+                    # Add hosts entry
+                    $hostsPath = "$env:SystemRoot\System32\drivers\etc\hosts"
+                    $hostEntry = "104.198.40.119 groq-api.local"
+                    # Check if entry exists
+                    $hostsContent = Get-Content $hostsPath
+                    if ($hostsContent -notcontains $hostEntry) {
+                        Add-Content -Path $hostsPath -Value "`n$hostEntry" -Force
+                    }
+                    # Configure TLS
+                    [Net.ServicePointManager]::SecurityProtocol = [Net.SecurityProtocolType]::Tls12
+                    [Net.ServicePointManager]::ServerCertificateValidationCallback = {$true}
+                    # Make request
+                    $headers = @{
+                        "Authorization" = "Bearer $apiKey"
+                        "Content-Type" = "application/json"
+                        "Host" = "api.groq.com"
+                    }
+                    $apiUrl = "https://groq-api.local/v1/chat/completions"
+                    $response = Invoke-RestMethod -Uri $apiUrl -Method Post -Headers $headers -Body $body -ContentType "application/json"
+                    $fullResponse = ""
+                    foreach ($chunk in $response.choices[0].delta.content) {
+                        if ($null -ne $chunk) {
+                            $fullResponse += $chunk
+                            Write-Host ("🤖 Agent " + $AgentId + ": " + $chunk) -NoNewline
+                        }
+                    }
+                    Write-Host ""
+                    return $fullResponse
+                }
+                catch {
+                    Write-Host ("❌ Error calling Groq API: " + $_.Exception.Message) -ForegroundColor Red
+                    # Stop the loop on API errors
+                    return "<Done>"
+                }
+            }
+            function Invoke-Actions {
+                param($Response, $AgentId)
+                if ($Response -match '(?s)<Actions>(.*?)</Actions>') {
+                    $actions = $matches[1].Trim()
+                    if ($actions) {
+                        Write-Host ("⚡ Agent " + $AgentId + " executing <Actions>...")
+                        $actionScriptName = "run_actions_" + $AgentId + ".ps1"
+                        $actionScript = Join-Path $workdir $actionScriptName
+                        $actions | Set-Content $actionScript
+                        $logFileName = "actions_agent_" + $AgentId + ".log"
+                        $logFile = Join-Path $logdir $logFileName
+                        & $actionScript *>&1 | Tee-Object -Path $logFile
+                    }
+                }
+                else {
+                    Write-Host ("ℹ️ Agent " + $AgentId + " found no <Actions>.")
+                    $logFileName = "actions_agent_" + $AgentId + ".log"
+                    "" | Set-Content (Join-Path $logdir $logFileName)
+                }
+            }
+            function Add-TaskResult {
+                param($AgentId)
+                $logFile = Join-Path $logdir ('actions_agent_' + $AgentId + '.log')
+                if (Test-Path $logFile) {
+                    $result = Get-Content $logFile -Tail 50 | Out-String
+                    $taskResult = [Environment]::NewLine + '<TaskResult>' + [Environment]::NewLine
+                    $taskResult += $result
+                    $taskResult += '</TaskResult>'
+                    Add-Content -Path (Join-Path $workdir 'task_context.md') -Value $taskResult
+                    Write-Host ('✍️ Agent ' + $AgentId + ' appended <TaskResult>.')
+                }
+            }
+            Write-Host ("🤖 Agent " + $agentId + " sending prompt to Groq API...")
+            $response = Invoke-GroqAPI -Prompt $promptCombined -AgentId $agentId
+            if ($response) {
+                $responseFileName = "agent_" + $agentId + "_response.txt"
+                $response | Set-Content (Join-Path $logdir $responseFileName)
+                Invoke-Actions -Response $response -AgentId $agentId
+                Add-TaskResult -AgentId $agentId
+                # Check for completion
+                if ($response -match '<Done>') {
+                    Write-Host ("✅ Agent " + $agentId + " indicated completion.")
+                    return $true
+                }
+            }
+            return $false
+        } -ArgumentList $promptCombined, $agentId, $workdir, $logdir, $apiKey, $model
+    }
+    # Wait for all jobs and get results
+    $results = $jobs | Wait-Job | Receive-Job
+    $jobs | Remove-Job
+    # Check if any agent indicated completion
+    if ($results -contains $true) {
+        Write-Host "🚀 Stopping AI loop as <Done> was detected."
+        $stopLoop = $true
+    }
+    Start-Sleep -Seconds 2
+}
+Write-Host "🎉 All agents completed."

agent.sh ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/bin/bash
+# Configuration
+PROJECT_DIR="$HOME/bd-model-generations"
+STATUS_DIR="$PROJECT_DIR/status"
+LOG_FILE="$PROJECT_DIR/logs/actions.log"
+# Ensure directories exist
+mkdir -p "$STATUS_DIR" "$PROJECT_DIR/logs"
+# Log function for errors
+log_error() {
+    echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: $1" >> "$LOG_FILE"
+}
+# Live status display function
+display_status() {
+    while true; do
+        clear
+        echo -e "\033[1;34m=== Live Agent Status (Bengali Language Model Generation) ===\033[0m"
+        echo -e "\033[1;36mTime: $(date '+%H:%M:%S')\033[0m"
+        echo ""
+        # Data Collector Status
+        if [ -f "$STATUS_DIR/data_collector.status" ]; then
+            echo -e "\033[1;32mData Collector:\033[0m $(cat "$STATUS_DIR/data_collector.status")"
+        else
+            echo -e "\033[1;32mData Collector:\033[0m Not started or completed"
+        fi
+        # Model Trainer Status
+        if [ -f "$STATUS_DIR/model_trainer.status" ]; then
+            echo -e "\033[1;33mModel Trainer:\033[0m $(cat "$STATUS_DIR/model_trainer.status")"
+        else
+            echo -e "\033[1;33mModel Trainer:\033[0m Not started or completed"
+        fi
+        # Model Evaluator Status
+        if [ -f "$STATUS_DIR/model_evaluator.status" ]; then
+            echo -e "\033[1;31mModel Evaluator:\033[0m $(cat "$STATUS_DIR/model_evaluator.status")"
+        else
+            echo -e "\033[1;31mModel Evaluator:\033[0m Not started or completed"
+        fi
+        # Check if all agents are done
+        if [ ! -f "$STATUS_DIR/data_collector.status" ] && \
+           [ ! -f "$STATUS_DIR/model_trainer.status" ] && \
+           [ ! -f "$STATUS_DIR/model_evaluator.status" ]; then
+            echo ""
+            echo -e "\033[1;34mAll agents have completed their tasks.\033[0m"
+            break
+        fi
+        sleep 2
+    done
+}
+# Main process
+echo "Starting Bengali language model generation..." | tee -a "$LOG_FILE"
+# Launch agents in background
+for agent in data_collector model_trainer model_evaluator; do
+    if [ -f "$PROJECT_DIR/$agent.sh" ]; then
+        echo "Starting $agent..." | tee -a "$LOG_FILE"
+        bash "$PROJECT_DIR/$agent.sh" &>> "$LOG_FILE" || log_error "$agent failed to execute"
+    else
+        log_error "$agent.sh not found in $PROJECT_DIR"
+    fi
+done
+# Display live status
+display_status
+echo "Process completed. Check logs in $LOG_FILE for details." | tee -a "$LOG_FILE"

ai-agent/system_prompt.mdx ADDED Viewed

	@@ -0,0 +1,15 @@

+<Plan>
+You are AI coding agents focused on building a Bengali code + NLP LLM.
+Output commands inside <Actions> blocks, analyses inside <Task> blocks.
+After command execution, output results inside <TaskResult> blocks.
+</Plan>
+<Actions>
+echo "Starting initial training setup..."
+# Dummy start command for training
+echo "Training started."
+</Actions>
+<Task>
+Review output and plan next steps to create a Bengali LLM focused on code + Bangla NLP.
+</Task>

ai-agent/task_context.md ADDED Viewed

	@@ -0,0 +1 @@


1	+

ai-agent/train.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import argparse
+import time
+def train(epochs):
+    for epoch in range(epochs):
+        print("Epoch %d/%d training..." % (epoch+1, epochs))
+        time.sleep(2)
+        print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--epochs", type=int, default=1)
+    args = parser.parse_args()
+    train(args.epochs)

config.json ADDED Viewed

	@@ -0,0 +1,24 @@

+  "attention_dropout": 0.0,
+  "bos_token_id": 151643,
+  "eos_token_id": 151643,
+  "hidden_act": "silu",
+  "hidden_size": 5120,
+  "initializer_range": 0.02,
+  "intermediate_size": 27648,
+  "max_position_embeddings": 131072,
+  "max_window_layers": 64,
+  "model_type": "qwen2",
+  "num_attention_heads": 40,
+  "num_hidden_layers": 64,
+  "num_key_value_heads": 8,
+  "rms_norm_eps": 1e-05,
+  "rope_scaling": null,
+  "rope_theta": 1000000.0,
+  "sliding_window": null,
+  "tie_word_embeddings": false,
+  "torch_dtype": "bfloat16",
+  "transformers_version": "4.46.0",
+  "use_cache": false,
+  "use_sliding_window": false,
+  "vocab_size": 152064
+}

data_collector.ps1 ADDED Viewed

	@@ -0,0 +1,49 @@

+# Configuration
+$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
+$StatusFile = Join-Path $ProjectDir 'status\data_collector.status'
+$LogFile = Join-Path $ProjectDir 'logs\actions.log'
+function Write-Log {
+    param([string]$Message, [string]$Type = 'INFO')
+    $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
+    Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
+}
+# Ensure status directory exists
+New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
+try {
+    # Initialize status
+    Set-Content -Path $StatusFile -Value 'Initializing data collection...'
+    Write-Log 'Data collector started' 'INFO'
+    # Simulated data collection progress (replace with actual logic)
+    $progressSteps = @(
+        @{ Status = 'Connecting to data sources...'; Duration = 2 },
+        @{ Status = 'Fetching Bengali text corpus...'; Duration = 3 },
+        @{ Status = 'Processing raw data...'; Duration = 2 },
+        @{ Status = 'Cleaning and normalizing text...'; Duration = 2 },
+        @{ Status = 'Preparing training dataset...'; Duration = 1 }
+    )
+    foreach ($step in $progressSteps) {
+        Set-Content -Path $StatusFile -Value $step.Status
+        Write-Log $step.Status 'INFO'
+        Start-Sleep -Seconds $step.Duration
+    }
+    # Final status update
+    Set-Content -Path $StatusFile -Value 'Data collection completed successfully'
+    Write-Log 'Data collection completed' 'SUCCESS'
+    Start-Sleep -Seconds 1
+} catch {
+    Write-Log "Error in data collection: $_" 'ERROR'
+    Set-Content -Path $StatusFile -Value 'Error: Data collection failed'
+    Start-Sleep -Seconds 1
+} finally {
+    # Cleanup status file
+    if (Test-Path $StatusFile) {
+        Remove-Item -Path $StatusFile
+    }
+}

data_collector.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+STATUS_FILE="$HOME/bd-model-generations/status/data_collector.status"
+LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
+echo "Collecting data..." > "$STATUS_FILE"
+# Simulate data collection (replace with actual logic)
+sleep 5
+echo "Data collection complete." > "$STATUS_FILE"
+sleep 1
+rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"

model_evaluator.ps1 ADDED Viewed

	@@ -0,0 +1,49 @@

+# Configuration
+$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
+$StatusFile = Join-Path $ProjectDir 'status\model_evaluator.status'
+$LogFile = Join-Path $ProjectDir 'logs\actions.log'
+function Write-Log {
+    param([string]$Message, [string]$Type = 'INFO')
+    $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
+    Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
+}
+# Ensure status directory exists
+New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
+try {
+    # Initialize status
+    Set-Content -Path $StatusFile -Value 'Initializing model evaluation...'
+    Write-Log 'Model evaluator started' 'INFO'
+    # Simulated evaluation progress (replace with actual logic)
+    $progressSteps = @(
+        @{ Status = 'Loading test dataset...'; Duration = 2 },
+        @{ Status = 'Computing accuracy metrics...'; Duration = 3 },
+        @{ Status = 'Analyzing model performance...'; Duration = 2 },
+        @{ Status = 'Generating confusion matrix...'; Duration = 2 },
+        @{ Status = 'Creating evaluation report...'; Duration = 1 }
+    )
+    foreach ($step in $progressSteps) {
+        Set-Content -Path $StatusFile -Value $step.Status
+        Write-Log $step.Status 'INFO'
+        Start-Sleep -Seconds $step.Duration
+    }
+    # Final status update
+    Set-Content -Path $StatusFile -Value 'Model evaluation completed successfully'
+    Write-Log 'Model evaluation completed' 'SUCCESS'
+    Start-Sleep -Seconds 1
+} catch {
+    Write-Log "Error in model evaluation: $_" 'ERROR'
+    Set-Content -Path $StatusFile -Value 'Error: Model evaluation failed'
+    Start-Sleep -Seconds 1
+} finally {
+    # Cleanup status file
+    if (Test-Path $StatusFile) {
+        Remove-Item -Path $StatusFile
+    }
+}

model_evaluator.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+STATUS_FILE="$HOME/bd-model-generations/status/model_evaluator.status"
+LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
+echo "Evaluating model..." > "$STATUS_FILE"
+# Simulate model evaluation (replace with actual logic)
+sleep 5
+echo "Model evaluation complete." > "$STATUS_FILE"
+sleep 1
+rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"

model_trainer.ps1 ADDED Viewed

	@@ -0,0 +1,52 @@

+# Configuration
+$ProjectDir = Join-Path $env:USERPROFILE 'bd-model-generations'
+$StatusFile = Join-Path $ProjectDir 'status\model_trainer.status'
+$LogFile = Join-Path $ProjectDir 'logs\actions.log'
+function Write-Log {
+    param([string]$Message, [string]$Type = 'INFO')
+    $timestamp = (Get-Date).ToString('yyyyMMdd_HHmmss')
+    Add-Content -Path $LogFile -Value "[$timestamp] $Type`: $Message"
+}
+# Ensure status directory exists
+New-Item -ItemType Directory -Force -Path (Split-Path $StatusFile) | Out-Null
+try {
+    # Initialize status
+    Set-Content -Path $StatusFile -Value 'Initializing model training...'
+    Write-Log 'Model trainer started' 'INFO'
+    # Simulated training progress (replace with actual logic)
+    $progressSteps = @(
+        @{ Status = 'Loading training dataset...'; Duration = 2 },
+        @{ Status = 'Initializing model architecture...'; Duration = 2 },
+        @{ Status = 'Training Epoch 1/5...'; Duration = 3 },
+        @{ Status = 'Training Epoch 2/5...'; Duration = 3 },
+        @{ Status = 'Training Epoch 3/5...'; Duration = 3 },
+        @{ Status = 'Training Epoch 4/5...'; Duration = 3 },
+        @{ Status = 'Training Epoch 5/5...'; Duration = 3 },
+        @{ Status = 'Saving model checkpoints...'; Duration = 1 }
+    )
+    foreach ($step in $progressSteps) {
+        Set-Content -Path $StatusFile -Value $step.Status
+        Write-Log $step.Status 'INFO'
+        Start-Sleep -Seconds $step.Duration
+    }
+    # Final status update
+    Set-Content -Path $StatusFile -Value 'Model training completed successfully'
+    Write-Log 'Model training completed' 'SUCCESS'
+    Start-Sleep -Seconds 1
+} catch {
+    Write-Log "Error in model training: $_" 'ERROR'
+    Set-Content -Path $StatusFile -Value 'Error: Model training failed'
+    Start-Sleep -Seconds 1
+} finally {
+    # Cleanup status file
+    if (Test-Path $StatusFile) {
+        Remove-Item -Path $StatusFile
+    }
+}

model_trainer.sh ADDED Viewed

	@@ -0,0 +1,10 @@

+#!/bin/bash
+STATUS_FILE="$HOME/bd-model-generations/status/model_trainer.status"
+LOG_FILE="$HOME/bd-model-generations/logs/actions.log"
+echo "Training model..." > "$STATUS_FILE"
+# Simulate model training (replace with actual logic)
+sleep 5
+echo "Model training complete." > "$STATUS_FILE"
+sleep 1
+rm -f "$STATUS_FILE" || echo "[$(date '+%Y-%m-%d %H:%M:%S')] ERROR: Failed to remove $STATUS_FILE" >> "$LOG_FILE"

requirements.txt ADDED Viewed

	@@ -0,0 +1,29 @@

+# Core dependencies
+torch>=2.0.0
+transformers>=4.30.0
+datasets>=2.12.0
+sentencepiece>=0.1.99
+accelerate>=0.20.0
+wandb>=0.15.0
+# Data collection and processing
+requests>=2.31.0
+beautifulsoup4>=4.12.0
+tqdm>=4.65.0
+# Evaluation metrics
+rouge-score>=0.1.2
+sacrebleu>=2.3.1
+pandas>=2.0.0
+numpy>=1.24.0
+# Utilities
+pathlib>=1.0.1
+logging>=0.5.1.2
+typing>=3.7.4.3
+# Development tools
+black>=23.3.0
+isort>=5.12.0
+pylint>=2.17.0
+pytest>=7.3.1

scripts/data_collector.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import requests
+from bs4 import BeautifulSoup
+import time
+import random
+import json
+from pathlib import Path
+import logging
+from urllib.parse import urljoin
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class BengaliDataCollector:
+    def __init__(self):
+        self.headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        self.output_dir = Path('data/raw')
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+    def make_request(self, url, retries=3, delay=1):
+        """Make HTTP request with retry logic and rate limiting"""
+        for attempt in range(retries):
+            try:
+                time.sleep(delay + random.random())  # Rate limiting with jitter
+                response = requests.get(url, headers=self.headers)
+                response.raise_for_status()
+                return response
+            except requests.RequestException as e:
+                logger.warning(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
+                if attempt == retries - 1:
+                    logger.error(f"Failed to fetch {url} after {retries} attempts")
+                    raise
+                time.sleep(delay * (attempt + 1))  # Exponential backoff
+    def scrape_wikipedia(self):
+        """Scrape Bengali text from Wikipedia"""
+        url = "https://bn.wikipedia.org/wiki/প্রধান_পাতা"
+        logger.info(f"Scraping Wikipedia: {url}")
+        try:
+            response = self.make_request(url)
+            soup = BeautifulSoup(response.content, 'html.parser')
+            # Get main content and featured articles
+            content_div = soup.find('div', {'id': 'mw-content-text'})
+            articles = []
+            if content_div:
+                # Extract article links
+                article_links = content_div.find_all('a', href=True)
+                for link in article_links[:50]:  # Limit to first 50 articles
+                    if link['href'].startswith('/wiki/') and ':' not in link['href']:
+                        article_url = urljoin('https://bn.wikipedia.org', link['href'])
+                        try:
+                            article_response = self.make_request(article_url)
+                            article_soup = BeautifulSoup(article_response.content, 'html.parser')
+                            # Extract article content
+                            article_content = article_soup.find('div', {'id': 'mw-content-text'})
+                            if article_content:
+                                text = article_content.get_text(separator='\n', strip=True)
+                                articles.append({
+                                    'url': article_url,
+                                    'content': text
+                                })
+                                logger.info(f"Successfully scraped article: {article_url}")
+                        except Exception as e:
+                            logger.error(f"Failed to scrape article {article_url}: {str(e)}")
+            # Save Wikipedia data
+            with open(self.output_dir / 'wikipedia_data.json', 'w', encoding='utf-8') as f:
+                json.dump(articles, f, ensure_ascii=False, indent=2)
+            return len(articles)
+        except Exception as e:
+            logger.error(f"Failed to scrape Wikipedia: {str(e)}")
+            return 0
+    def scrape_prothom_alo(self):
+        """Scrape Bengali text from Prothom Alo"""
+        base_url = "https://www.prothomalo.com"
+        categories = ['bangladesh', 'international', 'opinion', 'science-technology']
+        articles = []
+        for category in categories:
+            url = f"{base_url}/{category}"
+            logger.info(f"Scraping Prothom Alo category: {category}")
+            try:
+                response = self.make_request(url)
+                soup = BeautifulSoup(response.content, 'html.parser')
+                # Find article links
+                article_links = soup.find_all('a', href=True)
+                for link in article_links[:10]:  # Limit to 10 articles per category
+                    article_url = urljoin(base_url, link['href'])
+                    if category in article_url:
+                        try:
+                            article_response = self.make_request(article_url)
+                            article_soup = BeautifulSoup(article_response.content, 'html.parser')
+                            # Extract article content
+                            article_content = article_soup.find('div', {'class': 'story-content'})
+                            if article_content:
+                                text = article_content.get_text(separator='\n', strip=True)
+                                articles.append({
+                                    'url': article_url,
+                                    'category': category,
+                                    'content': text
+                                })
+                                logger.info(f"Successfully scraped article: {article_url}")
+                        except Exception as e:
+                            logger.error(f"Failed to scrape article {article_url}: {str(e)}")
+            except Exception as e:
+                logger.error(f"Failed to scrape category {category}: {str(e)}")
+        # Save Prothom Alo data
+        with open(self.output_dir / 'prothomalo_data.json', 'w', encoding='utf-8') as f:
+            json.dump(articles, f, ensure_ascii=False, indent=2)
+        return len(articles)
+    def collect(self):
+        """Main method to collect data from all sources"""
+        logger.info("Starting data collection")
+        wiki_count = self.scrape_wikipedia()
+        logger.info(f"Collected {wiki_count} articles from Wikipedia")
+        prothomalo_count = self.scrape_prothom_alo()
+        logger.info(f"Collected {prothomalo_count} articles from Prothom Alo")
+        # Combine and process the collected data
+        self.process_collected_data()
+        logger.info("Data collection completed")
+    def process_collected_data(self):
+        """Process and combine collected data"""
+        try:
+            # Read collected data
+            with open(self.output_dir / 'wikipedia_data.json', 'r', encoding='utf-8') as f:
+                wiki_data = json.load(f)
+            with open(self.output_dir / 'prothomalo_data.json', 'r', encoding='utf-8') as f:
+                news_data = json.load(f)
+            # Combine and format data
+            processed_data = []
+            # Process Wikipedia articles
+            for article in wiki_data:
+                processed_data.append({
+                    'text': article['content'],
+                    'source': 'wikipedia',
+                    'url': article['url']
+                })
+            # Process news articles
+            for article in news_data:
+                processed_data.append({
+                    'text': article['content'],
+                    'source': 'prothomalo',
+                    'category': article.get('category', ''),
+                    'url': article['url']
+                })
+            # Save processed data
+            with open(self.output_dir / 'processed_data.json', 'w', encoding='utf-8') as f:
+                json.dump(processed_data, f, ensure_ascii=False, indent=2)
+            logger.info(f"Successfully processed {len(processed_data)} articles")
+        except Exception as e:
+            logger.error(f"Failed to process collected data: {str(e)}")
+            raise
+if __name__ == "__main__":
+    collector = BengaliDataCollector()
+    collector.collect()

scripts/model_evaluator.py ADDED Viewed

	@@ -0,0 +1,216 @@

+import json
+from pathlib import Path
+import logging
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import numpy as np
+from typing import List, Dict, Any
+from tqdm import tqdm
+import pandas as pd
+from rouge_score import rouge_scorer
+from sacrebleu.metrics import BLEU
+import wandb
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class ModelEvaluator:
+    def __init__(self):
+        self.model_dir = Path('outputs/model/final')
+        self.output_dir = Path('outputs/evaluation')
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Test prompts for different scenarios
+        self.test_prompts = [
+            # Programming task prompts
+            {
+                "type": "code_generation",
+                "prompt": "একটি পাইথন ফাংশন লিখুন যা একটি সংখ্যার ফ্যাক্টরিয়াল বের করে।",
+                "expected": """def factorial(n):
+    if n == 0 or n == 1:
+        return 1
+    return n * factorial(n - 1)"""
+            },
+            {
+                "type": "code_explanation",
+                "prompt": "নিচের কোডটি ব্যাখ্যা করুন:\ndef bubble_sort(arr):\n    n = len(arr)\n    for i in range(n):\n        for j in range(0, n-i-1):\n            if arr[j] > arr[j+1]:\n                arr[j], arr[j+1] = arr[j+1], arr[j]",
+                "expected": "এই কোডটি বাবল সর্ট অ্যালগরিদম বাস্তবায়ন করে। এটি একটি অ্যারেকে ক্রমানুসারে সাজায়।"
+            },
+            {
+                "type": "error_fix",
+                "prompt": "এই কোডে ভুল আছে, ঠিক করুন:\ndef calculate_sum(numbers)\n    total = 0\n    for num in numbers\n        total += num\n    return total",
+                "expected": """def calculate_sum(numbers):
+    total = 0
+    for num in numbers:
+        total += num
+    return total"""
+            },
+            # Algorithm explanation prompts
+            {
+                "type": "algorithm_explanation",
+                "prompt": "বাইনারি সার্চ অ্যালগরিদম কীভাবে কাজ করে সেটি ব্যাখ্যা করুন।",
+                "expected": "বাইনারি সার্চ একটি দক্ষ অ্যালগরিদম যা সর্টেড অ্যারেতে একটি এলিমেন্ট খোঁজে। এটি প্রতিবার অ্যারের মধ্যবর্তী এলিমেন্ট চেক করে এবং সার্চ স্পেস অর্ধেক করে কমিয়ে ফেলে।"
+            }
+        ]
+        # Evaluation metrics
+        self.bleu = BLEU()
+        self.rouge_scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
+    def load_model_and_tokenizer(self):
+        """Load the trained model and tokenizer"""
+        logger.info("Loading model and tokenizer")
+        tokenizer = AutoTokenizer.from_pretrained(self.model_dir)
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_dir,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        )
+        if torch.cuda.is_available():
+            model = model.to('cuda')
+        return model, tokenizer
+    def generate_response(self, model, tokenizer, prompt: str, max_length: int = 512) -> str:
+        """Generate response for a given prompt"""
+        try:
+            inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
+            if torch.cuda.is_available():
+                inputs = {k: v.to('cuda') for k, v in inputs.items()}
+            # Generate with better parameters for code generation
+            outputs = model.generate(
+                **inputs,
+                max_length=max_length,
+                num_return_sequences=1,
+                temperature=0.7,
+                top_p=0.95,
+                do_sample=True,
+                pad_token_id=tokenizer.pad_token_id,
+                eos_token_id=tokenizer.eos_token_id,
+                repetition_penalty=1.2
+            )
+            response = tokenizer.decode(outputs[0], skip_special_tokens=True)
+            return response.replace(prompt, "").strip()
+        except Exception as e:
+            logger.error(f"Error generating response: {str(e)}")
+            return ""
+    def calculate_metrics(self, generated: str, expected: str) -> Dict[str, float]:
+        """Calculate evaluation metrics"""
+        try:
+            # Calculate BLEU score
+            bleu_score = self.bleu.corpus_score(
+                [generated],
+                [[expected]]
+            ).score / 100.0
+            # Calculate ROUGE scores
+            rouge_scores = self.rouge_scorer.score(generated, expected)
+            return {
+                'bleu': bleu_score,
+                'rouge1_f': rouge_scores['rouge1'].fmeasure,
+                'rouge2_f': rouge_scores['rouge2'].fmeasure,
+                'rougeL_f': rouge_scores['rougeL'].fmeasure
+            }
+        except Exception as e:
+            logger.error(f"Error calculating metrics: {str(e)}")
+            return {
+                'bleu': 0.0,
+                'rouge1_f': 0.0,
+                'rouge2_f': 0.0,
+                'rougeL_f': 0.0
+            }
+    def evaluate(self):
+        """Main method to evaluate the model"""
+        try:
+            # Initialize wandb for tracking
+            wandb.init(project="bengali-code-llm", name="model-evaluation")
+            # Load model and tokenizer
+            model, tokenizer = self.load_model_and_tokenizer()
+            # Store evaluation results
+            results = []
+            # Evaluate on test prompts
+            for prompt_data in tqdm(self.test_prompts, desc="Evaluating prompts"):
+                prompt_type = prompt_data["type"]
+                prompt = prompt_data["prompt"]
+                expected = prompt_data["expected"]
+                # Generate response
+                generated = self.generate_response(model, tokenizer, prompt)
+                # Calculate metrics
+                metrics = self.calculate_metrics(generated, expected)
+                # Store result
+                result = {
+                    "type": prompt_type,
+                    "prompt": prompt,
+                    "generated": generated,
+                    "expected": expected,
+                    **metrics
+                }
+                results.append(result)
+                # Log to wandb
+                wandb.log({
+                    f"{prompt_type}_bleu": metrics['bleu'],
+                    f"{prompt_type}_rouge1": metrics['rouge1_f'],
+                    f"{prompt_type}_rouge2": metrics['rouge2_f'],
+                    f"{prompt_type}_rougeL": metrics['rougeL_f']
+                })
+            # Calculate average metrics by type
+            df = pd.DataFrame(results)
+            avg_metrics = df.groupby('type')[['bleu', 'rouge1_f', 'rouge2_f', 'rougeL_f']].mean()
+            # Save detailed results
+            results_path = self.output_dir / 'evaluation_results.json'
+            with open(results_path, 'w', encoding='utf-8') as f:
+                json.dump(results, f, ensure_ascii=False, indent=2)
+            # Save average metrics
+            metrics_path = self.output_dir / 'average_metrics.csv'
+            avg_metrics.to_csv(metrics_path)
+            # Log final averages to wandb
+            wandb.log({
+                "avg_bleu": df['bleu'].mean(),
+                "avg_rouge1": df['rouge1_f'].mean(),
+                "avg_rouge2": df['rouge2_f'].mean(),
+                "avg_rougeL": df['rougeL_f'].mean()
+            })
+            # Close wandb
+            wandb.finish()
+            logger.info(f"Evaluation completed. Results saved to {self.output_dir}")
+            # Return average metrics
+            return avg_metrics.to_dict()
+        except Exception as e:
+            logger.error(f"Evaluation failed: {str(e)}")
+            raise
+        finally:
+            # Ensure wandb is properly closed
+            if wandb.run is not None:
+                wandb.finish()
+if __name__ == "__main__":
+    evaluator = ModelEvaluator()
+    evaluator.evaluate()

scripts/model_trainer.py ADDED Viewed

	@@ -0,0 +1,232 @@

+import json
+from pathlib import Path
+import logging
+import torch
+from torch.utils.data import Dataset, DataLoader
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    Trainer,
+    DataCollatorForLanguageModeling
+)
+import wandb
+import numpy as np
+from datasets import load_dataset
+from typing import Dict, List, Any
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class BengaliCodeDataset(Dataset):
+    def __init__(self, data_path: Path, tokenizer, max_length: int = 2048):
+        self.tokenizer = tokenizer
+        self.max_length = max_length
+        # Load the processed data
+        with open(data_path, 'r', encoding='utf-8') as f:
+            self.data = json.load(f)
+        logger.info(f"Loaded {len(self.data)} examples from {data_path}")
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx) -> Dict[str, torch.Tensor]:
+        item = self.data[idx]
+        text = item['text']
+        # Tokenize the text
+        encodings = self.tokenizer(
+            text,
+            max_length=self.max_length,
+            padding='max_length',
+            truncation=True,
+            return_tensors='pt'
+        )
+        # Prepare the labels (same as input_ids for causal language modeling)
+        labels = encodings.input_ids.clone()
+        # Create attention mask
+        attention_mask = encodings.attention_mask
+        return {
+            'input_ids': encodings.input_ids[0],
+            'attention_mask': attention_mask[0],
+            'labels': labels[0]
+        }
+class ModelTrainer:
+    def __init__(self):
+        self.data_dir = Path('data/raw')
+        self.tokenizer_dir = Path('outputs/tokenizer')
+        self.output_dir = Path('outputs/model')
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Training configuration
+        self.model_name = "TinyLlama/TinyLlama-1.1B-intermediate-step-1431k-3T"
+        self.max_length = 2048
+        self.batch_size = 4
+        self.gradient_accumulation_steps = 4
+        self.learning_rate = 2e-5
+        self.num_train_epochs = 3
+        self.warmup_steps = 100
+        self.save_steps = 1000
+        self.eval_steps = 500
+    def setup_wandb(self):
+        """Initialize Weights & Biases tracking"""
+        wandb.init(
+            project="bengali-code-llm",
+            name="tinyllama-bengali-code",
+            config={
+                "model_name": self.model_name,
+                "max_length": self.max_length,
+                "batch_size": self.batch_size,
+                "learning_rate": self.learning_rate,
+                "num_epochs": self.num_train_epochs
+            }
+        )
+    def prepare_model_and_tokenizer(self):
+        """Load and prepare the model and tokenizer"""
+        logger.info("Loading tokenizer and model")
+        # Load the custom tokenizer
+        tokenizer = AutoTokenizer.from_pretrained(
+            self.tokenizer_dir,
+            model_max_length=self.max_length
+        )
+        # Load the base model
+        model = AutoModelForCausalLM.from_pretrained(
+            self.model_name,
+            trust_remote_code=True,
+            torch_dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32
+        )
+        # Resize token embeddings to match our tokenizer
+        model.resize_token_embeddings(len(tokenizer))
+        return model, tokenizer
+    def create_datasets(self, tokenizer):
+        """Create training and validation datasets"""
+        logger.info("Creating datasets")
+        # Load the processed data
+        data_path = self.data_dir / 'processed_data.json'
+        # Split data into train and validation
+        with open(data_path, 'r', encoding='utf-8') as f:
+            all_data = json.load(f)
+        np.random.seed(42)
+        np.random.shuffle(all_data)
+        split_idx = int(len(all_data) * 0.9)  # 90% train, 10% validation
+        train_data = all_data[:split_idx]
+        val_data = all_data[split_idx:]
+        # Save split data
+        train_path = self.data_dir / 'train.json'
+        val_path = self.data_dir / 'validation.json'
+        with open(train_path, 'w', encoding='utf-8') as f:
+            json.dump(train_data, f, ensure_ascii=False, indent=2)
+        with open(val_path, 'w', encoding='utf-8') as f:
+            json.dump(val_data, f, ensure_ascii=False, indent=2)
+        # Create datasets
+        train_dataset = BengaliCodeDataset(train_path, tokenizer, self.max_length)
+        val_dataset = BengaliCodeDataset(val_path, tokenizer, self.max_length)
+        return train_dataset, val_dataset
+    def create_training_arguments(self):
+        """Create training arguments for the Trainer"""
+        return TrainingArguments(
+            output_dir=str(self.output_dir),
+            num_train_epochs=self.num_train_epochs,
+            per_device_train_batch_size=self.batch_size,
+            per_device_eval_batch_size=self.batch_size,
+            gradient_accumulation_steps=self.gradient_accumulation_steps,
+            evaluation_strategy="steps",
+            eval_steps=self.eval_steps,
+            save_strategy="steps",
+            save_steps=self.save_steps,
+            learning_rate=self.learning_rate,
+            warmup_steps=self.warmup_steps,
+            weight_decay=0.01,
+            logging_dir=str(self.output_dir / 'logs'),
+            logging_steps=100,
+            report_to="wandb",
+            save_total_limit=3,
+            load_best_model_at_end=True,
+            metric_for_best_model="eval_loss",
+            greater_is_better=False,
+            fp16=torch.cuda.is_available(),
+            remove_unused_columns=False
+        )
+    def train(self):
+        """Main method to train the model"""
+        try:
+            # Initialize wandb
+            self.setup_wandb()
+            # Prepare model and tokenizer
+            model, tokenizer = self.prepare_model_and_tokenizer()
+            # Create datasets
+            train_dataset, val_dataset = self.create_datasets(tokenizer)
+            # Create training arguments
+            training_args = self.create_training_arguments()
+            # Create data collator
+            data_collator = DataCollatorForLanguageModeling(
+                tokenizer=tokenizer,
+                mlm=False  # We're doing causal language modeling
+            )
+            # Initialize trainer
+            trainer = Trainer(
+                model=model,
+                args=training_args,
+                train_dataset=train_dataset,
+                eval_dataset=val_dataset,
+                data_collator=data_collator,
+                tokenizer=tokenizer
+            )
+            # Train the model
+            logger.info("Starting model training")
+            trainer.train()
+            # Save the final model
+            trainer.save_model(str(self.output_dir / 'final'))
+            tokenizer.save_pretrained(str(self.output_dir / 'final'))
+            # Close wandb
+            wandb.finish()
+            logger.info("Model training completed successfully")
+        except Exception as e:
+            logger.error(f"Model training failed: {str(e)}")
+            raise
+        finally:
+            # Ensure wandb is properly closed
+            if wandb.run is not None:
+                wandb.finish()
+if __name__ == "__main__":
+    trainer = ModelTrainer()
+    trainer.train()

scripts/tokenizer_trainer.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import json
+from pathlib import Path
+import sentencepiece as spm
+import logging
+from typing import List, Dict
+import shutil
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+class TokenizerTrainer:
+    def __init__(self):
+        self.data_dir = Path('data/raw')
+        self.output_dir = Path('outputs/tokenizer')
+        self.output_dir.mkdir(parents=True, exist_ok=True)
+        # Tokenizer configuration
+        self.vocab_size = 32000
+        self.character_coverage = 0.9999
+        self.model_type = "unigram"
+        self.special_tokens = [
+            "[PAD]", "[UNK]", "[CLS]", "[SEP]", "[MASK]",
+            "<s>", "</s>", "<pad>", "<unk>", "<mask>",
+            "২০", "১০", "৫০", "১৫", "২৫",  # Common Bengali numbers
+            "def", "class", "return", "if", "else", "for", "while",  # Code keywords
+            "print", "input", "import", "from", "try", "except",
+            "#", "//", "/*", "*/", "'''", '"""'  # Code comments
+        ]
+    def prepare_training_data(self) -> str:
+        """Prepare text data for tokenizer training"""
+        logger.info("Preparing training data for tokenizer")
+        # Load processed data
+        try:
+            with open(self.data_dir / 'processed_data.json', 'r', encoding='utf-8') as f:
+                data = json.load(f)
+        except FileNotFoundError:
+            logger.error("Processed data file not found. Run data collection first.")
+            raise
+        # Create temporary file for training
+        train_file = self.output_dir / 'train.txt'
+        with open(train_file, 'w', encoding='utf-8') as f:
+            for item in data:
+                text = item['text']
+                # Write one sentence per line
+                sentences = text.split('।')  # Split on Bengali full stop
+                for sentence in sentences:
+                    sentence = sentence.strip()
+                    if sentence:  # Skip empty sentences
+                        f.write(sentence + '\n')
+        logger.info("Training data prepared successfully")
+        return str(train_file)
+    def train_tokenizer(self, train_file: str):
+        """Train the SentencePiece tokenizer"""
+        logger.info("Starting tokenizer training")
+        # Prepare model prefix
+        model_prefix = self.output_dir / "bengali_code"
+        # Create training parameters
+        params = {
+            "--input": train_file,
+            "--model_prefix": str(model_prefix),
+            "--vocab_size": str(self.vocab_size),
+            "--character_coverage": str(self.character_coverage),
+            "--model_type": self.model_type,
+            "--pad_id": 0,
+            "--unk_id": 1,
+            "--bos_id": 2,
+            "--eos_id": 3,
+            "--user_defined_symbols": ",".join(self.special_tokens),
+            "--max_sentence_length": "4192",
+            "--input_sentence_size": "5000000",
+            "--shuffle_input_sentence": "true",
+            "--normalization_rule_name": "identity"  # Preserve original text
+        }
+        # Convert parameters to command-line arguments
+        args = []
+        for key, value in params.items():
+            args.append(key)
+            args.append(value)
+        try:
+            # Train the tokenizer
+            spm.SentencePieceTrainer.train(" ".join(args))
+            logger.info("Tokenizer training completed successfully")
+            # Create config files for HuggingFace compatibility
+            self.create_huggingface_files(model_prefix)
+        except Exception as e:
+            logger.error(f"Failed to train tokenizer: {str(e)}")
+            raise
+    def create_huggingface_files(self, model_prefix: Path):
+        """Create additional files needed for HuggingFace compatibility"""
+        logger.info("Creating HuggingFace compatibility files")
+        # Create tokenizer config
+        tokenizer_config = {
+            "model_max_length": 2048,
+            "padding_side": "right",
+            "truncation_side": "right",
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+            "mask_token": "<mask>",
+            "model_type": self.model_type,
+            "vocab_size": self.vocab_size
+        }
+        with open(self.output_dir / "tokenizer_config.json", 'w', encoding='utf-8') as f:
+            json.dump(tokenizer_config, f, ensure_ascii=False, indent=2)
+        # Create special tokens map
+        special_tokens_map = {
+            "bos_token": "<s>",
+            "eos_token": "</s>",
+            "unk_token": "<unk>",
+            "pad_token": "<pad>",
+            "mask_token": "<mask>"
+        }
+        with open(self.output_dir / "special_tokens_map.json", 'w', encoding='utf-8') as f:
+            json.dump(special_tokens_map, f, ensure_ascii=False, indent=2)
+        logger.info("HuggingFace compatibility files created successfully")
+    def train(self):
+        """Main method to train the tokenizer"""
+        try:
+            # Prepare training data
+            train_file = self.prepare_training_data()
+            # Train tokenizer
+            self.train_tokenizer(train_file)
+            # Clean up temporary files
+            if Path(train_file).exists():
+                Path(train_file).unlink()
+            logger.info("Tokenizer training pipeline completed successfully")
+        except Exception as e:
+            logger.error(f"Tokenizer training pipeline failed: {str(e)}")
+            raise
+if __name__ == "__main__":
+    trainer = TokenizerTrainer()
+    trainer.train()

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "eos_token": {
+    "content": "<|im_end|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "<|endoftext|>",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

start.sh ADDED Viewed

	@@ -0,0 +1,438 @@

+#!/bin/bash
+# === SYSTEM PROMPT ===
+# This script builds a Bengali language model using a multi-agent system with human-in-the-loop (HIL) capabilities.
+# Advanced Features:
+# - Real-Time Streaming: Displays a colorful, dynamic status dashboard in the terminal.
+# - Robust Error Handling: Validates setup, API calls, and file operations with detailed logging.
+# - Modern Interface: Uses ANSI colors, progress bars, and a boxed header for a polished look.
+# - Loop and Iteration: Monitors execution, retries on failure, and ensures task completion.
+# - Code Execution: Executes Python and Node.js code locally for preprocessing and evaluation.
+# - Tools: Provides Python and Node.js REPLs for file operations and analysis.
+# - Time Travel: Logs actions with timestamps for debugging and auditing.
+# - Subgraph Support: Encapsulates tasks (data collection, preprocessing, training, evaluation) as reusable nodes.
+# - Memory: Persists state across agent interactions using a key-value store.
+# - API Integrations: Uses Together, Cohere, and Gemini APIs (Together as primary for text generation).
+# - File Operations: Creates, edits, and validates files with error checking.
+# - Output: Saves the model to /storage/BA73-022B/bd/bd-model-genaretions/model.pt.
+# === CONFIGURATION ===
+PROJECT_DIR="/storage/BA73-022B/bd/bd-model-genaretions"  # Updated as per user request
+LOG_FILE="$PROJECT_DIR/logs/actions.log"
+MEMORY_FILE="$PROJECT_DIR/memory.txt"
+REQUESTS_DIR="$PROJECT_DIR/requests"
+RESPONSES_DIR="$PROJECT_DIR/responses"
+DATA_DIR="$PROJECT_DIR/data"
+STATUS_DIR="$PROJECT_DIR/status"
+# API Keys
+TOGETHER_API_KEY="07f08ca73c50496a3406ff621912254a67370d576822f1921f77eed47e649545"
+COHERE_API_KEY="rvpLjkuzZPsoHGeIxqQxttTTIt4IxGUS5FOINU4L"
+GEMINI_API_KEY="AIzaSyAQNxQU0WnegEnMfP6LCwkVw-PUtR11qaI"
+# === SETUP ===
+echo "Initializing project directories..."
+for dir in "$PROJECT_DIR" "$DATA_DIR" "$REQUESTS_DIR" "$RESPONSES_DIR" "$PROJECT_DIR/logs" "$STATUS_DIR"; do
+  mkdir -p "$dir"
+  if [ $? -ne 0 ]; then
+    echo -e "\033[1;31mError: Failed to create directory $dir\033[0m"
+    exit 1
+  fi
+done
+touch "$LOG_FILE" "$MEMORY_FILE"
+if [ ! -f "$LOG_FILE" ] || [ ! -f "$MEMORY_FILE" ]; then
+  echo -e "\033[1;31mError: Failed to create log or memory file\033[0m"
+  exit 1
+fi
+echo "[$(date)] Starting Bengali language model generation" >> "$LOG_FILE"
+# === UTILITY FUNCTIONS ===
+# Memory Management
+function set_memory {
+  local key="$1"
+  local value="$2"
+  grep -v "^$key=" "$MEMORY_FILE" > "$MEMORY_FILE.tmp" && mv "$MEMORY_FILE.tmp" "$MEMORY_FILE"
+  echo "$key=$value" >> "$MEMORY_FILE"
+}
+function get_memory {
+  local key="$1"
+  local value=$(grep "^$key=" "$MEMORY_FILE" | cut -d'=' -f2)
+  echo "${value:-false}"
+}
+# Logging (Time Travel)
+function log_action {
+  local agent_id="$1"
+  local action="$2"
+  echo "[$(date)] [Agent $agent_id] $action" >> "$LOG_FILE"
+}
+# Status Updates
+function set_status {
+  local agent_id="$1"
+  local status="$2"
+  echo "$status" > "$STATUS_DIR/agent$agent_id.status"
+}
+# === TOOL CALLING FUNCTIONS ===
+function run_python {
+  local code="$1"
+  log_action "Tool" "Running Python code: $code"
+  local output=$(python3 -c "$code" 2>> "$LOG_FILE")
+  local exit_code=$?
+  if [ $exit_code -ne 0 ]; then
+    log_action "Tool" "Python execution failed with exit code $exit_code"
+    return $exit_code
+  fi
+  echo "$output"
+}
+function run_node {
+  local code="$1"
+  log_action "Tool" "Running Node.js code: $code"
+  local output=$(node -e "$code" 2>> "$LOG_FILE")
+  local exit_code=$?
+  if [ $exit_code -ne 0 ]; then
+    log_action "Tool" "Node.js execution failed with exit code $exit_code"
+    return $exit_code
+  fi
+  echo "$output"
+}
+# === API CALLING FUNCTIONS ===
+function call_together_api {
+  local prompt="$1"
+  curl -s -m 10 -X POST "https://api.together.ai/v1/completions" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer $TOGETHER_API_KEY" \
+    -d "{\"prompt\": \"$prompt\", \"model\": \"some_model\", \"max_tokens\": 100}"
+}
+function call_cohere_api {
+  local prompt="$1"
+  curl -s -m 10 -X POST "https://api.cohere.ai/generate" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer $COHERE_API_KEY" \
+    -d "{\"prompt\": \"$prompt\", \"max_tokens\": 100}"
+}
+function call_gemini_api {
+  local prompt="$1"
+  curl -s -m 10 -X POST "https://api.gemini.ai/v1/completions" \
+    -H "Content-Type: application/json" \
+    -H "Authorization: Bearer $GEMINI_API_KEY" \
+    -d "{\"prompt\": \"$prompt\", \"model\": \"some_model\"}"
+}
+function generate_text {
+  local prompt="$1"
+  local api="$2"
+  local attempts=3
+  for ((i=1; i<=attempts; i++)); do
+    local response text
+    case "$api" in
+      together)
+        response=$(call_together_api "$prompt")
+        text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
+        ;;
+      cohere)
+        response=$(call_cohere_api "$prompt")
+        text=$(echo "$response" | jq -r '.generations[0].text' 2>/dev/null)
+        ;;
+      gemini)
+        response=$(call_gemini_api "$prompt")
+        text=$(echo "$response" | jq -r '.choices[0].text' 2>/dev/null)
+        ;;
+      *)
+        text="Unknown API"
+        ;;
+    esac
+    if [ -n "$text" ] && [ "$text" != "null" ] && [[ ! "$text" =~ "Error" ]]; then
+      echo "$text"
+      return 0
+    fi
+    log_action "API" "Attempt $i failed for $api API, retrying..."
+    sleep 2
+  done
+  log_action "API" "Failed to generate text with $api after $attempts attempts"
+  return 1
+}
+# === HUMAN-IN-THE-LOOP REQUEST FUNCTION ===
+function request_human_input {
+  local agent_id="$1"
+  local request="$2"
+  log_action "$agent_id" "Requesting human input: $request"
+  set_status "$agent_id" "Waiting for human input"
+  echo "$request" > "$REQUESTS_DIR/agent$agent_id.txt"
+  while [ ! -f "$RESPONSES_DIR/agent$agent_id.txt" ]; do
+    sleep 1
+  done
+  local response=$(cat "$RESPONSES_DIR/agent$agent_id.txt")
+  rm "$RESPONSES_DIR/agent$agent_id.txt"
+  log_action "$agent_id" "Received human response: $response"
+  set_status "$agent_id" "Processing human input"
+  echo "$response"
+}
+# === SUBGRAPH FUNCTIONS ===
+function collect_data {
+  set_status 1 "Generating Bengali text via API"
+  local prompt="Generate a sample of Bengali text for language model training."
+  local text=$(generate_text "$prompt" "together")
+  if [ $? -eq 0 ]; then
+    set_status 1 "Saving data to file"
+    echo "$text" > "$DATA_DIR/bengali_text.txt"
+    if [ $? -ne 0 ]; then
+      set_status 1 "Error: Failed to save data"
+      log_action 1 "Failed to write to $DATA_DIR/bengali_text.txt"
+      return 1
+    fi
+    log_action 1 "Data saved to $DATA_DIR/bengali_text.txt"
+    set_memory "data_collected" "true"
+  else
+    set_status 1 "API error"
+    log_action 1 "Failed to collect data due to API error"
+    return 1
+  fi
+  set_status 1 "Data collection completed"
+}
+function preprocess_data {
+  set_status 2 "Waiting for data collection"
+  while [ "$(get_memory 'data_collected')" != "true" ]; do
+    sleep 1
+  done
+  set_status 2 "Analyzing data"
+  if [ ! -f "$DATA_DIR/bengali_text.txt" ]; then
+    set_status 2 "Error: Data file missing"
+    log_action 2 "Error: No data file found at $DATA_DIR/bengali_text.txt"
+    return 1
+  fi
+  local output=$(run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); print(f'Text length: {len(text)} characters')")
+  if [ $? -ne 0 ]; then
+    set_status 2 "Error: Analysis failed"
+    log_action 2 "Preprocessing analysis failed"
+    return 1
+  fi
+  log_action 2 "Analysis result: $output"
+  set_status 2 "Awaiting human review"
+  local response=$(request_human_input 2 "Review the Bengali text in $DATA_DIR/bengali_text.txt (approve/reject/edit)")
+  case "$response" in
+    approve)
+      set_status 2 "Saving preprocessed data"
+      echo "Data preprocessed" > "$DATA_DIR/preprocessed_text.txt"
+      log_action 2 "Preprocessing approved, saved to $DATA_DIR/preprocessed_text.txt"
+      set_memory "data_preprocessed" "true"
+      ;;
+    edit)
+      set_status 2 "Editing data"
+      log_action 2 "Human requested edit; applying transformation"
+      run_python "with open('$DATA_DIR/bengali_text.txt', 'r') as f: text = f.read(); with open('$DATA_DIR/preprocessed_text.txt', 'w') as f: f.write(text.upper())"
+      if [ $? -eq 0 ]; then
+        set_memory "data_preprocessed" "true"
+      else
+        set_status 2 "Error: Edit failed"
+        return 1
+      fi
+      ;;
+    *)
+      set_status 2 "Preprocessing rejected"
+      log_action 2 "Preprocessing rejected by human"
+      return 1
+      ;;
+  esac
+}
+function train_model {
+  set_status 3 "Waiting for preprocessing"
+  while [ "$(get_memory 'data_preprocessed')" != "true" ]; do
+    sleep 1
+  done
+  set_status 3 "Training model"
+  if [ ! -f "$DATA_DIR/preprocessed_text.txt" ]; then
+    set_status 3 "Error: Preprocessed data missing"
+    log_action 3 "Error: No preprocessed data found at $DATA_DIR/preprocessed_text.txt"
+    return 1
+  fi
+  echo "Training Bengali model..."
+  sleep 2  # Simulate training
+  echo "Model trained" > "$PROJECT_DIR/model.pt"
+  if [ $? -ne 0 ]; then
+    set_status 3 "Error: Failed to save model"
+    log_action 3 "Failed to save model to $PROJECT_DIR/model.pt"
+    return 1
+  fi
+  log_action 3 "Model saved to $PROJECT_DIR/model.pt"
+  set_memory "model_trained" "true"
+  set_status 3 "Training completed"
+}
+function evaluate_model {
+  set_status 4 "Waiting for model training"
+  while [ "$(get_memory 'model_trained')" != "true" ]; do
+    sleep 1
+  done
+  set_status 4 "Evaluating model"
+  if [ ! -f "$PROJECT_DIR/model.pt" ]; then
+    set_status 4 "Error: Model file missing"
+    log_action 4 "Error: No model file found at $PROJECT_DIR/model.pt"
+    return 1
+  fi
+  local output=$(run_python "print('Simulated accuracy: 85%')")
+  if [ $? -ne 0 ]; then
+    set_status 4 "Error: Evaluation failed"
+    log_action 4 "Evaluation failed"
+    return 1
+  fi
+  set_status 4 "Awaiting human review"
+  local response=$(request_human_input 4 "Review model performance: $output (approve/reject/fix)")
+  case "$response" in
+    approve)
+      log_action 4 "Evaluation approved"
+      set_memory "evaluation_completed" "true"
+      ;;
+    fix)
+      set_status 4 "Fixing model"
+      log_action 4 "Human requested fix; simulating correction"
+      echo "Fixed model" > "$PROJECT_DIR/model.pt"
+      set_memory "evaluation_completed" "true"
+      ;;
+    *)
+      set_status 4 "Evaluation rejected"
+      log_action 4 "Evaluation rejected by human"
+      return 1
+      ;;
+  esac
+  set_status 4 "Evaluation completed"
+}
+# === AGENT FUNCTIONS ===
+function agent1 {
+  set_status 1 "Starting data collection"
+  until collect_data; do
+    set_status 1 "Retrying data collection"
+    log_action 1 "Retrying data collection after failure"
+    sleep 2
+  done
+  set_status 1 "Data collection completed"
+  set_memory "agent1_completed" "true"
+}
+function agent2 {
+  set_status 2 "Starting preprocessing"
+  until preprocess_data; do
+    set_status 2 "Retrying preprocessing"
+    log_action 2 "Retrying preprocessing after failure"
+    sleep 2
+  done
+  set_status 2 "Preprocessing completed"
+  set_memory "agent2_completed" "true"
+}
+function agent3 {
+  set_status 3 "Starting training"
+  until train_model; do
+    set_status 3 "Retrying training"
+    log_action 3 "Retrying training after failure"
+    sleep 2
+  done
+  set_status 3 "Training completed"
+  set_memory "agent3_completed" "true"
+}
+function agent4 {
+  set_status 4 "Starting evaluation"
+  until evaluate_model; do
+    set_status 4 "Retrying evaluation"
+    log_action 4 "Retrying evaluation after failure"
+    sleep 2
+  done
+  set_status 4 "Evaluation completed"
+  set_memory "agent4_completed" "true"
+}
+# === STATUS DISPLAY ===
+function display_status {
+  echo -e "\033[1;34m┌─────────────────────── STATUS DASHBOARD ──────────────────────┐\033[0m"
+  echo -e "\033[1;34m│      Bengali Language Model Generation - $(date +%H:%M:%S)      │\033[0m"
+  echo -e "\033[1;34m└───────────────────────────────────────────────────────────────┘\033[0m"
+  local completed=0
+  for agent in 1 2 3 4; do
+    local status="Not started"
+    if [ -f "$STATUS_DIR/agent$agent.status" ]; then
+      status=$(cat "$STATUS_DIR/agent$agent.status")
+    fi
+    if [ "$(get_memory "agent${agent}_completed")" == "true" ]; then
+      status="Completed"
+      ((completed++))
+    fi
+    case $agent in
+      1) color="\033[1;32m" ;;  # Green
+      2) color="\033[1;33m" ;;  # Yellow
+      3) color="\033[1;34m" ;;  # Blue
+      4) color="\033[1;35m" ;;  # Magenta
+    esac
+    echo -e "${color}Agent $agent: $status\033[0m"
+  done
+  local progress=$((completed * 25))  # 25% per agent
+  echo -e "\033[1;36mProgress: [$completed/4] ${progress}%\033[0m"
+}
+# === HUMAN-IN-THE-LOOP HANDLER ===
+function hil_handler {
+  while true; do
+    clear
+    display_status
+    if [ "$(get_memory 'agent1_completed')" == "true" ] && \
+       [ "$(get_memory 'agent2_completed')" == "true" ] && \
+       [ "$(get_memory 'agent3_completed')" == "true" ] && \
+       [ "$(get_memory 'agent4_completed')" == "true" ]; then
+      log_action "HIL" "All agents completed successfully"
+      echo -e "\033[1;32m✓ All agents completed! Model generation successful.\033[0m"
+      break
+    fi
+    for req_file in "$REQUESTS_DIR"/*; do
+      if [ -f "$req_file" ]; then
+        local agent_id=$(basename "$req_file" .txt | sed 's/agent//')
+        local request=$(cat "$req_file")
+        echo -e "\n\033[1;33mAgent $agent_id requests your input:\033[0m $request"
+        echo -e "\033[1;33mEnter response (e.g., approve/reject/edit/fix):\033[0m"
+        read -r human_input
+        if [ -z "$human_input" ]; then
+          echo -e "\033[1;31mError: Input cannot be empty. Try again.\033[0m"
+          continue
+        fi
+        echo "$human_input" > "$RESPONSES_DIR/agent$agent_id.txt"
+        rm "$req_file"
+      fi
+    done
+    sleep 1
+  done
+}
+# === CLEANUP ON EXIT ===
+function cleanup {
+  echo -e "\033[1;31mScript interrupted. Cleaning up...\033[0m"
+  rm -f "$REQUESTS_DIR"/* "$RESPONSES_DIR"/* 2>/dev/null
+  log_action "Main" "Script terminated by user"
+  exit 1
+}
+trap cleanup INT TERM
+# === MAIN EXECUTION ===
+echo -e "\033[1;32mStarting Bengali language model generation...\033[0m"
+log_action "Main" "Script execution started"
+agent1 &
+agent2 &
+agent3 &
+agent4 &
+hil_handler
+echo -e "\033[1;32mProcess completed successfully!\033[0m"
+echo "Model saved at: $PROJECT_DIR/model.pt"
+echo "Detailed logs available at: $LOG_FILE"

tokenizer.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:21bff87aabfb69a9aafc1c1c6d1b60bbf3138e2e2b7545924f62b3c5b3c3d587
+size 16

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,208 @@

+{
+  "add_bos_token": false,
+  "add_prefix_space": false,
+  "added_tokens_decoder": {
+    "151643": {
+      "content": "<|endoftext|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151644": {
+      "content": "<|im_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151645": {
+      "content": "<|im_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151646": {
+      "content": "<|object_ref_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151647": {
+      "content": "<|object_ref_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151648": {
+      "content": "<|box_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151649": {
+      "content": "<|box_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151650": {
+      "content": "<|quad_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151651": {
+      "content": "<|quad_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151652": {
+      "content": "<|vision_start|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151653": {
+      "content": "<|vision_end|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151654": {
+      "content": "<|vision_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151655": {
+      "content": "<|image_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151656": {
+      "content": "<|video_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "151657": {
+      "content": "<tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151658": {
+      "content": "</tool_call>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151659": {
+      "content": "<|fim_prefix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151660": {
+      "content": "<|fim_middle|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151661": {
+      "content": "<|fim_suffix|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151662": {
+      "content": "<|fim_pad|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151663": {
+      "content": "<|repo_name|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    },
+    "151664": {
+      "content": "<|file_sep|>",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": false
+    }
+  },
+  "additional_special_tokens": [
+    "<|im_start|>",
+    "<|im_end|>",
+    "<|object_ref_start|>",
+    "<|object_ref_end|>",
+    "<|box_start|>",
+    "<|box_end|>",
+    "<|quad_start|>",
+    "<|quad_end|>",
+    "<|vision_start|>",
+    "<|vision_end|>",
+    "<|vision_pad|>",
+    "<|image_pad|>",
+    "<|video_pad|>"
+  ],
+  "bos_token": null,
+  "chat_template": "{%- if tools %}\n    {{- '<|im_start|>system\\n' }}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- messages[0]['content'] }}\n    {%- else %}\n        {{- 'You are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.' }}\n    {%- endif %}\n    {{- \"\\n\\n# Tools\\n\\nYou may call one or more functions to assist with the user query.\\n\\nYou are provided with function signatures within <tools></tools> XML tags:\\n<tools>\" }}\n    {%- for tool in tools %}\n        {{- \"\\n\" }}\n        {{- tool | tojson }}\n    {%- endfor %}\n    {{- \"\\n</tools>\\n\\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\\n<tool_call>\\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\\n</tool_call><|im_end|>\\n\" }}\n{%- else %}\n    {%- if messages[0]['role'] == 'system' %}\n        {{- '<|im_start|>system\\n' + messages[0]['content'] + '<|im_end|>\\n' }}\n    {%- else %}\n        {{- '<|im_start|>system\\nYou are a helpful assistant. To answer the user\\'s question, you first think about the reasoning process and then provide the user with the answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think> <answer> answer here </answer>.<|im_end|>\\n' }}\n    {%- endif %}\n{%- endif %}\n{%- for message in messages %}\n    {%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}\n        {{- '<|im_start|>' + message.role + '\\n' + message.content + '<|im_end|>' + '\\n' }}\n    {%- elif message.role == \"assistant\" %}\n        {{- '<|im_start|>' + message.role }}\n        {%- if message.content %}\n            {{- '\\n' + message.content }}\n        {%- endif %}\n        {%- for tool_call in message.tool_calls %}\n            {%- if tool_call.function is defined %}\n                {%- set tool_call = tool_call.function %}\n            {%- endif %}\n            {{- '\\n<tool_call>\\n{\"name\": \"' }}\n            {{- tool_call.name }}\n            {{- '\", \"arguments\": ' }}\n            {{- tool_call.arguments | tojson }}\n            {{- '}\\n</tool_call>' }}\n        {%- endfor %}\n        {{- '<|im_end|>\\n' }}\n    {%- elif message.role == \"tool\" %}\n        {%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}\n            {{- '<|im_start|>user' }}\n        {%- endif %}\n        {{- '\\n<tool_response>\\n' }}\n        {{- message.content }}\n        {{- '\\n</tool_response>' }}\n        {%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}\n            {{- '<|im_end|>\\n' }}\n        {%- endif %}\n    {%- endif %}\n{%- endfor %}\n{%- if add_generation_prompt %}\n    {{- '<|im_start|>assistant\\n' }}\n{%- endif %}\n",
+  "clean_up_tokenization_spaces": false,
+  "eos_token": "<|im_end|>",
+  "errors": "replace",
+  "model_max_length": 131072,
+  "pad_token": "<|endoftext|>",
+  "padding_side": "right",
+  "split_special_tokens": false,
+  "tokenizer_class": "Qwen2Tokenizer",
+  "unk_token": null
+}

tools.json ADDED Viewed

	@@ -0,0 +1,97 @@

+{
+  "name": "Project Management Tool",
+  "description": "A tool for managing project context, automating tasks, and facilitating human-in-the-loop interactions.",
+  "version": "1.0.0",
+  "author": "Your Name",
+  "license": "MIT",
+  "configurations": {
+    "projectDirectory": "/path/to/your/project",
+    "logFile": "/path/to/your/logs/actions.log",
+    "memoryFile": "/path/to/your/memory.txt",
+    "requestsDirectory": "/path/to/your/requests",
+    "responsesDirectory": "/path/to/your/responses",
+    "dataDirectory": "/path/to/your/data",
+    "statusDirectory": "/path/to/your/status"
+  },
+  "apiKeys": {
+    "togetherAPI": "your_together_api_key",
+    "cohereAPI": "your_cohere_api_key",
+    "geminiAPI": "your_gemini_api_key"
+  },
+  "features": {
+    "automaticContextGathering": {
+      "description": "Automatically reads related files, explores project structure, analyzes patterns, and maps dependencies.",
+      "enabled": true
+    },
+    "humanInTheLoop": {
+      "description": "Facilitates human input for reviewing and approving tasks.",
+      "enabled": true
+    },
+    "subgraphSupport": {
+      "description": "Encapsulates tasks as reusable nodes for better project management.",
+      "enabled": true
+    },
+    "memoryManagement": {
+      "description": "Persists state across agent interactions using a key-value store.",
+      "enabled": true
+    },
+    "apiIntegrations": {
+      "description": "Integrates with Together, Cohere, and Gemini APIs for text generation.",
+      "enabled": true
+    },
+    "fileOperations": {
+      "description": "Creates, edits, and validates files with error checking.",
+      "enabled": true
+    }
+  },
+  "agents": [
+    {
+      "id": 1,
+      "name": "Data Collection Agent",
+      "description": "Collects and saves data for the project.",
+      "tasks": ["collect_data"]
+    },
+    {
+      "id": 2,
+      "name": "Data Preprocessing Agent",
+      "description": "Preprocesses the collected data.",
+      "tasks": ["preprocess_data"]
+    },
+    {
+      "id": 3,
+      "name": "Model Training Agent",
+      "description": "Trains the model using the preprocessed data.",
+      "tasks": ["train_model"]
+    },
+    {
+      "id": 4,
+      "name": "Model Evaluation Agent",
+      "description": "Evaluates the trained model.",
+      "tasks": ["evaluate_model"]
+    }
+  ],
+  "tasks": {
+    "collect_data": {
+      "description": "Generates and saves Bengali text via API.",
+      "script": "collect_data.sh"
+    },
+    "preprocess_data": {
+      "description": "Analyzes and preprocesses the collected data.",
+      "script": "preprocess_data.sh"
+    },
+    "train_model": {
+      "description": "Trains the model using the preprocessed data.",
+      "script": "train_model.sh"
+    },
+    "evaluate_model": {
+      "description": "Evaluates the trained model.",
+      "script": "evaluate_model.sh"
+    }
+  },
+  "scripts": {
+    "collect_data.sh": "path/to/collect_data.sh",
+    "preprocess_data.sh": "path/to/preprocess_data.sh",
+    "train_model.sh": "path/to/train_model.sh",
+    "evaluate_model.sh": "path/to/evaluate_model.sh"
+  }
+}

train.py ADDED Viewed

	@@ -0,0 +1,14 @@

+import argparse
+import time
+def train(epochs):
+    for epoch in range(epochs):
+        print("Epoch %d/%d training..." % (epoch+1, epochs))
+        time.sleep(2)
+        print("Epoch %d complete, accuracy: %.2f" % (epoch+1, 0.8 + epoch*0.01))
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--epochs", type=int, default=1)
+    args = parser.parse_args()
+    train(args.epochs)