Upload 13 files

Browse files

Files changed (14) hide show

.gitattributes +1 -0
LICENSE +23 -0
README.md +112 -962
data/benchmark_dataset.json +0 -0
data/training_data.json +3 -0
requirements.txt +36 -0
results/evaluation_results.json +0 -0
run_training.sh +73 -0
src/__init__.py +0 -0
src/config.py +317 -0
src/evaluate.py +641 -0
src/generate_benchmark.py +459 -0
src/prepare_dataset.py +199 -0
src/train.py +559 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/training_data.json filter=lfs diff=lfs merge=lfs -text

LICENSE ADDED Viewed

	@@ -0,0 +1,23 @@

+MIT License
+Copyright (c) 2025 FunctionGemma contributors
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,4 +1,12 @@
----
 language:
 - en
 - zh
@@ -19,1007 +27,149 @@ tags:
 - standard-protocol
 library_name: transformers
 pipeline_tag: text-generation
----
-# DMind-3-nano: Privacy-First On-Device Crypto Intent Recognition
-<div align="center">
-[![Model](https://img.shields.io/badge/🤗%20Hugging%20Face-Model-blue)](https://huggingface.co/YOUR_ORG/DMind-3-nano)
-[![License](https://img.shields.io/badge/License-Apache%202.0-green.svg)](https://opensource.org/licenses/Apache-2.0)
-[![Base Model](https://img.shields.io/badge/Base-FunctionGemma--270M-orange)](https://huggingface.co/google/functiongemma-270m-it)
-[![Privacy](https://img.shields.io/badge/Privacy-100%25%20On--Device-brightgreen)](https://huggingface.co/YOUR_ORG/DMind-3-nano)
-[![Protocol](https://img.shields.io/badge/Protocol-Standardized-blue)](https://huggingface.co/YOUR_ORG/DMind-3-nano)
-**🔐 Your Keys. Your Data. Your Privacy.**
-</div>
-## 🎯 Mission: Privacy-First Local Wallet Intelligence
-**DMind-3-nano** is designed for **on-device intent recognition** in cryptocurrency wallets, prioritizing user privacy through local inference. It establishes standardized protocols for blockchain function calling while keeping all user interactions private and secure.
-### Core Principles
-🔐 **Privacy-First**: All inference happens locally—no user data leaves the device
-📱 **Edge-Optimized**: 270M parameters designed for mobile and edge deployment
-🔄 **Standardized**: Unified protocols enable ecosystem-wide compatibility
-⚡ **Fast & Efficient**: Sub-second response time on consumer hardware
-🌍 **Accessible**: Runs on phones, tablets, and local hardware wallets
-### Why On-Device Matters
-When handling cryptocurrency operations, **privacy is paramount**:
-- ❌ Cloud-based AI: Your wallet commands, token preferences, and trading patterns are exposed
-- ✅ Local AI: All intent recognition happens on your device—**your keys, your data, your privacy**
-By standardizing `SEARCH_TOKEN` and `EXECUTE_SWAP` protocols, we enable seamless integration across wallets, DEXs, and agent frameworks—all while maintaining complete user privacy.
----
-## 📋 Model Overview
-**DMind-3-nano** is an ultra-lightweight function-calling model fine-tuned from `google/functiongemma-270m-it`, specifically optimized for on-device cryptocurrency intent recognition. Validated on 10,000+ real user interactions, it delivers enterprise-grade accuracy while running entirely on consumer hardware.
-### At a Glance: Why Choose On-Device?
-| Feature | ☁️ Cloud AI Wallets | 🔐 DMind-3-nano |
-|---------|-------------------|-----------------|
-| **Your Commands** | Sent to servers | Stay on your device |
-| **Privacy** | Logged & analyzed | 100% private |
-| **Latency** | 500-2000ms | <100ms |
-| **Works Offline** | ❌ No | ✅ Yes |
-| **Monthly Cost** | $5-20/user | Free |
-| **Data Breach Risk** | High (centralized) | Zero (no cloud) |
-| **Model Size** | N/A (cloud) | 70-540MB |
-### Key Features
-- 🔐 **Privacy-First**: 100% on-device inference—zero data sent to cloud
-- 📱 **Edge-Optimized**: Only 270M parameters, runs on phones and tablets
-- ⚡ **Fast Inference**: <100ms response time on modern mobile CPUs
-- 🎯 **Protocol-Standardized**: Cross-platform compatibility through unified schemas
-- 🌐 **Multi-Chain**: Solana, Ethereum, BSC, Base support
-- 🌍 **Multilingual**: Native English and Chinese understanding
-- 🔋 **Energy-Efficient**: Minimal battery impact for mobile wallets
-### Model Details
 | Property | Value |
-|----------|-------|
-| Model Name | DMind-3-nano |
-| Base Model | google/functiongemma-270m-it |
-| Parameters | 270M |
-| Context Length | 2048 tokens |
-| Precision | BF16 |
-| Languages | English, Chinese |
-| License | Apache 2.0 |
-### ⚠️ Experimental Model Notice
-**DMind-3-nano** is an **exploratory research model** demonstrating privacy-first on-device function calling. The model has been optimized and validated on a specific subset of tokens and chains commonly used in the Solana ecosystem.
-**Best Supported Tokens:**
-```
-SOL, USDC, JUP, RAY, BONK, WIF, ETH, BTC, POPCAT, BOME, TRUMP
 ```
-**Best Supported Chains:**
-```
-solana, ethereum, bsc, base
 ```
-**Performance Notes:**
-- ✅ **Optimal**: The model achieves highest accuracy (95%+) on the above tokens and chains
-- ⚠️ **General Support**: Other tokens/chains are supported but may have lower accuracy
-- 🔄 **Extensible**: The protocol design allows easy fine-tuning for additional tokens/chains
-For production deployments, we recommend:
-1. Testing with your specific token list
-2. Fine-tuning on your target tokens if needed
-3. Validating outputs before executing transactions
----
-## 🚀 Quick Start
-### Installation
 ```bash
-pip install transformers>=4.45.0 torch>=2.0.0 accelerate>=0.24.0
 ```
-### Basic Usage
-```python
-import torch
-from transformers import AutoProcessor, AutoModelForCausalLM
-# Load model and processor
-model_name = "YOUR_ORG/DMind-3-nano"
-processor = AutoProcessor.from_pretrained(model_name)
-model = AutoModelForCausalLM.from_pretrained(
-    model_name,
-    torch_dtype=torch.bfloat16,
-    device_map="auto"
-)
-# Define available tools
-tools = [
-    {
-        "type": "function",
-        "function": {
-            "name": "SEARCH_TOKEN",
-            "description": "Search for tokens on blockchain",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "symbol": {"type": "string", "description": "Token symbol"},
-                    "chain": {"type": "string", "enum": ["solana", "ethereum", "bsc", "base"]}
-                }
-            }
-        }
-    },
-    {
-        "type": "function",
-        "function": {
-            "name": "EXECUTE_SWAP",
-            "description": "Execute token swap on Solana",
-            "parameters": {
-                "type": "object",
-                "properties": {
-                    "inputTokenSymbol": {"type": "string"},
-                    "outputTokenSymbol": {"type": "string"},
-                    "inputTokenAmount": {"type": "string"}
-                }
-            }
-        }
-    }
-]
-# Prepare conversation
-messages = [
-    {"role": "developer", "content": "You are a helpful assistant for crypto operations."},
-    {"role": "user", "content": "Buy 100 BONK with SOL"}
-]
-# Generate function call
-inputs = processor.apply_chat_template(
-    messages,
-    tools=tools,
-    add_generation_prompt=True,
-    return_dict=True,
-    return_tensors="pt"
-).to(model.device)
-outputs = model.generate(**inputs, max_new_tokens=256, do_sample=False)
-response = processor.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-print(response)
-# Output: <start_function_call>call:EXECUTE_SWAP{inputTokenSymbol:"SOL",outputTokenSymbol:"BONK",inputTokenAmount:"100"}<end_function_call>
 ```
----
-## 🛠️ Tool Protocols (Standardized)
-### 1️⃣ SEARCH_TOKEN
-Search for cryptocurrency tokens across multiple blockchains.
-**Schema:**
-```json
-{
-  "name": "SEARCH_TOKEN",
-  "description": "Search for tokens on blockchain by symbol, address, or keyword",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "symbol": {
-        "type": "string",
-        "description": "Token symbol (e.g., 'SOL', 'ETH')"
-      },
-      "address": {
-        "type": "string",
-        "description": "Contract address of the token"
-      },
-      "chain": {
-        "type": "string",
-        "enum": ["solana", "ethereum", "bsc", "base"],
-        "description": "Target blockchain network"
-      },
-      "keyword": {
-        "type": "string",
-        "description": "Search keyword for token discovery"
-      }
-    },
-    "required": []
-  }
-}
 ```
-**Example Usage:**
 ```
 User: "查一下 Solana 上的 SOL"
 Model: <start_function_call>call:SEARCH_TOKEN{symbol:"SOL",chain:"solana"}<end_function_call>
 ```
-### 2️⃣ EXECUTE_SWAP
-Execute token swaps on Solana blockchain with intelligent defaults.
-**Schema:**
-```json
-{
-  "name": "EXECUTE_SWAP",
-  "description": "Swap tokens on the Solana blockchain. When user specifies 'buy <token>', default input is SOL. When 'sell <token>', default output is SOL.",
-  "parameters": {
-    "type": "object",
-    "properties": {
-      "inputTokenSymbol": {
-        "type": "string",
-        "description": "Symbol of the token to sell"
-      },
-      "inputTokenCA": {
-        "type": "string",
-        "description": "Contract address of input token"
-      },
-      "outputTokenSymbol": {
-        "type": "string",
-        "description": "Symbol of the token to buy"
-      },
-      "outputTokenCA": {
-        "type": "string",
-        "description": "Contract address of output token"
-      },
-      "inputTokenAmount": {
-        "type": "string",
-        "description": "Exact amount of input token"
-      },
-      "inputTokenPercentage": {
-        "type": "number",
-        "description": "Percentage of input token balance (0.0-1.0)"
-      },
-      "outputTokenAmount": {
-        "type": "string",
-        "description": "Expected output token amount"
-      }
-    },
-    "required": []
-  }
-}
-```
-**Example Usage:**
-```
-User: "Swap 5 SOL to BONK"
-Model: <start_function_call>call:EXECUTE_SWAP{inputTokenSymbol:"SOL",outputTokenSymbol:"BONK",inputTokenAmount:"5"}<end_function_call>
-User: "Sell 50% of my WIF"
-Model: <start_function_call>call:EXECUTE_SWAP{inputTokenSymbol:"WIF",outputTokenSymbol:"SOL",inputTokenPercentage:0.5}<end_function_call>
-```
-### Output Format Convention
-- **Wrapper Tags**: `<start_function_call>call:FUNCTION_NAME{args}<end_function_call>`
-- **Escape Sequences**: Use `<escape>` for quoted strings when necessary
-- **Argument Format**: JSON-like key-value pairs
----
-## 🔐 Privacy-First Architecture
-### The Privacy Problem in Crypto AI
-Most AI-powered wallet assistants today send your data to cloud servers:
-```
-❌ Cloud-based Assistant:
-   User: "Buy 1000 BONK" → Sent to Cloud API → Privacy Risk
-   • Your wallet commands are logged
-   • Trading patterns analyzed by third parties
-   • Potential data breaches expose user behavior
-   • Latency and internet dependency
-```
-### Our Solution: On-Device Intelligence
-**DMind-3-nano** runs entirely on your device:
-```
-✅ On-Device Assistant:
-   User: "Buy 1000 BONK" → Local Processing → Complete Privacy
-   • Zero data leaves your device
-   • No tracking, no logging, no surveillance
-   • Works offline
-   • Instant response (<100ms)
-```
-### Why Edge Deployment?
-| Feature | Cloud AI | DMind-3-nano (On-Device) |
-|---------|----------|--------------------------|
-| **Privacy** | ❌ Data sent to servers | ✅ 100% local processing |
-| **Latency** | ~500-2000ms | ✅ <100ms |
-| **Offline** | ❌ Requires internet | ✅ Works offline |
-| **Cost** | 💰 API fees per request | ✅ Free after download |
-| **Security** | ⚠️ API keys, data breaches | ✅ No attack surface |
-| **Compliance** | ⚠️ Data sovereignty issues | ✅ Full user control |
-### Real-World Deployment Scenarios
-**📱 Mobile Wallets**: iOS/Android wallet apps with conversational interface
-**💻 Desktop Wallets**: Electron-based wallets like Phantom, MetaMask
-**🔐 Hardware Wallets**: Secure enclaves with minimal compute (via quantization)
-**🌐 Browser Extensions**: Chrome/Firefox wallet extensions
-**🖥️ Local Trading Apps**: Desktop trading terminals with AI assistance
----
-## 🌟 Why Standardized Protocols Matter
-### The Problem
-In the current crypto AI landscape, each agent system implements its own function-calling format:
-```
-❌ Agent A: {"action": "swap", "from": "SOL", "to": "BONK", "amount": 100}
-❌ Agent B: {"tool": "trade", "input_token": "SOL", "output_token": "BONK", "qty": 100}
-❌ Agent C: {"cmd": "exchange", "sell": "SOL", "buy": "BONK", "vol": 100}
-```
-**Result**: Fragmentation, incompatibility, and integration hell.
-### Our Solution
-**Unified protocol** with clear schema definitions:
-```
-✅ Standardized: <start_function_call>call:EXECUTE_SWAP{inputTokenSymbol:"SOL",outputTokenSymbol:"BONK",inputTokenAmount:"100"}<end_function_call>
-```
-**Benefits**:
-- 🔄 **Plug-and-Play**: One protocol, many platforms
-- 🤝 **Multi-Agent Coordination**: Different agents speak the same language
-- 🛡️ **Type Safety**: Well-defined schemas reduce errors
-- 📈 **Ecosystem Growth**: Lower barrier for new integrations
----
-## 📊 Performance & Reliability
-Validated through extensive real-world testing:
-| Metric | Result |
-|--------|--------|
-| Test Cases | 10,000+ real user interactions |
-| Function Recognition Accuracy | **96.8%** |
-| Parameter Extraction Accuracy | **94.2%** |
-| SEARCH_TOKEN Protocol Adherence | **98.1%** |
-| EXECUTE_SWAP Protocol Adherence | **95.3%** |
-| Multi-turn Conversation Success | **92.7%** |
-**Testing Methodology**: Production environment with real users performing actual cryptocurrency operations across Solana, Ethereum, BSC, and Base chains. All tests conducted with standardized protocol validation.
-**Testing Scope**: The above metrics are based on the validated token set (SOL, USDC, JUP, RAY, BONK, WIF, ETH, BTC, POPCAT, BOME, TRUMP) and supported chains (Solana, Ethereum, BSC, Base). Performance on other tokens may vary.
----
-## 🔧 Advanced Usage
-### Batch Processing
-Process multiple queries efficiently:
-```python
-queries = [
-    "Buy 10 SOL worth of BONK",
-    "Search for Ethereum tokens",
-    "Sell 50% of my WIF holdings"
-]
-for query in queries:
-    messages = [
-        {"role": "developer", "content": "You are a helpful assistant for crypto operations."},
-        {"role": "user", "content": query}
-    ]
-    inputs = processor.apply_chat_template(
-        messages, tools=tools,
-        add_generation_prompt=True,
-        return_dict=True,
-        return_tensors="pt"
-    ).to(model.device)
-    outputs = model.generate(**inputs, max_new_tokens=256, do_sample=False)
-    response = processor.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=True)
-    print(f"Query: {query}")
-    print(f"Response: {response}\n")
-```
-### Parsing Function Calls
-Example parser for the standardized output format:
-```python
-import re
-import json
-def parse_function_call(response: str) -> dict:
-    """Parse standardized function call format"""
-    pattern = r'<start_function_call>call:(\w+)\{(.+?)\}<end_function_call>'
-    match = re.search(pattern, response)
-    if match:
-        func_name = match.group(1)
-        args_str = match.group(2).replace('<escape>', '"')
-        # Parse arguments
-        args = {}
-        for item in args_str.split(','):
-            if ':' in item:
-                key, value = item.split(':', 1)
-                args[key.strip().strip('"')] = value.strip().strip('"')
-        return {
-            "function": func_name,
-            "arguments": args,
-            "valid": True
-        }
-    return {"valid": False, "response": response}
-# Usage
-response = '<start_function_call>call:SEARCH_TOKEN{symbol:"SOL",chain:"solana"}<end_function_call>'
-parsed = parse_function_call(response)
-print(parsed)
-# Output: {'function': 'SEARCH_TOKEN', 'arguments': {'symbol': 'SOL', 'chain': 'solana'}, 'valid': True}
-```
----
-## 🏗️ Integration Guide
-### For Desktop Wallet Developers
-Integrate DMind-3-nano into your Python-based wallet application:
-```python
-class PrivateWalletAgent:
-    """On-device wallet assistant with zero cloud dependency"""
-    def __init__(self, model_name="YOUR_ORG/DMind-3-nano"):
-        self.processor = AutoProcessor.from_pretrained(model_name)
-        self.model = AutoModelForCausalLM.from_pretrained(
-            model_name,
-            torch_dtype=torch.bfloat16,
-            device_map="cpu"  # Force CPU for consistent behavior
-        )
-        self.tools = self._load_tools()
-    def process_user_command(self, user_query: str) -> dict:
-        """Process user intent locally, no network required"""
-        messages = [
-            {"role": "developer", "content": "You are a private wallet assistant."},
-            {"role": "user", "content": user_query}
-        ]
-        inputs = self.processor.apply_chat_template(
-            messages, tools=self.tools,
-            add_generation_prompt=True,
-            return_dict=True, return_tensors="pt"
-        )
-        # Local inference only
-        outputs = self.model.generate(**inputs, max_new_tokens=256, do_sample=False)
-        response = self.processor.decode(
-            outputs[0][inputs["input_ids"].shape[1]:],
-            skip_special_tokens=True
-        )
-        return self.parse_and_execute(response)
-    def parse_and_execute(self, response: str) -> dict:
-        parsed = parse_function_call(response)
-        if parsed["valid"]:
-            # Execute locally - never send to cloud
-            if parsed["function"] == "SEARCH_TOKEN":
-                return self.local_token_search(**parsed["arguments"])
-            elif parsed["function"] == "EXECUTE_SWAP":
-                return self.prepare_swap_transaction(**parsed["arguments"])
-        return {"error": "Invalid function call"}
-```
-### For Mobile Wallet Developers
-#### iOS (Swift + CoreML)
-```swift
-import CoreML
-class WalletIntentRecognizer {
-    private var model: MLModel
-    init() {
-        // Convert DMind-3-nano to CoreML format first
-        // See: https://huggingface.co/docs/transformers/serialization
-        guard let modelURL = Bundle.main.url(forResource: "DMind3Nano", withExtension: "mlmodelc"),
-              let model = try? MLModel(contentsOf: modelURL) else {
-            fatalError("Failed to load model")
-        }
-        self.model = model
-    }
-    func recognizeIntent(userInput: String) -> TransactionIntent? {
-        // Tokenize and run inference locally
-        let prediction = try? model.prediction(from: /* input features */)
-        // Parse standardized output
-        if let functionCall = parseFunctionCall(prediction?.output) {
-            return TransactionIntent(
-                function: functionCall.name,
-                parameters: functionCall.args
-            )
-        }
-        return nil
-    }
-}
-```
-#### Android (Kotlin + ONNX Runtime)
-```kotlin
-import ai.onnxruntime.*
-class WalletIntentRecognizer(context: Context) {
-    private val ortSession: OrtSession
-    init {
-        val ortEnv = OrtEnvironment.getEnvironment()
-        val modelBytes = context.assets.open("dmind3nano.onnx").readBytes()
-        ortSession = ortEnv.createSession(modelBytes)
-    }
-    fun recognizeIntent(userInput: String): TransactionIntent? {
-        // Tokenize input
-        val inputTensor = tokenizeInput(userInput)
-        // Run inference on-device
-        val outputs = ortSession.run(mapOf("input_ids" to inputTensor))
-        // Parse standardized output
-        return parseFunctionCall(outputs)
-    }
-    private fun parseFunctionCall(output: OrtSession.Result): TransactionIntent? {
-        // Parse <start_function_call>call:FUNCTION{args}<end_function_call>
-        // Return structured intent for wallet execution
-    }
-}
-```
-### Model Conversion Guide
-#### Convert to ONNX (for Android/Cross-platform)
-```python
-from transformers import AutoProcessor, AutoModelForCausalLM
-import torch
-model = AutoModelForCausalLM.from_pretrained("YOUR_ORG/DMind-3-nano")
-processor = AutoProcessor.from_pretrained("YOUR_ORG/DMind-3-nano")
-# Export to ONNX
-dummy_input = processor("test", return_tensors="pt")
-torch.onnx.export(
-    model,
-    (dummy_input["input_ids"],),
-    "dmind3nano.onnx",
-    input_names=["input_ids"],
-    output_names=["logits"],
-    dynamic_axes={"input_ids": {0: "batch", 1: "sequence"}}
-)
-# Quantize to INT8 for smaller size (~70MB)
-from onnxruntime.quantization import quantize_dynamic
-quantize_dynamic("dmind3nano.onnx", "dmind3nano_int8.onnx")
-```
-#### Convert to CoreML (for iOS)
-```python
-import coremltools as ct
-from transformers import AutoModelForCausalLM
-model = AutoModelForCausalLM.from_pretrained("YOUR_ORG/DMind-3-nano")
-# Trace model
-traced_model = torch.jit.trace(model, dummy_input["input_ids"])
-# Convert to CoreML
-coreml_model = ct.convert(
-    traced_model,
-    inputs=[ct.TensorType(shape=(1, ct.RangeDim(1, 512)), dtype=np.int32)]
-)
-coreml_model.save("DMind3Nano.mlmodel")
-```
-### For Protocol Adopters
-If you're building your own wallet or AI model, adopt these privacy-first protocols:
-#### Protocol Adoption Checklist
-1. ✅ **Use exact schema definitions** for `SEARCH_TOKEN` and `EXECUTE_SWAP`
-2. ✅ **Follow standardized output format**: `<start_function_call>call:FUNCTION_NAME{args}<end_function_call>`
-3. ✅ **Validate outputs** against the protocol schemas
-4. ✅ **Support on-device inference** (no cloud dependencies)
-5. ✅ **Document your implementation** and share with the community
-#### Why Adopt These Protocols?
-**For Users:**
-- 🔐 Privacy guarantee: Works identically across all compliant wallets
-- 🔄 Portability: Same commands work everywhere
-- 🛡️ Security: Standardized validation reduces vulnerabilities
-**For Developers:**
-- ⚡ Instant compatibility with the ecosystem
-- 📚 Well-documented, community-validated schemas
-- 🚀 Reduced development time (60-80% less integration work)
-- 🤝 Multi-wallet collaboration without custom adapters
-- 🎯 Focus on UX instead of protocol design
-#### Reference Implementations
-| Platform | Language | Status | Link |
-|----------|----------|--------|------|
-| Python (Desktop) | Python | ✅ Reference | This repo |
-| iOS CoreML | Swift | 🔄 Community | [Contribute!](https://github.com/YOUR_ORG/dmind-3-nano/issues) |
-| Android ONNX | Kotlin | 🔄 Community | [Contribute!](https://github.com/YOUR_ORG/dmind-3-nano/issues) |
-| React Native | JavaScript | 📋 Wanted | [Contribute!](https://github.com/YOUR_ORG/dmind-3-nano/issues) |
-| Rust (embedded) | Rust | 📋 Wanted | [Contribute!](https://github.com/YOUR_ORG/dmind-3-nano/issues) |
----
-## 💡 Use Cases & Examples
-### Real-World Privacy-Preserving Scenarios
-#### 1. **Mobile Wallet with Voice Commands**
-```
-User speaks: "Send 100 USDC to my friend"
-↓ (On-device speech-to-text)
-DMind-3-nano processes: "Send 100 USDC..."
-↓ (Local inference, <100ms)
-Output: <start_function_call>call:EXECUTE_SWAP{...}<end_function_call>
-↓
-Wallet prepares transaction locally, asks for confirmation
-✅ Zero data sent to cloud
-```
-#### 2. **Hardware Wallet with Limited Display**
-```
-User types on companion app: "查一下ETH价格"
-↓
-DMind-3-nano (quantized INT8) on secure enclave
-↓ (2-5s inference)
-Output: SEARCH_TOKEN{symbol:"ETH",chain:"ethereum"}
-↓
-Display ETH info on hardware wallet screen
-✅ All processing in secure hardware
-```
-#### 3. **Desktop Trading Terminal**
-```
-Power user: "Swap 50% of my SOL portfolio to BONK and POPCAT equally"
-↓
-DMind-3-nano parses complex intent locally
-↓ (<50ms on desktop CPU)
-Output: Multiple EXECUTE_SWAP calls
-↓
-Terminal shows batch transaction preview
-✅ Trading strategy never leaves local machine
-```
-#### 4. **Offline Cold Wallet Management**
-```
-User in air-gapped environment: "Prepare multi-sig transaction for 1000 USDT"
-↓
-DMind-3-nano (offline mode)
-↓
-Generates transaction template locally
-↓
-Sign and export to USB for broadcasting later
-✅ Complete airgap maintained
-```
-### Privacy Comparison Table
-| Scenario | Cloud AI | DMind-3-nano |
-|----------|----------|--------------|
-| User says "Buy 1000 PEPE" | 🔴 Command logged by API provider | ✅ Processed locally, no logs |
-| Frequent trading patterns | 🔴 Profile built for ads/analytics | ✅ Private, no tracking |
-| Whale wallet detected | 🔴 High-value user targeted | ✅ Anonymous to all |
-| Internet outage | 🔴 Assistant unusable | ✅ Full functionality |
-| Data breach at AI company | 🔴 Your trading history exposed | ✅ No data to breach |
-| Government data request | 🔴 Provider must comply | ✅ No centralized data exists |
----
-## 🔄 System Requirements
-### Software Dependencies
-| Package | Minimum Version |
-|---------|-----------------|
-| transformers | ≥ 4.45.0 |
-| torch | ≥ 2.0.0 |
-| accelerate | ≥ 0.24.0 |
-| Python | ≥ 3.8 |
-### Hardware Recommendations
-#### 🖥️ Desktop/Server (Development & Testing)
-| Configuration | Spec |
-|---------------|------|
-| **GPU** (Recommended) | NVIDIA GPU with ≥8GB VRAM |
-| **CPU** | Modern multi-core processor |
-| **RAM** | ≥16GB |
-| **Storage** | ~1GB for model files |
-**Performance Notes:**
-- ✅ BF16 precision: Best performance on Ampere+ GPUs (RTX 3000+, A100, H100)
-- ✅ FP16 precision: Good performance on older GPUs (V100, P100)
-- ✅ CPU inference: ~100-500ms latency on modern CPUs
-#### 📱 Mobile/Edge Devices (Production Deployment)
-| Device Type | Requirements | Expected Performance |
-|-------------|--------------|---------------------|
-| **iPhone** | iPhone 12+ (A14+) | <100ms with CoreML |
-| **Android** | Snapdragon 888+ or equivalent | <150ms with ONNX Runtime |
-| **iPad/Tablet** | Apple M1+ or flagship Android | <80ms |
-| **Embedded** | Raspberry Pi 4 (8GB) | ~1-2s (quantized) |
-| **Hardware Wallets** | ARM Cortex-A (high-end) | ~2-5s (INT8 quantized) |
-**Mobile Optimization Tips:**
-- 🔹 Use **ONNX** or **CoreML** conversion for optimal mobile performance
-- 🔹 Apply **INT8 quantization** to reduce model size to ~70MB
-- 🔹 Enable **neural engine acceleration** on Apple devices
-- 🔹 Use **NNAPI** on Android for hardware acceleration
----
-## 🤝 Join the Privacy-First Standardization Movement
-We're building an **open protocol for private, on-device crypto AI**. Help us establish industry standards that prioritize user privacy!
-### How to Contribute
-**🔐 For Wallet Developers:**
-- Integrate DMind-3-nano into your mobile/desktop wallet
-- Share mobile deployment challenges and solutions
-- Contribute platform-specific optimizations (iOS, Android, etc.)
-- Report real-world privacy and performance metrics
-**🎯 For Model Developers:**
-- Train specialized models using our protocols
-- Share quantization and optimization techniques
-- Contribute to model compression research (INT8, pruning, etc.)
-- Benchmark on different edge devices
-**🛠️ For Edge AI Engineers:**
-- Optimize for specific hardware (Apple Neural Engine, NNAPI, etc.)
-- Port to new platforms (Rust, WebAssembly, etc.)
-- Improve inference speed and energy efficiency
-- Share deployment best practices
-**📖 For Protocol Designers:**
-- Review and critique schema definitions
-- Propose new privacy-preserving protocols
-- Design protocols for emerging use cases (DeFi, NFTs, DAOs)
-- Help establish governance and versioning
-**📣 Spread Privacy-First AI:**
-- Star this repository and share with wallet developers
-- Write about the importance of on-device inference
-- Educate users about privacy risks of cloud-based AI
-- Organize workshops on edge AI for crypto
-### Discussion Channels
-- 💬 [GitHub Discussions](https://github.com/YOUR_ORG/dmind-3-nano/discussions): Protocol evolution and technical discussions
-- 🐛 [GitHub Issues](https://github.com/YOUR_ORG/dmind-3-nano/issues): Bug reports and feature requests
-- 📧 [Email](mailto:your-email@example.com): Partnership and collaboration inquiries
----
-## ❓ Frequently Asked Questions
-### General
-**Q: Is my data really private?**
-A: Yes. DMind-3-nano runs 100% on your device. No data is sent to any server, ever. You can verify this by monitoring network traffic or running in airplane mode.
-**Q: How fast is inference on mobile devices?**
-A: On modern smartphones (iPhone 12+, Snapdragon 888+), expect <100ms for intent recognition. On older devices, 200-500ms is typical.
-**Q: Can I use this offline?**
-A: Absolutely. Once downloaded, DMind-3-nano works completely offline. Perfect for cold wallets and air-gapped setups.
-**Q: What's the model size?**
-A: ~540MB (BF16), ~270MB (FP16), ~70MB (INT8 quantized). The INT8 version fits on most hardware wallets.
-### Technical
-**Q: How do I convert to mobile formats?**
-A: See the [Model Conversion Guide](#model-conversion-guide) above for ONNX and CoreML conversion instructions.
-**Q: Does it support custom tokens?**
-A: Yes! The protocols are designed to handle any token symbol, address, or chain. No retraining needed for new tokens.
-**Q: Can I fine-tune for my specific use case?**
-A: Yes. While the base model works well for general crypto operations, you can fine-tune on your specific vocabulary or additional protocols.
-**Q: What about languages beyond English/Chinese?**
-A: The model has basic understanding of other languages but was primarily trained on EN/ZH. Community contributions for multilingual models are welcome!
-### Integration
-**Q: Which mobile frameworks are supported?**
-A: Native iOS (CoreML), Native Android (ONNX Runtime), React Native (via ONNX), and Flutter (via TFLite) are all possible. See examples above.
-**Q: Can I integrate with existing wallet backends?**
-A: Yes! DMind-3-nano only handles intent recognition. Your existing transaction signing, RPC calls, and security logic remain unchanged.
-**Q: What about hardware wallet integration?**
-A: For secure elements with limited compute, use the INT8 quantized version. Expect 2-5s inference, which is acceptable for most use cases.
-### Privacy & Security
-**Q: How do you prove no data is collected?**
-A:
-1. Code is open source (inspect for network calls)
-2. Model runs locally (verify via network monitoring)
-3. Works offline (test in airplane mode)
-4. No telemetry or analytics built in
-**Q: Is on-device AI really more secure?**
-A: Yes. Eliminates risks of:
-- Man-in-the-middle attacks on API calls
-- Data breaches at AI providers
-- Server-side logging and profiling
-- Third-party data access
-**Q: What if someone steals my phone?**
-A: The model itself contains no private data. Your wallet's existing security (biometrics, PIN, seed phrase) still applies. DMind-3-nano is just a tool for parsing commands.
----
-## 📄 License & Usage
-### Model License
-This model is licensed under **Apache License 2.0**, allowing commercial and non-commercial use with minimal restrictions.
-### Protocol License
-The **SEARCH_TOKEN** and **EXECUTE_SWAP** protocol specifications are released into the **public domain** to encourage maximum adoption and standardization across the industry.
-**Important Compliance Notes:**
-- ✅ Commercial use allowed
-- ✅ Modification and distribution allowed
-- ✅ Protocol adoption requires no attribution
-- ⚠️ Validate all outputs before executing financial transactions
-- ⚠️ Comply with local cryptocurrency regulations
----
-## 🙏 Acknowledgments
-- **Google DeepMind** for the FunctionGemma foundation model
-- **Hugging Face** for democratizing AI model distribution
-- **10,000+ community testers** who validated the protocols in production
-- The broader crypto and AI communities for inspiration and feedback
----
-## 📚 Citation
-If you adopt these protocols or use DMind-3-nano in your work, please cite:
-```bibtex
-@misc{dmind3nano2024,
-  title={DMind-3-nano: Standardized Function Calling Protocol for Cryptocurrency Operations},
-  author={Your Organization},
-  year={2024},
-  publisher={Hugging Face},
-  howpublished={\url{https://huggingface.co/YOUR_ORG/DMind-3-nano}},
-  note={Protocols: SEARCH\_TOKEN, EXECUTE\_SWAP}
-}
-```
-### Protocol Version
-**Current Version**: 1.0.0
-**Released**: December 2024
-**Status**: Production-ready
----
-## 🌐 Ecosystem & Compatibility
-### Protocol Adopters
-We're building a community of compatible implementations. Join us!
-| Project/Platform | Status | Implementation |
-|------------------|--------|----------------|
-| DMind-3-nano | ✅ Reference Implementation | This model |
-| Your Project | 🔄 Seeking adopters | [Add yours!](https://github.com/YOUR_ORG/dmind-3-nano/issues) |
-### Roadmap
-**Privacy & Edge-First Evolution**
-- ✅ **v1.0**: SEARCH_TOKEN & EXECUTE_SWAP (Current) - 270M params
-- 🔄 **v1.1**: Multi-hop swap protocol (Q1 2025) - Mobile optimization
-- 📋 **v1.2**: Quantized INT8 release (Q1 2025) - ~70MB for hardware wallets
-- 📋 **v2.0**: DeFi lending/staking protocols (Q2 2025) - Privacy-preserving
-- 📋 **v2.1**: NFT operations (Q3 2025) - On-device image understanding
-- 📋 **v3.0**: Cross-chain bridge intent recognition (Q4 2025)
-**Protocol Governance Principles:**
-- 🔐 **Privacy-First**: Every protocol must support local inference
-- 🔄 **Backwards Compatible**: Old versions always work
-- 🌐 **Community-Driven**: Open RFC process for new protocols
-- 📖 **Transparent**: All decisions documented and public
----
-## 📞 Connect With Us
-<div align="center">
-### Get Involved
-[![Discord](https://img.shields.io/badge/Discord-Join%20Community-7289da)](https://discord.gg/YOUR_DISCORD)
-[![Twitter](https://img.shields.io/badge/Twitter-Follow%20Updates-1da1f2)](https://twitter.com/YOUR_HANDLE)
-[![GitHub](https://img.shields.io/badge/GitHub-Star%20Project-181717)](https://github.com/YOUR_ORG/dmind-3-nano)
-**Model Hub**: [🤗 Hugging Face](https://huggingface.co/YOUR_ORG/DMind-3-nano)
-**Discussions**: [HF Discussions](https://huggingface.co/YOUR_ORG/DMind-3-nano/discussions)
-**Issues**: [GitHub Issues](https://github.com/YOUR_ORG/dmind-3-nano/issues)
-**Email**: your-email@example.com
-</div>
----
-<div align="center">
-### 🚀 Let's Build the Privacy-First Crypto AI Ecosystem
-**🔐 Privacy. 📱 Edge-First. 🤝 Standardized.**
-> *"In crypto, we trust code, not clouds. Your wallet intelligence should be as private as your keys."*
----
-**[📥 Download Model](https://huggingface.co/YOUR_ORG/DMind-3-nano)** • **[📖 Protocol Spec](https://huggingface.co/YOUR_ORG/DMind-3-nano#tool-protocols-standardized)** • **[💬 Community](https://discord.gg/YOUR_DISCORD)**
----
-*Made with ❤️ for the open-source and crypto communities*
-*Empowering users with private, on-device AI since 2024*
-</div>

+# DMind-3-nano: Privacy-First On-Device Crypto Intent Recognition
+> Inference stays on your device. Standardized function calling for wallets, DEXs, and agents. Built on `google/functiongemma-270m-it`.
+**Repo purpose:** host the open-source training/eval pipeline and release artifacts. Place your exported model under `./model` before pushing to Hugging Face.
+## HF Card Metadata
+```
 language:
 - en
 - zh
 - standard-protocol
 library_name: transformers
 pipeline_tag: text-generation
+```
+## Highlights
+- 🔐 Privacy-first: 100% on-device intent recognition; no data leaves the device.
+- 📱 Edge-optimized: 270M params; runs on phones/tablets/edge CPUs.
+- 🔄 Standardized protocols: `SEARCH_TOKEN` / `EXECUTE_SWAP` with unified schemas.
+- 🌐 Multi-chain: Solana, Ethereum, BSC, Base.
+- 🌍 Multilingual: English + Chinese intents (Chinese samples kept in data/benchmarks).
+- 🤖 Agent-native: designed for local-first wallet/agent workflows where a growing share of trading decisions and execution happen **on-device**.
+- 📊 Training data: the final full fine-tune used **12,000+** samples in total; **LLM-generated data is only a subset**, and **60%+** of the data comes from **real trading scenarios**.
+- 🧾 **(To our knowledge) first public vertical-domain FunctionGemma case study**: an end-to-end example of fine-tuning `google/functiongemma-270m-it` for a real wallet/DEX intent domain, including the practical training/evaluation pipeline and reproducible scripts.
+## Why This Matters for Web3 (Standardization as a Step-Change)
+Web3 is composable at the protocol layer (tokens, RPCs), but still fragmented at the **intent layer**. Today every wallet, DEX, and agent framework invents its own “swap/search intent” schema and function-calling format. The result is high integration cost, brittle adapters, inconsistent safety guarantees, and poor ecosystem interoperability.
+This work targets a transformative goal: **standardize wallet intents** as a small, versionable protocol between natural language and transaction builders. Concretely, DMind-3-nano enforces a minimal set of typed tools (e.g. `SEARCH_TOKEN`, `EXECUTE_SWAP`) with strict schemas and a deterministic wrapper output format.
+What standardization unlocks:
+- **Interoperability**: one protocol works across wallets/DEXs/agents; integrations become plug-and-play.
+- **Safety & auditability**: tool calls are structured data—easy to validate, simulate, policy-check, and display for confirmation before signing.
+- **Benchmarkability**: shared datasets and comparable evaluations across models and releases.
+- **Ecosystem scaling**: new tools can be added via versioning without breaking existing clients.
+In short, DMind-3-nano is not only a model—it is a proposal for a **standard protocol layer** that can make wallet intelligence as interoperable as ERC-20 made tokens.
+### The next wave: local agents executing trades
+We expect a large share of future Web3 activity to be **agent-driven**: wallets will run local copilots that continuously parse user intent, monitor context, and propose/execute transactions. In that world, “cloud-only” intelligence becomes a bottleneck and a risk:
+- **Privacy**: trading intent, token preferences, and behavioral signals should not be streamed to third-party servers.
+- **Latency & reliability**: agents must work instantly and offline (mobile, hardware wallets, poor connectivity).
+- **Security boundaries**: local agents can keep a tighter loop between intent → policy checks → simulation → user confirmation → signing.
+This is why a small, high-accuracy **on-device function-calling model** is necessary infrastructure for the agent-native wallet era—and why standardizing the intent protocol matters even more when millions of agents need to speak the same language.
+Equally important, this repository serves as a **public reference implementation** for applying FunctionGemma to a concrete vertical domain. By openly sharing fine-tuning details (data format, training configs, evaluation, and benchmarks), it lowers the barrier for the community to replicate, extend, and standardize on a common intent protocol.
+## Model Overview
 | Property | Value |
+| --- | --- |
+| Model | DMind-3-nano |
+| Base | google/functiongemma-270m-it |
+| Params | 270M |
+| Context | 2048 |
+| Precision | BF16 (train) |
+| Best tokens | SOL, USDC, JUP, RAY, BONK, WIF, ETH, BTC, POPCAT, BOME, TRUMP |
+| Chains | solana, ethereum, bsc, base |
+**Experimental notice:** Highest accuracy on the token/chain set above; other assets may need further tuning. Validate outputs before transacting.
+## Repository Layout
+- `model/` **(add your exported model here before publishing)**
+- `src/` training/eval utilities
+  - `train.py` (LoRA or full fine-tune)
+  - `evaluate.py` (benchmark evaluation)
+  - `prepare_dataset.py` (SFT-ready formatting)
+  - `generate_benchmark.py` (100-case benchmark)
+  - `config.py` (tools, prompts, token maps)
+- `data/` sample data
+  - `training_data.json` (raw; open-sourced subset for reproducibility)
+  - `benchmark_dataset.json` (eval set; includes Chinese test prompts by design)
+- `results/evaluation_results.json` sample output
+- `run_training.sh`, `requirements.txt`
+## Quick Start (Training & Eval)
+Install:
+```bash
+pip install -r requirements.txt
 ```
+Train (LoRA default):
+```bash
+python -m src.train \
+  --model_path /path/to/functiongemma-270m-it \
+  --dataset_path ./data/training_data.json \
+  --output_dir ./runs \
+  --bf16
 ```
+Switch to full fine-tune: add `--no-use-lora`. Use `--use_4bit/--use_8bit` + `--gradient_checkpointing` for low memory.
+Evaluate:
 ```bash
+python -m src.evaluate \
+  --model_path ./runs/<run>/final_model \
+  --benchmark_path ./data/benchmark_dataset.json \
+  --output_path ./results/eval_$(date +%Y%m%d_%H%M%S).json
 ```
+Data utilities:
+```bash
+# Prepare SFT data
+python -m src.prepare_dataset --input ./data/training_data.json --output ./data/prepared_dataset.json
+# Regenerate benchmark
+python -m src.generate_benchmark --output ./data/benchmark_dataset.json
 ```
+Note: `data/prepared_dataset.json` is a **generated artifact** (optional) and is intentionally **not committed**.
+## Tool Protocols (Standardized)
+**SEARCH_TOKEN** — search on-chain token info.
+Params: `symbol`, `address`, `chain` (solana|ethereum|bsc|base), `keyword`.
+**EXECUTE_SWAP** — execute token swap.
+Params: `inputTokenSymbol`, `inputTokenCA`, `outputTokenCA`, `inputTokenAmount`, `inputTokenPercentage` (0-1), `outputTokenAmount`.
+Output format:
+```
+<start_function_call>call:FUNCTION_NAME{key1:val1,key2:val2}<end_function_call>
 ```
+Example (Chinese input retained for coverage):
 ```
 User: "查一下 Solana 上的 SOL"
 Model: <start_function_call>call:SEARCH_TOKEN{symbol:"SOL",chain:"solana"}<end_function_call>
 ```
+## Performance Snapshot
+- Function recognition: ~96.8% on validated set
+- Argument extraction: ~94.2%
+- Protocol adherence: SEARCH_TOKEN 98.1%, EXECUTE_SWAP 95.3%
+- Multi-turn success: ~92.7%
+Scope: tokens/chains listed in **Model Overview**; outside that set may be lower.
+## Deployment Notes
+- On-device: convert to ONNX/CoreML/TFLite for mobile/hardware wallets; apply INT8 quant for ~70MB.
+- CPU-only: expect sub-500ms on modern CPUs; GPUs give faster.
+- Keep Chinese benchmark samples intact (they are intentional test cases).
+## License & Governance
+- Code: MIT (`LICENSE`)
+- Model card intent: Apache-2.0 (as in metadata above)
+- Protocol specs (SEARCH_TOKEN / EXECUTE_SWAP): public domain for maximal adoption
+- Contributions are welcome via issues/PRs.
+Issues/PRs welcome. When publishing to Hugging Face, ensure `./model` contains your final weights/tokenizer. Replace `YOUR_ORG/DMind-3-nano`, badges, and links with your namespace before release.

data/benchmark_dataset.json ADDED Viewed

The diff for this file is too large to render. See raw diff

data/training_data.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88f5ee388ad60836142837fcc4790b7d53c2495763cb5627c1e4d78b9d58d9f9
+size 15262245

requirements.txt ADDED Viewed

	@@ -0,0 +1,36 @@

+# FunctionGemma SFT LoRA dependencies
+# PyTorch (pick the build that matches your CUDA)
+torch>=2.0.0
+torchvision
+torchaudio
+# Hugging Face stack
+transformers>=4.40.0
+datasets>=2.18.0
+accelerate>=0.27.0
+tokenizers>=0.15.0
+# TRL (Transformer Reinforcement Learning)
+trl>=0.8.0
+# PEFT (Parameter-Efficient Fine-Tuning)
+peft>=0.10.0
+# Quantization support (QLoRA)
+bitsandbytes>=0.43.0
+# Logging & monitoring
+tensorboard>=2.15.0
+wandb>=0.16.0
+# Utilities
+sentencepiece>=0.2.0
+protobuf>=4.25.0
+tqdm>=4.66.0
+# Flash Attention (optional; install separately)
+# pip install flash-attn --no-build-isolation
+# Evaluation
+evaluate>=0.4.0

results/evaluation_results.json ADDED Viewed

The diff for this file is too large to render. See raw diff

run_training.sh ADDED Viewed

	@@ -0,0 +1,73 @@

+#!/bin/bash
+# FunctionGemma SFT LoRA quickstart
+# Environment
+export CUDA_VISIBLE_DEVICES=0  # e.g. "0,1,2,3" for multi-GPU
+export TOKENIZERS_PARALLELISM=false
+# Model path (update to your local model location)
+MODEL_PATH="/path/to/your/functiongemma-270m-it"
+# Dataset path
+DATASET_PATH="./data/training_data.json"
+# Output directory
+OUTPUT_DIR="./runs"
+# Run name
+RUN_NAME="functiongemma-lora-$(date +%Y%m%d_%H%M%S)"
+echo "========================================"
+echo "FunctionGemma SFT LoRA training"
+echo "========================================"
+echo "Model: $MODEL_PATH"
+echo "Dataset: $DATASET_PATH"
+echo "Output: $OUTPUT_DIR/$RUN_NAME"
+echo "========================================"
+# Option 1: Standard LoRA (recommended for most GPUs)
+python -m src.train \
+    --model_path "$MODEL_PATH" \
+    --dataset_path "$DATASET_PATH" \
+    --output_dir "$OUTPUT_DIR" \
+    --run_name "$RUN_NAME" \
+    --lora_r 16 \
+    --lora_alpha 32 \
+    --lora_dropout 0.05 \
+    --num_train_epochs 3 \
+    --per_device_train_batch_size 4 \
+    --gradient_accumulation_steps 4 \
+    --learning_rate 5e-5 \
+    --warmup_ratio 0.1 \
+    --max_seq_length 2048 \
+    --bf16 \
+    --logging_steps 10 \
+    --save_steps 100 \
+    --eval_steps 100 \
+    --gradient_checkpointing
+# Option 2: QLoRA (for smaller GPUs, uncomment to use)
+# python -m src.train \
+#     --model_path "$MODEL_PATH" \
+#     --dataset_path "$DATASET_PATH" \
+#     --output_dir "$OUTPUT_DIR" \
+#     --run_name "$RUN_NAME-qlora" \
+#     --lora_r 16 \
+#     --lora_alpha 32 \
+#     --lora_dropout 0.05 \
+#     --num_train_epochs 3 \
+#     --per_device_train_batch_size 8 \
+#     --gradient_accumulation_steps 2 \
+#     --learning_rate 2e-4 \
+#     --warmup_ratio 0.1 \
+#     --max_seq_length 2048 \
+#     --use_4bit \
+#     --logging_steps 10 \
+#     --save_steps 100 \
+#     --eval_steps 100 \
+#     --gradient_checkpointing
+echo "========================================"
+echo "Training finished!"
+echo "Model saved to: $OUTPUT_DIR/$RUN_NAME"
+echo "========================================"

src/__init__.py ADDED Viewed

File without changes

src/config.py ADDED Viewed

	@@ -0,0 +1,317 @@

+#!/usr/bin/env python3
+"""
+Shared configuration.
+Includes:
+1. System prompt templates
+2. Token mappings (symbol -> contract address)
+3. Tool definitions
+4. Supported chains
+"""
+# ============================================================
+# Token mappings
+# ============================================================
+# Tokens on Solana
+SOLANA_TOKENS = {
+    # Native
+    "SOL": "So11111111111111111111111111111111111111112",
+    # Stablecoins
+    "USDC": "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v",
+    "USDT": "Es9vMFrzaCERmJfrF4H2FYD4KCoNkY11McCe8BenwNYB",
+    # Major tokens
+    "RAY": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R",
+    "JUP": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN",
+    "BONK": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263",
+    "WIF": "EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm",
+    "ORCA": "orcaEKTdK7LKz57vaAYr9QeNsVEPfiu6QeMU1kektZE",
+    "MNGO": "MangoCzJ36AjZyKwVj3VnYU4GTonjfVEnJmvvWaxLac",
+    "SRM": "SRMuApVNdxXokk5GT7XD5cUUgXMBCoAz2LHeuAoKWRt",
+    "STEP": "StepAscQoEioFxxWGnh2sLBDFp9d8rvKz2Yp39iDpyT",
+    "COPE": "8HGyAAB1yoM1ttS7pXjHMa3dukTFGQggnFFH3hJZgzQh",
+    "FIDA": "EchesyfXePKdLtoiZSL8pBe8Myagyy8ZRqsACNCFGnvp",
+    "MAPS": "MAPS41MDahZ9QdKXhVa4dWB9RuyfV4XqhyAZ8XcYepb",
+    "OXY": "z3dn17yLaGMKffVogeFHQ9zWVcXgqgf3PQnDsNs2g6M",
+    "MEDIA": "ETAtLmCmsoiEEKfNrHKJ2kYy3MoABhU6NQvpSfij5tDs",
+    "SLND": "SLNDpmoWTVADgEdndyvWzroNL7zSi1dF9PC3xHGtPwp",
+    "SAMO": "7xKXtg2CW87d97TXJSDpbD5jBkheTqA83TZRuJosgAsU",
+    # Meme tokens
+    "POPCAT": "7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr",
+    "TRUMP": "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN",
+    "MELANIA": "FUAfBo2jgks6gB4Z4LfZkqSZgzNucisEHqnNebaRxM1P",
+    "PEPE": "CLoUDKc4Ane7HeQcPpE3YHnznRxhMimJ4MyaUqyHFzAu",
+    "DOGE": "9nEqaUcb16sQ3Tn1psbkWqyhPdLmfHWjKGymREjsAgTE",
+    "SHIB": "CiKu4eHsVrc1eueVQeHn7qhXTcVu95gSQmBpX4utjL9z",
+    "FLOKI": "FLKiaqMsaLwSZLqnCpSFHktMUQsXgcrhFN3TZg1Zj2EN",
+    "WOJAK": "8cMrxCkREwszByAj9hzn5qKLnLjvLDU7gPYtWQgfwSAS",
+    "GIGA": "63LfDmNb3MQ8mw9MtZ2To9bEA2M71kZUUGq5tiJxcqj9",
+    "BOME": "ukHH6c7mMyiWCf1b9pnWe25TSpkDDt3H5pQZgZ74J82",
+    "SLERF": "7BgBvyjrZX1YKz4oh9mjb8ZScatkkwb8DzFx7LoiVkM3",
+    "MEW": "MEW1gQWJ3nEXg2qgERiKu7FAFj79PHvQVREQUzScPP5",
+    "MYRO": "HhJpBhRRn4g56VsyLuT8DL5Bv31HkXqsrahTTUCZeZg4",
+    "PONKE": "5z3EqYQo9HiCEs3R84RCDMu2n7anpDMxRhdK8PSWmrRC",
+    "MOODENG": "ED5nyyWEzpPPiWimP8vYm7sD7TD3LAt3Q3gRTWHzPJBY",
+    # DeFi tokens
+    "PYTH": "HZ1JovNiVvGrGNiiYvEozEVgZ58xaU3RKwX8eACQBCt3",
+    "JTO": "jtojtomepa8beP8AuQc6eXt5FriJwfFMwQx2v2f9mCL",
+    "W": "85VBFQZC9TZkfaptBWjvUw7YbZjy52A6mjtPGjstQAmQ",
+    "RENDER": "rndrizKT3MK1iimdxRdWabcF7Zg7AR5T4nud4EkHBof",
+    "HNT": "hntyVP6YFm1Hg25TN9WGLqM12b8TQmcknKrdu1oxWux",
+    "INJ": "6McPRfPV6bY1e9hLxWyG54W9i9Epq75QBvXg2oetBVTB",
+    # Wrapped tokens
+    "WETH": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs",  # same as WETH
+    "WBTC": "3NZ9JMVBmGAqocybic2c7LQCJScmgsAZ6vQqTDzcqmJh",  # same as WBTC
+    "ETH": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs",  # same as WETH
+    "BTC": "3NZ9JMVBmGAqocybic2c7LQCJScmgsAZ6vQqTDzcqmJh",  # same as WBTC
+}
+# Tokens on Ethereum
+ETHEREUM_TOKENS = {
+    "ETH": "0xEeeeeEeeeEeEeeEeEeEeeEEEeeeeEeeeeeeeEEeE",
+    "WETH": "0xC02aaA39b223FE8D0A0e5C4F27eAD9083C756Cc2",
+    "USDC": "0xA0b86991c6218b36c1d19D4a2e9Eb0cE3606eB48",
+    "USDT": "0xdAC17F958D2ee523a2206206994597C13D831ec7",
+    "DAI": "0x6B175474E89094C44Da98b954EescdeCB5BE3830",
+    "WBTC": "0x2260FAC5E5542a773Aa44fBCfeDf7C193bc2C599",
+    "UNI": "0x1f9840a85d5aF5bf1D1762F925BDADdC4201F984",
+    "LINK": "0x514910771AF9Ca656af840dff83E8264EcF986CA",
+    "AAVE": "0x7Fc66500c84A76Ad7e9c93437bFc5Ac33E2DDaE9",
+    "PEPE": "0x6982508145454Ce325dDbE47a25d4ec3d2311933",
+    "SHIB": "0x95aD61b0a150d79219dCF64E1E6Cc01f0B64C4cE",
+    "TRUMP": "0x576e2BeD8F7b46D34016198911Cdf9886f78bea7",
+}
+# Tokens on BSC
+BSC_TOKENS = {
+    "BNB": "0xEeeeeEeeeEeEeeEeEeEeeEEEeeeeEeeeeeeeEEeE",
+    "WBNB": "0xbb4CdB9CBd36B01bD1cBaEBF2De08d9173bc095c",
+    "USDC": "0x8AC76a51cc950d9822D68b83fE1Ad97B32Cd580d",
+    "USDT": "0x55d398326f99059fF775485246999027B3197955",
+    "BUSD": "0xe9e7CEA3DedcA5984780Bafc599bD69ADd087D56",
+    "CAKE": "0x0E09FaBB73Bd3Ade0a17ECC321fD13a19e81cE82",
+    "DOGE": "0xbA2aE424d960c26247Dd6c32edC70B295c744C43",
+    "SHIB": "0x2859e4544C4bB03966803b044A93563Bd2D0DD4D",
+    "FLOKI": "0xfb5B838b6cfEEdC2873aB27866079AC55363D37E",
+}
+# Tokens on Base
+BASE_TOKENS = {
+    "ETH": "0xEeeeeEeeeEeEeeEeEeEeeEEEeeeeEeeeeeeeEEeE",
+    "WETH": "0x4200000000000000000000000000000000000006",
+    "USDC": "0x833589fCD6eDb6E08f4c7C32D4f71b54bdA02913",
+    "DAI": "0x50c5725949A6F0c72E6C4a641F24049A917DB0Cb",
+    "BRETT": "0x532f27101965dd16442E59d40670FaF5eBB142E4",
+    "DEGEN": "0x4ed4E862860beD51a9570b96d89aF5E1B0Efefed",
+    "TOSHI": "0xAC1Bd2486aAf3B5C0fc3Fd868558b082a531B2B4",
+}
+# Token mappings for all chains
+ALL_TOKENS = {
+    "solana": SOLANA_TOKENS,
+    "ethereum": ETHEREUM_TOKENS,
+    "bsc": BSC_TOKENS,
+    "base": BASE_TOKENS,
+}
+# Supported chains
+SUPPORTED_CHAINS = ["solana", "ethereum", "bsc", "base"]
+# Default chain
+DEFAULT_CHAIN = "solana"
+# ============================================================
+# System prompt templates
+# ============================================================
+def get_system_prompt(chain: str = "solana") -> str:
+    """
+    Build a system prompt that contains the token mapping for a chain.
+    Args:
+        chain: blockchain name
+    Returns:
+        system prompt string
+    """
+    tokens = ALL_TOKENS.get(chain, SOLANA_TOKENS)
+    # Build the token mapping string
+    token_list = []
+    for symbol, address in tokens.items():
+        token_list.append(f"  - {symbol}: {address}")
+    token_mapping_str = "\n".join(token_list)
+    system_prompt = f"""You are a blockchain trading assistant that helps users search for tokens and execute swaps.
+## Available Tools
+1. **SEARCH_TOKEN**: Search for token information on the blockchain.
+   - Parameters: symbol, address, chain, keyword
+   - Use when user wants to find/query/search token information
+2. **EXECUTE_SWAP**: Execute a token swap on the blockchain.
+   - Parameters: inputTokenCA, outputTokenCA, inputTokenAmount, inputTokenPercentage
+   - Use when user wants to buy/sell/swap/convert tokens
+## Token Address Mapping ({chain.upper()})
+{token_mapping_str}
+## Important Rules
+1. When user mentions a token symbol (like SOL, USDC, RAY), use the corresponding contract address from the mapping above.
+2. For buying tokens: inputTokenCA is what user spends (usually SOL/ETH), outputTokenCA is what user receives.
+3. For selling tokens: inputTokenCA is what user sells, outputTokenCA is usually SOL/ETH.
+4. If user specifies an amount, use inputTokenAmount. If user specifies a percentage (like "all", "50%", "half"), use inputTokenPercentage.
+5. inputTokenPercentage should be a decimal between 0 and 1 (e.g., 0.5 for 50%, 1.0 for 100%).
+6. If the user's request is unclear or missing required information, ask for clarification instead of making assumptions.
+7. Do NOT call any function if the request is unrelated to token search or swap (e.g., weather, greetings, jokes).
+## Examples
+- "Buy 2 SOL of RAY" → EXECUTE_SWAP with inputTokenCA=SOL address, outputTokenCA=RAY address, inputTokenAmount="2"
+- "Sell all my JUP" → EXECUTE_SWAP with inputTokenCA=JUP address, outputTokenCA=SOL address, inputTokenPercentage=1.0
+- "Search for BONK" → SEARCH_TOKEN with symbol="BONK", chain="{chain}"
+"""
+    return system_prompt
+# Shorter system prompt (used for training)
+def get_system_prompt_short(chain: str = "solana") -> str:
+    """Build the concise system prompt."""
+    tokens = ALL_TOKENS.get(chain, SOLANA_TOKENS)
+    # Only include common tokens
+    common_tokens = ["SOL", "USDC", "USDT", "RAY", "JUP", "BONK", "WIF", "TRUMP", "POPCAT", "PEPE", "ETH", "WETH", "BTC", "WBTC"]
+    token_list = []
+    for symbol in common_tokens:
+        if symbol in tokens:
+            token_list.append(f"{symbol}:{tokens[symbol]}")
+    token_mapping_str = ", ".join(token_list)
+    system_prompt = f"""You are a blockchain trading assistant. Use SEARCH_TOKEN to find tokens and EXECUTE_SWAP to trade tokens.
+Token addresses ({chain}): {token_mapping_str}
+Rules:
+- For buy: inputTokenCA=payment token, outputTokenCA=target token
+- For sell: inputTokenCA=token to sell, outputTokenCA=SOL/ETH
+- Use inputTokenAmount for specific amounts, inputTokenPercentage (0-1) for percentages
+- Ask for clarification if request is unclear
+- Do NOT call functions for unrelated requests (weather, greetings, etc.)
+"""
+    return system_prompt
+# ============================================================
+# Tool definitions
+# ============================================================
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "SEARCH_TOKEN",
+            "description": "Search for token information on the blockchain. Use this when user wants to find, query, or get information about a token.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "symbol": {
+                        "type": ["string", "null"],
+                        "description": "Symbol of the token (e.g., SOL, USDC, RAY)"
+                    },
+                    "address": {
+                        "type": ["string", "null"],
+                        "description": "Contract address of the token"
+                    },
+                    "chain": {
+                        "type": "string",
+                        "enum": ["solana", "ethereum", "bsc", "base"],
+                        "description": "Blockchain to search on"
+                    },
+                    "keyword": {
+                        "type": ["string", "null"],
+                        "description": "Keyword to search for the token"
+                    }
+                },
+                "required": []
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "EXECUTE_SWAP",
+            "description": "Execute a token swap on the blockchain. Use this when user wants to buy, sell, swap, or convert tokens.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "inputTokenCA": {
+                        "type": ["string", "null"],
+                        "description": "Contract address of the token to spend/sell"
+                    },
+                    "outputTokenCA": {
+                        "type": ["string", "null"],
+                        "description": "Contract address of the token to receive/buy"
+                    },
+                    "inputTokenAmount": {
+                        "type": ["string", "null"],
+                        "description": "Exact amount of input token to swap"
+                    },
+                    "inputTokenPercentage": {
+                        "type": ["number", "null"],
+                        "description": "Percentage of input token balance to swap (0-1, e.g., 0.5 for 50%)"
+                    }
+                },
+                "required": ["inputTokenCA", "outputTokenCA"]
+            }
+        }
+    }
+]
+# ============================================================
+# Helper functions
+# ============================================================
+def get_token_address(symbol: str, chain: str = "solana") -> str:
+    """Get token address by symbol."""
+    tokens = ALL_TOKENS.get(chain, SOLANA_TOKENS)
+    return tokens.get(symbol.upper(), None)
+def get_native_token(chain: str = "solana") -> tuple:
+    """Get the native token (symbol, address) for a chain."""
+    native_tokens = {
+        "solana": ("SOL", SOLANA_TOKENS["SOL"]),
+        "ethereum": ("ETH", ETHEREUM_TOKENS["ETH"]),
+        "bsc": ("BNB", BSC_TOKENS["BNB"]),
+        "base": ("ETH", BASE_TOKENS["ETH"]),
+    }
+    return native_tokens.get(chain, ("SOL", SOLANA_TOKENS["SOL"]))
+def get_all_token_symbols(chain: str = "solana") -> list:
+    """Return all token symbols on a chain."""
+    tokens = ALL_TOKENS.get(chain, SOLANA_TOKENS)
+    return list(tokens.keys())
+if __name__ == "__main__":
+    # Quick self-test
+    print("=== System Prompt (Full) ===")
+    print(get_system_prompt("solana")[:500] + "...")
+    print("\n=== System Prompt (Short) ===")
+    print(get_system_prompt_short("solana"))
+    print("\n=== Token Address ===")
+    print(f"RAY: {get_token_address('RAY', 'solana')}")
+    print(f"TRUMP: {get_token_address('TRUMP', 'solana')}")

src/evaluate.py ADDED Viewed

	@@ -0,0 +1,641 @@

+#!/usr/bin/env python3
+"""
+FunctionGemma evaluation script (v2).
+Uses a unified system prompt for evaluation.
+Usage:
+    python -m src.evaluate --model_path ./runs/<run>/final_model --benchmark_path ./data/benchmark_dataset.json
+"""
+import os
+import re
+import sys
+import json
+import argparse
+import logging
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+from datetime import datetime
+from concurrent.futures import ThreadPoolExecutor, as_completed
+from threading import Lock
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+from peft import PeftModel
+from tqdm import tqdm
+# Import config
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+if str(PROJECT_ROOT) not in sys.path:
+    sys.path.insert(0, str(PROJECT_ROOT))
+DEFAULT_BENCHMARK_PATH = PROJECT_ROOT / "data" / "benchmark_dataset.json"
+DEFAULT_RESULTS_DIR = PROJECT_ROOT / "results"
+from src.config import (  # noqa: E402
+    get_system_prompt, get_system_prompt_short, TOOLS,
+    SOLANA_TOKENS, get_token_address
+)
+# Logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def load_model(
+    model_path: str,
+    lora_path: Optional[str] = None,
+    device: str = "auto",
+    load_in_8bit: bool = False,
+    load_in_4bit: bool = False,
+):
+    """Load model and tokenizer."""
+    logger.info(f"Loading model: {model_path}")
+    kwargs = {
+        "device_map": device,
+        "trust_remote_code": True,
+    }
+    if load_in_8bit:
+        kwargs["load_in_8bit"] = True
+    elif load_in_4bit:
+        from transformers import BitsAndBytesConfig
+        kwargs["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_quant_type="nf4",
+        )
+    else:
+        kwargs["torch_dtype"] = torch.bfloat16
+    tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    model = AutoModelForCausalLM.from_pretrained(model_path, **kwargs)
+    if lora_path:
+        logger.info(f"Loading LoRA adapter: {lora_path}")
+        model = PeftModel.from_pretrained(model, lora_path)
+    model.eval()
+    return model, tokenizer
+def parse_functiongemma_output(response: str) -> Tuple[Optional[str], Optional[Dict]]:
+    """
+    Parse FunctionGemma formatted output.
+    Format: <start_function_call>call:FUNC_NAME{key:<escape>value<escape>,...}<end_function_call>
+    """
+    # full match
+    pattern = r'<start_function_call>call:(\w+)\{([^}]*)\}<end_function_call>'
+    match = re.search(pattern, response)
+    if not match:
+        # partial match (truncated)
+        pattern = r'<start_function_call>call:(\w+)\{([^}]*)\}'
+        match = re.search(pattern, response)
+    if not match:
+        # match function name only
+        pattern = r'<start_function_call>call:(\w+)'
+        match = re.search(pattern, response)
+        if match:
+            return match.group(1), {}
+        # fallback: look for function names
+        for func in ["SEARCH_TOKEN", "EXECUTE_SWAP"]:
+            if func in response:
+                return func, {}
+        return None, None
+    func_name = match.group(1)
+    params_str = match.group(2) if len(match.groups()) > 1 else ""
+    # parse arguments
+    args = parse_params_string(params_str)
+    return func_name, args
+def parse_params_string(params_str: str) -> Dict:
+    """Parse parameter string."""
+    args = {}
+    if not params_str:
+        return args
+    # pattern: key:<escape>value<escape> or key:value
+    param_pattern = r'(\w+):(?:<escape>([^<]*)<escape>|([^,}]+))'
+    for match in re.finditer(param_pattern, params_str):
+        key = match.group(1)
+        value = match.group(2) if match.group(2) is not None else match.group(3)
+        if value is None:
+            continue
+        value = value.strip()
+        # handle percentage
+        if value.endswith('%'):
+            try:
+                args[key] = float(value[:-1]) / 100
+                continue
+            except ValueError:
+                pass
+        # attempt numeric conversion
+        try:
+            if '.' in value:
+                args[key] = float(value)
+            else:
+                args[key] = int(value)
+        except ValueError:
+            args[key] = value
+    return args
+def is_rejection_response(response: str) -> bool:
+    """Check if the response is a rejection/clarification."""
+    # no function call markers
+    if '<start_function_call>' not in response:
+        return True
+    # check clarification/rejection keywords (keep Chinese variants for CN prompts)
+    rejection_keywords = [
+        "please specify", "could you", "what token", "which token",
+        "请问", "请提供", "请告诉", "您能", "什么代币", "哪个代币",
+        "sorry", "can't", "cannot", "unable", "抱歉", "无法",
+        "more information", "more details", "更多信息",
+    ]
+    response_lower = response.lower()
+    for keyword in rejection_keywords:
+        if keyword.lower() in response_lower:
+            return True
+    return False
+def format_messages_for_model(
+    messages: List[Dict],
+    tokenizer,
+    tools: List[Dict] = None,
+) -> str:
+    """Format messages into the model chat template."""
+    if hasattr(tokenizer, 'apply_chat_template'):
+        try:
+            return tokenizer.apply_chat_template(
+                messages,
+                tools=tools,
+                tokenize=False,
+                add_generation_prompt=True,
+            )
+        except Exception:
+            pass
+    # Manual formatting fallback
+    formatted = ""
+    for msg in messages:
+        role = msg["role"]
+        content = msg["content"]
+        if role == "system":
+            formatted += f"<start_of_turn>system\n{content}<end_of_turn>\n"
+        elif role == "user":
+            formatted += f"<start_of_turn>user\n{content}<end_of_turn>\n"
+        elif role == "assistant":
+            formatted += f"<start_of_turn>model\n{content}<end_of_turn>\n"
+    formatted += "<start_of_turn>model\n"
+    return formatted
+def generate_response(
+    model,
+    tokenizer,
+    prompt: str,
+    system_prompt: str,
+    max_new_tokens: int = 256,
+) -> str:
+    """Generate model response."""
+    messages = [
+        {"role": "system", "content": system_prompt},
+        {"role": "user", "content": prompt},
+    ]
+    input_text = format_messages_for_model(messages, tokenizer, TOOLS)
+    inputs = tokenizer(input_text, return_tensors="pt")
+    inputs = {k: v.to(model.device) for k, v in inputs.items()}
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=max_new_tokens,
+            temperature=0.1,
+            do_sample=True,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id,
+        )
+    response = tokenizer.decode(outputs[0][inputs["input_ids"].shape[1]:], skip_special_tokens=False)
+    response = response.replace("<end_of_turn>", "").strip()
+    return response
+def compare_arguments(expected: Dict, actual: Dict) -> Tuple[float, List[str]]:
+    """Compare expected vs actual arguments."""
+    if not expected:
+        return 1.0 if not actual else 0.0, []
+    if not actual:
+        return 0.0, ["No arguments extracted"]
+    errors = []
+    total_keys = set(expected.keys()) | set(actual.keys())
+    if not total_keys:
+        return 1.0, []
+    matched = 0
+    for key in expected.keys():
+        exp_val = expected.get(key)
+        act_val = actual.get(key)
+        if exp_val is None:
+            continue
+        if act_val is None:
+            errors.append(f"Missing key: {key}")
+            continue
+        # Compare values
+        if str(exp_val) == str(act_val):
+            matched += 1
+        elif isinstance(exp_val, str) and isinstance(act_val, str):
+            # Partial match (contract address prefix)
+            if exp_val[:10] == act_val[:10]:
+                matched += 0.5
+                errors.append(f"Partial match for {key}")
+            else:
+                errors.append(f"Value mismatch for {key}: expected {exp_val}, got {act_val}")
+        elif isinstance(exp_val, (int, float)) and isinstance(act_val, (int, float)):
+            if abs(float(exp_val) - float(act_val)) < 0.01:
+                matched += 1
+            else:
+                errors.append(f"Value mismatch for {key}: expected {exp_val}, got {act_val}")
+        else:
+            errors.append(f"Type mismatch for {key}")
+    # Check extra keys
+    for key in actual.keys():
+        if key not in expected:
+            errors.append(f"Extra key: {key}")
+    score = matched / len([k for k in expected.keys() if expected.get(k) is not None]) if expected else 1.0
+    return score, errors
+def process_single_sample(
+    sample: Dict,
+    idx: int,
+    model,
+    tokenizer,
+    system_prompt: str,
+) -> Dict:
+    """Process one sample and return evaluation result."""
+    sample_id = sample.get("id", idx + 1)
+    category = sample.get("category", "unknown")
+    user_input = sample["input"]
+    expected_func = sample["expected"]["function_name"]
+    expected_args = sample["expected"].get("arguments", {})
+    # Extract user message
+    if isinstance(user_input, dict) and "messages" in user_input:
+        prompt = ""
+        for msg in user_input["messages"]:
+            if msg.get("role") == "user":
+                prompt = msg.get("content", "")
+                break
+    else:
+        prompt = str(user_input)
+    # Generate response
+    response = generate_response(model, tokenizer, prompt, system_prompt)
+    # Parse response
+    actual_func, actual_args = parse_functiongemma_output(response)
+    is_rejection = is_rejection_response(response)
+    # Evaluate
+    func_correct = False
+    args_correct = False
+    exact_match = False
+    arg_score = 0.0
+    error_msg = None
+    rejection_correct = False
+    if expected_func is None:
+        # Expecting rejection
+        func_correct = is_rejection or actual_func is None
+        args_correct = func_correct
+        exact_match = func_correct
+        arg_score = 1.0 if func_correct else 0.0
+        rejection_correct = func_correct
+        if not func_correct:
+            error_msg = f"Expected rejection, got {actual_func}"
+    else:
+        # Expecting a function call
+        func_correct = actual_func == expected_func
+        if func_correct:
+            # Compare arguments
+            arg_score, arg_errors = compare_arguments(expected_args, actual_args or {})
+            args_correct = arg_score >= 0.99
+            exact_match = args_correct
+            if not args_correct:
+                error_msg = "; ".join(arg_errors)
+        else:
+            error_msg = f"Expected {expected_func}, got {actual_func}"
+    # Return result
+    result = {
+        "sample_id": sample_id,
+        "category": category,
+        "expected_func": expected_func,
+        "actual_func": actual_func,
+        "func_correct": func_correct,
+        "args_correct": args_correct,
+        "exact_match": exact_match,
+        "rejection_correct": rejection_correct,
+        "arg_score": arg_score,
+        "error_msg": error_msg,
+        "user_input": user_input,
+        "expected_args": expected_args,
+        "actual_args": actual_args,
+        "response": response,
+    }
+    return result
+def evaluate_benchmark(
+    model,
+    tokenizer,
+    benchmark: List[Dict],
+    chain: str = "solana",
+    verbose: bool = False,
+    num_workers: int = 1,
+) -> Dict:
+    """Evaluate the benchmark (supports concurrency)."""
+    system_prompt = get_system_prompt_short(chain)
+    results = {
+        "total": len(benchmark),
+        "function_correct": 0,
+        "arguments_correct": 0,
+        "exact_match": 0,
+        "rejection_correct": 0,
+        "total_arg_score": 0.0,
+        "by_category": {},
+        "by_function": {},
+        "errors": [],
+    }
+    # Protect result updates with a lock
+    results_lock = Lock()
+    # Concurrent processing
+    if num_workers > 1:
+        logger.info(f"Evaluating with {num_workers} worker threads")
+        with ThreadPoolExecutor(max_workers=num_workers) as executor:
+            # Submit tasks
+            futures = {
+                executor.submit(
+                    process_single_sample,
+                    sample, i, model, tokenizer, system_prompt
+                ): i for i, sample in enumerate(benchmark)
+            }
+            # Progress bar
+            with tqdm(total=len(benchmark), desc="Evaluation") as pbar:
+                for future in as_completed(futures):
+                    result = future.result()
+                    # Update results (locked)
+                    with results_lock:
+                        _update_results(results, result, verbose)
+                    pbar.update(1)
+    else:
+        # Serial path
+        logger.info("Evaluating with a single thread")
+        for i, sample in enumerate(tqdm(benchmark, desc="Evaluation")):
+            result = process_single_sample(sample, i, model, tokenizer, system_prompt)
+            _update_results(results, result, verbose)
+    return results
+def _update_results(results: Dict, result: Dict, verbose: bool):
+    """Update aggregated evaluation results."""
+    sample_id = result["sample_id"]
+    category = result["category"]
+    expected_func = result["expected_func"]
+    actual_func = result["actual_func"]
+    func_correct = result["func_correct"]
+    args_correct = result["args_correct"]
+    exact_match = result["exact_match"]
+    rejection_correct = result["rejection_correct"]
+    arg_score = result["arg_score"]
+    error_msg = result["error_msg"]
+    # Overall stats
+    if func_correct:
+        results["function_correct"] += 1
+    if args_correct:
+        results["arguments_correct"] += 1
+    if exact_match:
+        results["exact_match"] += 1
+    if rejection_correct:
+        results["rejection_correct"] += 1
+    results["total_arg_score"] += arg_score
+    # By category
+    if category not in results["by_category"]:
+        results["by_category"][category] = {
+            "total": 0, "func_correct": 0, "exact_match": 0, "arg_score": 0.0
+        }
+    results["by_category"][category]["total"] += 1
+    if func_correct:
+        results["by_category"][category]["func_correct"] += 1
+    if exact_match:
+        results["by_category"][category]["exact_match"] += 1
+    results["by_category"][category]["arg_score"] += arg_score
+    # By function
+    func_key = expected_func or "None"
+    if func_key not in results["by_function"]:
+        results["by_function"][func_key] = {
+            "total": 0, "func_correct": 0, "exact_match": 0, "arg_score": 0.0
+        }
+    results["by_function"][func_key]["total"] += 1
+    if func_correct:
+        results["by_function"][func_key]["func_correct"] += 1
+    if exact_match:
+        results["by_function"][func_key]["exact_match"] += 1
+    results["by_function"][func_key]["arg_score"] += arg_score
+    # Record errors
+    if error_msg and len(results["errors"]) < 10:
+        results["errors"].append({
+            "id": sample_id,
+            "category": category,
+            "input": result["user_input"],
+            "expected_func": expected_func,
+            "actual_func": actual_func,
+            "expected_args": result["expected_args"],
+            "actual_args": result["actual_args"],
+            "error": error_msg,
+            "response": result["response"][:200],
+        })
+    if verbose:
+        status = "✓" if exact_match else "✗"
+        # Extract user message preview for logs
+        user_input = result["user_input"]
+        if isinstance(user_input, dict):
+            user_msg = ""
+            if "messages" in user_input:
+                for msg in user_input["messages"]:
+                    if msg.get("role") == "user":
+                        user_msg = msg.get("content", "")
+                        break
+            input_preview = user_msg[:50] if user_msg else str(user_input)[:50]
+        else:
+            input_preview = str(user_input)[:50]
+        logger.info(f"[{sample_id}] {status} {category}: {input_preview}...")
+def print_report(results: Dict):
+    """Print evaluation report."""
+    total = results["total"]
+    print("\n" + "=" * 70)
+    print("FunctionGemma Evaluation Report")
+    print("=" * 70)
+    print(f"\nTotal samples: {total}")
+    print("\n" + "-" * 70)
+    print("Overall metrics")
+    print("-" * 70)
+    func_acc = results["function_correct"] / total * 100 if total > 0 else 0
+    arg_acc = results["arguments_correct"] / total * 100 if total > 0 else 0
+    exact_acc = results["exact_match"] / total * 100 if total > 0 else 0
+    avg_arg_score = results["total_arg_score"] / total * 100 if total > 0 else 0
+    # Rejection accuracy
+    rejection_samples = sum(1 for f in results["by_function"].values() if "None" in str(f))
+    rejection_total = results["by_function"].get("None", {}).get("total", 0)
+    rejection_acc = results["rejection_correct"] / rejection_total * 100 if rejection_total > 0 else 0
+    print(f"Function selection accuracy: {func_acc:.2f}%")
+    print(f"Argument accuracy:           {arg_acc:.2f}%")
+    print(f"Exact match accuracy:        {exact_acc:.2f}%")
+    print(f"Average argument score:      {avg_arg_score:.2f}%")
+    print(f"Rejection accuracy:          {rejection_acc:.2f}%")
+    print("\n" + "-" * 70)
+    print("By function")
+    print("-" * 70)
+    for func, stats in sorted(results["by_function"].items()):
+        func_total = stats["total"]
+        func_correct = stats["func_correct"] / func_total * 100 if func_total > 0 else 0
+        func_arg_score = stats["arg_score"] / func_total * 100 if func_total > 0 else 0
+        func_exact = stats["exact_match"] / func_total * 100 if func_total > 0 else 0
+        print(f"{func:15} | samples: {func_total:3} | func acc: {func_correct:6.2f}% | "
+              f"arg score: {func_arg_score:6.2f}% | exact: {func_exact:6.2f}%")
+    if results["errors"]:
+        print("\n" + "-" * 70)
+        print("Error samples")
+        print("-" * 70)
+        for err in results["errors"][:5]:
+            print(f"\nID: {err['id']} | category: {err['category']}")
+            print(f"Input: {err['input']}")
+            print(f"Expected: {err['expected_func']} | Actual: {err['actual_func']}")
+            print(f"Error: {err['error']}")
+    print("\n" + "=" * 70)
+def main():
+    parser = argparse.ArgumentParser(description="FunctionGemma evaluation (v2)")
+    parser.add_argument("--model_path", type=str, required=True, help="Model path")
+    parser.add_argument("--lora_path", type=str, default=None, help="LoRA adapter path")
+    parser.add_argument("--benchmark_path", type=str, default=str(DEFAULT_BENCHMARK_PATH), help="Benchmark dataset path")
+    parser.add_argument("--output_path", type=str, default=None, help="Output path (defaults to results/ with timestamp)")
+    parser.add_argument("--chain", type=str, default="solana", help="Chain name")
+    parser.add_argument("--load_in_8bit", action="store_true", help="Enable 8-bit quantization")
+    parser.add_argument("--load_in_4bit", action="store_true", help="Enable 4-bit quantization")
+    parser.add_argument("--verbose", action="store_true", help="Verbose logging")
+    parser.add_argument("--num_workers", type=int, default=4, help="Number of worker threads (default 4)")
+    args = parser.parse_args()
+    # Load model
+    model, tokenizer = load_model(
+        args.model_path,
+        lora_path=args.lora_path,
+        load_in_8bit=args.load_in_8bit,
+        load_in_4bit=args.load_in_4bit,
+    )
+    # Load benchmark
+    benchmark_path = Path(args.benchmark_path)
+    logger.info(f"Loading benchmark: {benchmark_path}")
+    with open(benchmark_path, 'r', encoding='utf-8') as f:
+        benchmark = json.load(f)
+    logger.info(f"Benchmark samples: {len(benchmark)}")
+    # Evaluate
+    logger.info("Starting evaluation...")
+    results = evaluate_benchmark(
+        model, tokenizer, benchmark,
+        chain=args.chain,
+        verbose=args.verbose,
+        num_workers=args.num_workers,
+    )
+    # Print report
+    print_report(results)
+    # Save results
+    output_path = Path(args.output_path) if args.output_path else DEFAULT_RESULTS_DIR / f"evaluation_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(results, f, ensure_ascii=False, indent=2)
+    logger.info(f"Evaluation saved to: {output_path}")
+if __name__ == "__main__":
+    main()

src/generate_benchmark.py ADDED Viewed

	@@ -0,0 +1,459 @@

+#!/usr/bin/env python3
+"""
+Generate the FunctionGemma evaluation benchmark.
+Creates 100 high-quality samples to assess function-calling accuracy across:
+- SEARCH_TOKEN calls
+- EXECUTE_SWAP calls
+- Incomplete requests (should ask back)
+- Irrelevant requests (should refuse)
+"""
+import json
+import random
+import argparse
+from pathlib import Path
+from typing import Dict, List, Any, Optional
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_BENCHMARK_PATH = PROJECT_ROOT / "data" / "benchmark_dataset.json"
+# Token info
+TOKENS = {
+    "SOL": {"ca": "So11111111111111111111111111111111111111112", "chain": "solana"},
+    "USDC": {"ca": "EPjFWdd5AufqSSqeM2qN1xzybapC8G4wEGGkZwyTDt1v", "chain": "solana"},
+    "JUP": {"ca": "JUPyiwrYJFskUPiHa7hkeR8VUtAeFoSYbKedZNsDvCN", "chain": "solana"},
+    "RAY": {"ca": "4k3Dyjzvzp8eMZWUXbBCjEvwSkkk59S5iCNLY3QrkX6R", "chain": "solana"},
+    "BONK": {"ca": "DezXAZ8z7PnrnRJjz3wXBoRgixCa6xjnB7YaB1pPB263", "chain": "solana"},
+    "WIF": {"ca": "EKpQGSJtjMFqKZ9KQanSqYXRcF8fBopzLHYxdM65zcjm", "chain": "solana"},
+    "ETH": {"ca": "7vfCXTUXx5WJV5JADk17DUJ4ksgau7utNKj4b963voxs", "chain": "solana"},
+    "BTC": {"ca": "9n4nbM75f5Ui33ZbPYXn59EwSgE8CGsHtAeTH5YFeJ9E", "chain": "solana"},
+    "POPCAT": {"ca": "7GCihgDB8fe6KNjn2MYtkzZcRjQy3t9GHdC8uHYmW2hr", "chain": "solana"},
+    "TRUMP": {"ca": "6p6xgHyF7AeE6TZkSmFsko444wqoP15icUSqi2jfGiPN", "chain": "solana"},
+}
+CHAINS = ["solana", "ethereum", "bsc", "base"]
+# Tool definitions
+TOOLS = [
+    {
+        "type": "function",
+        "function": {
+            "name": "SEARCH_TOKEN",
+            "description": "search token onchain",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "symbol": {"type": ["string", "null"], "description": "Symbol of the token"},
+                    "address": {"type": ["string", "null"], "description": "Contract address of the token"},
+                    "chain": {"type": "string", "enum": ["solana", "ethereum", "bsc", "base"], "description": "supported chains"},
+                    "keyword": {"type": ["string", "null"], "description": "keyword to search for the token"}
+                },
+                "required": []
+            }
+        }
+    },
+    {
+        "type": "function",
+        "function": {
+            "name": "EXECUTE_SWAP",
+            "description": "Swap tokens on the Solana blockchain. When the user specifies 'buy <token>', the default input token is SOL. When the user specifies 'sell <token>', the default output token is SOL.",
+            "parameters": {
+                "type": "object",
+                "properties": {
+                    "inputTokenSymbol": {"type": ["string", "null"], "description": "Symbol of the token to sell."},
+                    "inputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to sell."},
+                    "outputTokenCA": {"type": ["string", "null"], "description": "Contract address of the token to buy."},
+                    "inputTokenAmount": {"type": ["string", "null"], "description": "Exact amount of the input token to swap."},
+                    "inputTokenPercentage": {"type": ["number", "null"], "description": "Percentage of the input token balance to swap."},
+                    "outputTokenAmount": {"type": ["string", "null"], "description": "Expected amount of the output token to receive."}
+                },
+                "required": ["inputTokenCA", "outputTokenCA", "inputTokenAmount", "inputTokenPercentage"]
+            }
+        }
+    }
+]
+def create_benchmark_item(
+    user_input: str,
+    expected_function: Optional[str],
+    expected_args: Optional[Dict] = None,
+    category: str = "function_call",
+    description: str = ""
+) -> Dict:
+    """Create one benchmark sample."""
+    return {
+        "id": None,  # assigned later
+        "category": category,
+        "description": description,
+        "input": {
+            "messages": [
+                {"role": "developer", "content": "You are a model that can do function calling with the following functions"},
+                {"role": "user", "content": user_input}
+            ],
+            "tools": TOOLS
+        },
+        "expected": {
+            "function_name": expected_function,
+            "arguments": expected_args
+        }
+    }
+def generate_search_token_benchmarks() -> List[Dict]:
+    """Generate SEARCH_TOKEN cases."""
+    benchmarks = []
+    # 1) search by symbol (English)
+    test_cases = [
+        ("Search for BONK token", "BONK", "solana", None, None),
+        ("Find WIF on solana", "WIF", "solana", None, None),
+        ("Look up JUP token", "JUP", "solana", None, None),
+        ("Search ETH on ethereum", "ETH", "ethereum", None, None),
+        ("Find USDC token on base", "USDC", "base", None, None),
+    ]
+    for query, symbol, chain, address, keyword in test_cases:
+        expected_args = {"symbol": symbol, "chain": chain}
+        if address:
+            expected_args["address"] = address
+        if keyword:
+            expected_args["keyword"] = keyword
+        benchmarks.append(create_benchmark_item(
+            query, "SEARCH_TOKEN", expected_args,
+            "search_by_symbol", f"Search {symbol} by symbol"
+        ))
+    # 2) search by symbol (Chinese)
+    cn_cases = [
+        ("帮我搜索 BONK 代币", "BONK", "solana"),
+        ("查一下 WIF 这个币", "WIF", "solana"),
+        ("找一下 JUP 代币信息", "JUP", "solana"),
+        ("搜索 RAY 代币", "RAY", "solana"),
+        ("查询 POPCAT 代币", "POPCAT", "solana"),
+    ]
+    for query, symbol, chain in cn_cases:
+        benchmarks.append(create_benchmark_item(
+            query, "SEARCH_TOKEN", {"symbol": symbol, "chain": chain},
+            "search_by_symbol_cn", f"Search {symbol} by symbol (Chinese)"
+        ))
+    # 3) search by address
+    for token, info in list(TOKENS.items())[:5]:
+        query = f"Search token at address {info['ca']}"
+        benchmarks.append(create_benchmark_item(
+            query, "SEARCH_TOKEN", {"address": info['ca'], "chain": info['chain']},
+            "search_by_address", f"Search {token} by address"
+        ))
+    # 4) search by keyword
+    keyword_cases = [
+        ("Search for dog themed tokens", "dog", "solana"),
+        ("Find meme coins", "meme", "solana"),
+        ("Look for cat tokens on base", "cat", "base"),
+    ]
+    for query, keyword, chain in keyword_cases:
+        benchmarks.append(create_benchmark_item(
+            query, "SEARCH_TOKEN", {"keyword": keyword, "chain": chain},
+            "search_by_keyword", f"Search by keyword: {keyword}"
+        ))
+    return benchmarks
+def generate_execute_swap_benchmarks() -> List[Dict]:
+    """Generate EXECUTE_SWAP cases."""
+    benchmarks = []
+    # 1) buy token (fixed amount)
+    buy_cases = [
+        ("Buy 1 SOL worth of BONK", "SOL", "BONK", "1", None),
+        ("Purchase 5 SOL of WIF", "SOL", "WIF", "5", None),
+        ("Buy 10 USDC worth of JUP", "USDC", "JUP", "10", None),
+        ("I want to buy 2 SOL of RAY", "SOL", "RAY", "2", None),
+        ("Get me 0.5 SOL of POPCAT", "SOL", "POPCAT", "0.5", None),
+    ]
+    for query, input_token, output_token, amount, percentage in buy_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "buy_with_amount", f"Buy {output_token} with {amount} {input_token}"
+        ))
+    # 2) buy token (percentage)
+    buy_pct_cases = [
+        ("Buy BONK with 50% of my SOL", "SOL", "BONK", None, 0.5),
+        ("Use 30% of my USDC to buy WIF", "USDC", "WIF", None, 0.3),
+        ("Spend 100% of my SOL on JUP", "SOL", "JUP", None, 1.0),
+        ("Put 25% of my ETH into RAY", "ETH", "RAY", None, 0.25),
+        ("Use half of my BTC to get BONK", "BTC", "BONK", None, 0.5),
+    ]
+    for query, input_token, output_token, amount, percentage in buy_pct_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "buy_with_percentage", f"Buy {output_token} with {int(percentage*100)}% {input_token}"
+        ))
+    # 3) sell token (fixed amount)
+    sell_cases = [
+        ("Sell 1000 BONK", "BONK", "SOL", "1000", None),
+        ("Sell 500 WIF for SOL", "WIF", "SOL", "500", None),
+        ("Convert 100 JUP to SOL", "JUP", "SOL", "100", None),
+        ("Dump 2000 RAY", "RAY", "SOL", "2000", None),
+        ("Sell 50 USDC", "USDC", "SOL", "50", None),
+    ]
+    for query, input_token, output_token, amount, percentage in sell_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "sell_with_amount", f"Sell {amount} {input_token}"
+        ))
+    # 4) sell token (percentage)
+    sell_pct_cases = [
+        ("Sell 50% of my BONK", "BONK", "SOL", None, 0.5),
+        ("Dump all my WIF", "WIF", "SOL", None, 1.0),
+        ("Sell 30% of my JUP holdings", "JUP", "SOL", None, 0.3),
+        ("Get rid of 75% of my RAY", "RAY", "SOL", None, 0.75),
+        ("Sell a quarter of my USDC", "USDC", "SOL", None, 0.25),
+    ]
+    for query, input_token, output_token, amount, percentage in sell_pct_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "sell_with_percentage", f"Sell {int(percentage*100)}% {input_token}"
+        ))
+    # 5) Chinese buy/sell requests (content kept)
+    cn_swap_cases = [
+        ("用 1 个 SOL 买 BONK", "SOL", "BONK", "1", None),
+        ("把 50% 的 USDC 换成 WIF", "USDC", "WIF", None, 0.5),
+        ("卖掉 1000 个 BONK", "BONK", "SOL", "1000", None),
+        ("把所有 JUP 都卖了", "JUP", "SOL", None, 1.0),
+        ("用 2 SOL 购买 RAY", "SOL", "RAY", "2", None),
+        ("出售 30% 的 WIF", "WIF", "SOL", None, 0.3),
+        ("买入 5 SOL 的 POPCAT", "SOL", "POPCAT", "5", None),
+        ("清仓 ETH", "ETH", "SOL", None, 1.0),
+    ]
+    for query, input_token, output_token, amount, percentage in cn_swap_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "swap_chinese", f"Swap request in Chinese"
+        ))
+    # 6) swap between tokens
+    swap_cases = [
+        ("Swap 100 USDC for BONK", "USDC", "BONK", "100", None),
+        ("Exchange 50 JUP for WIF", "JUP", "WIF", "50", None),
+        ("Convert all my ETH to USDC", "ETH", "USDC", None, 1.0),
+    ]
+    for query, input_token, output_token, amount, percentage in swap_cases:
+        input_ca = TOKENS[input_token]["ca"]
+        output_ca = TOKENS[output_token]["ca"]
+        benchmarks.append(create_benchmark_item(
+            query, "EXECUTE_SWAP",
+            {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": amount, "inputTokenPercentage": percentage},
+            "token_to_token", f"Swap {input_token} to {output_token}"
+        ))
+    return benchmarks
+def generate_incomplete_benchmarks() -> List[Dict]:
+    """Generate incomplete requests (should ask clarification)."""
+    benchmarks = []
+    incomplete_cases = [
+        ("I want to buy some tokens", "incomplete_no_token", "Missing token name"),
+        ("Sell my holdings", "incomplete_no_token", "Missing which token to sell"),
+        ("Search for a token", "incomplete_no_info", "Missing token info"),
+        ("Buy something", "incomplete_vague", "Too vague"),
+        ("我想买币", "incomplete_cn", "Missing token (Chinese)"),
+        ("帮我卖掉", "incomplete_cn", "Missing token and amount (Chinese)"),
+        ("Swap tokens", "incomplete_swap", "Missing swap details"),
+        ("I want to trade", "incomplete_trade", "Missing trade details"),
+    ]
+    for query, category, description in incomplete_cases:
+        benchmarks.append(create_benchmark_item(
+            query, None, None, category, description
+        ))
+    return benchmarks
+def generate_irrelevant_benchmarks() -> List[Dict]:
+    """Generate irrelevant requests (should not call any function)."""
+    benchmarks = []
+    irrelevant_cases = [
+        ("What's the weather today?", "irrelevant_weather", "Weather query"),
+        ("Tell me a joke", "irrelevant_joke", "Joke request"),
+        ("What time is it?", "irrelevant_time", "Time query"),
+        ("Who is the president?", "irrelevant_general", "General knowledge"),
+        ("今天天气怎么样？", "irrelevant_cn", "Weather (Chinese)"),
+        ("给我讲个笑话", "irrelevant_cn", "Joke (Chinese)"),
+        ("Hello, how are you?", "irrelevant_greeting", "Greeting"),
+        ("What is Bitcoin?", "irrelevant_info", "Info request (no action)"),
+    ]
+    for query, category, description in irrelevant_cases:
+        benchmarks.append(create_benchmark_item(
+            query, None, None, category, description
+        ))
+    return benchmarks
+def generate_benchmark_dataset(output_path: str = str(DEFAULT_BENCHMARK_PATH)):
+    """Generate the full benchmark dataset."""
+    print("=" * 60)
+    print("Generating FunctionGemma benchmark dataset")
+    print("=" * 60)
+    # Collect all cases
+    all_benchmarks = []
+    # SEARCH_TOKEN cases
+    search_benchmarks = generate_search_token_benchmarks()
+    print(f"SEARCH_TOKEN cases: {len(search_benchmarks)}")
+    all_benchmarks.extend(search_benchmarks)
+    # EXECUTE_SWAP cases
+    swap_benchmarks = generate_execute_swap_benchmarks()
+    print(f"EXECUTE_SWAP cases: {len(swap_benchmarks)}")
+    all_benchmarks.extend(swap_benchmarks)
+    # Incomplete requests
+    incomplete_benchmarks = generate_incomplete_benchmarks()
+    print(f"Incomplete request cases: {len(incomplete_benchmarks)}")
+    all_benchmarks.extend(incomplete_benchmarks)
+    # Irrelevant requests
+    irrelevant_benchmarks = generate_irrelevant_benchmarks()
+    print(f"Irrelevant request cases: {len(irrelevant_benchmarks)}")
+    all_benchmarks.extend(irrelevant_benchmarks)
+    # Pad to 100 if needed
+    while len(all_benchmarks) < 100:
+        # Add a few variants
+        extra_cases = [
+            ("Buy 3 SOL of TRUMP", "SOL", "TRUMP", "3", None, "EXECUTE_SWAP"),
+            ("Search for TRUMP token", "TRUMP", "solana", None, None, "SEARCH_TOKEN"),
+        ]
+        for case in extra_cases:
+            if len(all_benchmarks) >= 100:
+                break
+            if case[5] == "EXECUTE_SWAP":
+                input_ca = TOKENS[case[1]]["ca"]
+                output_ca = TOKENS[case[2]]["ca"]
+                all_benchmarks.append(create_benchmark_item(
+                    case[0], "EXECUTE_SWAP",
+                    {"inputTokenCA": input_ca, "outputTokenCA": output_ca, "inputTokenAmount": case[3], "inputTokenPercentage": case[4]},
+                    "extra", "Extra test case"
+                ))
+            else:
+                all_benchmarks.append(create_benchmark_item(
+                    case[0], "SEARCH_TOKEN",
+                    {"symbol": case[1], "chain": case[2]},
+                    "extra", "Extra test case"
+                ))
+    # Limit to 100
+    all_benchmarks = all_benchmarks[:100]
+    # Assign ids
+    for i, item in enumerate(all_benchmarks):
+        item["id"] = i + 1
+    # Shuffle
+    random.seed(42)
+    random.shuffle(all_benchmarks)
+    # Re-assign ids
+    for i, item in enumerate(all_benchmarks):
+        item["id"] = i + 1
+    print(f"\nTotal: {len(all_benchmarks)} cases")
+    # Category stats
+    categories = {}
+    for item in all_benchmarks:
+        cat = item["category"]
+        categories[cat] = categories.get(cat, 0) + 1
+    print("\nCategory distribution:")
+    for cat, count in sorted(categories.items()):
+        print(f"  - {cat}: {count}")
+    # Function stats
+    func_counts = {"SEARCH_TOKEN": 0, "EXECUTE_SWAP": 0, "None": 0}
+    for item in all_benchmarks:
+        func = item["expected"]["function_name"]
+        if func:
+            func_counts[func] = func_counts.get(func, 0) + 1
+        else:
+            func_counts["None"] += 1
+    print("\nFunction distribution:")
+    for func, count in func_counts.items():
+        print(f"  - {func}: {count}")
+    # Save
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(all_benchmarks, f, ensure_ascii=False, indent=2)
+    print(f"\nBenchmark saved to: {output_path}")
+    # Show examples
+    print("\n" + "=" * 60)
+    print("Examples:")
+    print("=" * 60)
+    for i, item in enumerate(all_benchmarks[:3]):
+        print(f"\n--- Example {i+1} ---")
+        print(f"ID: {item['id']}")
+        print(f"Category: {item['category']}")
+        print(f"Input: {item['input']['messages'][1]['content']}")
+        print(f"Expected function: {item['expected']['function_name']}")
+        if item['expected']['arguments']:
+            print(f"Expected args: {json.dumps(item['expected']['arguments'], ensure_ascii=False)}")
+    return all_benchmarks
+def main():
+    parser = argparse.ArgumentParser(description="Generate FunctionGemma benchmark dataset")
+    parser.add_argument("--output", type=str, default=str(DEFAULT_BENCHMARK_PATH), help="Output file path")
+    args = parser.parse_args()
+    output_path = Path(args.output)
+    output_path.parent.mkdir(parents=True, exist_ok=True)
+    generate_benchmark_dataset(str(output_path))
+if __name__ == "__main__":
+    main()

src/prepare_dataset.py ADDED Viewed

	@@ -0,0 +1,199 @@

+#!/usr/bin/env python3
+"""
+Data preprocessing script.
+Convert the generated dataset into a format directly consumable by SFTTrainer.
+FunctionGemma expects a specific chat template structure.
+Usage:
+    python -m src.prepare_dataset --input ./data/training_data.json --output ./data/prepared_dataset.json
+"""
+import json
+import argparse
+from pathlib import Path
+from typing import List, Dict, Any
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_INPUT = PROJECT_ROOT / "data" / "training_data.json"
+DEFAULT_OUTPUT = PROJECT_ROOT / "data" / "prepared_dataset.json"
+def convert_tool_calls_to_text(tool_calls: List[Dict]) -> str:
+    """Convert tool_calls into plain text (FunctionGemma format)."""
+    if not tool_calls:
+        return ""
+    result_parts = []
+    for tc in tool_calls:
+        func = tc.get("function", {})
+        name = func.get("name", "")
+        args = func.get("arguments", {})
+        # FunctionGemma format: functionName(arguments)
+        args_str = json.dumps(args, ensure_ascii=False)
+        result_parts.append(f"{name}({args_str})")
+    return "\n".join(result_parts)
+def convert_messages_for_sft(messages: List[Dict], tools: List[Dict] = None) -> List[Dict]:
+    """
+    Convert message format for SFTTrainer.
+    Input:
+        [
+            {"role": "developer", "content": "..."},
+            {"role": "user", "content": "..."},
+            {"role": "assistant", "tool_calls": [...]} or {"role": "assistant", "content": "..."}
+        ]
+    Output:
+        [
+            {"role": "system", "content": "..."},  # developer -> system
+            {"role": "user", "content": "..."},
+            {"role": "assistant", "content": "..."}  # tool_calls flattened to text
+        ]
+    """
+    converted = []
+    # Build tools description
+    tools_description = ""
+    if tools:
+        tools_desc_parts = []
+        for tool in tools:
+            if tool.get("type") == "function":
+                func = tool.get("function", {})
+                name = func.get("name", "")
+                desc = func.get("description", "")
+                params = func.get("parameters", {})
+                tools_desc_parts.append(f"- {name}: {desc}")
+        if tools_desc_parts:
+            tools_description = "\n\nAvailable tools:\n" + "\n".join(tools_desc_parts)
+    for msg in messages:
+        role = msg.get("role", "")
+        if role == "developer":
+            # developer -> system
+            content = msg.get("content", "")
+            if tools_description:
+                content = content + tools_description
+            converted.append({
+                "role": "system",
+                "content": content
+            })
+        elif role == "user":
+            converted.append({
+                "role": "user",
+                "content": msg.get("content", "")
+            })
+        elif role == "assistant":
+            if "tool_calls" in msg:
+                # Convert tool_calls to text
+                tool_calls_text = convert_tool_calls_to_text(msg["tool_calls"])
+                converted.append({
+                    "role": "assistant",
+                    "content": tool_calls_text
+                })
+            else:
+                converted.append({
+                    "role": "assistant",
+                    "content": msg.get("content", "")
+                })
+        elif role == "tool":
+            # Tool response
+            converted.append({
+                "role": "tool",
+                "content": msg.get("content", "")
+            })
+    return converted
+def prepare_dataset(input_path: str, output_path: str, format_type: str = "messages"):
+    """
+    Prepare dataset.
+    format_type:
+        - "messages": output {"messages": [...]}
+        - "text": output {"text": "..."} (flattened text)
+    """
+    print(f"Loading dataset: {input_path}")
+    with open(input_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    print(f"Raw samples: {len(data)}")
+    prepared_data = []
+    for i, item in enumerate(data):
+        messages = item.get("messages", [])
+        tools = item.get("tools", [])
+        # Convert messages
+        converted_messages = convert_messages_for_sft(messages, tools)
+        if format_type == "messages":
+            prepared_data.append({
+                "messages": converted_messages
+            })
+        elif format_type == "text":
+            # Convert to plain text
+            text_parts = []
+            for msg in converted_messages:
+                role = msg["role"]
+                content = msg["content"]
+                if role == "system":
+                    text_parts.append(f"<start_of_turn>system\n{content}<end_of_turn>")
+                elif role == "user":
+                    text_parts.append(f"<start_of_turn>user\n{content}<end_of_turn>")
+                elif role == "assistant":
+                    text_parts.append(f"<start_of_turn>model\n{content}<end_of_turn>")
+            prepared_data.append({
+                "text": "\n".join(text_parts)
+            })
+    print(f"Processed samples: {len(prepared_data)}")
+    # Save
+    with open(output_path, 'w', encoding='utf-8') as f:
+        json.dump(prepared_data, f, ensure_ascii=False, indent=2)
+    print(f"Saved to: {output_path}")
+    # Show example
+    print("\n" + "=" * 60)
+    print("Example:")
+    print("=" * 60)
+    if format_type == "messages":
+        example = prepared_data[0]
+        for msg in example["messages"]:
+            print(f"\n[{msg['role']}]")
+            print(msg["content"][:200] + "..." if len(msg["content"]) > 200 else msg["content"])
+    else:
+        print(prepared_data[0]["text"][:500] + "...")
+    return prepared_data
+def main():
+    parser = argparse.ArgumentParser(description="Dataset preparation")
+    parser.add_argument("--input", type=str, default=str(DEFAULT_INPUT), help="Input file path")
+    parser.add_argument("--output", type=str, default=str(DEFAULT_OUTPUT), help="Output file path")
+    parser.add_argument("--format", type=str, choices=["messages", "text"], default="messages", help="Output format")
+    args = parser.parse_args()
+    prepare_dataset(args.input, args.output, args.format)
+if __name__ == "__main__":
+    main()

src/train.py ADDED Viewed

	@@ -0,0 +1,559 @@

+#!/usr/bin/env python3
+"""
+FunctionGemma SFT fine-tuning script.
+Runs TRL SFTTrainer for FunctionGemma with two modes:
+  1) LoRA (recommended): faster, lower memory, less overfit
+  2) Full-parameter: higher cost, maximal capacity
+Usage:
+    # LoRA (default)
+    python -m src.train \
+        --model_path /path/to/model \
+        --dataset_path ./data/training_data.json \
+        --bf16
+    # Full-parameter
+    python -m src.train \
+        --model_path /path/to/model \
+        --dataset_path ./data/training_data.json \
+        --no-use-lora \
+        --bf16
+"""
+import os
+import json
+import argparse
+import logging
+from datetime import datetime
+from pathlib import Path
+from typing import Optional
+import torch
+from datasets import Dataset, load_dataset
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+    TrainingArguments,
+    BitsAndBytesConfig,
+)
+from peft import LoraConfig, get_peft_model, TaskType, prepare_model_for_kbit_training
+from trl import SFTTrainer, SFTConfig
+# Paths and logging
+PROJECT_ROOT = Path(__file__).resolve().parent.parent
+DEFAULT_DATA_PATH = PROJECT_ROOT / "data" / "training_data.json"
+DEFAULT_OUTPUT_DIR = PROJECT_ROOT / "runs"
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+def parse_args():
+    """Parse CLI arguments."""
+    parser = argparse.ArgumentParser(description="FunctionGemma SFT fine-tuning (LoRA / full)")
+    # Model
+    parser.add_argument(
+        "--model_path",
+        type=str,
+        default="google/functiongemma-270m-it",
+        help="Model path or HF model id"
+    )
+    parser.add_argument(
+        "--tokenizer_path",
+        type=str,
+        default=None,
+        help="Tokenizer path (defaults to model_path)"
+    )
+    # Dataset
+    parser.add_argument(
+        "--dataset_path",
+        type=str,
+        default=str(DEFAULT_DATA_PATH),
+        help="Training dataset path"
+    )
+    parser.add_argument(
+        "--val_split",
+        type=float,
+        default=0.1,
+        help="Validation split ratio"
+    )
+    # Output
+    parser.add_argument(
+        "--output_dir",
+        type=str,
+        default=str(DEFAULT_OUTPUT_DIR),
+        help="Root output directory"
+    )
+    parser.add_argument(
+        "--run_name",
+        type=str,
+        default=None,
+        help="Run name for logging and saving"
+    )
+    # Fine-tuning mode
+    parser.add_argument(
+        "--use_lora",
+        action="store_true",
+        default=True,
+        help="Enable LoRA (recommended). Add --no-use-lora for full-parameter finetune"
+    )
+    parser.add_argument("--no-use-lora", dest="use_lora", action="store_false", help="Disable LoRA, run full-parameter finetune")
+    # LoRA (only when use_lora=True)
+    parser.add_argument("--lora_r", type=int, default=16, help="LoRA rank")
+    parser.add_argument("--lora_alpha", type=int, default=32, help="LoRA alpha")
+    parser.add_argument("--lora_dropout", type=float, default=0.05, help="LoRA dropout")
+    parser.add_argument(
+        "--target_modules",
+        type=str,
+        nargs="+",
+        default=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
+        help="Target modules for LoRA"
+    )
+    # Training (aligned with FunctionGemma guidance)
+    parser.add_argument("--num_train_epochs", type=int, default=6, help="Training epochs (official rec: 8)")
+    parser.add_argument("--max_steps", type=int, default=-1, help="Max training steps (-1 to use epochs)")
+    parser.add_argument("--per_device_train_batch_size", type=int, default=4, help="Train batch size per device")
+    parser.add_argument("--per_device_eval_batch_size", type=int, default=2, help="Eval batch size")
+    parser.add_argument("--gradient_accumulation_steps", type=int, default=8, help="Grad accumulation steps")
+    parser.add_argument("--learning_rate", type=float, default=5e-5, help="Learning rate")
+    parser.add_argument("--weight_decay", type=float, default=0.01, help="Weight decay")
+    parser.add_argument("--warmup_ratio", type=float, default=0.0, help="Warmup ratio (constant scheduler usually skips warmup)")
+    parser.add_argument("--max_seq_length", type=int, default=2048, help="Max sequence length (model supports up to 32768)")
+    parser.add_argument("--lr_scheduler_type", type=str, default="constant", help="LR scheduler type (default constant)")
+    # Precision & optimization
+    parser.add_argument("--bf16", action="store_true", help="Use BF16")
+    parser.add_argument("--fp16", action="store_true", help="Use FP16")
+    parser.add_argument("--use_4bit", action="store_true", help="Enable 4-bit quant (QLoRA)")
+    parser.add_argument("--use_8bit", action="store_true", help="Enable 8-bit quant")
+    parser.add_argument("--use_flash_attention", action="store_true", help="Enable Flash Attention 2")
+    parser.add_argument("--gradient_checkpointing", action="store_true", help="Enable gradient checkpointing")
+    # Logging & saving
+    parser.add_argument("--logging_steps", type=int, default=10, help="Log every N steps")
+    parser.add_argument("--save_steps", type=int, default=100, help="Save checkpoint every N steps")
+    parser.add_argument("--eval_steps", type=int, default=100, help="Eval every N steps")
+    parser.add_argument("--save_total_limit", type=int, default=3, help="Max checkpoints to keep")
+    # Misc
+    parser.add_argument("--seed", type=int, default=42, help="Random seed")
+    parser.add_argument("--resume_from_checkpoint", type=str, default=None, help="Resume from checkpoint")
+    parser.add_argument("--push_to_hub", action="store_true", help="Push to Hugging Face Hub")
+    parser.add_argument("--hub_model_id", type=str, default=None, help="Hub model id")
+    return parser.parse_args()
+def load_and_prepare_dataset(dataset_path: str, val_split: float = 0.1):
+    """Load and normalize dataset structure for SFT."""
+    logger.info(f"Loading dataset: {dataset_path}")
+    # Load JSON dataset
+    with open(dataset_path, 'r', encoding='utf-8') as f:
+        data = json.load(f)
+    logger.info(f"Dataset size: {len(data)} samples")
+    # Normalize nested structures:
+    # if an item has input.messages/tools, lift them to top-level
+    processed_data = []
+    for idx, item in enumerate(data):
+        if 'input' in item and 'messages' in item['input']:
+            # Deep copy messages to avoid mutating original
+            messages = json.loads(json.dumps(item['input']['messages']))
+            # Fix tool_calls formatting if present
+            for msg in messages:
+                if 'tool_calls' in msg and msg['tool_calls']:
+                    for tc in msg['tool_calls']:
+                        if 'function' in tc and 'arguments' in tc['function']:
+                            args = tc['function']['arguments']
+                            # ensure arguments is a string
+                            if not isinstance(args, str):
+                                tc['function']['arguments'] = json.dumps(args)
+            # Convert expected field into assistant response if present
+            if 'expected' in item and item['expected']:
+                expected = item['expected']
+                # If last message is not assistant, append one
+                if messages[-1]['role'] != 'assistant':
+                    # Decide between function call or refusal
+                    function_name = expected.get('function_name')
+                    arguments = expected.get('arguments')
+                    response = expected.get('response', '')
+                    if function_name is not None and arguments is not None:
+                        # Case 1: function call -> add tool_calls
+                        arguments_str = json.dumps(arguments) if isinstance(arguments, dict) else str(arguments)
+                        assistant_msg = {
+                            "role": "assistant",
+                            "content": None,
+                            "tool_calls": [{
+                                "id": f"call_{hash(function_name + arguments_str) % 1000000}",  # generate unique id
+                                "type": "function",
+                                "function": {
+                                    "name": function_name,
+                                    "arguments": arguments_str
+                                }
+                            }]
+                        }
+                        messages.append(assistant_msg)
+                        logger.debug(f"Added assistant tool_calls: {function_name}")
+                    elif function_name is None and arguments is None and response:
+                        # Case 2: refusal -> plain text response
+                        assistant_msg = {
+                            "role": "assistant",
+                            "content": response
+                        }
+                        messages.append(assistant_msg)
+                        logger.debug(f"Added assistant refusal response: {response[:50]}")
+                    else:
+                        logger.warning(f"Unknown expected format: {expected}")
+            processed_item = {
+                'messages': messages
+            }
+            # include tools if present
+            if 'tools' in item['input']:
+                processed_item['tools'] = item['input']['tools']
+            # preserve id
+            if 'id' in item:
+                processed_item['id'] = item['id']
+            # Final check: tool_calls arguments must be strings
+            for msg in processed_item['messages']:
+                if 'tool_calls' in msg and msg['tool_calls']:
+                    for tc in msg['tool_calls']:
+                        if 'function' in tc and 'arguments' in tc['function']:
+                            if not isinstance(tc['function']['arguments'], str):
+                                logger.error(f"Sample {idx} arguments not string: {type(tc['function']['arguments'])}")
+                                tc['function']['arguments'] = json.dumps(tc['function']['arguments'])
+            processed_data.append(processed_item)
+        elif 'messages' in item:
+            # Already proper format, just normalize tool_calls
+            messages = json.loads(json.dumps(item['messages']))
+            for msg in messages:
+                if 'tool_calls' in msg and msg['tool_calls']:
+                    for tc in msg['tool_calls']:
+                        if 'function' in tc and 'arguments' in tc['function']:
+                            if not isinstance(tc['function']['arguments'], str):
+                                tc['function']['arguments'] = json.dumps(tc['function']['arguments'])
+            item_copy = dict(item)
+            item_copy['messages'] = messages
+            processed_data.append(item_copy)
+        else:
+            logger.warning(f"Skip malformed item: {item.get('id', 'unknown')}")
+    logger.info(f"Processed dataset size: {len(processed_data)}")
+    # Validate format
+    tool_calls_count = 0
+    for item in processed_data:
+        for msg in item['messages']:
+            if 'tool_calls' in msg and msg['tool_calls']:
+                tool_calls_count += 1
+                for tc in msg['tool_calls']:
+                    if 'function' in tc and 'arguments' in tc['function']:
+                        if not isinstance(tc['function']['arguments'], str):
+                            logger.error(f"Found non-string arguments: {type(tc['function']['arguments'])}")
+    logger.info(f"Messages containing tool_calls: {tool_calls_count}")
+    # Convert to Hugging Face Dataset
+    dataset = Dataset.from_list(processed_data)
+    # Split train/val
+    if val_split > 0:
+        dataset = dataset.train_test_split(test_size=val_split, seed=42)
+        train_dataset = dataset['train']
+        eval_dataset = dataset['test']
+        logger.info(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}")
+    else:
+        train_dataset = dataset
+        eval_dataset = None
+        logger.info(f"Train: {len(train_dataset)}, no eval split")
+    return train_dataset, eval_dataset
+def get_quantization_config(use_4bit: bool, use_8bit: bool):
+    """Build quantization config if requested."""
+    if use_4bit:
+        logger.info("Using 4-bit quantization (QLoRA)")
+        return BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_compute_dtype=torch.bfloat16,
+            bnb_4bit_use_double_quant=True,
+        )
+    elif use_8bit:
+        logger.info("Using 8-bit quantization")
+        return BitsAndBytesConfig(
+            load_in_8bit=True,
+        )
+    return None
+def load_model_and_tokenizer(args):
+    """Load model and tokenizer."""
+    logger.info(f"Loading model: {args.model_path}")
+    tokenizer_path = args.tokenizer_path or args.model_path
+    # Load tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(
+        tokenizer_path,
+        trust_remote_code=True,
+        padding_side="right",
+    )
+    # Ensure pad token exists
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+    # Quantization config
+    quantization_config = get_quantization_config(args.use_4bit, args.use_8bit)
+    # Model kwargs
+    model_kwargs = {
+        "trust_remote_code": True,
+        "device_map": "auto",
+    }
+    if quantization_config:
+        model_kwargs["quantization_config"] = quantization_config
+    # Precision
+    if args.bf16 and not (args.use_4bit or args.use_8bit):
+        model_kwargs["torch_dtype"] = torch.bfloat16
+    elif args.fp16 and not (args.use_4bit or args.use_8bit):
+        model_kwargs["torch_dtype"] = torch.float16
+    # Flash Attention
+    if args.use_flash_attention:
+        model_kwargs["attn_implementation"] = "flash_attention_2"
+        logger.info("Using Flash Attention 2")
+    # Load model
+    model = AutoModelForCausalLM.from_pretrained(
+        args.model_path,
+        **model_kwargs
+    )
+    # Prepare for k-bit training when quantized
+    if args.use_4bit or args.use_8bit:
+        model = prepare_model_for_kbit_training(model)
+    # Gradient checkpointing
+    if args.gradient_checkpointing:
+        model.gradient_checkpointing_enable()
+        logger.info("Enabled gradient checkpointing")
+    logger.info(f"Model parameters: {model.num_parameters():,}")
+    return model, tokenizer
+def get_lora_config(args):
+    """Build LoRA config."""
+    logger.info(f"LoRA config: r={args.lora_r}, alpha={args.lora_alpha}, dropout={args.lora_dropout}")
+    logger.info(f"Target modules: {args.target_modules}")
+    return LoraConfig(
+        r=args.lora_r,
+        lora_alpha=args.lora_alpha,
+        lora_dropout=args.lora_dropout,
+        target_modules=args.target_modules,
+        bias="none",
+        task_type=TaskType.CAUSAL_LM,
+    )
+def formatting_func(example):
+    """
+    Format function: pass data through for SFTTrainer.
+    Dataset format:
+    {
+        "messages": [
+            {"role": "developer", "content": "..."},
+            {"role": "user", "content": "..."},
+            {"role": "assistant", "tool_calls": [...]} or {"role": "assistant", "content": "..."}
+        ],
+        "tools": [...]
+    }
+    """
+    # Return as-is; SFTTrainer applies chat template
+    return example
+def main():
+    args = parse_args()
+    # Set run name
+    if args.run_name is None:
+        args.run_name = f"functiongemma-lora-{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+    # Create output directory
+    output_dir = os.path.join(args.output_dir, args.run_name)
+    os.makedirs(output_dir, exist_ok=True)
+    logger.info("=" * 60)
+    logger.info("FunctionGemma SFT LoRA training")
+    logger.info("=" * 60)
+    logger.info(f"Output dir: {output_dir}")
+    # Save config
+    config_path = os.path.join(output_dir, "training_config.json")
+    with open(config_path, 'w') as f:
+        json.dump(vars(args), f, indent=2)
+    logger.info(f"Config saved to: {config_path}")
+    # Load dataset
+    train_dataset, eval_dataset = load_and_prepare_dataset(
+        args.dataset_path,
+        args.val_split
+    )
+    # Load model + tokenizer
+    model, tokenizer = load_model_and_tokenizer(args)
+    # Build LoRA config if enabled
+    if args.use_lora:
+        logger.info("=" * 60)
+        logger.info("LoRA fine-tuning mode")
+        logger.info("=" * 60)
+        lora_config = get_lora_config(args)
+    else:
+        logger.info("=" * 60)
+        logger.info("Full-parameter fine-tuning mode")
+        logger.info("Warning: full fine-tuning needs more memory and time!")
+        logger.info("=" * 60)
+        lora_config = None
+    # SFTTrainer config
+    training_args = SFTConfig(
+        output_dir=output_dir,
+        run_name=args.run_name,
+        # Sequence length / packing
+        max_length=args.max_seq_length,
+        packing=False,
+        # Training
+        num_train_epochs=args.num_train_epochs,
+        max_steps=args.max_steps,
+        per_device_train_batch_size=args.per_device_train_batch_size,
+        per_device_eval_batch_size=args.per_device_eval_batch_size,
+        gradient_accumulation_steps=args.gradient_accumulation_steps,
+        # Optimizer
+        learning_rate=args.learning_rate,
+        weight_decay=args.weight_decay,
+        warmup_ratio=args.warmup_ratio,
+        lr_scheduler_type=args.lr_scheduler_type,
+        optim="adamw_torch_fused",
+        # Precision
+        bf16=args.bf16,
+        fp16=args.fp16,
+        # Logging / saving
+        logging_steps=args.logging_steps,
+        save_steps=args.save_steps,
+        eval_steps=args.eval_steps if eval_dataset else None,
+        eval_strategy="steps" if eval_dataset else "no",
+        save_total_limit=args.save_total_limit,
+        load_best_model_at_end=True if eval_dataset else False,
+        # Misc
+        seed=args.seed,
+        report_to=["tensorboard"],
+        # Hub
+        push_to_hub=args.push_to_hub,
+        hub_model_id=args.hub_model_id,
+        # Gradient checkpointing
+        gradient_checkpointing=args.gradient_checkpointing,
+        gradient_checkpointing_kwargs={"use_reentrant": False} if args.gradient_checkpointing else None,
+    )
+    # Create SFTTrainer
+    # Dataset should include 'messages' and 'tools'; SFTTrainer applies chat template automatically
+    trainer = SFTTrainer(
+        model=model,
+        args=training_args,
+        train_dataset=train_dataset,
+        eval_dataset=eval_dataset,
+        processing_class=tokenizer,  # newer TRL uses processing_class instead of tokenizer
+        peft_config=lora_config,
+    )
+    # Parameter stats
+    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
+    total_params = sum(p.numel() for p in model.parameters())
+    trainable_percentage = 100 * trainable_params / total_params if total_params > 0 else 0
+    logger.info("=" * 60)
+    logger.info("Model parameter stats:")
+    logger.info(f"  Total params: {total_params:,}")
+    logger.info(f"  Trainable params: {trainable_params:,}")
+    logger.info(f"  Trainable ratio: {trainable_percentage:.2f}%")
+    logger.info(f"  Mode: {'LoRA' if args.use_lora else 'Full fine-tune'}")
+    logger.info("=" * 60)
+    # Train
+    logger.info("Start training...")
+    if args.resume_from_checkpoint:
+        trainer.train(resume_from_checkpoint=args.resume_from_checkpoint)
+    else:
+        trainer.train()
+    # Save final model
+    logger.info("Saving final model...")
+    final_model_path = os.path.join(output_dir, "final_model")
+    trainer.save_model(final_model_path)
+    tokenizer.save_pretrained(final_model_path)
+    logger.info("=" * 60)
+    logger.info("Training done.")
+    logger.info(f"Model saved at: {final_model_path}")
+    if args.use_lora:
+        # LoRA: also save adapter
+        lora_path = os.path.join(output_dir, "lora_adapter")
+        model.save_pretrained(lora_path)
+        tokenizer.save_pretrained(lora_path)
+        logger.info(f"LoRA adapter saved to: {lora_path}")
+        logger.info("")
+        logger.info("Usage:")
+        logger.info(f"  1. LoRA adapter: {lora_path}")
+        logger.info(f"  2. Merge adapters with your base model before inference")
+    else:
+        # Full fine-tune: final_model is ready to use
+        logger.info("")
+        logger.info("Usage:")
+        logger.info(f"  Use model directly from: {final_model_path}")
+    logger.info("=" * 60)
+if __name__ == "__main__":
+    main()