wheel llama cpp was added
Browse files- .gitattributes +1 -0
- .gitignore +1 -1
- BUILD_INSTRUCTIONS.md +0 -89
- Dockerfile +9 -8
- GRAMMAR_CHANGES.md +0 -100
- app.py +373 -224
- config.py +10 -7
- requirements.txt +0 -2
- test.ipynb +0 -24
- wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
*.whl filter=lfs diff=lfs merge=lfs -text
|
.gitignore
CHANGED
|
@@ -15,7 +15,6 @@ lib64/
|
|
| 15 |
parts/
|
| 16 |
sdist/
|
| 17 |
var/
|
| 18 |
-
wheels/
|
| 19 |
*.egg-info/
|
| 20 |
.installed.cfg
|
| 21 |
*.egg
|
|
@@ -69,3 +68,4 @@ temp/
|
|
| 69 |
# Test files
|
| 70 |
test*
|
| 71 |
test.ipynb
|
|
|
|
|
|
| 15 |
parts/
|
| 16 |
sdist/
|
| 17 |
var/
|
|
|
|
| 18 |
*.egg-info/
|
| 19 |
.installed.cfg
|
| 20 |
*.egg
|
|
|
|
| 68 |
# Test files
|
| 69 |
test*
|
| 70 |
test.ipynb
|
| 71 |
+
logs.txt
|
BUILD_INSTRUCTIONS.md
DELETED
|
@@ -1,89 +0,0 @@
|
|
| 1 |
-
# Инструкции по сборке Docker образа с предзагруженной моделью
|
| 2 |
-
|
| 3 |
-
## Обзор изменений
|
| 4 |
-
|
| 5 |
-
Dockerfile был модифицирован для предварительной загрузки модели Hugging Face во время сборки образа. Это обеспечивает:
|
| 6 |
-
|
| 7 |
-
- ✅ Быстрое развертывание (модель уже в контейнере)
|
| 8 |
-
- ✅ Надежность (нет зависимости от сети при запуске)
|
| 9 |
-
- ✅ Консистентность (фиксированная версия модели)
|
| 10 |
-
|
| 11 |
-
## Сборка образа
|
| 12 |
-
|
| 13 |
-
### Базовая сборка (для публичных моделей):
|
| 14 |
-
|
| 15 |
-
```bash
|
| 16 |
-
docker build -t llm-structured-output .
|
| 17 |
-
```
|
| 18 |
-
|
| 19 |
-
### Сборка с токеном Hugging Face (для приватных моделей):
|
| 20 |
-
|
| 21 |
-
```bash
|
| 22 |
-
docker build --build-arg HUGGINGFACE_TOKEN=your_token_here -t llm-structured-output .
|
| 23 |
-
```
|
| 24 |
-
|
| 25 |
-
Или через переменную окружения:
|
| 26 |
-
|
| 27 |
-
```bash
|
| 28 |
-
export HUGGINGFACE_TOKEN=your_token_here
|
| 29 |
-
docker build -t llm-structured-output .
|
| 30 |
-
```
|
| 31 |
-
|
| 32 |
-
## Запуск контейнера
|
| 33 |
-
|
| 34 |
-
```bash
|
| 35 |
-
docker run -p 7860:7860 llm-structured-output
|
| 36 |
-
```
|
| 37 |
-
|
| 38 |
-
Приложение будет доступно по адресу: http://localhost:7860
|
| 39 |
-
|
| 40 |
-
## Запуск через docker-compose
|
| 41 |
-
|
| 42 |
-
```bash
|
| 43 |
-
docker-compose up --build
|
| 44 |
-
```
|
| 45 |
-
|
| 46 |
-
## Важные изменения
|
| 47 |
-
|
| 48 |
-
### 1. Dockerfile
|
| 49 |
-
- Добавлен `git-lfs` для работы с большими файлами
|
| 50 |
-
- Добавлена переменная `DOCKER_CONTAINER=true`
|
| 51 |
-
- Добавлен этап предварительной загрузки модели
|
| 52 |
-
- Модель скачивается во время сборки образа
|
| 53 |
-
|
| 54 |
-
### 2. app.py
|
| 55 |
-
- Добавлена проверка на Docker окружение
|
| 56 |
-
- Если модель не найдена в Docker контейнере, выбрасывается ошибка
|
| 57 |
-
- Логика загрузки модели оптимизирована для работы с предзагруженными моделями
|
| 58 |
-
|
| 59 |
-
## Размер образа
|
| 60 |
-
|
| 61 |
-
Образ будет больше из-за включенной модели, но это компенсируется:
|
| 62 |
-
- Быстрым запуском контейнера
|
| 63 |
-
- Отсутствием сетевых зависимостей
|
| 64 |
-
- Возможностью кэширования слоев Docker
|
| 65 |
-
|
| 66 |
-
## Настройка модели
|
| 67 |
-
|
| 68 |
-
Для изменения модели отредактируйте `config.py`:
|
| 69 |
-
|
| 70 |
-
```python
|
| 71 |
-
MODEL_REPO: str = "your-repo/your-model"
|
| 72 |
-
MODEL_FILENAME: str = "your-model.gguf"
|
| 73 |
-
```
|
| 74 |
-
|
| 75 |
-
Затем пересоберите образ.
|
| 76 |
-
|
| 77 |
-
## Отладка
|
| 78 |
-
|
| 79 |
-
Для проверки наличия модели в контейнере:
|
| 80 |
-
|
| 81 |
-
```bash
|
| 82 |
-
docker run -it llm-structured-output ls -la /app/models/
|
| 83 |
-
```
|
| 84 |
-
|
| 85 |
-
Для проверки логов сборки:
|
| 86 |
-
|
| 87 |
-
```bash
|
| 88 |
-
docker build --no-cache -t llm-structured-output .
|
| 89 |
-
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Dockerfile
CHANGED
|
@@ -4,14 +4,17 @@ FROM python:3.10-slim
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
-
# Install system dependencies required for runtime and
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
wget \
|
| 10 |
curl \
|
| 11 |
git \
|
| 12 |
git-lfs \
|
|
|
|
|
|
|
| 13 |
libopenblas-dev \
|
| 14 |
libssl-dev \
|
|
|
|
| 15 |
&& rm -rf /var/lib/apt/lists/*
|
| 16 |
|
| 17 |
# Initialize git-lfs
|
|
@@ -26,7 +29,9 @@ ENV DOCKER_CONTAINER=true
|
|
| 26 |
# Create models directory
|
| 27 |
RUN mkdir -p /app/models
|
| 28 |
|
| 29 |
-
|
|
|
|
|
|
|
| 30 |
|
| 31 |
# Copy requirements first for better Docker layer caching
|
| 32 |
COPY requirements.txt .
|
|
@@ -42,11 +47,7 @@ RUN python -c "import os; from huggingface_hub import hf_hub_download; from conf
|
|
| 42 |
|
| 43 |
# Verify model file exists after build
|
| 44 |
RUN ls -la /app/models/ && \
|
| 45 |
-
[ -
|
| 46 |
-
|
| 47 |
-
# Copy and install llama-cpp-python from local wheel
|
| 48 |
-
COPY wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl /tmp/
|
| 49 |
-
RUN pip install /tmp/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
|
| 50 |
|
| 51 |
# Copy application files
|
| 52 |
COPY . .
|
|
@@ -62,5 +63,5 @@ USER user
|
|
| 62 |
EXPOSE 7860
|
| 63 |
|
| 64 |
# Set entrypoint and default command
|
| 65 |
-
ENTRYPOINT ["./entrypoint.sh"]
|
| 66 |
CMD ["python", "main.py", "--mode", "gradio"]
|
|
|
|
| 4 |
# Set working directory
|
| 5 |
WORKDIR /app
|
| 6 |
|
| 7 |
+
# Install system dependencies required for runtime and compilation
|
| 8 |
RUN apt-get update && apt-get install -y \
|
| 9 |
wget \
|
| 10 |
curl \
|
| 11 |
git \
|
| 12 |
git-lfs \
|
| 13 |
+
build-essential \
|
| 14 |
+
cmake \
|
| 15 |
libopenblas-dev \
|
| 16 |
libssl-dev \
|
| 17 |
+
libgomp1 \
|
| 18 |
&& rm -rf /var/lib/apt/lists/*
|
| 19 |
|
| 20 |
# Initialize git-lfs
|
|
|
|
| 29 |
# Create models directory
|
| 30 |
RUN mkdir -p /app/models
|
| 31 |
|
| 32 |
+
# Copy and install llama-cpp-python from local wheel
|
| 33 |
+
COPY wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl /tmp/
|
| 34 |
+
RUN pip install /tmp/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
|
| 35 |
|
| 36 |
# Copy requirements first for better Docker layer caching
|
| 37 |
COPY requirements.txt .
|
|
|
|
| 47 |
|
| 48 |
# Verify model file exists after build
|
| 49 |
RUN ls -la /app/models/ && \
|
| 50 |
+
[ -n "$(ls /app/models/*.gguf 2>/dev/null)" ] || (echo "No .gguf model file found!" && exit 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# Copy application files
|
| 53 |
COPY . .
|
|
|
|
| 63 |
EXPOSE 7860
|
| 64 |
|
| 65 |
# Set entrypoint and default command
|
| 66 |
+
# ENTRYPOINT ["./entrypoint.sh"]
|
| 67 |
CMD ["python", "main.py", "--mode", "gradio"]
|
GRAMMAR_CHANGES.md
DELETED
|
@@ -1,100 +0,0 @@
|
|
| 1 |
-
# 🔗 Grammar Support Implementation
|
| 2 |
-
|
| 3 |
-
## 📋 Summary
|
| 4 |
-
|
| 5 |
-
Successfully integrated **Grammar-based Structured Output (GBNF)** support from the source project `/Users/ivan/Documents/Proging/free_llm_huggingface/free_llm_structure_output` into the current Docker project.
|
| 6 |
-
|
| 7 |
-
## 🔧 Changes Made
|
| 8 |
-
|
| 9 |
-
### 1. Core Grammar Implementation (`app.py`)
|
| 10 |
-
- ✅ Added `LlamaGrammar` import from `llama_cpp`
|
| 11 |
-
- ✅ Implemented `_json_schema_to_gbnf()` function for JSON Schema → GBNF conversion
|
| 12 |
-
- ✅ Added `use_grammar` parameter to `generate_structured_response()` method
|
| 13 |
-
- ✅ Enhanced generation logic with dual modes:
|
| 14 |
-
- **Grammar Mode**: Uses GBNF constraints for strict JSON enforcement
|
| 15 |
-
- **Schema Guidance Mode**: Uses prompt-based schema guidance
|
| 16 |
-
- ✅ Added `test_grammar_generation()` function for testing
|
| 17 |
-
- ✅ Updated `process_request()` to handle grammar parameter
|
| 18 |
-
|
| 19 |
-
### 2. Gradio Interface Enhancement
|
| 20 |
-
- ✅ Added "🔗 Use Grammar (GBNF) Mode" checkbox
|
| 21 |
-
- ✅ Updated submit button handler to pass grammar parameter
|
| 22 |
-
- ✅ Enhanced model information section with grammar features description
|
| 23 |
-
|
| 24 |
-
### 3. REST API Updates (`api.py`)
|
| 25 |
-
- ✅ Added `use_grammar: bool = True` to `StructuredOutputRequest` model
|
| 26 |
-
- ✅ Updated `/generate` endpoint to support grammar parameter
|
| 27 |
-
- ✅ Updated `/generate_with_file` endpoint with `use_grammar` form field
|
| 28 |
-
- ✅ Enhanced API documentation
|
| 29 |
-
|
| 30 |
-
### 4. Documentation Updates
|
| 31 |
-
- ✅ Updated `README.md` with comprehensive Grammar Mode section
|
| 32 |
-
- ✅ Added feature tags: `grammar`, `gbnf`
|
| 33 |
-
- ✅ Included usage examples for all interfaces
|
| 34 |
-
- ✅ Added mode comparison table
|
| 35 |
-
- ✅ Listed supported schema features
|
| 36 |
-
|
| 37 |
-
### 5. Testing
|
| 38 |
-
- ✅ Created `test_grammar_standalone.py` for validation
|
| 39 |
-
- ✅ Successfully tested grammar generation with multiple schema types:
|
| 40 |
-
- Simple objects with required/optional properties
|
| 41 |
-
- Nested objects with arrays
|
| 42 |
-
- String enums support
|
| 43 |
-
|
| 44 |
-
## 🎯 Key Features Added
|
| 45 |
-
|
| 46 |
-
### Grammar Mode Benefits:
|
| 47 |
-
- **100% valid JSON** - No parsing errors
|
| 48 |
-
- **Schema compliance** - Guaranteed structure adherence
|
| 49 |
-
- **Consistent output** - Reliable format every time
|
| 50 |
-
- **Better performance** - Fewer retry attempts needed
|
| 51 |
-
|
| 52 |
-
### Supported Schema Features:
|
| 53 |
-
- ✅ Objects with required/optional properties
|
| 54 |
-
- ✅ Arrays with typed items
|
| 55 |
-
- ✅ String enums
|
| 56 |
-
- ✅ Numbers and integers
|
| 57 |
-
- ✅ Booleans
|
| 58 |
-
- ✅ Nested objects and arrays
|
| 59 |
-
- ⚠️ Complex conditionals (simplified)
|
| 60 |
-
|
| 61 |
-
## 🎛️ Usage Examples
|
| 62 |
-
|
| 63 |
-
### Gradio Interface:
|
| 64 |
-
- Toggle the "🔗 Use Grammar (GBNF) Mode" checkbox (enabled by default)
|
| 65 |
-
|
| 66 |
-
### REST API:
|
| 67 |
-
```json
|
| 68 |
-
{
|
| 69 |
-
"prompt": "Analyze this data...",
|
| 70 |
-
"json_schema": {
|
| 71 |
-
"type": "object",
|
| 72 |
-
"properties": {
|
| 73 |
-
"result": {"type": "string"},
|
| 74 |
-
"confidence": {"type": "number"}
|
| 75 |
-
}
|
| 76 |
-
},
|
| 77 |
-
"use_grammar": true
|
| 78 |
-
}
|
| 79 |
-
```
|
| 80 |
-
|
| 81 |
-
### Python API:
|
| 82 |
-
```python
|
| 83 |
-
result = llm_client.generate_structured_response(
|
| 84 |
-
prompt="Your prompt",
|
| 85 |
-
json_schema=schema,
|
| 86 |
-
use_grammar=True # Enable grammar mode
|
| 87 |
-
)
|
| 88 |
-
```
|
| 89 |
-
|
| 90 |
-
## 🔍 Validation
|
| 91 |
-
|
| 92 |
-
All grammar generation functionality has been tested and validated:
|
| 93 |
-
- ✅ Grammar generation from JSON schemas works correctly
|
| 94 |
-
- ✅ GBNF output format is valid
|
| 95 |
-
- ✅ Enum support is functional
|
| 96 |
-
- ✅ Nested structures are handled properly
|
| 97 |
-
|
| 98 |
-
## 🚀 Ready for Production
|
| 99 |
-
|
| 100 |
-
The implementation is complete and ready for use in Docker environments. Grammar mode provides more reliable structured output generation while maintaining backward compatibility with the existing schema guidance approach.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
|
@@ -1,3 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import json
|
| 2 |
import os
|
| 3 |
import gradio as gr
|
|
@@ -9,7 +15,7 @@ from config import Config
|
|
| 9 |
|
| 10 |
# Try to import llama_cpp with fallback
|
| 11 |
try:
|
| 12 |
-
from llama_cpp import Llama, LlamaGrammar
|
| 13 |
LLAMA_CPP_AVAILABLE = True
|
| 14 |
except ImportError as e:
|
| 15 |
print(f"Warning: llama-cpp-python not available: {e}")
|
|
@@ -27,9 +33,14 @@ except ImportError as e:
|
|
| 27 |
hf_hub_download = None
|
| 28 |
|
| 29 |
# Setup logging
|
| 30 |
-
logging.
|
|
|
|
| 31 |
logger = logging.getLogger(__name__)
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
class StructuredOutputRequest(BaseModel):
|
| 34 |
prompt: str
|
| 35 |
image: Optional[str] = None # base64 encoded image
|
|
@@ -144,14 +155,19 @@ class LLMClient:
|
|
| 144 |
lora_base=None,
|
| 145 |
lora_path=None,
|
| 146 |
seed=Config.SEED,
|
| 147 |
-
verbose=
|
| 148 |
)
|
|
|
|
|
|
|
| 149 |
|
| 150 |
logger.info("Model successfully loaded and initialized")
|
| 151 |
|
| 152 |
# Test model with a simple prompt to verify it's working
|
|
|
|
| 153 |
logger.info("Testing model with simple prompt...")
|
| 154 |
-
|
|
|
|
|
|
|
| 155 |
logger.info("Model test successful")
|
| 156 |
|
| 157 |
except Exception as e:
|
|
@@ -175,11 +191,13 @@ class LLMClient:
|
|
| 175 |
|
| 176 |
def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
|
| 177 |
"""
|
| 178 |
-
Format prompt for structured output generation
|
| 179 |
"""
|
| 180 |
schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
|
| 181 |
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
|
| 184 |
Please respond in strict accordance with the following JSON schema:
|
| 185 |
|
|
@@ -187,139 +205,72 @@ Please respond in strict accordance with the following JSON schema:
|
|
| 187 |
{schema_str}
|
| 188 |
```
|
| 189 |
|
| 190 |
-
Return ONLY valid JSON without additional comments or explanations
|
|
|
|
|
|
|
| 191 |
|
| 192 |
return formatted_prompt
|
| 193 |
-
|
| 194 |
-
def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str:
|
| 195 |
-
"""Convert JSON schema to GBNF (Backus-Naur Form) grammar for structured output"""
|
| 196 |
-
rules = []
|
| 197 |
-
rule_names = set() # Track rule names to avoid duplicates
|
| 198 |
-
|
| 199 |
-
def add_rule(name: str, definition: str):
|
| 200 |
-
if name not in rule_names:
|
| 201 |
-
rules.append(f"{name} ::= {definition}")
|
| 202 |
-
rule_names.add(name)
|
| 203 |
|
| 204 |
-
def
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
return "string"
|
| 208 |
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
required = schema_part.get("required", [])
|
| 215 |
-
|
| 216 |
-
if not properties:
|
| 217 |
-
add_rule(type_name, '"{" ws "}"')
|
| 218 |
-
return type_name
|
| 219 |
-
|
| 220 |
-
# Separate required and optional parts
|
| 221 |
-
required_parts = []
|
| 222 |
-
optional_parts = []
|
| 223 |
-
|
| 224 |
-
for prop_name, prop_schema in properties.items():
|
| 225 |
-
prop_type_name = f"{type_name}_{prop_name}"
|
| 226 |
-
prop_type = process_type(prop_schema, prop_type_name)
|
| 227 |
-
prop_def = f'"\\"" "{prop_name}" "\\"" ws ":" ws {prop_type}'
|
| 228 |
-
|
| 229 |
-
if prop_name in required:
|
| 230 |
-
required_parts.append(prop_def)
|
| 231 |
-
else:
|
| 232 |
-
optional_parts.append(prop_def)
|
| 233 |
-
|
| 234 |
-
# Build object structure - simplified approach
|
| 235 |
-
if not required_parts and not optional_parts:
|
| 236 |
-
object_def = '"{" ws "}"'
|
| 237 |
-
else:
|
| 238 |
-
# For simplicity, create a fixed structure based on required fields only
|
| 239 |
-
# and treat optional fields as always present but with optional values
|
| 240 |
-
if not required_parts:
|
| 241 |
-
# Only optional fields - make the whole object optional content
|
| 242 |
-
if len(optional_parts) == 1:
|
| 243 |
-
object_def = f'"{" ws ({optional_parts[0]})? ws "}"'
|
| 244 |
-
else:
|
| 245 |
-
comma_separated = ' ws "," ws '.join(optional_parts)
|
| 246 |
-
object_def = f'"{" ws ({comma_separated})? ws "}"'
|
| 247 |
-
else:
|
| 248 |
-
# Has required fields
|
| 249 |
-
all_parts = required_parts.copy()
|
| 250 |
-
|
| 251 |
-
# Add optional parts as truly optional (with optional commas)
|
| 252 |
-
for opt_part in optional_parts:
|
| 253 |
-
all_parts.append(f'(ws "," ws {opt_part})?')
|
| 254 |
-
|
| 255 |
-
if len(all_parts) == 1:
|
| 256 |
-
object_def = f'"{" ws {all_parts[0]} ws "}"'
|
| 257 |
-
else:
|
| 258 |
-
# Join required parts with commas, optional parts are already with optional commas
|
| 259 |
-
required_with_commas = ' ws "," ws '.join(required_parts)
|
| 260 |
-
optional_with_commas = ' '.join([f'(ws "," ws {opt})?' for opt in optional_parts])
|
| 261 |
-
|
| 262 |
-
if optional_with_commas:
|
| 263 |
-
object_def = f'"{{" ws {required_with_commas} {optional_with_commas} ws "}}"'
|
| 264 |
-
else:
|
| 265 |
-
object_def = f'"{{" ws {required_with_commas} ws "}}"'
|
| 266 |
-
|
| 267 |
-
add_rule(type_name, object_def)
|
| 268 |
-
return type_name
|
| 269 |
-
|
| 270 |
-
elif schema_type == "array":
|
| 271 |
-
# Handle array type
|
| 272 |
-
items_schema = schema_part.get("items", {})
|
| 273 |
-
items_type_name = f"{type_name}_items"
|
| 274 |
-
item_type = process_type(items_schema, f"{type_name}_item")
|
| 275 |
-
|
| 276 |
-
# Create array items rule
|
| 277 |
-
add_rule(items_type_name, f"{item_type} (ws \",\" ws {item_type})*")
|
| 278 |
-
add_rule(type_name, f'"[" ws ({items_type_name})? ws "]"')
|
| 279 |
-
return type_name
|
| 280 |
-
|
| 281 |
-
elif schema_type == "string":
|
| 282 |
-
# Handle string type with enum support
|
| 283 |
-
if "enum" in schema_part:
|
| 284 |
-
enum_values = schema_part["enum"]
|
| 285 |
-
enum_options = ' | '.join([f'"\\"" "{val}" "\\""' for val in enum_values])
|
| 286 |
-
add_rule(type_name, enum_options)
|
| 287 |
-
return type_name
|
| 288 |
-
else:
|
| 289 |
-
return "string"
|
| 290 |
-
|
| 291 |
-
elif schema_type == "number" or schema_type == "integer":
|
| 292 |
-
return "number"
|
| 293 |
|
| 294 |
-
|
| 295 |
-
|
| 296 |
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
if
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 323 |
def generate_structured_response(self,
|
| 324 |
prompt: str,
|
| 325 |
json_schema: Union[str, Dict[str, Any]],
|
|
@@ -360,17 +311,21 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
|
|
| 360 |
generation_params = {
|
| 361 |
"max_tokens": Config.MAX_NEW_TOKENS,
|
| 362 |
"temperature": Config.TEMPERATURE,
|
|
|
|
|
|
|
|
|
|
| 363 |
"echo": False
|
| 364 |
}
|
| 365 |
|
| 366 |
# Add grammar or stop tokens based on mode
|
| 367 |
if use_grammar and grammar is not None:
|
| 368 |
generation_params["grammar"] = grammar
|
| 369 |
-
# For grammar mode, use a simpler prompt
|
| 370 |
-
simple_prompt = f"
|
| 371 |
response = self.llm(simple_prompt, **generation_params)
|
| 372 |
else:
|
| 373 |
-
|
|
|
|
| 374 |
response = self.llm(formatted_prompt, **generation_params)
|
| 375 |
|
| 376 |
# Extract generated text
|
|
@@ -385,11 +340,7 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
|
|
| 385 |
if json_start != -1 and json_end > json_start:
|
| 386 |
json_str = generated_text[json_start:json_end]
|
| 387 |
parsed_response = json.loads(json_str)
|
| 388 |
-
return
|
| 389 |
-
"success": True,
|
| 390 |
-
"data": parsed_response,
|
| 391 |
-
"raw_response": generated_text
|
| 392 |
-
}
|
| 393 |
else:
|
| 394 |
return {
|
| 395 |
"error": "Could not find JSON in model response",
|
|
@@ -408,6 +359,99 @@ def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str
|
|
| 408 |
"error": f"Generation error: {str(e)}"
|
| 409 |
}
|
| 410 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 411 |
def test_grammar_generation(json_schema_str: str) -> Dict[str, Any]:
|
| 412 |
"""
|
| 413 |
Test grammar generation without running the full model
|
|
@@ -457,6 +501,43 @@ def process_request(prompt: str,
|
|
| 457 |
result = llm_client.generate_structured_response(prompt, json_schema, image, use_grammar)
|
| 458 |
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 459 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 460 |
# Examples for demonstration
|
| 461 |
example_schema = """{
|
| 462 |
"type": "object",
|
|
@@ -502,89 +583,12 @@ def create_gradio_interface():
|
|
| 502 |
else:
|
| 503 |
gr.Markdown("✅ **Status**: Model successfully loaded and ready to work")
|
| 504 |
|
| 505 |
-
with gr.
|
| 506 |
-
with gr.
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
value=example_prompt
|
| 512 |
-
)
|
| 513 |
-
|
| 514 |
-
image_input = gr.Image(
|
| 515 |
-
label="Image (optional, for multimodal models)",
|
| 516 |
-
type="pil"
|
| 517 |
-
)
|
| 518 |
-
|
| 519 |
-
schema_input = gr.Textbox(
|
| 520 |
-
label="JSON schema for response structure",
|
| 521 |
-
placeholder="Enter JSON schema...",
|
| 522 |
-
lines=15,
|
| 523 |
-
value=example_schema
|
| 524 |
-
)
|
| 525 |
-
|
| 526 |
-
grammar_checkbox = gr.Checkbox(
|
| 527 |
-
label="🔗 Use Grammar (GBNF) Mode",
|
| 528 |
-
value=True,
|
| 529 |
-
info="Enable grammar-based structured output for more precise JSON generation"
|
| 530 |
-
)
|
| 531 |
-
|
| 532 |
-
submit_btn = gr.Button("Generate Response", variant="primary")
|
| 533 |
-
|
| 534 |
-
with gr.Column():
|
| 535 |
-
output = gr.Textbox(
|
| 536 |
-
label="Structured Response",
|
| 537 |
-
lines=20,
|
| 538 |
-
interactive=False
|
| 539 |
-
)
|
| 540 |
-
|
| 541 |
-
submit_btn.click(
|
| 542 |
-
fn=process_request,
|
| 543 |
-
inputs=[prompt_input, schema_input, image_input, grammar_checkbox],
|
| 544 |
-
outputs=output
|
| 545 |
-
)
|
| 546 |
-
|
| 547 |
-
# Examples
|
| 548 |
-
gr.Markdown("## 📋 Usage Examples")
|
| 549 |
-
|
| 550 |
-
examples = gr.Examples(
|
| 551 |
-
examples=[
|
| 552 |
-
[
|
| 553 |
-
"Describe today's weather in New York",
|
| 554 |
-
"""{
|
| 555 |
-
"type": "object",
|
| 556 |
-
"properties": {
|
| 557 |
-
"temperature": {"type": "number"},
|
| 558 |
-
"description": {"type": "string"},
|
| 559 |
-
"humidity": {"type": "number"}
|
| 560 |
-
}
|
| 561 |
-
}""",
|
| 562 |
-
None
|
| 563 |
-
],
|
| 564 |
-
[
|
| 565 |
-
"Create a Python learning plan for one month",
|
| 566 |
-
"""{
|
| 567 |
-
"type": "object",
|
| 568 |
-
"properties": {
|
| 569 |
-
"weeks": {
|
| 570 |
-
"type": "array",
|
| 571 |
-
"items": {
|
| 572 |
-
"type": "object",
|
| 573 |
-
"properties": {
|
| 574 |
-
"week_number": {"type": "integer"},
|
| 575 |
-
"topics": {"type": "array", "items": {"type": "string"}},
|
| 576 |
-
"practice_hours": {"type": "number"}
|
| 577 |
-
}
|
| 578 |
-
}
|
| 579 |
-
},
|
| 580 |
-
"total_hours": {"type": "number"}
|
| 581 |
-
}
|
| 582 |
-
}""",
|
| 583 |
-
None
|
| 584 |
-
]
|
| 585 |
-
],
|
| 586 |
-
inputs=[prompt_input, schema_input, image_input]
|
| 587 |
-
)
|
| 588 |
|
| 589 |
# Model information
|
| 590 |
gr.Markdown(f"""
|
|
@@ -612,10 +616,155 @@ def create_gradio_interface():
|
|
| 612 |
- Strict enforcement of JSON structure during generation
|
| 613 |
- Support for objects, arrays, strings, numbers, booleans, and enums
|
| 614 |
- Improved consistency and reliability of structured outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
""")
|
| 616 |
|
| 617 |
return demo
|
| 618 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
if __name__ == "__main__":
|
| 620 |
# Create and launch Gradio interface
|
| 621 |
demo = create_gradio_interface()
|
|
@@ -623,5 +772,5 @@ if __name__ == "__main__":
|
|
| 623 |
server_name=Config.HOST,
|
| 624 |
server_port=Config.GRADIO_PORT,
|
| 625 |
share=False,
|
| 626 |
-
debug=
|
| 627 |
)
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ.setdefault("OMP_NUM_THREADS", "1")
|
| 3 |
+
os.environ.setdefault("OPENBLAS_NUM_THREADS", "1")
|
| 4 |
+
os.environ.setdefault("MKL_NUM_THREADS", "1")
|
| 5 |
+
os.environ.setdefault("NUMEXPR_NUM_THREADS", "1")
|
| 6 |
+
|
| 7 |
import json
|
| 8 |
import os
|
| 9 |
import gradio as gr
|
|
|
|
| 15 |
|
| 16 |
# Try to import llama_cpp with fallback
|
| 17 |
try:
|
| 18 |
+
from llama_cpp import Llama, LlamaGrammar, LlamaRAMCache
|
| 19 |
LLAMA_CPP_AVAILABLE = True
|
| 20 |
except ImportError as e:
|
| 21 |
print(f"Warning: llama-cpp-python not available: {e}")
|
|
|
|
| 33 |
hf_hub_download = None
|
| 34 |
|
| 35 |
# Setup logging
|
| 36 |
+
log_level = getattr(logging, Config.LOG_LEVEL.upper())
|
| 37 |
+
logging.basicConfig(level=log_level)
|
| 38 |
logger = logging.getLogger(__name__)
|
| 39 |
|
| 40 |
+
# Reduce llama-cpp-python verbosity
|
| 41 |
+
llama_logger = logging.getLogger('llama_cpp')
|
| 42 |
+
llama_logger.setLevel(logging.WARNING)
|
| 43 |
+
|
| 44 |
class StructuredOutputRequest(BaseModel):
|
| 45 |
prompt: str
|
| 46 |
image: Optional[str] = None # base64 encoded image
|
|
|
|
| 155 |
lora_base=None,
|
| 156 |
lora_path=None,
|
| 157 |
seed=Config.SEED,
|
| 158 |
+
verbose=False # Disable verbose to reduce log noise
|
| 159 |
)
|
| 160 |
+
# cache = LlamaRAMCache()
|
| 161 |
+
# self.llm.set_cache(cache)
|
| 162 |
|
| 163 |
logger.info("Model successfully loaded and initialized")
|
| 164 |
|
| 165 |
# Test model with a simple prompt to verify it's working
|
| 166 |
+
from time import time
|
| 167 |
logger.info("Testing model with simple prompt...")
|
| 168 |
+
start_time = time()
|
| 169 |
+
test_response = self.llm("Hello", max_tokens=1, temperature=1.0, top_k=64, top_p=0.95, min_p=0.0)
|
| 170 |
+
logger.info(f"Model test time: {time() - start_time:.2f} seconds, response: {test_response}")
|
| 171 |
logger.info("Model test successful")
|
| 172 |
|
| 173 |
except Exception as e:
|
|
|
|
| 191 |
|
| 192 |
def _format_prompt_with_schema(self, prompt: str, json_schema: Dict[str, Any]) -> str:
|
| 193 |
"""
|
| 194 |
+
Format prompt for structured output generation using Gemma chat format
|
| 195 |
"""
|
| 196 |
schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
|
| 197 |
|
| 198 |
+
# Use Gemma chat format with proper tokens
|
| 199 |
+
formatted_prompt = f"""<bos><start_of_turn>user
|
| 200 |
+
{prompt}
|
| 201 |
|
| 202 |
Please respond in strict accordance with the following JSON schema:
|
| 203 |
|
|
|
|
| 205 |
{schema_str}
|
| 206 |
```
|
| 207 |
|
| 208 |
+
Return ONLY valid JSON without additional comments or explanations.<end_of_turn>
|
| 209 |
+
<start_of_turn>model
|
| 210 |
+
"""
|
| 211 |
|
| 212 |
return formatted_prompt
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 213 |
|
| 214 |
+
def _format_gemma_chat(self, messages: list) -> str:
|
| 215 |
+
"""
|
| 216 |
+
Format messages in Gemma chat format
|
|
|
|
| 217 |
|
| 218 |
+
Args:
|
| 219 |
+
messages: List of dicts with 'role' and 'content' keys
|
| 220 |
+
role can be 'user' or 'model'
|
| 221 |
+
"""
|
| 222 |
+
formatted_parts = ["<bos>"]
|
| 223 |
|
| 224 |
+
for message in messages:
|
| 225 |
+
role = message.get('role', 'user')
|
| 226 |
+
content = message.get('content', '')
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
|
| 228 |
+
if role not in ['user', 'model']:
|
| 229 |
+
role = 'user' # fallback to user role
|
| 230 |
|
| 231 |
+
formatted_parts.append(f"<start_of_turn>{role}")
|
| 232 |
+
formatted_parts.append(content)
|
| 233 |
+
formatted_parts.append("<end_of_turn>")
|
| 234 |
+
|
| 235 |
+
# Add start of model response
|
| 236 |
+
formatted_parts.append("<start_of_turn>model")
|
| 237 |
+
|
| 238 |
+
return "\n".join(formatted_parts)
|
| 239 |
+
|
| 240 |
+
def generate_chat_response(self, messages: list, max_tokens: int = None) -> str:
|
| 241 |
+
"""
|
| 242 |
+
Generate response using Gemma chat format
|
| 243 |
+
|
| 244 |
+
Args:
|
| 245 |
+
messages: List of message dicts with 'role' and 'content' keys
|
| 246 |
+
max_tokens: Maximum tokens for generation
|
| 247 |
+
|
| 248 |
+
Returns:
|
| 249 |
+
Generated response text
|
| 250 |
+
"""
|
| 251 |
+
if not messages:
|
| 252 |
+
raise ValueError("Messages list cannot be empty")
|
| 253 |
+
|
| 254 |
+
# Format messages using Gemma chat format
|
| 255 |
+
formatted_prompt = self._format_gemma_chat(messages)
|
| 256 |
+
|
| 257 |
+
# Set generation parameters
|
| 258 |
+
generation_params = {
|
| 259 |
+
"max_tokens": max_tokens or Config.MAX_NEW_TOKENS,
|
| 260 |
+
"temperature": Config.TEMPERATURE,
|
| 261 |
+
"top_k": 64,
|
| 262 |
+
"top_p": 0.95,
|
| 263 |
+
"min_p": 0.0,
|
| 264 |
+
"echo": False,
|
| 265 |
+
"stop": ["<end_of_turn>", "<start_of_turn>", "<bos>"]
|
| 266 |
+
}
|
| 267 |
+
|
| 268 |
+
# Generate response
|
| 269 |
+
response = self.llm(formatted_prompt, **generation_params)
|
| 270 |
+
generated_text = response['choices'][0]['text'].strip()
|
| 271 |
+
|
| 272 |
+
return generated_text
|
| 273 |
+
|
| 274 |
def generate_structured_response(self,
|
| 275 |
prompt: str,
|
| 276 |
json_schema: Union[str, Dict[str, Any]],
|
|
|
|
| 311 |
generation_params = {
|
| 312 |
"max_tokens": Config.MAX_NEW_TOKENS,
|
| 313 |
"temperature": Config.TEMPERATURE,
|
| 314 |
+
"top_k": 64,
|
| 315 |
+
"top_p": 0.95,
|
| 316 |
+
"min_p": 0.0,
|
| 317 |
"echo": False
|
| 318 |
}
|
| 319 |
|
| 320 |
# Add grammar or stop tokens based on mode
|
| 321 |
if use_grammar and grammar is not None:
|
| 322 |
generation_params["grammar"] = grammar
|
| 323 |
+
# For grammar mode, use a simpler prompt in Gemma format
|
| 324 |
+
simple_prompt = f"<bos><start_of_turn>user\n{prompt}<end_of_turn>\n<start_of_turn>model\n"
|
| 325 |
response = self.llm(simple_prompt, **generation_params)
|
| 326 |
else:
|
| 327 |
+
# Update stop tokens for Gemma format
|
| 328 |
+
generation_params["stop"] = ["<end_of_turn>", "<start_of_turn>", "<bos>"]
|
| 329 |
response = self.llm(formatted_prompt, **generation_params)
|
| 330 |
|
| 331 |
# Extract generated text
|
|
|
|
| 340 |
if json_start != -1 and json_end > json_start:
|
| 341 |
json_str = generated_text[json_start:json_end]
|
| 342 |
parsed_response = json.loads(json_str)
|
| 343 |
+
return parsed_response
|
|
|
|
|
|
|
|
|
|
|
|
|
| 344 |
else:
|
| 345 |
return {
|
| 346 |
"error": "Could not find JSON in model response",
|
|
|
|
| 359 |
"error": f"Generation error: {str(e)}"
|
| 360 |
}
|
| 361 |
|
| 362 |
+
def _json_schema_to_gbnf(schema: Dict[str, Any], root_name: str = "root") -> str:
|
| 363 |
+
"""Convert JSON schema to GBNF (Backus-Naur Form) grammar for structured output"""
|
| 364 |
+
rules = {} # Use dict to maintain order and avoid duplicates
|
| 365 |
+
|
| 366 |
+
def add_rule(name: str, definition: str):
|
| 367 |
+
if name not in rules:
|
| 368 |
+
rules[name] = f"{name} ::= {definition}"
|
| 369 |
+
|
| 370 |
+
def process_type(schema_part: Dict[str, Any], type_name: str = "value") -> str:
|
| 371 |
+
if "type" not in schema_part:
|
| 372 |
+
# Handle anyOf, oneOf, allOf cases - simplified to string for now
|
| 373 |
+
return "string"
|
| 374 |
+
|
| 375 |
+
schema_type = schema_part["type"]
|
| 376 |
+
|
| 377 |
+
if schema_type == "object":
|
| 378 |
+
# Handle object type
|
| 379 |
+
properties = schema_part.get("properties", {})
|
| 380 |
+
required = schema_part.get("required", [])
|
| 381 |
+
|
| 382 |
+
if not properties:
|
| 383 |
+
add_rule(type_name, '"{" ws "}"')
|
| 384 |
+
return type_name
|
| 385 |
+
|
| 386 |
+
# Build object properties
|
| 387 |
+
property_rules = []
|
| 388 |
+
|
| 389 |
+
for prop_name, prop_schema in properties.items():
|
| 390 |
+
prop_type_name = f"{type_name}_{prop_name}"
|
| 391 |
+
prop_type = process_type(prop_schema, prop_type_name)
|
| 392 |
+
property_rules.append(f'"\\"" "{prop_name}" "\\"" ws ":" ws {prop_type}')
|
| 393 |
+
|
| 394 |
+
# Create a simplified object structure with all properties as required
|
| 395 |
+
# This avoids complex optional field handling that can cause parsing issues
|
| 396 |
+
if len(property_rules) == 1:
|
| 397 |
+
object_def = f'"{{" ws {property_rules[0]} ws "}}"'
|
| 398 |
+
else:
|
| 399 |
+
properties_joined = ' ws "," ws '.join(property_rules)
|
| 400 |
+
object_def = f'"{{" ws {properties_joined} ws "}}"'
|
| 401 |
+
|
| 402 |
+
add_rule(type_name, object_def)
|
| 403 |
+
return type_name
|
| 404 |
+
|
| 405 |
+
elif schema_type == "array":
|
| 406 |
+
# Handle array type
|
| 407 |
+
items_schema = schema_part.get("items", {})
|
| 408 |
+
items_type_name = f"{type_name}_items"
|
| 409 |
+
item_type = process_type(items_schema, f"{type_name}_item")
|
| 410 |
+
|
| 411 |
+
# Create array items rule
|
| 412 |
+
add_rule(items_type_name, f"{item_type} (ws \",\" ws {item_type})*")
|
| 413 |
+
add_rule(type_name, f'"[" ws ({items_type_name})? ws "]"')
|
| 414 |
+
return type_name
|
| 415 |
+
|
| 416 |
+
elif schema_type == "string":
|
| 417 |
+
# Handle string type with enum support
|
| 418 |
+
if "enum" in schema_part:
|
| 419 |
+
enum_values = schema_part["enum"]
|
| 420 |
+
enum_options = ' | '.join([f'"\\"" "{val}" "\\""' for val in enum_values])
|
| 421 |
+
add_rule(type_name, enum_options)
|
| 422 |
+
return type_name
|
| 423 |
+
else:
|
| 424 |
+
return "string"
|
| 425 |
+
|
| 426 |
+
elif schema_type == "number" or schema_type == "integer":
|
| 427 |
+
return "number"
|
| 428 |
+
|
| 429 |
+
elif schema_type == "boolean":
|
| 430 |
+
return "boolean"
|
| 431 |
+
|
| 432 |
+
else:
|
| 433 |
+
return "string" # fallback
|
| 434 |
+
|
| 435 |
+
# First add basic GBNF rules for primitives to ensure they come first
|
| 436 |
+
basic_rules_data = [
|
| 437 |
+
('ws', '[ \\t\\n]*'),
|
| 438 |
+
('string', '"\\"" char* "\\""'),
|
| 439 |
+
('char', '[^"\\\\] | "\\\\" (["\\\\bfnrt] | "u" hex hex hex hex)'),
|
| 440 |
+
('hex', '[0-9a-fA-F]'),
|
| 441 |
+
('number', '"-"? ("0" | [1-9] [0-9]*) ("." [0-9]+)? ([eE] [+-]? [0-9]+)?'),
|
| 442 |
+
('boolean', '"true" | "false"'),
|
| 443 |
+
('null', '"null"')
|
| 444 |
+
]
|
| 445 |
+
|
| 446 |
+
for rule_name, rule_def in basic_rules_data:
|
| 447 |
+
add_rule(rule_name, rule_def)
|
| 448 |
+
|
| 449 |
+
# Process root schema to build all custom rules
|
| 450 |
+
process_type(schema, root_name)
|
| 451 |
+
|
| 452 |
+
# Return rules in the order they were added
|
| 453 |
+
return "\n".join(rules.values())
|
| 454 |
+
|
| 455 |
def test_grammar_generation(json_schema_str: str) -> Dict[str, Any]:
|
| 456 |
"""
|
| 457 |
Test grammar generation without running the full model
|
|
|
|
| 501 |
result = llm_client.generate_structured_response(prompt, json_schema, image, use_grammar)
|
| 502 |
return json.dumps(result, ensure_ascii=False, indent=2)
|
| 503 |
|
| 504 |
+
def test_gemma_chat(messages_text: str) -> str:
|
| 505 |
+
"""
|
| 506 |
+
Test Gemma chat format with example conversation
|
| 507 |
+
"""
|
| 508 |
+
if llm_client is None:
|
| 509 |
+
return "Error: LLM client not initialized"
|
| 510 |
+
|
| 511 |
+
try:
|
| 512 |
+
# Parse messages from text (simple format: role:message per line)
|
| 513 |
+
messages = []
|
| 514 |
+
for line in messages_text.strip().split('\n'):
|
| 515 |
+
if ':' in line:
|
| 516 |
+
role, content = line.split(':', 1)
|
| 517 |
+
role = role.strip().lower()
|
| 518 |
+
content = content.strip()
|
| 519 |
+
if role in ['user', 'model']:
|
| 520 |
+
messages.append({"role": role, "content": content})
|
| 521 |
+
|
| 522 |
+
if not messages:
|
| 523 |
+
# Use default example if no valid messages provided
|
| 524 |
+
messages = [
|
| 525 |
+
{"role": "user", "content": "Hello!"},
|
| 526 |
+
{"role": "model", "content": "Hey there!"},
|
| 527 |
+
{"role": "user", "content": "What is 1+1?"}
|
| 528 |
+
]
|
| 529 |
+
|
| 530 |
+
# Generate formatted prompt to show the structure
|
| 531 |
+
formatted_prompt = llm_client._format_gemma_chat(messages)
|
| 532 |
+
|
| 533 |
+
# Generate response
|
| 534 |
+
response = llm_client.generate_chat_response(messages, max_tokens=100)
|
| 535 |
+
|
| 536 |
+
return f"Formatted prompt:\n{formatted_prompt}\n\nGenerated response:\n{response}"
|
| 537 |
+
|
| 538 |
+
except Exception as e:
|
| 539 |
+
return f"Error: {str(e)}"
|
| 540 |
+
|
| 541 |
# Examples for demonstration
|
| 542 |
example_schema = """{
|
| 543 |
"type": "object",
|
|
|
|
| 583 |
else:
|
| 584 |
gr.Markdown("✅ **Status**: Model successfully loaded and ready to work")
|
| 585 |
|
| 586 |
+
with gr.Tabs():
|
| 587 |
+
with gr.TabItem("🔧 Structured Output"):
|
| 588 |
+
create_structured_output_tab()
|
| 589 |
+
|
| 590 |
+
with gr.TabItem("💬 Gemma Chat Format"):
|
| 591 |
+
create_gemma_chat_tab()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 592 |
|
| 593 |
# Model information
|
| 594 |
gr.Markdown(f"""
|
|
|
|
| 616 |
- Strict enforcement of JSON structure during generation
|
| 617 |
- Support for objects, arrays, strings, numbers, booleans, and enums
|
| 618 |
- Improved consistency and reliability of structured outputs
|
| 619 |
+
|
| 620 |
+
📝 **Gemma Format Features**:
|
| 621 |
+
- Uses proper Gemma chat tokens: `<bos>`, `<start_of_turn>`, `<end_of_turn>`
|
| 622 |
+
- Supports multi-turn conversations with user/model roles
|
| 623 |
+
- Compatible with Gemma model's expected input format
|
| 624 |
+
- Improved response quality with proper token structure
|
| 625 |
""")
|
| 626 |
|
| 627 |
return demo
|
| 628 |
|
| 629 |
+
def create_structured_output_tab():
|
| 630 |
+
"""Create structured output tab"""
|
| 631 |
+
with gr.Row():
|
| 632 |
+
with gr.Column():
|
| 633 |
+
prompt_input = gr.Textbox(
|
| 634 |
+
label="Prompt for model",
|
| 635 |
+
placeholder="Enter your request...",
|
| 636 |
+
lines=5,
|
| 637 |
+
value=example_prompt
|
| 638 |
+
)
|
| 639 |
+
|
| 640 |
+
image_input = gr.Image(
|
| 641 |
+
label="Image (optional, for multimodal models)",
|
| 642 |
+
type="pil"
|
| 643 |
+
)
|
| 644 |
+
|
| 645 |
+
schema_input = gr.Textbox(
|
| 646 |
+
label="JSON schema for response structure",
|
| 647 |
+
placeholder="Enter JSON schema...",
|
| 648 |
+
lines=15,
|
| 649 |
+
value=example_schema
|
| 650 |
+
)
|
| 651 |
+
|
| 652 |
+
grammar_checkbox = gr.Checkbox(
|
| 653 |
+
label="🔗 Use Grammar (GBNF) Mode",
|
| 654 |
+
value=True,
|
| 655 |
+
info="Enable grammar-based structured output for more precise JSON generation"
|
| 656 |
+
)
|
| 657 |
+
|
| 658 |
+
submit_btn = gr.Button("Generate Response", variant="primary")
|
| 659 |
+
|
| 660 |
+
with gr.Column():
|
| 661 |
+
output = gr.Textbox(
|
| 662 |
+
label="Structured Response",
|
| 663 |
+
lines=20,
|
| 664 |
+
interactive=False
|
| 665 |
+
)
|
| 666 |
+
|
| 667 |
+
submit_btn.click(
|
| 668 |
+
fn=process_request,
|
| 669 |
+
inputs=[prompt_input, schema_input, image_input, grammar_checkbox],
|
| 670 |
+
outputs=output
|
| 671 |
+
)
|
| 672 |
+
|
| 673 |
+
# Examples
|
| 674 |
+
gr.Markdown("## 📋 Usage Examples")
|
| 675 |
+
|
| 676 |
+
examples = gr.Examples(
|
| 677 |
+
examples=[
|
| 678 |
+
[
|
| 679 |
+
"Describe today's weather in New York",
|
| 680 |
+
"""{
|
| 681 |
+
"type": "object",
|
| 682 |
+
"properties": {
|
| 683 |
+
"temperature": {"type": "number"},
|
| 684 |
+
"description": {"type": "string"},
|
| 685 |
+
"humidity": {"type": "number"}
|
| 686 |
+
}
|
| 687 |
+
}""",
|
| 688 |
+
None
|
| 689 |
+
],
|
| 690 |
+
[
|
| 691 |
+
"Create a Python learning plan for one month",
|
| 692 |
+
"""{
|
| 693 |
+
"type": "object",
|
| 694 |
+
"properties": {
|
| 695 |
+
"weeks": {
|
| 696 |
+
"type": "array",
|
| 697 |
+
"items": {
|
| 698 |
+
"type": "object",
|
| 699 |
+
"properties": {
|
| 700 |
+
"week_number": {"type": "integer"},
|
| 701 |
+
"topics": {"type": "array", "items": {"type": "string"}},
|
| 702 |
+
"practice_hours": {"type": "number"}
|
| 703 |
+
}
|
| 704 |
+
}
|
| 705 |
+
},
|
| 706 |
+
"total_hours": {"type": "number"}
|
| 707 |
+
}
|
| 708 |
+
}""",
|
| 709 |
+
None
|
| 710 |
+
]
|
| 711 |
+
],
|
| 712 |
+
inputs=[prompt_input, schema_input, image_input]
|
| 713 |
+
)
|
| 714 |
+
|
| 715 |
+
def create_gemma_chat_tab():
|
| 716 |
+
"""Create Gemma chat format demonstration tab"""
|
| 717 |
+
gr.Markdown("## 💬 Gemma Chat Format Demo")
|
| 718 |
+
gr.Markdown("This tab demonstrates the Gemma chat format with `<bos>`, `<start_of_turn>`, and `<end_of_turn>` tokens.")
|
| 719 |
+
|
| 720 |
+
with gr.Row():
|
| 721 |
+
with gr.Column():
|
| 722 |
+
messages_input = gr.Textbox(
|
| 723 |
+
label="Conversation Messages (format: role: message per line)",
|
| 724 |
+
placeholder="user: Hello!\nmodel: Hey there!\nuser: What is 1+1?",
|
| 725 |
+
lines=8,
|
| 726 |
+
value="user: Hello!\nmodel: Hey there!\nuser: What is 1+1?"
|
| 727 |
+
)
|
| 728 |
+
|
| 729 |
+
test_btn = gr.Button("Test Gemma Format", variant="primary")
|
| 730 |
+
|
| 731 |
+
with gr.Column():
|
| 732 |
+
chat_output = gr.Textbox(
|
| 733 |
+
label="Formatted Prompt and Response",
|
| 734 |
+
lines=15,
|
| 735 |
+
interactive=False
|
| 736 |
+
)
|
| 737 |
+
|
| 738 |
+
test_btn.click(
|
| 739 |
+
fn=test_gemma_chat,
|
| 740 |
+
inputs=messages_input,
|
| 741 |
+
outputs=chat_output
|
| 742 |
+
)
|
| 743 |
+
|
| 744 |
+
# Example explanation
|
| 745 |
+
gr.Markdown("""
|
| 746 |
+
### 📝 Format Explanation
|
| 747 |
+
|
| 748 |
+
The Gemma chat format uses special tokens to structure conversations:
|
| 749 |
+
- `<bos>` - Beginning of sequence
|
| 750 |
+
- `<start_of_turn>user` - Start user message
|
| 751 |
+
- `<end_of_turn>` - End current message
|
| 752 |
+
- `<start_of_turn>model` - Start model response
|
| 753 |
+
|
| 754 |
+
**Example structure:**
|
| 755 |
+
```
|
| 756 |
+
<bos><start_of_turn>user
|
| 757 |
+
Hello!<end_of_turn>
|
| 758 |
+
<start_of_turn>model
|
| 759 |
+
Hey there!<end_of_turn>
|
| 760 |
+
<start_of_turn>user
|
| 761 |
+
What is 1+1?<end_of_turn>
|
| 762 |
+
<start_of_turn>model
|
| 763 |
+
```
|
| 764 |
+
|
| 765 |
+
This format is now used for both structured output and regular chat generation.
|
| 766 |
+
""")
|
| 767 |
+
|
| 768 |
if __name__ == "__main__":
|
| 769 |
# Create and launch Gradio interface
|
| 770 |
demo = create_gradio_interface()
|
|
|
|
| 772 |
server_name=Config.HOST,
|
| 773 |
server_port=Config.GRADIO_PORT,
|
| 774 |
share=False,
|
| 775 |
+
debug=False
|
| 776 |
)
|
config.py
CHANGED
|
@@ -5,19 +5,19 @@ class Config:
|
|
| 5 |
"""Application configuration for working with local GGUF models"""
|
| 6 |
|
| 7 |
# Model settings - using Hugging Face downloaded model
|
| 8 |
-
MODEL_REPO
|
| 9 |
-
MODEL_FILENAME
|
| 10 |
-
MODEL_PATH
|
| 11 |
HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 12 |
|
| 13 |
# Model loading settings - optimized for Docker container
|
| 14 |
-
N_CTX: int = int(os.getenv("N_CTX", "
|
| 15 |
N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
|
| 16 |
-
N_THREADS: int = int(os.getenv("N_THREADS", "
|
| 17 |
N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
|
| 18 |
USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
|
| 19 |
USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
|
| 20 |
-
F16_KV: bool = os.getenv("F16_KV", "
|
| 21 |
SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
|
| 22 |
|
| 23 |
# Server settings - Docker compatible
|
|
@@ -25,9 +25,12 @@ class Config:
|
|
| 25 |
GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
|
| 26 |
API_PORT: int = int(os.getenv("API_PORT", "8000"))
|
| 27 |
|
|
|
|
|
|
|
|
|
|
| 28 |
# Generation settings - optimized for Docker
|
| 29 |
MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
|
| 30 |
-
TEMPERATURE: float =
|
| 31 |
|
| 32 |
# File upload settings
|
| 33 |
MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
|
|
|
|
| 5 |
"""Application configuration for working with local GGUF models"""
|
| 6 |
|
| 7 |
# Model settings - using Hugging Face downloaded model
|
| 8 |
+
MODEL_REPO = "unsloth/gemma-3-270m-it-GGUF"
|
| 9 |
+
MODEL_FILENAME = "gemma-3-270m-it-Q8_0.gguf"
|
| 10 |
+
MODEL_PATH = f"/app/models/{MODEL_FILENAME}"
|
| 11 |
HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
|
| 12 |
|
| 13 |
# Model loading settings - optimized for Docker container
|
| 14 |
+
N_CTX: int = int(os.getenv("N_CTX", "1024")) # Reduced context window for Docker
|
| 15 |
N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "0")) # CPU-only for Docker by default
|
| 16 |
+
N_THREADS: int = int(os.getenv("N_THREADS", "2")) # Conservative thread count
|
| 17 |
N_BATCH: int = int(os.getenv("N_BATCH", "512")) # Smaller batch size for Docker
|
| 18 |
USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Disabled for Docker
|
| 19 |
USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
|
| 20 |
+
F16_KV: bool = os.getenv("F16_KV", "false").lower() == "true" # Use 16-bit keys and values
|
| 21 |
SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
|
| 22 |
|
| 23 |
# Server settings - Docker compatible
|
|
|
|
| 25 |
GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
|
| 26 |
API_PORT: int = int(os.getenv("API_PORT", "8000"))
|
| 27 |
|
| 28 |
+
# Logging settings
|
| 29 |
+
LOG_LEVEL: str = os.getenv("LOG_LEVEL", "INFO") # INFO, WARNING, ERROR, DEBUG
|
| 30 |
+
|
| 31 |
# Generation settings - optimized for Docker
|
| 32 |
MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "256")) # Reduced for faster response
|
| 33 |
+
TEMPERATURE: float = 1.0
|
| 34 |
|
| 35 |
# File upload settings
|
| 36 |
MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
|
requirements.txt
CHANGED
|
@@ -1,6 +1,4 @@
|
|
| 1 |
huggingface_hub==0.25.2
|
| 2 |
-
# Core ML dependencies - updated for compatibility with gemma-3n-E4B model
|
| 3 |
-
# https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.2/llama_cpp_python-0.3.2-cp310-cp310-linux_x86_64.whl
|
| 4 |
|
| 5 |
# Web interface
|
| 6 |
gradio==4.44.1
|
|
|
|
| 1 |
huggingface_hub==0.25.2
|
|
|
|
|
|
|
| 2 |
|
| 3 |
# Web interface
|
| 4 |
gradio==4.44.1
|
test.ipynb
DELETED
|
@@ -1,24 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"cells": [],
|
| 3 |
-
"metadata": {
|
| 4 |
-
"kernelspec": {
|
| 5 |
-
"display_name": "py310",
|
| 6 |
-
"language": "python",
|
| 7 |
-
"name": "python3"
|
| 8 |
-
},
|
| 9 |
-
"language_info": {
|
| 10 |
-
"codemirror_mode": {
|
| 11 |
-
"name": "ipython",
|
| 12 |
-
"version": 3
|
| 13 |
-
},
|
| 14 |
-
"file_extension": ".py",
|
| 15 |
-
"mimetype": "text/x-python",
|
| 16 |
-
"name": "python",
|
| 17 |
-
"nbconvert_exporter": "python",
|
| 18 |
-
"pygments_lexer": "ipython3",
|
| 19 |
-
"version": "3.10.18"
|
| 20 |
-
}
|
| 21 |
-
},
|
| 22 |
-
"nbformat": 4,
|
| 23 |
-
"nbformat_minor": 5
|
| 24 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
wheels/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73ff502f10b7d2c985879796fc80ea212a71a9114bf26b90b7bd70c2842ba967
|
| 3 |
+
size 4259580
|