lyangas commited on
Commit
824d8df
·
1 Parent(s): ba2ded2

init repo

Browse files
Files changed (6) hide show
  1. .env.example +29 -0
  2. .gitignore +70 -0
  3. README.md +214 -7
  4. app.py +422 -0
  5. config.py +62 -0
  6. requirements.txt +18 -0
.env.example ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Example environment configuration for HF Spaces
2
+ # Copy this file to .env and modify as needed
3
+
4
+ # Model Configuration
5
+ MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
6
+ MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
7
+ MODEL_PATH=./models/gemma-3n-E4B-it-Q8_0.gguf
8
+ HUGGINGFACE_TOKEN=
9
+
10
+ # GPU Optimization Settings (for HF Spaces with GPU)
11
+ N_CTX=8192
12
+ N_GPU_LAYERS=-1
13
+ N_THREADS=8
14
+ N_BATCH=1024
15
+ USE_MLOCK=false
16
+ USE_MMAP=true
17
+ F16_KV=true
18
+ SEED=42
19
+
20
+ # Server Settings
21
+ HOST=0.0.0.0
22
+ GRADIO_PORT=7860
23
+
24
+ # Generation Settings
25
+ MAX_NEW_TOKENS=512
26
+ TEMPERATURE=0.1
27
+
28
+ # File Upload Settings
29
+ MAX_FILE_SIZE=10485760
.gitignore ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ build/
8
+ develop-eggs/
9
+ dist/
10
+ downloads/
11
+ eggs/
12
+ .eggs/
13
+ lib/
14
+ lib64/
15
+ parts/
16
+ sdist/
17
+ var/
18
+ wheels/
19
+ *.egg-info/
20
+ .installed.cfg
21
+ *.egg
22
+ MANIFEST
23
+
24
+ # Virtual environments
25
+ .env
26
+ .venv
27
+ env/
28
+ venv/
29
+ ENV/
30
+ env.bak/
31
+ venv.bak/
32
+
33
+ # IDE
34
+ .vscode/
35
+ .idea/
36
+ *.swp
37
+ *.swo
38
+ *~
39
+
40
+ # Models
41
+ models/
42
+ *.gguf
43
+ *.bin
44
+ *.safetensors
45
+
46
+ # Logs
47
+ *.log
48
+ logs/
49
+
50
+ # OS
51
+ .DS_Store
52
+ .DS_Store?
53
+ ._*
54
+ .Spotlight-V100
55
+ .Trashes
56
+ ehthumbs.db
57
+ Thumbs.db
58
+
59
+ # Jupyter
60
+ .ipynb_checkpoints/
61
+
62
+ # Gradio
63
+ flagged/
64
+ gradio_cached_examples/
65
+
66
+ # Temporary files
67
+ tmp/
68
+ temp/
69
+ *.tmp
70
+ *.temp
README.md CHANGED
@@ -1,13 +1,220 @@
1
  ---
2
- title: Free Llm Structure Output
3
- emoji: 😻
4
- colorFrom: green
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 5.43.1
8
  app_file: app.py
9
  pinned: false
10
- license: gemma
 
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LLM Structured Output
3
+ emoji: 🤖
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.44.1
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
+ hardware: t4-small
12
  ---
13
 
14
+ # 🤖 LLM Structured Output - Hugging Face Spaces
15
+
16
+ Приложение для генерации структурированных ответов с использованием локальных GGUF моделей через llama-cpp-python, оптимизированное для работы в Hugging Face Spaces с GPU поддержкой.
17
+
18
+ ## ✨ Возможности
19
+
20
+ - 🚀 **GPU Ускорение**: Оптимизировано для работы с GPU в HF Spaces с использованием декоратора `@spaces.GPU`
21
+ - 📊 **Структурированный вывод**: Генерация ответов согласно JSON схеме
22
+ - 🎯 **Высокая точность**: Использование локальных GGUF моделей
23
+ - 🎨 **Удобный интерфейс**: Современный Gradio интерфейс
24
+ - 🔧 **Гибкая настройка**: Поддержка различных моделей и параметров
25
+ - ⚡ **Умное управление ресурсами**: GPU сессии выделяются на 120 секунд на запрос
26
+
27
+ ## 🚀 Быстрый старт
28
+
29
+ ### Развертывание в Hugging Face Spaces
30
+
31
+ 1. Создайте новый Space в Hugging Face
32
+ 2. Выберите тип Space: **Gradio**
33
+ 3. Выберите аппаратное обеспечение: **GPU** (T4 или выше)
34
+ 4. Загрузите файлы проекта
35
+ 5. Space автоматически запустится
36
+
37
+ ### Локальный запуск
38
+
39
+ ```bash
40
+ # Клонирование репозитория
41
+ git clone <your-repo>
42
+ cd free_llm_structure_output
43
+
44
+ # Установка зависимостей
45
+ pip install -r requirements.txt
46
+
47
+ # Запуск приложения
48
+ python app.py
49
+ ```
50
+
51
+ ## 📋 Структура проекта
52
+
53
+ ```
54
+ free_llm_structure_output/
55
+ ├── app.py # Основное приложение Gradio
56
+ ├── config.py # Конфигурация для HF Spaces
57
+ ├── requirements.txt # Зависимости Python
58
+ └── README.md # Документация
59
+ ```
60
+
61
+ ## ⚙️ Конфигурация
62
+
63
+ Основные параметры настраиваются через переменные окружения или файл `config.py`:
64
+
65
+ ### Настройки модели
66
+ - `MODEL_REPO`: Репозиторий модели на HuggingFace (по умолчанию: lmstudio-community/gemma-3n-E4B-it-text-GGUF)
67
+ - `MODEL_FILENAME`: Имя файла модели (по умолчанию: gemma-3n-E4B-it-Q8_0.gguf)
68
+ - `HUGGINGFACE_TOKEN`: Токен HF для приватных моделей
69
+
70
+ ### GPU оптимизация
71
+ - `N_GPU_LAYERS`: Количество слоев на GPU (-1 для всех)
72
+ - `N_CTX`: Размер контекста (8192 для GPU)
73
+ - `N_BATCH`: Размер батча (1024 для GPU)
74
+ - `N_THREADS`: Количество потоков (8 для HF Spaces)
75
+
76
+ ### Генерация
77
+ - `MAX_NEW_TOKENS`: Максимальная длина ответа (512)
78
+ - `TEMPERATURE`: Температура генерации (0.1)
79
+
80
+ ## 🎯 Использование
81
+
82
+ ### Базовый пример
83
+
84
+ 1. **Введите промпт**: Опишите что вы хотите проанализировать
85
+ 2. **Задайте JSON схему**: Определите структуру ответа
86
+ 3. **Нажмите "Generate Response"**: Получите структурированный ответ
87
+
88
+ ### Пример JSON схемы
89
+
90
+ ```json
91
+ {
92
+ "type": "object",
93
+ "properties": {
94
+ "summary": {
95
+ "type": "string",
96
+ "description": "Краткое описание"
97
+ },
98
+ "sentiment": {
99
+ "type": "string",
100
+ "enum": ["positive", "negative", "neutral"],
101
+ "description": "Эмоциональная окраска"
102
+ },
103
+ "confidence": {
104
+ "type": "number",
105
+ "minimum": 0,
106
+ "maximum": 1,
107
+ "description": "Уровень уверенности"
108
+ }
109
+ },
110
+ "required": ["summary", "sentiment"]
111
+ }
112
+ ```
113
+
114
+ ## 🔧 Продвинутые настройки
115
+
116
+ ### Переменные окружения для HF Spaces
117
+
118
+ Создайте файл `.env` в настройках Space или задайте переменные:
119
+
120
+ ```env
121
+ # Модель
122
+ MODEL_REPO=lmstudio-community/gemma-3n-E4B-it-text-GGUF
123
+ MODEL_FILENAME=gemma-3n-E4B-it-Q8_0.gguf
124
+ HUGGINGFACE_TOKEN=your_token_here
125
+
126
+ # GPU настройки
127
+ N_GPU_LAYERS=-1
128
+ N_CTX=8192
129
+ N_BATCH=1024
130
+ N_THREADS=8
131
+
132
+ # Генерация
133
+ MAX_NEW_TOKENS=512
134
+ TEMPERATURE=0.1
135
+ ```
136
+
137
+ ### Использование других моделей
138
+
139
+ Поддерживаются любые GGUF модели из HuggingFace Hub:
140
+
141
+ ```python
142
+ # В config.py или переменных окружения
143
+ MODEL_REPO = "microsoft/Phi-3-mini-4k-instruct-gguf"
144
+ MODEL_FILENAME = "Phi-3-mini-4k-instruct-q4.gguf"
145
+ ```
146
+
147
+ ## 📊 Производительность
148
+
149
+ ### Рекомендуемые конфигурации HF Spaces
150
+
151
+ | Размер модели | GPU | N_CTX | N_BATCH | N_GPU_LAYERS |
152
+ |---------------|-----|-------|---------|--------------|
153
+ | 3B-7B | T4 | 4096 | 512 | -1 |
154
+ | 7B-13B | A10G| 8192 | 1024 | -1 |
155
+ | 13B+ | A100| 16384 | 2048 | -1 |
156
+
157
+ ### Оптимизация скорости
158
+
159
+ - Используйте quantized модели (Q4_0, Q8_0)
160
+ - Настройте `N_BATCH` под размер GPU памяти
161
+ - Установите `N_GPU_LAYERS=-1` для полного GPU ускорения
162
+
163
+ ## 🛠️ Отладка
164
+
165
+ ### Проблемы с загрузкой модели
166
+
167
+ 1. Проверьте доступность модели в HF Hub
168
+ 2. Убедитесь в корректности `HUGGINGFACE_TOKEN`
169
+ 3. Проверите размер GPU памяти
170
+ 4. Используйте менее ресурсоемкую модель
171
+
172
+ ### Логи
173
+
174
+ Включите детальное логирование:
175
+
176
+ ```python
177
+ import logging
178
+ logging.basicConfig(level=logging.DEBUG)
179
+ ```
180
+
181
+ ## 🎨 Примеры использования
182
+
183
+ ### Анализ текста
184
+ ```
185
+ Промпт: "Проанализируй отзыв: 'Отличный продукт, рекомендую!'"
186
+ Схема: {"sentiment": "string", "rating": "number", "keywords": "array"}
187
+ ```
188
+
189
+ ### Извлечение данных
190
+ ```
191
+ Промпт: "Извлеки информацию о компании из текста"
192
+ Схема: {"name": "string", "industry": "string", "employees": "number"}
193
+ ```
194
+
195
+ ### Генерация структур
196
+ ```
197
+ Промпт: "Создай план обучения Python"
198
+ Схема: {"weeks": "array", "topics": "array", "hours": "number"}
199
+ ```
200
+
201
+ ## 📄 Лицензия
202
+
203
+ MIT License
204
+
205
+ ## 🤝 Поддержка
206
+
207
+ - 🐛 Сообщения об ошибках: создайте Issue
208
+ - 💡 Предложения: создайте Discussion
209
+ - 📧 Прямая связь: через HuggingFace
210
+
211
+ ## 🔗 Полезные ссылки
212
+
213
+ - [Hugging Face Spaces Documentation](https://huggingface.co/docs/hub/spaces)
214
+ - [Gradio Documentation](https://gradio.app/docs/)
215
+ - [llama-cpp-python](https://github.com/abetlen/llama-cpp-python)
216
+ - [GGUF Models Hub](https://huggingface.co/models?library=gguf)
217
+
218
+ ---
219
+
220
+ ⭐ **Нравится проект?** Поставьте звезду и поделитесь с коллегами!
app.py ADDED
@@ -0,0 +1,422 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import spaces
2
+ import os
3
+ import json
4
+ import subprocess
5
+ from llama_cpp import Llama
6
+ import gradio as gr
7
+ from huggingface_hub import hf_hub_download
8
+ from typing import Optional, Dict, Any, Union
9
+ from PIL import Image
10
+ from pydantic import BaseModel
11
+ import logging
12
+ from config import Config
13
+
14
+ # Setup logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ # Get Hugging Face token
19
+ huggingface_token = os.getenv("HUGGINGFACE_TOKEN")
20
+
21
+ # Download model if needed
22
+ def download_model_if_needed():
23
+ """Download model from Hugging Face if it doesn't exist locally"""
24
+ model_path = Config.get_model_path()
25
+
26
+ if os.path.exists(model_path):
27
+ logger.info(f"Model already exists at: {model_path}")
28
+ return model_path
29
+
30
+ # Check alternative locations for HF Spaces
31
+ alternative_paths = [
32
+ f"./models/{Config.MODEL_FILENAME}",
33
+ f"/tmp/models/{Config.MODEL_FILENAME}",
34
+ f"./{Config.MODEL_FILENAME}"
35
+ ]
36
+
37
+ for alt_path in alternative_paths:
38
+ if os.path.exists(alt_path):
39
+ logger.info(f"Found model at alternative location: {alt_path}")
40
+ return alt_path
41
+
42
+ logger.info(f"Downloading model {Config.MODEL_REPO}/{Config.MODEL_FILENAME}...")
43
+
44
+ # Create models directory if it doesn't exist
45
+ models_dir = Config.get_models_dir()
46
+ os.makedirs(models_dir, exist_ok=True)
47
+
48
+ try:
49
+ # Download model
50
+ model_path = hf_hub_download(
51
+ repo_id=Config.MODEL_REPO,
52
+ filename=Config.MODEL_FILENAME,
53
+ local_dir=models_dir,
54
+ token=huggingface_token if huggingface_token else None
55
+ )
56
+
57
+ logger.info(f"Model downloaded to: {model_path}")
58
+ return model_path
59
+ except Exception as e:
60
+ logger.error(f"Failed to download model: {e}")
61
+ raise
62
+
63
+ # Download model at startup
64
+ try:
65
+ download_model_if_needed()
66
+ except Exception as e:
67
+ logger.error(f"Error downloading model: {e}")
68
+
69
+ # Global variables for model management
70
+ llm = None
71
+ llm_model = None
72
+
73
+ class StructuredOutputRequest(BaseModel):
74
+ prompt: str
75
+ image: Optional[str] = None # base64 encoded image
76
+ json_schema: Dict[str, Any]
77
+
78
+ def _validate_json_schema(schema: str) -> Dict[str, Any]:
79
+ """Validate and parse JSON schema"""
80
+ try:
81
+ parsed_schema = json.loads(schema)
82
+ return parsed_schema
83
+ except json.JSONDecodeError as e:
84
+ raise ValueError(f"Invalid JSON schema: {e}")
85
+
86
+ def _format_prompt_with_schema(prompt: str, json_schema: Dict[str, Any]) -> str:
87
+ """Format prompt for structured output generation"""
88
+ schema_str = json.dumps(json_schema, ensure_ascii=False, indent=2)
89
+
90
+ formatted_prompt = f"""User: {prompt}
91
+
92
+ Please respond in strict accordance with the following JSON schema:
93
+
94
+ ```json
95
+ {schema_str}
96
+ ```
97
+
98
+ Return ONLY valid JSON without additional comments or explanations."""
99
+
100
+ return formatted_prompt
101
+
102
+ @spaces.GPU(duration=120, concurrency_limit=1)
103
+ def generate_structured_response(
104
+ prompt: str,
105
+ json_schema_str: str,
106
+ image: Optional[Image.Image] = None,
107
+ model: str = Config.MODEL_FILENAME,
108
+ max_tokens: int = Config.MAX_NEW_TOKENS,
109
+ temperature: float = Config.TEMPERATURE,
110
+ top_p: float = 0.9,
111
+ top_k: int = 40,
112
+ repeat_penalty: float = 1.1,
113
+ ) -> Dict[str, Any]:
114
+ """
115
+ Generate structured response from local GGUF model with GPU acceleration
116
+ """
117
+ global llm
118
+ global llm_model
119
+
120
+ try:
121
+ # Load or reload model if needed
122
+ if llm is None or llm_model != model:
123
+ logger.info(f"Loading model: {model}")
124
+
125
+ # Find model path
126
+ model_path = Config.get_model_path()
127
+ if not os.path.exists(model_path):
128
+ # Try alternative paths
129
+ alternative_paths = [
130
+ f"./models/{model}",
131
+ f"/tmp/models/{model}",
132
+ f"./{model}"
133
+ ]
134
+
135
+ for alt_path in alternative_paths:
136
+ if os.path.exists(alt_path):
137
+ model_path = alt_path
138
+ break
139
+ else:
140
+ raise FileNotFoundError(f"Model file not found: {model}")
141
+
142
+ # Initialize Llama model with GPU optimization
143
+ llm = Llama(
144
+ model_path=model_path,
145
+ n_ctx=Config.N_CTX,
146
+ n_batch=Config.N_BATCH,
147
+ n_gpu_layers=Config.N_GPU_LAYERS, # Use all GPU layers
148
+ use_mlock=Config.USE_MLOCK,
149
+ use_mmap=Config.USE_MMAP,
150
+ vocab_only=False,
151
+ f16_kv=Config.F16_KV,
152
+ logits_all=False,
153
+ embedding=False,
154
+ n_threads=Config.N_THREADS,
155
+ last_n_tokens_size=128,
156
+ lora_base=None,
157
+ lora_path=None,
158
+ seed=Config.SEED,
159
+ verbose=True,
160
+ main_gpu=0, # Use first GPU
161
+ tensor_split=None,
162
+ rope_scaling_type=None,
163
+ rope_freq_base=0.0,
164
+ rope_freq_scale=0.0,
165
+ )
166
+
167
+ llm_model = model
168
+ logger.info("Model successfully loaded with GPU acceleration")
169
+
170
+ # Validate and parse JSON schema
171
+ try:
172
+ parsed_schema = _validate_json_schema(json_schema_str)
173
+ except Exception as e:
174
+ return {
175
+ "error": f"Schema validation error: {str(e)}",
176
+ "raw_response": ""
177
+ }
178
+
179
+ # Format prompt
180
+ formatted_prompt = _format_prompt_with_schema(prompt, parsed_schema)
181
+
182
+ # Warning about images (not supported in this implementation)
183
+ if image is not None:
184
+ logger.warning("Image processing is not supported with this local model")
185
+
186
+ # Generate response with GPU optimization
187
+ logger.info("Generating response with GPU acceleration...")
188
+
189
+ response = llm(
190
+ formatted_prompt,
191
+ max_tokens=max_tokens,
192
+ temperature=temperature,
193
+ stop=["User:", "\n\n", "Assistant:", "Human:"],
194
+ echo=False,
195
+ top_p=top_p,
196
+ top_k=top_k,
197
+ repeat_penalty=repeat_penalty,
198
+ presence_penalty=0.0,
199
+ frequency_penalty=0.0,
200
+ )
201
+
202
+ # Extract generated text
203
+ generated_text = response['choices'][0]['text']
204
+
205
+ # Attempt to parse JSON response
206
+ try:
207
+ # Find JSON in response
208
+ json_start = generated_text.find('{')
209
+ json_end = generated_text.rfind('}') + 1
210
+
211
+ if json_start != -1 and json_end > json_start:
212
+ json_str = generated_text[json_start:json_end]
213
+ parsed_response = json.loads(json_str)
214
+ return {
215
+ "success": True,
216
+ "data": parsed_response,
217
+ "raw_response": generated_text
218
+ }
219
+ else:
220
+ return {
221
+ "error": "Could not find JSON in model response",
222
+ "raw_response": generated_text
223
+ }
224
+
225
+ except json.JSONDecodeError as e:
226
+ return {
227
+ "error": f"JSON parsing error: {e}",
228
+ "raw_response": generated_text
229
+ }
230
+
231
+ except Exception as e:
232
+ logger.error(f"Unexpected error: {e}")
233
+ return {
234
+ "error": f"Generation error: {str(e)}"
235
+ }
236
+
237
+ def process_request(prompt: str,
238
+ json_schema: str,
239
+ image: Optional[Image.Image] = None) -> str:
240
+ """
241
+ Process request through Gradio interface
242
+ """
243
+ if not prompt.strip():
244
+ return json.dumps({"error": "Prompt cannot be empty"}, ensure_ascii=False, indent=2)
245
+
246
+ if not json_schema.strip():
247
+ return json.dumps({"error": "JSON schema cannot be empty"}, ensure_ascii=False, indent=2)
248
+
249
+ result = generate_structured_response(prompt, json_schema, image)
250
+ return json.dumps(result, ensure_ascii=False, indent=2)
251
+
252
+ # Examples for demonstration
253
+ example_schema = """{
254
+ "type": "object",
255
+ "properties": {
256
+ "summary": {
257
+ "type": "string",
258
+ "description": "Brief summary of the response"
259
+ },
260
+ "sentiment": {
261
+ "type": "string",
262
+ "enum": ["positive", "negative", "neutral"],
263
+ "description": "Emotional tone"
264
+ },
265
+ "confidence": {
266
+ "type": "number",
267
+ "minimum": 0,
268
+ "maximum": 1,
269
+ "description": "Confidence level in the response"
270
+ },
271
+ "keywords": {
272
+ "type": "array",
273
+ "items": {
274
+ "type": "string"
275
+ },
276
+ "description": "Key words"
277
+ }
278
+ },
279
+ "required": ["summary", "sentiment", "confidence"]
280
+ }"""
281
+
282
+ example_prompt = "Analyze the following text and provide a structured assessment: 'The company's new product received enthusiastic user reviews. Sales exceeded all expectations by 150%.'"
283
+
284
+ def create_gradio_interface():
285
+ """Create Gradio interface optimized for HF Spaces"""
286
+
287
+ with gr.Blocks(title="LLM Structured Output - HF Spaces", theme=gr.themes.Soft()) as demo:
288
+ gr.Markdown("# 🤖 LLM with Structured Output")
289
+ gr.Markdown(f"✨ **Running on Hugging Face Spaces with GPU acceleration**")
290
+ gr.Markdown(f"🚀 Model: **{Config.MODEL_REPO}/{Config.MODEL_FILENAME}**")
291
+ gr.Markdown("✅ **Status**: Model ready with GPU acceleration via @spaces.GPU decorator")
292
+
293
+ with gr.Row():
294
+ with gr.Column():
295
+ prompt_input = gr.Textbox(
296
+ label="Prompt for model",
297
+ placeholder="Enter your request...",
298
+ lines=5,
299
+ value=example_prompt
300
+ )
301
+
302
+ image_input = gr.Image(
303
+ label="Image (optional, for multimodal models)",
304
+ type="pil"
305
+ )
306
+
307
+ schema_input = gr.Textbox(
308
+ label="JSON schema for response structure",
309
+ placeholder="Enter JSON schema...",
310
+ lines=15,
311
+ value=example_schema
312
+ )
313
+
314
+ submit_btn = gr.Button("🚀 Generate Response", variant="primary", size="lg")
315
+
316
+ with gr.Column():
317
+ output = gr.Textbox(
318
+ label="Structured Response",
319
+ lines=20,
320
+ interactive=False
321
+ )
322
+
323
+ submit_btn.click(
324
+ fn=process_request,
325
+ inputs=[prompt_input, schema_input, image_input],
326
+ outputs=output
327
+ )
328
+
329
+ # Examples
330
+ gr.Markdown("## 📋 Usage Examples")
331
+
332
+ examples = gr.Examples(
333
+ examples=[
334
+ [
335
+ "Describe today's weather in New York",
336
+ """{
337
+ "type": "object",
338
+ "properties": {
339
+ "temperature": {"type": "number"},
340
+ "description": {"type": "string"},
341
+ "humidity": {"type": "number"}
342
+ }
343
+ }""",
344
+ None
345
+ ],
346
+ [
347
+ "Create a Python learning plan for one month",
348
+ """{
349
+ "type": "object",
350
+ "properties": {
351
+ "weeks": {
352
+ "type": "array",
353
+ "items": {
354
+ "type": "object",
355
+ "properties": {
356
+ "week_number": {"type": "integer"},
357
+ "topics": {"type": "array", "items": {"type": "string"}},
358
+ "practice_hours": {"type": "number"}
359
+ }
360
+ }
361
+ },
362
+ "total_hours": {"type": "number"}
363
+ }
364
+ }""",
365
+ None
366
+ ],
367
+ [
368
+ "Analyze this business proposal and extract key metrics",
369
+ """{
370
+ "type": "object",
371
+ "properties": {
372
+ "feasibility_score": {"type": "number", "minimum": 0, "maximum": 10},
373
+ "risk_factors": {"type": "array", "items": {"type": "string"}},
374
+ "investment_required": {"type": "number"},
375
+ "expected_roi": {"type": "number"},
376
+ "timeline_months": {"type": "integer"}
377
+ },
378
+ "required": ["feasibility_score", "risk_factors"]
379
+ }""",
380
+ None
381
+ ]
382
+ ],
383
+ inputs=[prompt_input, schema_input, image_input]
384
+ )
385
+
386
+ # Model information
387
+ gr.Markdown(f"""
388
+ ## ℹ️ Model Information
389
+
390
+ - **Model**: {Config.MODEL_REPO}/{Config.MODEL_FILENAME}
391
+ - **Local path**: {Config.MODEL_PATH}
392
+ - **Context window**: {Config.N_CTX} tokens
393
+ - **Batch size**: {Config.N_BATCH}
394
+ - **GPU layers**: {Config.N_GPU_LAYERS if Config.N_GPU_LAYERS >= 0 else "All (GPU accelerated)"}
395
+ - **CPU threads**: {Config.N_THREADS}
396
+ - **Maximum response length**: {Config.MAX_NEW_TOKENS} tokens
397
+ - **Temperature**: {Config.TEMPERATURE}
398
+ - **Memory lock**: {"Enabled" if Config.USE_MLOCK else "Disabled"}
399
+ - **Memory mapping**: {"Enabled" if Config.USE_MMAP else "Disabled"}
400
+ - **GPU Acceleration**: Enabled via @spaces.GPU decorator (120 seconds duration)
401
+
402
+ 💡 **Tips**:
403
+ - Use clear and specific JSON schemas for better results
404
+ - The model is optimized for GPU acceleration on Hugging Face Spaces
405
+ - Structured output helps ensure consistent API responses
406
+ - GPU sessions are allocated for 120 seconds per request
407
+
408
+ 🎯 **Perfect for**: API response generation, data extraction, content analysis, and structured data creation
409
+ """)
410
+
411
+ return demo
412
+
413
+ if __name__ == "__main__":
414
+ # Create and launch Gradio interface for HF Spaces
415
+ demo = create_gradio_interface()
416
+ demo.launch(
417
+ server_name=Config.HOST,
418
+ server_port=Config.GRADIO_PORT,
419
+ share=False,
420
+ debug=False, # Disabled for production
421
+ show_error=True
422
+ )
config.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from typing import Optional
3
+
4
+ class Config:
5
+ """Application configuration for Hugging Face Spaces with GPU support"""
6
+
7
+ # Model settings - optimized for HF Spaces with GPU
8
+ MODEL_REPO: str = os.getenv("MODEL_REPO", "lmstudio-community/gemma-3n-E4B-it-text-GGUF")
9
+ MODEL_FILENAME: str = os.getenv("MODEL_FILENAME", "gemma-3n-E4B-it-Q8_0.gguf")
10
+ MODEL_PATH: str = os.getenv("MODEL_PATH", "./models/gemma-3n-E4B-it-Q8_0.gguf")
11
+ HUGGINGFACE_TOKEN: str = os.getenv("HUGGINGFACE_TOKEN", "")
12
+
13
+ # Model loading settings - optimized for HF Spaces GPU
14
+ N_CTX: int = int(os.getenv("N_CTX", "8192")) # Larger context for GPU
15
+ N_GPU_LAYERS: int = int(os.getenv("N_GPU_LAYERS", "-1")) # Use all GPU layers
16
+ N_THREADS: int = int(os.getenv("N_THREADS", "8")) # More threads for HF GPU
17
+ N_BATCH: int = int(os.getenv("N_BATCH", "1024")) # Larger batch for GPU
18
+ USE_MLOCK: bool = os.getenv("USE_MLOCK", "false").lower() == "true" # Keep disabled
19
+ USE_MMAP: bool = os.getenv("USE_MMAP", "true").lower() == "true" # Keep memory mapping
20
+ F16_KV: bool = os.getenv("F16_KV", "true").lower() == "true" # Use 16-bit keys and values
21
+ SEED: int = int(os.getenv("SEED", "42")) # Random seed for reproducibility
22
+
23
+ # Server settings - HF Spaces compatible
24
+ HOST: str = os.getenv("HOST", "0.0.0.0")
25
+ GRADIO_PORT: int = int(os.getenv("GRADIO_PORT", "7860")) # Standard HuggingFace Spaces port
26
+
27
+ # Generation settings - optimized for GPU performance
28
+ MAX_NEW_TOKENS: int = int(os.getenv("MAX_NEW_TOKENS", "512")) # Increased for GPU
29
+ TEMPERATURE: float = float(os.getenv("TEMPERATURE", "0.1"))
30
+
31
+ # File upload settings
32
+ MAX_FILE_SIZE: int = int(os.getenv("MAX_FILE_SIZE", "10485760")) # 10MB
33
+ ALLOWED_IMAGE_EXTENSIONS: set = {".jpg", ".jpeg", ".png", ".gif", ".bmp", ".webp"}
34
+
35
+ @classmethod
36
+ def is_model_available(cls) -> bool:
37
+ """Check if local model file exists"""
38
+ return os.path.exists(cls.MODEL_PATH)
39
+
40
+ @classmethod
41
+ def get_model_path(cls) -> str:
42
+ """Get absolute path to model file"""
43
+ return os.path.abspath(cls.MODEL_PATH)
44
+
45
+ @classmethod
46
+ def get_models_dir(cls) -> str:
47
+ """Get models directory path"""
48
+ return os.path.dirname(cls.MODEL_PATH)
49
+
50
+ @classmethod
51
+ def load_from_env_file(cls, env_file: str = ".env") -> None:
52
+ """Load configuration from .env file"""
53
+ if os.path.exists(env_file):
54
+ with open(env_file, 'r') as f:
55
+ for line in f:
56
+ line = line.strip()
57
+ if line and not line.startswith('#') and '=' in line:
58
+ key, value = line.split('=', 1)
59
+ os.environ[key.strip()] = value.strip()
60
+
61
+ # Automatically load from .env file on import
62
+ Config.load_from_env_file()
requirements.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Core dependencies for Hugging Face Spaces with GPU support
2
+ huggingface_hub==0.25.2
3
+ spaces
4
+
5
+ # GPU-optimized llama-cpp-python
6
+ # llama-cpp-python>=0.3.4
7
+ https://github.com/abetlen/llama-cpp-python/releases/download/v0.3.16-cu124/llama_cpp_python-0.3.16-cp310-cp310-linux_x86_64.whl
8
+
9
+ # Web interface
10
+ gradio==4.44.1
11
+
12
+ # Data processing
13
+ pillow>=9.0.0,<11.0.0
14
+ pydantic==2.10.6
15
+ numpy>=1.24.0,<2.0.0
16
+
17
+ # HTTP requests
18
+ requests>=2.28.0