AI Backend Deploy
Deploy Lightweight AI Backend (2026-02-23 19:37:44)
d39e477
"""
Configuration file for easy parameter tuning
This file contains all adjustable parameters for the AI backend
"""
# ====== MODEL CONFIGURATION ======
# Model names and paths
MODELS = {
"chat": {
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"type": "causal-lm",
"params": 1100_000_000
},
"code": {
"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"type": "causal-lm",
"params": 1100_000_000
},
"summarization": {
"name": "google/flan-t5-small",
"type": "seq2seq",
"params": 170_000_000
}
}
# ====== GENERATION PARAMETERS ======
GENERATION_CONFIG = {
"chat": {
"max_tokens_limit": 200, # Hard limit for CPU stability
"default_max_tokens": 150, # Default for user
"min_temperature": 0.1,
"max_temperature": 1.0,
"default_temperature": 0.7,
"top_p": 0.9,
"top_k": 50,
},
"code": {
"max_tokens_limit": 300,
"default_max_tokens": 256,
"min_temperature": 0.1,
"max_temperature": 1.0,
"default_temperature": 0.3, # Lower temp for deterministic code
"top_p": 0.95,
"top_k": 40,
},
"summarization": {
"max_length": 150,
"min_length": 20,
"max_input_length": 1000, # Truncate input to prevent OOM
},
"image": {
"max_width": 256,
"max_height": 256,
"default_width": 256,
"default_height": 256,
"min_width": 128,
"min_height": 128,
}
}
# ====== RESOURCE CONFIGURATION ======
RESOURCE_CONFIG = {
"device": "cpu",
"cpu_threads": 4, # Number of CPU threads for torch
"torch_dtype": "float32", # Use float32 for CPU (not bfloat16)
"low_cpu_mem_usage": True, # Enable memory optimization
"max_memory_percent": 85, # Don't exceed 85% of available RAM
}
# ====== PERFORMANCE CONFIGURATION ======
PERFORMANCE_CONFIG = {
# Queue settings
"queue_max_size": 10, # Max requests in queue
"concurrency_limit": 2, # Max concurrent requests
# Timeouts (seconds)
"model_load_timeout": 120, # Timeout for loading models
"request_timeout": 60, # Timeout for generation
# Memory management
"gc_collect_freq": 1, # Collect garbage after every N requests
"model_unload_after": 300, # Unload inactive models after N seconds
}
# ====== GRADIO CONFIGURATION ======
GRADIO_CONFIG = {
"server_name": "0.0.0.0",
"server_port": 7860,
"share": False,
"show_error": True,
"analytics_enabled": False,
"max_threads": 40,
}
# ====== UI CONFIGURATION ======
UI_CONFIG = {
"title": "🤖 Lightweight AI Backend",
"description": "Multi-Model API for Hugging Face Free CPU Tier",
"theme": "soft", # Options: default, soft, monochrome, glass
"show_api_docs": True,
}
# ====== LOGGING CONFIGURATION ======
LOGGING_CONFIG = {
"log_level": "INFO", # DEBUG, INFO, WARNING, ERROR
"log_to_file": False,
"log_file": "app.log",
}
# ====== ADVANCED OPTIONS ======
ADVANCED_CONFIG = {
# Model quantization (experimental)
"use_quantization": False, # Not recommended for CPU
# Model compilation (experimental)
"use_torch_compile": False, # Not stable on CPU yet
# Cache settings
"cache_embeddings": False, # Cache model outputs (memory tradeoff)
"enable_attention_cache": True, # Enable KV cache
# Error handling
"max_retries": 2,
"retry_delay": 1, # Seconds between retries
}
# ====== ALTERNATIVE MODELS (For experimentation) ======
ALTERNATIVE_MODELS = {
"chat_alternatives": [
"microsoft/phi-1", # Requires > 4GB RAM
"stabilityai/stablelm-zephyr-3b",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED
],
"code_alternatives": [
"Salesforce/codet5-small",
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED
],
"summarization_alternatives": [
"google/flan-t5-small", # RECOMMENDED (170M)
"google/flan-t5-base", # Larger (250M)
"facebook/bart-large-cnn", # Heavier (406M)
],
}
# ====== FEATURE FLAGS ======
FEATURES = {
"enable_chat": True,
"enable_code": True,
"enable_summarization": True,
"enable_image_generation": True,
# API exposure
"expose_api": True,
"api_auth_required": False, # Require auth for API calls
# UI tabs
"show_about_tab": True,
"show_api_docs": True,
}
# ====== DEPLOYMENT NOTES ======
"""
OPTIMIZATION TIPS FOR FREE CPU TIER:
1. Token Limits:
- Keep max_tokens_limit at 200-300 for stability
- Process smaller batches (1-2 requests at a time)
2. Memory Management:
- Queue handling prevents concurrent large loads
- Models lazy-load only when needed
- Garbage collection runs after each request
3. Model Selection:
- TinyLlama: 1.1B params, good quality/speed tradeoff
- FLAN-T5-small: 170M params, fastest summarization
- Procedural images: No ML overhead
4. If Space Crashes:
- Reduce max_tokens limits further
- Lower concurrency_limit to 1
- Disable image generation tab
5. Expected Performance:
- First request: 8-15s (includes model loading)
- Subsequent requests: 2-8s per request
- Total RAM: ~1.5-2GB
- Idle memory: ~500MB-1GB
"""