Spaces:
Sleeping
Sleeping
| """ | |
| Configuration file for easy parameter tuning | |
| This file contains all adjustable parameters for the AI backend | |
| """ | |
| # ====== MODEL CONFIGURATION ====== | |
| # Model names and paths | |
| MODELS = { | |
| "chat": { | |
| "name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| "type": "causal-lm", | |
| "params": 1100_000_000 | |
| }, | |
| "code": { | |
| "name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0", | |
| "type": "causal-lm", | |
| "params": 1100_000_000 | |
| }, | |
| "summarization": { | |
| "name": "google/flan-t5-small", | |
| "type": "seq2seq", | |
| "params": 170_000_000 | |
| } | |
| } | |
| # ====== GENERATION PARAMETERS ====== | |
| GENERATION_CONFIG = { | |
| "chat": { | |
| "max_tokens_limit": 200, # Hard limit for CPU stability | |
| "default_max_tokens": 150, # Default for user | |
| "min_temperature": 0.1, | |
| "max_temperature": 1.0, | |
| "default_temperature": 0.7, | |
| "top_p": 0.9, | |
| "top_k": 50, | |
| }, | |
| "code": { | |
| "max_tokens_limit": 300, | |
| "default_max_tokens": 256, | |
| "min_temperature": 0.1, | |
| "max_temperature": 1.0, | |
| "default_temperature": 0.3, # Lower temp for deterministic code | |
| "top_p": 0.95, | |
| "top_k": 40, | |
| }, | |
| "summarization": { | |
| "max_length": 150, | |
| "min_length": 20, | |
| "max_input_length": 1000, # Truncate input to prevent OOM | |
| }, | |
| "image": { | |
| "max_width": 256, | |
| "max_height": 256, | |
| "default_width": 256, | |
| "default_height": 256, | |
| "min_width": 128, | |
| "min_height": 128, | |
| } | |
| } | |
| # ====== RESOURCE CONFIGURATION ====== | |
| RESOURCE_CONFIG = { | |
| "device": "cpu", | |
| "cpu_threads": 4, # Number of CPU threads for torch | |
| "torch_dtype": "float32", # Use float32 for CPU (not bfloat16) | |
| "low_cpu_mem_usage": True, # Enable memory optimization | |
| "max_memory_percent": 85, # Don't exceed 85% of available RAM | |
| } | |
| # ====== PERFORMANCE CONFIGURATION ====== | |
| PERFORMANCE_CONFIG = { | |
| # Queue settings | |
| "queue_max_size": 10, # Max requests in queue | |
| "concurrency_limit": 2, # Max concurrent requests | |
| # Timeouts (seconds) | |
| "model_load_timeout": 120, # Timeout for loading models | |
| "request_timeout": 60, # Timeout for generation | |
| # Memory management | |
| "gc_collect_freq": 1, # Collect garbage after every N requests | |
| "model_unload_after": 300, # Unload inactive models after N seconds | |
| } | |
| # ====== GRADIO CONFIGURATION ====== | |
| GRADIO_CONFIG = { | |
| "server_name": "0.0.0.0", | |
| "server_port": 7860, | |
| "share": False, | |
| "show_error": True, | |
| "analytics_enabled": False, | |
| "max_threads": 40, | |
| } | |
| # ====== UI CONFIGURATION ====== | |
| UI_CONFIG = { | |
| "title": "🤖 Lightweight AI Backend", | |
| "description": "Multi-Model API for Hugging Face Free CPU Tier", | |
| "theme": "soft", # Options: default, soft, monochrome, glass | |
| "show_api_docs": True, | |
| } | |
| # ====== LOGGING CONFIGURATION ====== | |
| LOGGING_CONFIG = { | |
| "log_level": "INFO", # DEBUG, INFO, WARNING, ERROR | |
| "log_to_file": False, | |
| "log_file": "app.log", | |
| } | |
| # ====== ADVANCED OPTIONS ====== | |
| ADVANCED_CONFIG = { | |
| # Model quantization (experimental) | |
| "use_quantization": False, # Not recommended for CPU | |
| # Model compilation (experimental) | |
| "use_torch_compile": False, # Not stable on CPU yet | |
| # Cache settings | |
| "cache_embeddings": False, # Cache model outputs (memory tradeoff) | |
| "enable_attention_cache": True, # Enable KV cache | |
| # Error handling | |
| "max_retries": 2, | |
| "retry_delay": 1, # Seconds between retries | |
| } | |
| # ====== ALTERNATIVE MODELS (For experimentation) ====== | |
| ALTERNATIVE_MODELS = { | |
| "chat_alternatives": [ | |
| "microsoft/phi-1", # Requires > 4GB RAM | |
| "stabilityai/stablelm-zephyr-3b", | |
| "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED | |
| ], | |
| "code_alternatives": [ | |
| "Salesforce/codet5-small", | |
| "TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED | |
| ], | |
| "summarization_alternatives": [ | |
| "google/flan-t5-small", # RECOMMENDED (170M) | |
| "google/flan-t5-base", # Larger (250M) | |
| "facebook/bart-large-cnn", # Heavier (406M) | |
| ], | |
| } | |
| # ====== FEATURE FLAGS ====== | |
| FEATURES = { | |
| "enable_chat": True, | |
| "enable_code": True, | |
| "enable_summarization": True, | |
| "enable_image_generation": True, | |
| # API exposure | |
| "expose_api": True, | |
| "api_auth_required": False, # Require auth for API calls | |
| # UI tabs | |
| "show_about_tab": True, | |
| "show_api_docs": True, | |
| } | |
| # ====== DEPLOYMENT NOTES ====== | |
| """ | |
| OPTIMIZATION TIPS FOR FREE CPU TIER: | |
| 1. Token Limits: | |
| - Keep max_tokens_limit at 200-300 for stability | |
| - Process smaller batches (1-2 requests at a time) | |
| 2. Memory Management: | |
| - Queue handling prevents concurrent large loads | |
| - Models lazy-load only when needed | |
| - Garbage collection runs after each request | |
| 3. Model Selection: | |
| - TinyLlama: 1.1B params, good quality/speed tradeoff | |
| - FLAN-T5-small: 170M params, fastest summarization | |
| - Procedural images: No ML overhead | |
| 4. If Space Crashes: | |
| - Reduce max_tokens limits further | |
| - Lower concurrency_limit to 1 | |
| - Disable image generation tab | |
| 5. Expected Performance: | |
| - First request: 8-15s (includes model loading) | |
| - Subsequent requests: 2-8s per request | |
| - Total RAM: ~1.5-2GB | |
| - Idle memory: ~500MB-1GB | |
| """ | |