Spaces:

Sabithulla
/

lightweight-ai-backend

Sleeping

AI Backend Deploy

Deploy Lightweight AI Backend (2026-02-23 19:37:44)

d39e477 about 2 months ago

5.48 kB

	"""
	Configuration file for easy parameter tuning
	This file contains all adjustable parameters for the AI backend
	"""

	# ====== MODEL CONFIGURATION ======

	# Model names and paths
	MODELS = {
	"chat": {
	"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"type": "causal-lm",
	"params": 1100_000_000
	},
	"code": {
	"name": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
	"type": "causal-lm",
	"params": 1100_000_000
	},
	"summarization": {
	"name": "google/flan-t5-small",
	"type": "seq2seq",
	"params": 170_000_000
	}
	}

	# ====== GENERATION PARAMETERS ======

	GENERATION_CONFIG = {
	"chat": {
	"max_tokens_limit": 200, # Hard limit for CPU stability
	"default_max_tokens": 150, # Default for user
	"min_temperature": 0.1,
	"max_temperature": 1.0,
	"default_temperature": 0.7,
	"top_p": 0.9,
	"top_k": 50,
	},
	"code": {
	"max_tokens_limit": 300,
	"default_max_tokens": 256,
	"min_temperature": 0.1,
	"max_temperature": 1.0,
	"default_temperature": 0.3, # Lower temp for deterministic code
	"top_p": 0.95,
	"top_k": 40,
	},
	"summarization": {
	"max_length": 150,
	"min_length": 20,
	"max_input_length": 1000, # Truncate input to prevent OOM
	},
	"image": {
	"max_width": 256,
	"max_height": 256,
	"default_width": 256,
	"default_height": 256,
	"min_width": 128,
	"min_height": 128,
	}
	}

	# ====== RESOURCE CONFIGURATION ======

	RESOURCE_CONFIG = {
	"device": "cpu",
	"cpu_threads": 4, # Number of CPU threads for torch
	"torch_dtype": "float32", # Use float32 for CPU (not bfloat16)
	"low_cpu_mem_usage": True, # Enable memory optimization
	"max_memory_percent": 85, # Don't exceed 85% of available RAM
	}

	# ====== PERFORMANCE CONFIGURATION ======

	PERFORMANCE_CONFIG = {
	# Queue settings
	"queue_max_size": 10, # Max requests in queue
	"concurrency_limit": 2, # Max concurrent requests

	# Timeouts (seconds)
	"model_load_timeout": 120, # Timeout for loading models
	"request_timeout": 60, # Timeout for generation

	# Memory management
	"gc_collect_freq": 1, # Collect garbage after every N requests
	"model_unload_after": 300, # Unload inactive models after N seconds
	}

	# ====== GRADIO CONFIGURATION ======

	GRADIO_CONFIG = {
	"server_name": "0.0.0.0",
	"server_port": 7860,
	"share": False,
	"show_error": True,
	"analytics_enabled": False,
	"max_threads": 40,
	}

	# ====== UI CONFIGURATION ======

	UI_CONFIG = {
	"title": "🤖 Lightweight AI Backend",
	"description": "Multi-Model API for Hugging Face Free CPU Tier",
	"theme": "soft", # Options: default, soft, monochrome, glass
	"show_api_docs": True,
	}

	# ====== LOGGING CONFIGURATION ======

	LOGGING_CONFIG = {
	"log_level": "INFO", # DEBUG, INFO, WARNING, ERROR
	"log_to_file": False,
	"log_file": "app.log",
	}

	# ====== ADVANCED OPTIONS ======

	ADVANCED_CONFIG = {
	# Model quantization (experimental)
	"use_quantization": False, # Not recommended for CPU

	# Model compilation (experimental)
	"use_torch_compile": False, # Not stable on CPU yet

	# Cache settings
	"cache_embeddings": False, # Cache model outputs (memory tradeoff)
	"enable_attention_cache": True, # Enable KV cache

	# Error handling
	"max_retries": 2,
	"retry_delay": 1, # Seconds between retries
	}

	# ====== ALTERNATIVE MODELS (For experimentation) ======

	ALTERNATIVE_MODELS = {
	"chat_alternatives": [
	"microsoft/phi-1", # Requires > 4GB RAM
	"stabilityai/stablelm-zephyr-3b",
	"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED
	],
	"code_alternatives": [
	"Salesforce/codet5-small",
	"TinyLlama/TinyLlama-1.1B-Chat-v1.0", # RECOMMENDED
	],
	"summarization_alternatives": [
	"google/flan-t5-small", # RECOMMENDED (170M)
	"google/flan-t5-base", # Larger (250M)
	"facebook/bart-large-cnn", # Heavier (406M)
	],
	}

	# ====== FEATURE FLAGS ======

	FEATURES = {
	"enable_chat": True,
	"enable_code": True,
	"enable_summarization": True,
	"enable_image_generation": True,

	# API exposure
	"expose_api": True,
	"api_auth_required": False, # Require auth for API calls

	# UI tabs
	"show_about_tab": True,
	"show_api_docs": True,
	}

	# ====== DEPLOYMENT NOTES ======
	"""
	OPTIMIZATION TIPS FOR FREE CPU TIER:

	1. Token Limits:
	- Keep max_tokens_limit at 200-300 for stability
	- Process smaller batches (1-2 requests at a time)

	2. Memory Management:
	- Queue handling prevents concurrent large loads
	- Models lazy-load only when needed
	- Garbage collection runs after each request

	3. Model Selection:
	- TinyLlama: 1.1B params, good quality/speed tradeoff
	- FLAN-T5-small: 170M params, fastest summarization
	- Procedural images: No ML overhead

	4. If Space Crashes:
	- Reduce max_tokens limits further
	- Lower concurrency_limit to 1
	- Disable image generation tab

	5. Expected Performance:
	- First request: 8-15s (includes model loading)
	- Subsequent requests: 2-8s per request
	- Total RAM: ~1.5-2GB
	- Idle memory: ~500MB-1GB
	"""