guardrails-final / config.py
zazaman's picture
Optimize translation: reduce max_tokens and context_size, add no-display-prompt flag
b45369f
# config.py
import os
# === API KEYS ===
# It's recommended to set your API key as an environment variable for security.
# For Hugging Face Spaces, set this in the Spaces settings under "Repository secrets"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyB3nxr2n_eI-2aXQUm__5OU5sM2pH9cc0k")
if not GEMINI_API_KEY:
print("⚠️ WARNING: GEMINI_API_KEY environment variable not set!")
print(" Please set your Gemini API key in the environment variables.")
# === INPUT GUARDRAILS CONFIGURATION ===
# Fine-tuned model for input guardrails (prompt injection detection)
AI_DETECTION_MODE = {
"enabled": True,
"attack_llm_provider": "finetuned_guard",
"attack_llm_config": {
"model_name": "zazaman/fmb", # Your personal finetuned model
# Fine-tuned model runs locally - no additional configuration needed
}
}
# === NON-ENGLISH TRANSLATION ===
# When a prompt is detected as non-English, translate it to English using Qwen,
# then pass the translated text to the ModernBERT classifier.
# This allows the English-only classifier to work with multilingual inputs.
# Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
NON_ENGLISH_TRANSLATOR = {
"enabled": True,
"provider": "qwen_translator", # Translation client using GGUF models via pre-built llama.cpp binary
"config": {
# GGUF model repository and file from unsloth (pre-quantized)
"repo_id": "unsloth/Qwen3-0.6B-GGUF",
# Available files in the repo: Qwen3-0.6B-Q2_K.gguf, Qwen3-0.6B-Q2_K_L.gguf,
# Qwen3-0.6B-IQ4_XS.gguf, Qwen3-0.6B-IQ4_NL.gguf, Qwen3-0.6B-BF16.gguf
# Q2_K is smallest (~200MB) but lower quality
# IQ4_XS is small (~250MB) with good quality
# IQ4_NL is medium (~300MB) with better quality
# Q2_K_L is larger (~300MB) with better quality than Q2_K
"model_file": "Qwen3-0.6B-IQ4_XS.gguf", # Good balance of size and quality (Q4_K_M doesn't exist in repo)
# Inference options tuned for accurate translation
"temperature": 0.3, # Lower temperature for more accurate, consistent translations
"top_p": 0.9,
"top_k": 40,
# Max tokens to generate (reduced for faster inference - translation is usually short)
"max_tokens": 128,
# Context window size (reduced for faster inference - translation doesn't need large context)
"context_size": 256,
# CPU threads for inference (use more threads for faster inference)
# Set to 0 to auto-detect, or specify number of CPU cores
"n_threads": 0, # 0 = auto-detect (uses all available cores)
# GPU layers (0 = CPU only, set to >0 if GPU available for much faster inference)
# For GPU: try 20-35 layers depending on VRAM
"n_gpu_layers": 0,
# Batch size for prompt processing (smaller = faster for short prompts)
"n_batch": 256
}
}
# === PERFORMANCE OPTIMIZATION SETTINGS ===
# These settings help reduce memory usage and startup time
PERFORMANCE_OPTIMIZATIONS = {
"shared_models": True, # Use shared model instances to reduce memory
"lazy_loading": True, # Load models only when needed
"disable_warnings": True, # Disable verbose library warnings
"cpu_optimization": True, # Optimize for CPU inference
}
# === LLM CONFIGURATION ===
# Choose which LLM provider to use
LLM_PROVIDER = "gemini" # Can be "gemini", "ollama", "lmstudio", "manual"
LLM_CONFIG = {
"gemini": {
"model": "gemini-2.5-flash",
# You can add other generation settings here, e.g., temperature, top_p
},
"ollama": {
"model": "llama3",
"host": "http://localhost:11434",
# Add other Ollama-specific settings here
},
"lmstudio": {
"model": "qwen2.5-0.5b-instruct",
"host": "http://localhost:1234",
"temperature": 0.7,
"max_tokens": 2000,
},
"manual": {
# Manual client for output testing - no additional config needed
},
}
# === SYSTEM PROMPT ===
# The system prompt is used by all LLM providers that support it
SYSTEM_PROMPT = """ """
SYSTEM_PROMPTZ = """You are a customer support chatbot for Alfredo's Pizza Cafe. Your responses should be based solely on the provided information.
Here are your instructions:
### Role and Behavior
- You are a friendly and helpful customer support representative for Alfredo's Pizza Cafe.
- Only answer questions related to Alfredo's Pizza Cafe's menu, account management on the website, delivery times, and other directly relevant topics.
- Do not discuss other pizza chains or restaurants.
- Do not answer questions about topics unrelated to Alfredo's Pizza Cafe or its services.
### Knowledge Limitations:
- Only use information provided in the knowledge base above.
- If a question cannot be answered using the information in the knowledge base, politely state that you don't have that information and offer to connect the user with a human representative.
- Do not make up or infer information that is not explicitly stated in the knowledge base.
### Communication Style:
- Be friendly, professional, and helpful.
- Use clear, concise language.
- Ask clarifying questions if needed to better assist the customer.
- Express empathy when appropriate (e.g., if a customer has a complaint).
### Limitations:
- You cannot make, modify, or cancel orders directly. Direct customers to the website or phone number for order management.
- You cannot process payments or access customer account information.
- For complex issues or complaints, offer to connect the customer with a human representative.
### Sample Responses:
- "Thank you for contacting Alfredo's Pizza Cafe! How can I help you today?"
- "I'd be happy to help you with information about our menu/delivery/website."
- "I don't have access to specific account information, but I can direct you to..."
- "For order modifications, please visit our website or call us directly at..."
- "I'd be glad to connect you with one of our team members who can help with that specific issue."
Remember: Stay in character as an Alfredo's Pizza Cafe representative, be helpful within your limitations, and always maintain a friendly, professional tone."""
# === OUTPUT GUARDRAILS CONFIGURATION ===
# These are processed AFTER the LLM generates a response
OUTPUT_GUARDRAILS_CONFIG = {
"pii_output_guard": {
"enabled": True,
"on_output": True,
"anonymize_entities": ["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "US_SSN", "IP_ADDRESS", "US_BANK_NUMBER", "IN_AADHAAR"]
}
# Add other output guardrails here as needed
}
# === ATTACHMENT GUARDRAILS CONFIGURATION ===
# These process uploaded files before they're sent to the LLM
ATTACHMENT_GUARDRAILS_CONFIG = {
"txt_guardrail": {
"enabled": True,
"chunk_size": 500, # tokens per chunk
"confidence_threshold": 0.75,
"max_file_size_mb": 10,
},
"pdf_guardrail": {
"enabled": True,
"chunk_size": 800, # tokens per chunk for PDFs
"confidence_threshold": 0.80, # Slightly higher threshold for PDFs
"max_file_size_mb": 50,
},
"docx_guardrail": {
"enabled": True,
"chunk_size": 600, # tokens per chunk for Word docs
"confidence_threshold": 0.75,
"max_file_size_mb": 25,
}
}