# config.py import os # === API KEYS === # It's recommended to set your API key as an environment variable for security. # For Hugging Face Spaces, set this in the Spaces settings under "Repository secrets" GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyB3nxr2n_eI-2aXQUm__5OU5sM2pH9cc0k") if not GEMINI_API_KEY: print("⚠️ WARNING: GEMINI_API_KEY environment variable not set!") print(" Please set your Gemini API key in the environment variables.") # === INPUT GUARDRAILS CONFIGURATION === # Fine-tuned model for input guardrails (prompt injection detection) AI_DETECTION_MODE = { "enabled": True, "attack_llm_provider": "finetuned_guard", "attack_llm_config": { "model_name": "zazaman/fmb", # Your personal finetuned model # Fine-tuned model runs locally - no additional configuration needed } } # === NON-ENGLISH TRANSLATION === # When a prompt is detected as non-English, translate it to English using Qwen, # then pass the translated text to the ModernBERT classifier. # This allows the English-only classifier to work with multilingual inputs. # Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces. NON_ENGLISH_TRANSLATOR = { "enabled": True, "provider": "qwen_translator", # Translation client using GGUF models via pre-built llama.cpp binary "config": { # GGUF model repository and file from unsloth (pre-quantized) "repo_id": "unsloth/Qwen3-0.6B-GGUF", # Available files in the repo: Qwen3-0.6B-Q2_K.gguf, Qwen3-0.6B-Q2_K_L.gguf, # Qwen3-0.6B-IQ4_XS.gguf, Qwen3-0.6B-IQ4_NL.gguf, Qwen3-0.6B-BF16.gguf # Q2_K is smallest (~200MB) but lower quality # IQ4_XS is small (~250MB) with good quality # IQ4_NL is medium (~300MB) with better quality # Q2_K_L is larger (~300MB) with better quality than Q2_K "model_file": "Qwen3-0.6B-IQ4_XS.gguf", # Good balance of size and quality (Q4_K_M doesn't exist in repo) # Inference options tuned for accurate translation "temperature": 0.3, # Lower temperature for more accurate, consistent translations "top_p": 0.9, "top_k": 40, # Max tokens to generate (reduced for faster inference - translation is usually short) "max_tokens": 128, # Context window size (reduced for faster inference - translation doesn't need large context) "context_size": 256, # CPU threads for inference (use more threads for faster inference) # Set to 0 to auto-detect, or specify number of CPU cores "n_threads": 0, # 0 = auto-detect (uses all available cores) # GPU layers (0 = CPU only, set to >0 if GPU available for much faster inference) # For GPU: try 20-35 layers depending on VRAM "n_gpu_layers": 0, # Batch size for prompt processing (smaller = faster for short prompts) "n_batch": 256 } } # === PERFORMANCE OPTIMIZATION SETTINGS === # These settings help reduce memory usage and startup time PERFORMANCE_OPTIMIZATIONS = { "shared_models": True, # Use shared model instances to reduce memory "lazy_loading": True, # Load models only when needed "disable_warnings": True, # Disable verbose library warnings "cpu_optimization": True, # Optimize for CPU inference } # === LLM CONFIGURATION === # Choose which LLM provider to use LLM_PROVIDER = "gemini" # Can be "gemini", "ollama", "lmstudio", "manual" LLM_CONFIG = { "gemini": { "model": "gemini-2.5-flash", # You can add other generation settings here, e.g., temperature, top_p }, "ollama": { "model": "llama3", "host": "http://localhost:11434", # Add other Ollama-specific settings here }, "lmstudio": { "model": "qwen2.5-0.5b-instruct", "host": "http://localhost:1234", "temperature": 0.7, "max_tokens": 2000, }, "manual": { # Manual client for output testing - no additional config needed }, } # === SYSTEM PROMPT === # The system prompt is used by all LLM providers that support it SYSTEM_PROMPT = """ """ SYSTEM_PROMPTZ = """You are a customer support chatbot for Alfredo's Pizza Cafe. Your responses should be based solely on the provided information. Here are your instructions: ### Role and Behavior - You are a friendly and helpful customer support representative for Alfredo's Pizza Cafe. - Only answer questions related to Alfredo's Pizza Cafe's menu, account management on the website, delivery times, and other directly relevant topics. - Do not discuss other pizza chains or restaurants. - Do not answer questions about topics unrelated to Alfredo's Pizza Cafe or its services. ### Knowledge Limitations: - Only use information provided in the knowledge base above. - If a question cannot be answered using the information in the knowledge base, politely state that you don't have that information and offer to connect the user with a human representative. - Do not make up or infer information that is not explicitly stated in the knowledge base. ### Communication Style: - Be friendly, professional, and helpful. - Use clear, concise language. - Ask clarifying questions if needed to better assist the customer. - Express empathy when appropriate (e.g., if a customer has a complaint). ### Limitations: - You cannot make, modify, or cancel orders directly. Direct customers to the website or phone number for order management. - You cannot process payments or access customer account information. - For complex issues or complaints, offer to connect the customer with a human representative. ### Sample Responses: - "Thank you for contacting Alfredo's Pizza Cafe! How can I help you today?" - "I'd be happy to help you with information about our menu/delivery/website." - "I don't have access to specific account information, but I can direct you to..." - "For order modifications, please visit our website or call us directly at..." - "I'd be glad to connect you with one of our team members who can help with that specific issue." Remember: Stay in character as an Alfredo's Pizza Cafe representative, be helpful within your limitations, and always maintain a friendly, professional tone.""" # === OUTPUT GUARDRAILS CONFIGURATION === # These are processed AFTER the LLM generates a response OUTPUT_GUARDRAILS_CONFIG = { "pii_output_guard": { "enabled": True, "on_output": True, "anonymize_entities": ["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "US_SSN", "IP_ADDRESS", "US_BANK_NUMBER", "IN_AADHAAR"] } # Add other output guardrails here as needed } # === ATTACHMENT GUARDRAILS CONFIGURATION === # These process uploaded files before they're sent to the LLM ATTACHMENT_GUARDRAILS_CONFIG = { "txt_guardrail": { "enabled": True, "chunk_size": 500, # tokens per chunk "confidence_threshold": 0.75, "max_file_size_mb": 10, }, "pdf_guardrail": { "enabled": True, "chunk_size": 800, # tokens per chunk for PDFs "confidence_threshold": 0.80, # Slightly higher threshold for PDFs "max_file_size_mb": 50, }, "docx_guardrail": { "enabled": True, "chunk_size": 600, # tokens per chunk for Word docs "confidence_threshold": 0.75, "max_file_size_mb": 25, } }