File size: 7,383 Bytes
b5386e2
 
 
a2e1879
b5386e2
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c26a471
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
b45369f
 
a2e1879
b45369f
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5386e2
a2e1879
b5386e2
 
 
 
 
 
 
 
 
 
 
a2e1879
 
 
 
 
 
 
 
 
b5386e2
 
a2e1879
 
 
 
b5386e2
 
 
 
 
 
 
 
 
 
 
 
 
 
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5386e2
 
a2e1879
 
 
 
 
 
 
 
 
 
 
 
 
b5386e2
a2e1879
b5386e2
a2e1879
 
 
b5386e2
a2e1879
 
 
 
 
 
b5386e2
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
# config.py
import os

# === API KEYS ===
# It's recommended to set your API key as an environment variable for security.
# For Hugging Face Spaces, set this in the Spaces settings under "Repository secrets"
GEMINI_API_KEY = os.environ.get("GEMINI_API_KEY", "AIzaSyB3nxr2n_eI-2aXQUm__5OU5sM2pH9cc0k")

if not GEMINI_API_KEY:
    print("⚠️  WARNING: GEMINI_API_KEY environment variable not set!")
    print("   Please set your Gemini API key in the environment variables.")

# === INPUT GUARDRAILS CONFIGURATION ===
# Fine-tuned model for input guardrails (prompt injection detection)
AI_DETECTION_MODE = {
    "enabled": True,
    "attack_llm_provider": "finetuned_guard",
    "attack_llm_config": {
        "model_name": "zazaman/fmb",  # Your personal finetuned model
        # Fine-tuned model runs locally - no additional configuration needed
    }
}

# === NON-ENGLISH TRANSLATION ===
# When a prompt is detected as non-English, translate it to English using Qwen,
# then pass the translated text to the ModernBERT classifier.
# This allows the English-only classifier to work with multilingual inputs.
# Uses pre-quantized GGUF models from unsloth - no bitsandbytes needed. Works on Hugging Face Spaces.
NON_ENGLISH_TRANSLATOR = {
    "enabled": True,
    "provider": "qwen_translator",  # Translation client using GGUF models via pre-built llama.cpp binary
    "config": {
        # GGUF model repository and file from unsloth (pre-quantized)
        "repo_id": "unsloth/Qwen3-0.6B-GGUF",
        # Available files in the repo: Qwen3-0.6B-Q2_K.gguf, Qwen3-0.6B-Q2_K_L.gguf, 
        # Qwen3-0.6B-IQ4_XS.gguf, Qwen3-0.6B-IQ4_NL.gguf, Qwen3-0.6B-BF16.gguf
        # Q2_K is smallest (~200MB) but lower quality
        # IQ4_XS is small (~250MB) with good quality
        # IQ4_NL is medium (~300MB) with better quality
        # Q2_K_L is larger (~300MB) with better quality than Q2_K
        "model_file": "Qwen3-0.6B-IQ4_XS.gguf",  # Good balance of size and quality (Q4_K_M doesn't exist in repo)
        # Inference options tuned for accurate translation
        "temperature": 0.3,  # Lower temperature for more accurate, consistent translations
        "top_p": 0.9,
        "top_k": 40,
        # Max tokens to generate (reduced for faster inference - translation is usually short)
        "max_tokens": 128,
        # Context window size (reduced for faster inference - translation doesn't need large context)
        "context_size": 256,
        # CPU threads for inference (use more threads for faster inference)
        # Set to 0 to auto-detect, or specify number of CPU cores
        "n_threads": 0,  # 0 = auto-detect (uses all available cores)
        # GPU layers (0 = CPU only, set to >0 if GPU available for much faster inference)
        # For GPU: try 20-35 layers depending on VRAM
        "n_gpu_layers": 0,
        # Batch size for prompt processing (smaller = faster for short prompts)
        "n_batch": 256
    }
}

# === PERFORMANCE OPTIMIZATION SETTINGS ===
# These settings help reduce memory usage and startup time
PERFORMANCE_OPTIMIZATIONS = {
    "shared_models": True,  # Use shared model instances to reduce memory
    "lazy_loading": True,   # Load models only when needed
    "disable_warnings": True,  # Disable verbose library warnings
    "cpu_optimization": True,  # Optimize for CPU inference
}

# === LLM CONFIGURATION ===
# Choose which LLM provider to use
LLM_PROVIDER = "gemini"  # Can be "gemini", "ollama", "lmstudio", "manual"

LLM_CONFIG = {
    "gemini": {
        "model": "gemini-2.5-flash",
        # You can add other generation settings here, e.g., temperature, top_p
    },
    "ollama": {
        "model": "llama3",
        "host": "http://localhost:11434",
        # Add other Ollama-specific settings here
    },
    "lmstudio": {
        "model": "qwen2.5-0.5b-instruct",
        "host": "http://localhost:1234",
        "temperature": 0.7,
        "max_tokens": 2000,
    },
    "manual": {
        # Manual client for output testing - no additional config needed
    },
}

# === SYSTEM PROMPT ===
# The system prompt is used by all LLM providers that support it
SYSTEM_PROMPT = """ """
SYSTEM_PROMPTZ = """You are a customer support chatbot for Alfredo's Pizza Cafe. Your responses should be based solely on the provided information.

Here are your instructions:

### Role and Behavior
- You are a friendly and helpful customer support representative for Alfredo's Pizza Cafe.
- Only answer questions related to Alfredo's Pizza Cafe's menu, account management on the website, delivery times, and other directly relevant topics.
- Do not discuss other pizza chains or restaurants.
- Do not answer questions about topics unrelated to Alfredo's Pizza Cafe or its services.

### Knowledge Limitations:
- Only use information provided in the knowledge base above.
- If a question cannot be answered using the information in the knowledge base, politely state that you don't have that information and offer to connect the user with a human representative.
- Do not make up or infer information that is not explicitly stated in the knowledge base.

### Communication Style:
- Be friendly, professional, and helpful.
- Use clear, concise language.
- Ask clarifying questions if needed to better assist the customer.
- Express empathy when appropriate (e.g., if a customer has a complaint).

### Limitations:
- You cannot make, modify, or cancel orders directly. Direct customers to the website or phone number for order management.
- You cannot process payments or access customer account information.
- For complex issues or complaints, offer to connect the customer with a human representative.

### Sample Responses:
- "Thank you for contacting Alfredo's Pizza Cafe! How can I help you today?"
- "I'd be happy to help you with information about our menu/delivery/website."
- "I don't have access to specific account information, but I can direct you to..."
- "For order modifications, please visit our website or call us directly at..."
- "I'd be glad to connect you with one of our team members who can help with that specific issue."

Remember: Stay in character as an Alfredo's Pizza Cafe representative, be helpful within your limitations, and always maintain a friendly, professional tone."""

# === OUTPUT GUARDRAILS CONFIGURATION ===
# These are processed AFTER the LLM generates a response
OUTPUT_GUARDRAILS_CONFIG = {
    "pii_output_guard": {
        "enabled": True,
        "on_output": True,
        "anonymize_entities": ["PHONE_NUMBER", "EMAIL_ADDRESS", "CREDIT_CARD", "US_SSN", "IP_ADDRESS", "US_BANK_NUMBER", "IN_AADHAAR"]
    }
    # Add other output guardrails here as needed
}

# === ATTACHMENT GUARDRAILS CONFIGURATION ===
# These process uploaded files before they're sent to the LLM
ATTACHMENT_GUARDRAILS_CONFIG = {
    "txt_guardrail": {
        "enabled": True,
        "chunk_size": 500,  # tokens per chunk
        "confidence_threshold": 0.75,
        "max_file_size_mb": 10,
    },
    "pdf_guardrail": {
        "enabled": True,
        "chunk_size": 800,  # tokens per chunk for PDFs
        "confidence_threshold": 0.80,  # Slightly higher threshold for PDFs
        "max_file_size_mb": 50,
    },
    "docx_guardrail": {
        "enabled": True,
        "chunk_size": 600,  # tokens per chunk for Word docs
        "confidence_threshold": 0.75,
        "max_file_size_mb": 25,
    }
}