Spaces:
Sleeping
Sleeping
Update system model to Gemma 3 1B Instruct and humanize responses
Browse files- Dockerfile +3 -3
- Modelfile +1 -1
- config.py +1 -1
- llm/inference.py +9 -10
- llm/model_loader.py +2 -2
- rag/prompt_builder.py +16 -17
- rag/rag_pipeline.py +58 -20
Dockerfile
CHANGED
|
@@ -30,9 +30,9 @@ COPY requirements.txt .
|
|
| 30 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 31 |
pip install --no-cache-dir -r requirements.txt
|
| 32 |
|
| 33 |
-
# Download
|
| 34 |
-
RUN curl -L -o models/
|
| 35 |
-
"https://huggingface.co/
|
| 36 |
|
| 37 |
# Copy the rest of the application files
|
| 38 |
COPY --chown=user:user . .
|
|
|
|
| 30 |
RUN pip install --no-cache-dir --upgrade pip && \
|
| 31 |
pip install --no-cache-dir -r requirements.txt
|
| 32 |
|
| 33 |
+
# Download Gemma 3 1B Instruct GGUF model during build
|
| 34 |
+
RUN curl -L -o models/google_gemma-3-1b-it-Q4_K_M.gguf \
|
| 35 |
+
"https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF/resolve/main/google_gemma-3-1b-it-Q4_K_M.gguf"
|
| 36 |
|
| 37 |
# Copy the rest of the application files
|
| 38 |
COPY --chown=user:user . .
|
Modelfile
CHANGED
|
@@ -1 +1 @@
|
|
| 1 |
-
FROM ./models/
|
|
|
|
| 1 |
+
FROM ./models/google_gemma-3-1b-it-Q4_K_M.gguf
|
config.py
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
-
MODEL_PATH = "models/
|
| 4 |
VECTOR_DB_PATH = "vector_store/faiss_index"
|
| 5 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 6 |
CHUNK_SIZE = 500
|
|
|
|
| 1 |
import os
|
| 2 |
|
| 3 |
+
MODEL_PATH = "models/google_gemma-3-1b-it-Q4_K_M.gguf"
|
| 4 |
VECTOR_DB_PATH = "vector_store/faiss_index"
|
| 5 |
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
|
| 6 |
CHUNK_SIZE = 500
|
llm/inference.py
CHANGED
|
@@ -8,7 +8,7 @@ from llm.model_loader import get_llm
|
|
| 8 |
from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
|
| 9 |
|
| 10 |
OLLAMA_API_URL = "http://localhost:11434"
|
| 11 |
-
OLLAMA_MODEL_NAME = "
|
| 12 |
|
| 13 |
_llm_lock = threading.Lock()
|
| 14 |
_ollama_ready = False
|
|
@@ -118,7 +118,7 @@ def _generate_response_ollama(prompt: str, max_tokens: int = None) -> str:
|
|
| 118 |
"options": {
|
| 119 |
"num_predict": max_tokens or MAX_TOKENS,
|
| 120 |
"temperature": TEMPERATURE,
|
| 121 |
-
"stop": ["Question:", "<
|
| 122 |
}
|
| 123 |
}
|
| 124 |
|
|
@@ -146,7 +146,7 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
|
|
| 146 |
prompt,
|
| 147 |
max_tokens=max_tokens or MAX_TOKENS,
|
| 148 |
temperature=TEMPERATURE,
|
| 149 |
-
stop=["Question:", "<
|
| 150 |
)
|
| 151 |
text = output["choices"][0]["text"]
|
| 152 |
return text.strip()
|
|
@@ -158,8 +158,8 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
|
|
| 158 |
status_code=503,
|
| 159 |
detail=(
|
| 160 |
"LLM model file not found. "
|
| 161 |
-
"Download a GGUF model and place it at 'models/
|
| 162 |
-
"Recommended: https://huggingface.co/
|
| 163 |
)
|
| 164 |
)
|
| 165 |
|
|
@@ -317,11 +317,10 @@ def translate_to_english(text: str) -> str:
|
|
| 317 |
|
| 318 |
# 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
|
| 319 |
prompt = (
|
| 320 |
-
"<
|
| 321 |
-
"Translate Hinglish/Hindi to English. Reply ONLY with translation.\n"
|
| 322 |
-
"<
|
| 323 |
-
|
| 324 |
-
"<|im_start|>assistant\n"
|
| 325 |
)
|
| 326 |
try:
|
| 327 |
# Limit translation to 40 tokens since a single query is very short
|
|
|
|
| 8 |
from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
|
| 9 |
|
| 10 |
OLLAMA_API_URL = "http://localhost:11434"
|
| 11 |
+
OLLAMA_MODEL_NAME = "gemma3-local"
|
| 12 |
|
| 13 |
_llm_lock = threading.Lock()
|
| 14 |
_ollama_ready = False
|
|
|
|
| 118 |
"options": {
|
| 119 |
"num_predict": max_tokens or MAX_TOKENS,
|
| 120 |
"temperature": TEMPERATURE,
|
| 121 |
+
"stop": ["Question:", "<end_of_turn>", "<eos>"]
|
| 122 |
}
|
| 123 |
}
|
| 124 |
|
|
|
|
| 146 |
prompt,
|
| 147 |
max_tokens=max_tokens or MAX_TOKENS,
|
| 148 |
temperature=TEMPERATURE,
|
| 149 |
+
stop=["Question:", "<end_of_turn>", "<eos>"]
|
| 150 |
)
|
| 151 |
text = output["choices"][0]["text"]
|
| 152 |
return text.strip()
|
|
|
|
| 158 |
status_code=503,
|
| 159 |
detail=(
|
| 160 |
"LLM model file not found. "
|
| 161 |
+
"Download a GGUF model and place it at 'models/google_gemma-3-1b-it-Q4_K_M.gguf'.\n"
|
| 162 |
+
"Recommended: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF"
|
| 163 |
)
|
| 164 |
)
|
| 165 |
|
|
|
|
| 317 |
|
| 318 |
# 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
|
| 319 |
prompt = (
|
| 320 |
+
"<start_of_turn>user\n"
|
| 321 |
+
"Translate Hinglish/Hindi to English. Reply ONLY with translation.\n\n"
|
| 322 |
+
f"{text}<end_of_turn>\n"
|
| 323 |
+
"<start_of_turn>model\n"
|
|
|
|
| 324 |
)
|
| 325 |
try:
|
| 326 |
# Limit translation to 40 tokens since a single query is very short
|
llm/model_loader.py
CHANGED
|
@@ -31,8 +31,8 @@ def get_llm() -> Llama:
|
|
| 31 |
raise FileNotFoundError(
|
| 32 |
f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n"
|
| 33 |
f" Download a GGUF model and place it at: {MODEL_PATH}\n"
|
| 34 |
-
f" Recommended:
|
| 35 |
-
f" URL: https://huggingface.co/
|
| 36 |
)
|
| 37 |
try:
|
| 38 |
cpu_count = os.cpu_count()
|
|
|
|
| 31 |
raise FileNotFoundError(
|
| 32 |
f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n"
|
| 33 |
f" Download a GGUF model and place it at: {MODEL_PATH}\n"
|
| 34 |
+
f" Recommended: Gemma 3 1B Instruct (Q4_K_M)\n"
|
| 35 |
+
f" URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n"
|
| 36 |
)
|
| 37 |
try:
|
| 38 |
cpu_count = os.cpu_count()
|
rag/prompt_builder.py
CHANGED
|
@@ -52,33 +52,32 @@ def build_prompt(context: str, question: str, language: str = "english"):
|
|
| 52 |
|
| 53 |
if language == "hindi":
|
| 54 |
system_prompt = (
|
| 55 |
-
"आप एक स
|
| 56 |
-
"केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
|
| 57 |
"यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
|
| 58 |
)
|
| 59 |
elif language == "hinglish":
|
| 60 |
system_prompt = (
|
| 61 |
-
"You are a helpful AI assistant. Niche diye gaye context
|
| 62 |
-
"Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
|
| 63 |
-
"e.g., 'Nervous candidate ko handle karne ke liye relax karne ko bolein.'\n"
|
| 64 |
"Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
|
| 65 |
)
|
| 66 |
else:
|
| 67 |
system_prompt = (
|
| 68 |
-
"You are a helpful AI assistant. Answer the user's question
|
| 69 |
-
"
|
| 70 |
"If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
|
| 71 |
)
|
| 72 |
|
| 73 |
-
prompt = f"""<
|
| 74 |
-
{system_prompt}
|
| 75 |
-
|
| 76 |
Context:
|
| 77 |
{formatted_context}
|
| 78 |
|
| 79 |
Question:
|
| 80 |
-
{question}<
|
| 81 |
-
<
|
| 82 |
Answer: """
|
| 83 |
return prompt
|
| 84 |
|
|
@@ -96,10 +95,10 @@ def build_greeting_prompt(question: str, language: str = "english"):
|
|
| 96 |
f"{lang_instruction}"
|
| 97 |
)
|
| 98 |
|
| 99 |
-
prompt = f"""<
|
| 100 |
-
{system_prompt}
|
| 101 |
-
|
| 102 |
-
{question}<
|
| 103 |
-
<
|
| 104 |
"""
|
| 105 |
return prompt
|
|
|
|
| 52 |
|
| 53 |
if language == "hindi":
|
| 54 |
system_prompt = (
|
| 55 |
+
"आप एक मददगार और मिलनसार एआई सहायक हैं। नीचे दिए गए संदर्भ (context) के आधार पर उपयोगकर्ता के प्रश्न का उत्तर एक गर्मजोशी भरे, प्राकृतिक और बातचीत के लहजे में दें (3 वाक्यों से कम)।\n"
|
| 56 |
+
"कच्चे पाठ को कॉपी-पेस्ट करने के बजाय उसे स्वाभाविक रूप से समझाएं। केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
|
| 57 |
"यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
|
| 58 |
)
|
| 59 |
elif language == "hinglish":
|
| 60 |
system_prompt = (
|
| 61 |
+
"You are a helpful and friendly AI assistant. Niche diye gaye context ke base par user ke question ka answer ek natural, warm and conversational tone me do (under 3 sentences).\n"
|
| 62 |
+
"Raw text ko copy-paste karne ke bajaye natural language me explain karo. Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
|
|
|
|
| 63 |
"Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
|
| 64 |
)
|
| 65 |
else:
|
| 66 |
system_prompt = (
|
| 67 |
+
"You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based on the XML qa blocks in the context. "
|
| 68 |
+
"Rephrase the information to sound humanized and conversational, rather than copy-pasting raw text. Keep your answer brief and under 3 sentences.\n"
|
| 69 |
"If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
|
| 70 |
)
|
| 71 |
|
| 72 |
+
prompt = f"""<start_of_turn>user
|
| 73 |
+
{system_prompt}
|
| 74 |
+
|
| 75 |
Context:
|
| 76 |
{formatted_context}
|
| 77 |
|
| 78 |
Question:
|
| 79 |
+
{question}<end_of_turn>
|
| 80 |
+
<start_of_turn>model
|
| 81 |
Answer: """
|
| 82 |
return prompt
|
| 83 |
|
|
|
|
| 95 |
f"{lang_instruction}"
|
| 96 |
)
|
| 97 |
|
| 98 |
+
prompt = f"""<start_of_turn>user
|
| 99 |
+
{system_prompt}
|
| 100 |
+
|
| 101 |
+
{question}<end_of_turn>
|
| 102 |
+
<start_of_turn>model
|
| 103 |
"""
|
| 104 |
return prompt
|
rag/rag_pipeline.py
CHANGED
|
@@ -192,29 +192,29 @@ def restructure_query(query: str, language: str) -> str:
|
|
| 192 |
"""
|
| 193 |
if language == "english":
|
| 194 |
prompt = (
|
| 195 |
-
"<
|
| 196 |
"You are a precise grammar correction and query restructuring tool. "
|
| 197 |
"Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
|
| 198 |
"Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
|
| 199 |
"Example:\n"
|
| 200 |
"Query: why company want to hire me?\n"
|
| 201 |
-
"Correction: Why should the company hire me?\n"
|
| 202 |
-
"<
|
| 203 |
-
|
| 204 |
-
"
|
| 205 |
)
|
| 206 |
else: # hindi
|
| 207 |
prompt = (
|
| 208 |
-
"<
|
| 209 |
"You are a precise grammar correction and query restructuring tool for Hindi. "
|
| 210 |
"Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
|
| 211 |
"Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
|
| 212 |
"Example:\n"
|
| 213 |
"Query: कंपनी hire क्यों करे मुझे?\n"
|
| 214 |
-
"Correction: कंपनी मुझे नौकरी क्यों दे?\n"
|
| 215 |
-
"<
|
| 216 |
-
|
| 217 |
-
"
|
| 218 |
)
|
| 219 |
try:
|
| 220 |
from llm.inference import generate_response
|
|
@@ -228,6 +228,45 @@ def restructure_query(query: str, language: str) -> str:
|
|
| 228 |
return query
|
| 229 |
|
| 230 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 231 |
def generate_rag_response(question: str):
|
| 232 |
# 1. Detect query language
|
| 233 |
lang = detect_language(question)
|
|
@@ -289,28 +328,27 @@ def generate_rag_response(question: str):
|
|
| 289 |
break
|
| 290 |
|
| 291 |
if matching_qa:
|
| 292 |
-
#
|
| 293 |
-
|
| 294 |
-
response = matching_qa["answer"]
|
| 295 |
else:
|
| 296 |
# Fall back to using the LLM with the formatted context XML
|
| 297 |
from rag.prompt_builder import format_context_as_xml
|
| 298 |
formatted_context = format_context_as_xml(context)
|
| 299 |
|
| 300 |
system_prompt = (
|
| 301 |
-
"You are a helpful AI assistant. Answer the user's question
|
| 302 |
-
"Keep your answer brief, factual, and under 3 sentences.
|
| 303 |
"If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
|
| 304 |
)
|
| 305 |
-
prompt = f"""<
|
| 306 |
-
{system_prompt}
|
| 307 |
-
|
| 308 |
Context:
|
| 309 |
{formatted_context}
|
| 310 |
|
| 311 |
Question:
|
| 312 |
-
{search_query}<
|
| 313 |
-
<
|
| 314 |
"""
|
| 315 |
response = generate_response(prompt)
|
| 316 |
response = _clean_response(response)
|
|
|
|
| 192 |
"""
|
| 193 |
if language == "english":
|
| 194 |
prompt = (
|
| 195 |
+
"<start_of_turn>user\n"
|
| 196 |
"You are a precise grammar correction and query restructuring tool. "
|
| 197 |
"Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
|
| 198 |
"Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
|
| 199 |
"Example:\n"
|
| 200 |
"Query: why company want to hire me?\n"
|
| 201 |
+
"Correction: Why should the company hire me?\n\n"
|
| 202 |
+
f"Query: {query}<end_of_turn>\n"
|
| 203 |
+
"<start_of_turn>model\n"
|
| 204 |
+
"Correction: "
|
| 205 |
)
|
| 206 |
else: # hindi
|
| 207 |
prompt = (
|
| 208 |
+
"<start_of_turn>user\n"
|
| 209 |
"You are a precise grammar correction and query restructuring tool for Hindi. "
|
| 210 |
"Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
|
| 211 |
"Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
|
| 212 |
"Example:\n"
|
| 213 |
"Query: कंपनी hire क्यों करे मुझे?\n"
|
| 214 |
+
"Correction: कंपनी मुझे नौकरी क्यों दे?\n\n"
|
| 215 |
+
f"Query: {query}<end_of_turn>\n"
|
| 216 |
+
"<start_of_turn>model\n"
|
| 217 |
+
"Correction: "
|
| 218 |
)
|
| 219 |
try:
|
| 220 |
from llm.inference import generate_response
|
|
|
|
| 228 |
return query
|
| 229 |
|
| 230 |
|
| 231 |
+
def humanize_raw_response(raw_answer: str, question: str, language: str) -> str:
|
| 232 |
+
"""
|
| 233 |
+
Rephrase a raw exact-match answer using the LLM to make it sound natural and conversational.
|
| 234 |
+
"""
|
| 235 |
+
if language == "hindi":
|
| 236 |
+
prompt = (
|
| 237 |
+
"<start_of_turn>user\n"
|
| 238 |
+
"आप एक मददगार और मिलनसार एआई सहायक हैं। निम्नलिखित प्रश्न और उसके कच्चे उत्तर (raw answer) को एक स्वाभाविक, गर्मजोशी भरे और मानवीय उत्तर में बदलें। उत्तर संक्षिप्त (1-2 वाक्य) होना चाहिए।\n"
|
| 239 |
+
"केवल हिंदी (Devanagari script) में ही उत्तर दें।\n\n"
|
| 240 |
+
f"प्रश्न: {question}\n"
|
| 241 |
+
f"कच्चा उत्तर: {raw_answer}<end_of_turn>\n"
|
| 242 |
+
"<start_of_turn>model\n"
|
| 243 |
+
)
|
| 244 |
+
elif language == "hinglish":
|
| 245 |
+
prompt = (
|
| 246 |
+
"<start_of_turn>user\n"
|
| 247 |
+
"You are a helpful and friendly AI assistant. Rephrase the following raw answer to sound natural, warm, and conversational in Hinglish based on the user's question. Keep it concise (1-2 sentences).\n"
|
| 248 |
+
"Answer ONLY in Hinglish (Hindi language written in English/Latin script).\n\n"
|
| 249 |
+
f"Question: {question}\n"
|
| 250 |
+
f"Raw Answer: {raw_answer}<end_of_turn>\n"
|
| 251 |
+
"<start_of_turn>model\n"
|
| 252 |
+
)
|
| 253 |
+
else: # english
|
| 254 |
+
prompt = (
|
| 255 |
+
"<start_of_turn>user\n"
|
| 256 |
+
"You are a helpful and friendly AI assistant. Rephrase the following raw answer to make it sound natural, warm, conversational, and human-like based on the user's question. Keep it concise (1-2 sentences).\n\n"
|
| 257 |
+
f"Question: {question}\n"
|
| 258 |
+
f"Raw Answer: {raw_answer}<end_of_turn>\n"
|
| 259 |
+
"<start_of_turn>model\n"
|
| 260 |
+
)
|
| 261 |
+
try:
|
| 262 |
+
from llm.inference import generate_response
|
| 263 |
+
response = generate_response(prompt, max_tokens=100)
|
| 264 |
+
return response.strip()
|
| 265 |
+
except Exception as e:
|
| 266 |
+
print(f"[WARNING] Rephrasing failed: {e}. Falling back to raw answer.")
|
| 267 |
+
return raw_answer
|
| 268 |
+
|
| 269 |
+
|
| 270 |
def generate_rag_response(question: str):
|
| 271 |
# 1. Detect query language
|
| 272 |
lang = detect_language(question)
|
|
|
|
| 328 |
break
|
| 329 |
|
| 330 |
if matching_qa:
|
| 331 |
+
# Rephrase the raw answer using LLM to make it sound natural and humanized
|
| 332 |
+
response = humanize_raw_response(matching_qa["answer"], question, lang)
|
|
|
|
| 333 |
else:
|
| 334 |
# Fall back to using the LLM with the formatted context XML
|
| 335 |
from rag.prompt_builder import format_context_as_xml
|
| 336 |
formatted_context = format_context_as_xml(context)
|
| 337 |
|
| 338 |
system_prompt = (
|
| 339 |
+
"You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based ONLY on the provided context. "
|
| 340 |
+
"Keep your answer brief, factual, and under 3 sentences. Rephrase the context to sound human-like rather than copy-pasting.\n"
|
| 341 |
"If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
|
| 342 |
)
|
| 343 |
+
prompt = f"""<start_of_turn>user
|
| 344 |
+
{system_prompt}
|
| 345 |
+
|
| 346 |
Context:
|
| 347 |
{formatted_context}
|
| 348 |
|
| 349 |
Question:
|
| 350 |
+
{search_query}<end_of_turn>
|
| 351 |
+
<start_of_turn>model
|
| 352 |
"""
|
| 353 |
response = generate_response(prompt)
|
| 354 |
response = _clean_response(response)
|