khubchand commited on
Commit
9eed65c
·
1 Parent(s): 069b0f0

Update system model to Gemma 3 1B Instruct and humanize responses

Browse files
Files changed (7) hide show
  1. Dockerfile +3 -3
  2. Modelfile +1 -1
  3. config.py +1 -1
  4. llm/inference.py +9 -10
  5. llm/model_loader.py +2 -2
  6. rag/prompt_builder.py +16 -17
  7. rag/rag_pipeline.py +58 -20
Dockerfile CHANGED
@@ -30,9 +30,9 @@ COPY requirements.txt .
30
  RUN pip install --no-cache-dir --upgrade pip && \
31
  pip install --no-cache-dir -r requirements.txt
32
 
33
- # Download Qwen2.5-0.5B-Instruct GGUF model during build
34
- RUN curl -L -o models/qwen2.5-0.5b-instruct-q4_k_m.gguf \
35
- "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
36
 
37
  # Copy the rest of the application files
38
  COPY --chown=user:user . .
 
30
  RUN pip install --no-cache-dir --upgrade pip && \
31
  pip install --no-cache-dir -r requirements.txt
32
 
33
+ # Download Gemma 3 1B Instruct GGUF model during build
34
+ RUN curl -L -o models/google_gemma-3-1b-it-Q4_K_M.gguf \
35
+ "https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF/resolve/main/google_gemma-3-1b-it-Q4_K_M.gguf"
36
 
37
  # Copy the rest of the application files
38
  COPY --chown=user:user . .
Modelfile CHANGED
@@ -1 +1 @@
1
- FROM ./models/qwen2.5-0.5b-instruct-q4_k_m.gguf
 
1
+ FROM ./models/google_gemma-3-1b-it-Q4_K_M.gguf
config.py CHANGED
@@ -1,6 +1,6 @@
1
  import os
2
 
3
- MODEL_PATH = "models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
4
  VECTOR_DB_PATH = "vector_store/faiss_index"
5
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
6
  CHUNK_SIZE = 500
 
1
  import os
2
 
3
+ MODEL_PATH = "models/google_gemma-3-1b-it-Q4_K_M.gguf"
4
  VECTOR_DB_PATH = "vector_store/faiss_index"
5
  EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
6
  CHUNK_SIZE = 500
llm/inference.py CHANGED
@@ -8,7 +8,7 @@ from llm.model_loader import get_llm
8
  from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
9
 
10
  OLLAMA_API_URL = "http://localhost:11434"
11
- OLLAMA_MODEL_NAME = "qwen-local"
12
 
13
  _llm_lock = threading.Lock()
14
  _ollama_ready = False
@@ -118,7 +118,7 @@ def _generate_response_ollama(prompt: str, max_tokens: int = None) -> str:
118
  "options": {
119
  "num_predict": max_tokens or MAX_TOKENS,
120
  "temperature": TEMPERATURE,
121
- "stop": ["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
122
  }
123
  }
124
 
@@ -146,7 +146,7 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
146
  prompt,
147
  max_tokens=max_tokens or MAX_TOKENS,
148
  temperature=TEMPERATURE,
149
- stop=["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
150
  )
151
  text = output["choices"][0]["text"]
152
  return text.strip()
@@ -158,8 +158,8 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
158
  status_code=503,
159
  detail=(
160
  "LLM model file not found. "
161
- "Download a GGUF model and place it at 'models/Phi-3-mini-4k-instruct-q4.gguf'.\n"
162
- "Recommended: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf"
163
  )
164
  )
165
 
@@ -317,11 +317,10 @@ def translate_to_english(text: str) -> str:
317
 
318
  # 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
319
  prompt = (
320
- "<|im_start|>system\n"
321
- "Translate Hinglish/Hindi to English. Reply ONLY with translation.\n"
322
- "<|im_end|>\n"
323
- f"<|im_start|>user\n{text}<|im_end|>\n"
324
- "<|im_start|>assistant\n"
325
  )
326
  try:
327
  # Limit translation to 40 tokens since a single query is very short
 
8
  from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
9
 
10
  OLLAMA_API_URL = "http://localhost:11434"
11
+ OLLAMA_MODEL_NAME = "gemma3-local"
12
 
13
  _llm_lock = threading.Lock()
14
  _ollama_ready = False
 
118
  "options": {
119
  "num_predict": max_tokens or MAX_TOKENS,
120
  "temperature": TEMPERATURE,
121
+ "stop": ["Question:", "<end_of_turn>", "<eos>"]
122
  }
123
  }
124
 
 
146
  prompt,
147
  max_tokens=max_tokens or MAX_TOKENS,
148
  temperature=TEMPERATURE,
149
+ stop=["Question:", "<end_of_turn>", "<eos>"]
150
  )
151
  text = output["choices"][0]["text"]
152
  return text.strip()
 
158
  status_code=503,
159
  detail=(
160
  "LLM model file not found. "
161
+ "Download a GGUF model and place it at 'models/google_gemma-3-1b-it-Q4_K_M.gguf'.\n"
162
+ "Recommended: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF"
163
  )
164
  )
165
 
 
317
 
318
  # 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
319
  prompt = (
320
+ "<start_of_turn>user\n"
321
+ "Translate Hinglish/Hindi to English. Reply ONLY with translation.\n\n"
322
+ f"{text}<end_of_turn>\n"
323
+ "<start_of_turn>model\n"
 
324
  )
325
  try:
326
  # Limit translation to 40 tokens since a single query is very short
llm/model_loader.py CHANGED
@@ -31,8 +31,8 @@ def get_llm() -> Llama:
31
  raise FileNotFoundError(
32
  f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n"
33
  f" Download a GGUF model and place it at: {MODEL_PATH}\n"
34
- f" Recommended: Qwen2.5 0.5B Instruct (Q4_K_M)\n"
35
- f" URL: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF\n"
36
  )
37
  try:
38
  cpu_count = os.cpu_count()
 
31
  raise FileNotFoundError(
32
  f"\n\n Model file not found: {os.path.abspath(MODEL_PATH)}\n"
33
  f" Download a GGUF model and place it at: {MODEL_PATH}\n"
34
+ f" Recommended: Gemma 3 1B Instruct (Q4_K_M)\n"
35
+ f" URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n"
36
  )
37
  try:
38
  cpu_count = os.cpu_count()
rag/prompt_builder.py CHANGED
@@ -52,33 +52,32 @@ def build_prompt(context: str, question: str, language: str = "english"):
52
 
53
  if language == "hindi":
54
  system_prompt = (
55
- "आप एक सयक एआई सहायक हैं। नीचे दिए गए संदर्भ (context) सी संक्षिप् उत्तर दें (3 वाक्यों से कम)।\n"
56
- "केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
57
  "यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
58
  )
59
  elif language == "hinglish":
60
  system_prompt = (
61
- "You are a helpful AI assistant. Niche diye gaye context se simple and brief answer do (under 3 sentences).\n"
62
- "Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
63
- "e.g., 'Nervous candidate ko handle karne ke liye relax karne ko bolein.'\n"
64
  "Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
65
  )
66
  else:
67
  system_prompt = (
68
- "You are a helpful AI assistant. Answer the user's question directly based on the XML qa blocks in the context. "
69
- "Find the matching question and return its exact answer. Do not add meta-commentary or extra text.\n"
70
  "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
71
  )
72
 
73
- prompt = f"""<|im_start|>system
74
- {system_prompt}<|im_end|>
75
- <|im_start|>user
76
  Context:
77
  {formatted_context}
78
 
79
  Question:
80
- {question}<|im_end|>
81
- <|im_start|>assistant
82
  Answer: """
83
  return prompt
84
 
@@ -96,10 +95,10 @@ def build_greeting_prompt(question: str, language: str = "english"):
96
  f"{lang_instruction}"
97
  )
98
 
99
- prompt = f"""<|im_start|>system
100
- {system_prompt}<|im_end|>
101
- <|im_start|>user
102
- {question}<|im_end|>
103
- <|im_start|>assistant
104
  """
105
  return prompt
 
52
 
53
  if language == "hindi":
54
  system_prompt = (
55
+ "आप एक मददगार और मिलनसा एआई सहायक हैं। नीचे दिए गए संदर्भ (context) ार उपयोगता के प्रश्न का उत्तर एक गर्मजोशी भरे, प्राकृतिक और बातचीत के लहजे में दें (3 वाक्यों से कम)।\n"
56
+ "कच्च पाठ को कॉपी-पेस्ट करने के बजाय उसे स्ाभाविक रूप से समझाएं। केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
57
  "यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
58
  )
59
  elif language == "hinglish":
60
  system_prompt = (
61
+ "You are a helpful and friendly AI assistant. Niche diye gaye context ke base par user ke question ka answer ek natural, warm and conversational tone me do (under 3 sentences).\n"
62
+ "Raw text ko copy-paste karne ke bajaye natural language me explain karo. Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
 
63
  "Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
64
  )
65
  else:
66
  system_prompt = (
67
+ "You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based on the XML qa blocks in the context. "
68
+ "Rephrase the information to sound humanized and conversational, rather than copy-pasting raw text. Keep your answer brief and under 3 sentences.\n"
69
  "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
70
  )
71
 
72
+ prompt = f"""<start_of_turn>user
73
+ {system_prompt}
74
+
75
  Context:
76
  {formatted_context}
77
 
78
  Question:
79
+ {question}<end_of_turn>
80
+ <start_of_turn>model
81
  Answer: """
82
  return prompt
83
 
 
95
  f"{lang_instruction}"
96
  )
97
 
98
+ prompt = f"""<start_of_turn>user
99
+ {system_prompt}
100
+
101
+ {question}<end_of_turn>
102
+ <start_of_turn>model
103
  """
104
  return prompt
rag/rag_pipeline.py CHANGED
@@ -192,29 +192,29 @@ def restructure_query(query: str, language: str) -> str:
192
  """
193
  if language == "english":
194
  prompt = (
195
- "<|im_start|>system\n"
196
  "You are a precise grammar correction and query restructuring tool. "
197
  "Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
198
  "Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
199
  "Example:\n"
200
  "Query: why company want to hire me?\n"
201
- "Correction: Why should the company hire me?\n"
202
- "<|im_end|>\n"
203
- f"<|im_start|>user\nQuery: {query}<|im_end|>\n"
204
- "<|im_start|>assistant\nCorrection: "
205
  )
206
  else: # hindi
207
  prompt = (
208
- "<|im_start|>system\n"
209
  "You are a precise grammar correction and query restructuring tool for Hindi. "
210
  "Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
211
  "Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
212
  "Example:\n"
213
  "Query: कंपनी hire क्यों करे मुझे?\n"
214
- "Correction: कंपनी मुझे नौकरी क्यों दे?\n"
215
- "<|im_end|>\n"
216
- f"<|im_start|>user\nQuery: {query}<|im_end|>\n"
217
- "<|im_start|>assistant\nCorrection: "
218
  )
219
  try:
220
  from llm.inference import generate_response
@@ -228,6 +228,45 @@ def restructure_query(query: str, language: str) -> str:
228
  return query
229
 
230
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  def generate_rag_response(question: str):
232
  # 1. Detect query language
233
  lang = detect_language(question)
@@ -289,28 +328,27 @@ def generate_rag_response(question: str):
289
  break
290
 
291
  if matching_qa:
292
- # Directly extract the answer string for exact Q&A matches.
293
- # This completely bypasses LLM latency and hallucination risks.
294
- response = matching_qa["answer"]
295
  else:
296
  # Fall back to using the LLM with the formatted context XML
297
  from rag.prompt_builder import format_context_as_xml
298
  formatted_context = format_context_as_xml(context)
299
 
300
  system_prompt = (
301
- "You are a helpful AI assistant. Answer the user's question directly based ONLY on the provided context. "
302
- "Keep your answer brief, factual, and under 3 sentences. Do not add meta-commentary or repeat the context.\n"
303
  "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
304
  )
305
- prompt = f"""<|im_start|>system
306
- {system_prompt}<|im_end|>
307
- <|im_start|>user
308
  Context:
309
  {formatted_context}
310
 
311
  Question:
312
- {search_query}<|im_end|>
313
- <|im_start|>assistant
314
  """
315
  response = generate_response(prompt)
316
  response = _clean_response(response)
 
192
  """
193
  if language == "english":
194
  prompt = (
195
+ "<start_of_turn>user\n"
196
  "You are a precise grammar correction and query restructuring tool. "
197
  "Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
198
  "Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
199
  "Example:\n"
200
  "Query: why company want to hire me?\n"
201
+ "Correction: Why should the company hire me?\n\n"
202
+ f"Query: {query}<end_of_turn>\n"
203
+ "<start_of_turn>model\n"
204
+ "Correction: "
205
  )
206
  else: # hindi
207
  prompt = (
208
+ "<start_of_turn>user\n"
209
  "You are a precise grammar correction and query restructuring tool for Hindi. "
210
  "Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
211
  "Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
212
  "Example:\n"
213
  "Query: कंपनी hire क्यों करे मुझे?\n"
214
+ "Correction: कंपनी मुझे नौकरी क्यों दे?\n\n"
215
+ f"Query: {query}<end_of_turn>\n"
216
+ "<start_of_turn>model\n"
217
+ "Correction: "
218
  )
219
  try:
220
  from llm.inference import generate_response
 
228
  return query
229
 
230
 
231
+ def humanize_raw_response(raw_answer: str, question: str, language: str) -> str:
232
+ """
233
+ Rephrase a raw exact-match answer using the LLM to make it sound natural and conversational.
234
+ """
235
+ if language == "hindi":
236
+ prompt = (
237
+ "<start_of_turn>user\n"
238
+ "आप एक मददगार और मिलनसार एआई सहायक हैं। निम्नलिखित प्रश्न और उसके कच्चे उत्तर (raw answer) को एक स्वाभाविक, गर्मजोशी भरे और मानवीय उत्तर में बदलें। उत्तर संक्षिप्त (1-2 वाक्य) होना चाहिए।\n"
239
+ "केवल हिंदी (Devanagari script) में ही उत्तर दें।\n\n"
240
+ f"प्रश्न: {question}\n"
241
+ f"कच्चा उत्तर: {raw_answer}<end_of_turn>\n"
242
+ "<start_of_turn>model\n"
243
+ )
244
+ elif language == "hinglish":
245
+ prompt = (
246
+ "<start_of_turn>user\n"
247
+ "You are a helpful and friendly AI assistant. Rephrase the following raw answer to sound natural, warm, and conversational in Hinglish based on the user's question. Keep it concise (1-2 sentences).\n"
248
+ "Answer ONLY in Hinglish (Hindi language written in English/Latin script).\n\n"
249
+ f"Question: {question}\n"
250
+ f"Raw Answer: {raw_answer}<end_of_turn>\n"
251
+ "<start_of_turn>model\n"
252
+ )
253
+ else: # english
254
+ prompt = (
255
+ "<start_of_turn>user\n"
256
+ "You are a helpful and friendly AI assistant. Rephrase the following raw answer to make it sound natural, warm, conversational, and human-like based on the user's question. Keep it concise (1-2 sentences).\n\n"
257
+ f"Question: {question}\n"
258
+ f"Raw Answer: {raw_answer}<end_of_turn>\n"
259
+ "<start_of_turn>model\n"
260
+ )
261
+ try:
262
+ from llm.inference import generate_response
263
+ response = generate_response(prompt, max_tokens=100)
264
+ return response.strip()
265
+ except Exception as e:
266
+ print(f"[WARNING] Rephrasing failed: {e}. Falling back to raw answer.")
267
+ return raw_answer
268
+
269
+
270
  def generate_rag_response(question: str):
271
  # 1. Detect query language
272
  lang = detect_language(question)
 
328
  break
329
 
330
  if matching_qa:
331
+ # Rephrase the raw answer using LLM to make it sound natural and humanized
332
+ response = humanize_raw_response(matching_qa["answer"], question, lang)
 
333
  else:
334
  # Fall back to using the LLM with the formatted context XML
335
  from rag.prompt_builder import format_context_as_xml
336
  formatted_context = format_context_as_xml(context)
337
 
338
  system_prompt = (
339
+ "You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based ONLY on the provided context. "
340
+ "Keep your answer brief, factual, and under 3 sentences. Rephrase the context to sound human-like rather than copy-pasting.\n"
341
  "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
342
  )
343
+ prompt = f"""<start_of_turn>user
344
+ {system_prompt}
345
+
346
  Context:
347
  {formatted_context}
348
 
349
  Question:
350
+ {search_query}<end_of_turn>
351
+ <start_of_turn>model
352
  """
353
  response = generate_response(prompt)
354
  response = _clean_response(response)