Spaces:

khubchand
/

ai-assistant-engine

Sleeping

App Files Files Community

khubchand commited on 17 days ago

Commit

9eed65c

1 Parent(s): 069b0f0

Update system model to Gemma 3 1B Instruct and humanize responses

Browse files

Files changed (7) hide show

Dockerfile +3 -3
Modelfile +1 -1
config.py +1 -1
llm/inference.py +9 -10
llm/model_loader.py +2 -2
rag/prompt_builder.py +16 -17
rag/rag_pipeline.py +58 -20

Dockerfile CHANGED Viewed

@@ -30,9 +30,9 @@ COPY requirements.txt .
 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
-# Download Qwen2.5-0.5B-Instruct GGUF model during build
-RUN curl -L -o models/qwen2.5-0.5b-instruct-q4_k_m.gguf \
-    "https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF/resolve/main/qwen2.5-0.5b-instruct-q4_k_m.gguf"
 # Copy the rest of the application files
 COPY --chown=user:user . .

 RUN pip install --no-cache-dir --upgrade pip && \
     pip install --no-cache-dir -r requirements.txt
+# Download Gemma 3 1B Instruct GGUF model during build
+RUN curl -L -o models/google_gemma-3-1b-it-Q4_K_M.gguf \
+    "https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF/resolve/main/google_gemma-3-1b-it-Q4_K_M.gguf"
 # Copy the rest of the application files
 COPY --chown=user:user . .

Modelfile CHANGED Viewed

	@@ -1 +1 @@
1	- FROM ./models/~~qwen2.5~~-~~0.5b~~-~~instruct~~-~~q4_k_m~~.gguf


1	+ FROM ./models/google_gemma-3-1b-it-Q4_K_M.gguf

config.py CHANGED Viewed

@@ -1,6 +1,6 @@
 import os
-MODEL_PATH = "models/qwen2.5-0.5b-instruct-q4_k_m.gguf"
 VECTOR_DB_PATH = "vector_store/faiss_index"
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE = 500

 import os
+MODEL_PATH = "models/google_gemma-3-1b-it-Q4_K_M.gguf"
 VECTOR_DB_PATH = "vector_store/faiss_index"
 EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
 CHUNK_SIZE = 500

llm/inference.py CHANGED Viewed

@@ -8,7 +8,7 @@ from llm.model_loader import get_llm
 from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
 OLLAMA_API_URL = "http://localhost:11434"
-OLLAMA_MODEL_NAME = "qwen-local"
 _llm_lock = threading.Lock()
 _ollama_ready = False
@@ -118,7 +118,7 @@ def _generate_response_ollama(prompt: str, max_tokens: int = None) -> str:
         "options": {
             "num_predict": max_tokens or MAX_TOKENS,
             "temperature": TEMPERATURE,
-            "stop": ["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
         }
     }
@@ -146,7 +146,7 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
                 prompt,
                 max_tokens=max_tokens or MAX_TOKENS,
                 temperature=TEMPERATURE,
-                stop=["Question:", "<|im_end|>", "<|im_start|>", "<|endoftext|>", "<|end_of_text|>"]
             )
         text = output["choices"][0]["text"]
         return text.strip()
@@ -158,8 +158,8 @@ def generate_response(prompt: str, max_tokens: int = None) -> str:
                 status_code=503,
                 detail=(
                     "LLM model file not found. "
-                    "Download a GGUF model and place it at 'models/Phi-3-mini-4k-instruct-q4.gguf'.\n"
-                    "Recommended: https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf"
                 )
             )
@@ -317,11 +317,10 @@ def translate_to_english(text: str) -> str:
     # 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
     prompt = (
-        "<|im_start|>system\n"
-        "Translate Hinglish/Hindi to English. Reply ONLY with translation.\n"
-        "<|im_end|>\n"
-        f"<|im_start|>user\n{text}<|im_end|>\n"
-        "<|im_start|>assistant\n"
     )
     try:
         # Limit translation to 40 tokens since a single query is very short

 from config import MAX_TOKENS, TEMPERATURE, USE_OLLAMA
 OLLAMA_API_URL = "http://localhost:11434"
+OLLAMA_MODEL_NAME = "gemma3-local"
 _llm_lock = threading.Lock()
 _ollama_ready = False
         "options": {
             "num_predict": max_tokens or MAX_TOKENS,
             "temperature": TEMPERATURE,
+            "stop": ["Question:", "<end_of_turn>", "<eos>"]
         }
     }
                 prompt,
                 max_tokens=max_tokens or MAX_TOKENS,
                 temperature=TEMPERATURE,
+                stop=["Question:", "<end_of_turn>", "<eos>"]
             )
         text = output["choices"][0]["text"]
         return text.strip()
                 status_code=503,
                 detail=(
                     "LLM model file not found. "
+                    "Download a GGUF model and place it at 'models/google_gemma-3-1b-it-Q4_K_M.gguf'.\n"
+                    "Recommended: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF"
                 )
             )
     # 2. Local LLM Fallback (uses a short concise template to minimize prefill latency)
     prompt = (
+        "<start_of_turn>user\n"
+        "Translate Hinglish/Hindi to English. Reply ONLY with translation.\n\n"
+        f"{text}<end_of_turn>\n"
+        "<start_of_turn>model\n"
     )
     try:
         # Limit translation to 40 tokens since a single query is very short

llm/model_loader.py CHANGED Viewed

@@ -31,8 +31,8 @@ def get_llm() -> Llama:
             raise FileNotFoundError(
                 f"\n\n  Model file not found: {os.path.abspath(MODEL_PATH)}\n"
                 f"  Download a GGUF model and place it at:  {MODEL_PATH}\n"
-                f"  Recommended: Qwen2.5 0.5B Instruct (Q4_K_M)\n"
-                f"  URL: https://huggingface.co/Qwen/Qwen2.5-0.5B-Instruct-GGUF\n"
             )
         try:
             cpu_count = os.cpu_count()

             raise FileNotFoundError(
                 f"\n\n  Model file not found: {os.path.abspath(MODEL_PATH)}\n"
                 f"  Download a GGUF model and place it at:  {MODEL_PATH}\n"
+                f"  Recommended: Gemma 3 1B Instruct (Q4_K_M)\n"
+                f"  URL: https://huggingface.co/bartowski/google_gemma-3-1b-it-GGUF\n"
             )
         try:
             cpu_count = os.cpu_count()

rag/prompt_builder.py CHANGED Viewed

@@ -52,33 +52,32 @@ def build_prompt(context: str, question: str, language: str = "english"):
     if language == "hindi":
         system_prompt = (
-            "आप एक सहायक एआई सहायक हैं। नीचे दिए गए संदर्भ (context) से सीधे और संक्षिप्त उत्तर दें (3 वाक्यों से कम)।\n"
-            "केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
             "यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
         )
     elif language == "hinglish":
         system_prompt = (
-            "You are a helpful AI assistant. Niche diye gaye context se simple and brief answer do (under 3 sentences).\n"
-            "Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
-            "e.g., 'Nervous candidate ko handle karne ke liye relax karne ko bolein.'\n"
             "Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
         )
     else:
         system_prompt = (
-            "You are a helpful AI assistant. Answer the user's question directly based on the XML qa blocks in the context. "
-            "Find the matching question and return its exact answer. Do not add meta-commentary or extra text.\n"
             "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
         )
-    prompt = f"""<|im_start|>system
-{system_prompt}<|im_end|>
-<|im_start|>user
 Context:
 {formatted_context}
 Question:
-{question}<|im_end|>
-<|im_start|>assistant
 Answer: """
     return prompt
@@ -96,10 +95,10 @@ def build_greeting_prompt(question: str, language: str = "english"):
         f"{lang_instruction}"
     )
-    prompt = f"""<|im_start|>system
-{system_prompt}<|im_end|>
-<|im_start|>user
-{question}<|im_end|>
-<|im_start|>assistant
 """
     return prompt

     if language == "hindi":
         system_prompt = (
+            "आप एक मददगार और मिलनसार एआई सहायक हैं। नीचे दिए गए संदर्भ (context) के आधार पर उपयोगकर्ता के प्रश्न का उत्तर एक गर्मजोशी भरे, प्राकृतिक और बातचीत के लहजे में दें (3 वाक्यों से कम)।\n"
+            "कच्चे पाठ को कॉपी-पेस्ट करने के बजाय उसे स्वाभाविक रूप से समझाएं। केवल हिंदी (Devanagari script) में ही उत्तर दें। अंग्रेजी का प्रयोग न करें।\n"
             "यदि संदर्भ में उत्तर उपलब्ध नहीं है, तो उत्तर दें: 'माफ़ कीजिए, मैं इस संदर्भ में आपकी मदद नहीं कर सकता।'"
         )
     elif language == "hinglish":
         system_prompt = (
+            "You are a helpful and friendly AI assistant. Niche diye gaye context ke base par user ke question ka answer ek natural, warm and conversational tone me do (under 3 sentences).\n"
+            "Raw text ko copy-paste karne ke bajaye natural language me explain karo. Answer ONLY in Hinglish (Hindi language written in English script). Do not write pure English.\n"
             "Agar context me answer nahi hai, toh response do: 'Sorry, main is context me help nahi kar sakta.'"
         )
     else:
         system_prompt = (
+            "You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based on the XML qa blocks in the context. "
+            "Rephrase the information to sound humanized and conversational, rather than copy-pasting raw text. Keep your answer brief and under 3 sentences.\n"
             "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
         )
+    prompt = f"""<start_of_turn>user
+{system_prompt}
 Context:
 {formatted_context}
 Question:
+{question}<end_of_turn>
+<start_of_turn>model
 Answer: """
     return prompt
         f"{lang_instruction}"
     )
+    prompt = f"""<start_of_turn>user
+{system_prompt}
+{question}<end_of_turn>
+<start_of_turn>model
 """
     return prompt

rag/rag_pipeline.py CHANGED Viewed

@@ -192,29 +192,29 @@ def restructure_query(query: str, language: str) -> str:
     """
     if language == "english":
         prompt = (
-            "<|im_start|>system\n"
             "You are a precise grammar correction and query restructuring tool. "
             "Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
             "Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
             "Example:\n"
             "Query: why company want to hire me?\n"
-            "Correction: Why should the company hire me?\n"
-            "<|im_end|>\n"
-            f"<|im_start|>user\nQuery: {query}<|im_end|>\n"
-            "<|im_start|>assistant\nCorrection: "
         )
     else:  # hindi
         prompt = (
-            "<|im_start|>system\n"
             "You are a precise grammar correction and query restructuring tool for Hindi. "
             "Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
             "Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
             "Example:\n"
             "Query: कंपनी hire क्यों करे मुझे?\n"
-            "Correction: कंपनी मुझे नौकरी क्यों दे?\n"
-            "<|im_end|>\n"
-            f"<|im_start|>user\nQuery: {query}<|im_end|>\n"
-            "<|im_start|>assistant\nCorrection: "
         )
     try:
         from llm.inference import generate_response
@@ -228,6 +228,45 @@ def restructure_query(query: str, language: str) -> str:
     return query
 def generate_rag_response(question: str):
     # 1. Detect query language
     lang = detect_language(question)
@@ -289,28 +328,27 @@ def generate_rag_response(question: str):
             break
     if matching_qa:
-        # Directly extract the answer string for exact Q&A matches.
-        # This completely bypasses LLM latency and hallucination risks.
-        response = matching_qa["answer"]
     else:
         # Fall back to using the LLM with the formatted context XML
         from rag.prompt_builder import format_context_as_xml
         formatted_context = format_context_as_xml(context)
         system_prompt = (
-            "You are a helpful AI assistant. Answer the user's question directly based ONLY on the provided context. "
-            "Keep your answer brief, factual, and under 3 sentences. Do not add meta-commentary or repeat the context.\n"
             "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
         )
-        prompt = f"""<|im_start|>system
-{system_prompt}<|im_end|>
-<|im_start|>user
 Context:
 {formatted_context}
 Question:
-{search_query}<|im_end|>
-<|im_start|>assistant
 """
         response = generate_response(prompt)
         response = _clean_response(response)

     """
     if language == "english":
         prompt = (
+            "<start_of_turn>user\n"
             "You are a precise grammar correction and query restructuring tool. "
             "Correct the grammar of the user's search query and restructure it to be direct, formal, and optimal for database search. "
             "Respond ONLY with the corrected query. Do not add meta-commentary, explanations, or quotes.\n"
             "Example:\n"
             "Query: why company want to hire me?\n"
+            "Correction: Why should the company hire me?\n\n"
+            f"Query: {query}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+            "Correction: "
         )
     else:  # hindi
         prompt = (
+            "<start_of_turn>user\n"
             "You are a precise grammar correction and query restructuring tool for Hindi. "
             "Correct the grammar of the user's Hindi query and restructure it to be formal and direct in Devanagari script. "
             "Respond ONLY with the corrected Devanagari Hindi text. Do not add English translation, meta-commentary, or quotes.\n"
             "Example:\n"
             "Query: कंपनी hire क्यों करे मुझे?\n"
+            "Correction: कंपनी मुझे नौकरी क्यों दे?\n\n"
+            f"Query: {query}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+            "Correction: "
         )
     try:
         from llm.inference import generate_response
     return query
+def humanize_raw_response(raw_answer: str, question: str, language: str) -> str:
+    """
+    Rephrase a raw exact-match answer using the LLM to make it sound natural and conversational.
+    """
+    if language == "hindi":
+        prompt = (
+            "<start_of_turn>user\n"
+            "आप एक मददगार और मिलनसार एआई सहायक हैं। निम्नलिखित प्रश्न और उसके कच्चे उत्तर (raw answer) को एक स्वाभाविक, गर्मजोशी भरे और मानवीय उत्तर में बदलें। उत्तर संक्षिप्त (1-2 वाक्य) होना चाहिए।\n"
+            "केवल हिंदी (Devanagari script) में ही उत्तर दें।\n\n"
+            f"प्रश्न: {question}\n"
+            f"कच्चा उत्तर: {raw_answer}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+    elif language == "hinglish":
+        prompt = (
+            "<start_of_turn>user\n"
+            "You are a helpful and friendly AI assistant. Rephrase the following raw answer to sound natural, warm, and conversational in Hinglish based on the user's question. Keep it concise (1-2 sentences).\n"
+            "Answer ONLY in Hinglish (Hindi language written in English/Latin script).\n\n"
+            f"Question: {question}\n"
+            f"Raw Answer: {raw_answer}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+    else:  # english
+        prompt = (
+            "<start_of_turn>user\n"
+            "You are a helpful and friendly AI assistant. Rephrase the following raw answer to make it sound natural, warm, conversational, and human-like based on the user's question. Keep it concise (1-2 sentences).\n\n"
+            f"Question: {question}\n"
+            f"Raw Answer: {raw_answer}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+    try:
+        from llm.inference import generate_response
+        response = generate_response(prompt, max_tokens=100)
+        return response.strip()
+    except Exception as e:
+        print(f"[WARNING] Rephrasing failed: {e}. Falling back to raw answer.")
+        return raw_answer
 def generate_rag_response(question: str):
     # 1. Detect query language
     lang = detect_language(question)
             break
     if matching_qa:
+        # Rephrase the raw answer using LLM to make it sound natural and humanized
+        response = humanize_raw_response(matching_qa["answer"], question, lang)
     else:
         # Fall back to using the LLM with the formatted context XML
         from rag.prompt_builder import format_context_as_xml
         formatted_context = format_context_as_xml(context)
         system_prompt = (
+            "You are a helpful and friendly AI assistant. Answer the user's question in a warm, natural, and conversational tone based ONLY on the provided context. "
+            "Keep your answer brief, factual, and under 3 sentences. Rephrase the context to sound human-like rather than copy-pasting.\n"
             "If the answer is not available in the context, respond exactly with: 'Sorry, I can't help in this context.'"
         )
+        prompt = f"""<start_of_turn>user
+{system_prompt}
 Context:
 {formatted_context}
 Question:
+{search_query}<end_of_turn>
+<start_of_turn>model
 """
         response = generate_response(prompt)
         response = _clean_response(response)