hr-policy / app.py
oezekielanim's picture
Update app.py
200c405 verified
import os
import nltk
import logging
import json
from typing import List, Optional, Dict, Any
import re
import random
# Initialize logger early for setup messages
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# --- Robust NLTK 'punkt' setup ---
nltk_data_dir = "/app/nltk_data"
try:
os.makedirs(nltk_data_dir, exist_ok=True)
except OSError as e:
logger.error(f"Could not create NLTK data directory {nltk_data_dir}: {e}")
if nltk_data_dir not in nltk.data.path:
nltk.data.path.append(nltk_data_dir)
try:
nltk.data.find('tokenizers/punkt')
except LookupError:
logger.warning(f"'punkt' resource not found by NLTK. Attempting to download to {nltk_data_dir}...")
try:
nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
logger.info(f"'punkt' downloaded successfully to {nltk_data_dir}.")
nltk.data.find('tokenizers/punkt')
except Exception as e:
logger.error(f"Failed to download 'punkt': {e}", exc_info=True)
raise RuntimeError(f"NLTK 'punkt' resource is missing and download failed: {e}")
# --- End of NLTK 'punkt' setup ---
os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/app/cache")
os.environ["XDG_CACHE_HOME"] = os.environ.get("XDG_CACHE_HOME", "/app/cache")
os.environ["TMPDIR"] = os.environ.get("TMPDIR", "/app/tmp")
os.makedirs(os.environ["HF_HOME"], exist_ok=True)
os.makedirs(os.environ["TMPDIR"], exist_ok=True)
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field
from langchain_community.document_loaders import PyPDFLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_core.output_parsers import StrOutputParser
from langchain_community.retrievers import BM25Retriever
from langchain.retrievers import EnsembleRetriever
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.documents import Document
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.agents import AgentExecutor, create_react_agent
from langchain_core.tools import Tool, BaseTool
from langchain import hub
from langchain.memory import ConversationBufferWindowMemory
from langchain_core.messages import AIMessage, HumanMessage
app = FastAPI(title="LMI Chatbot with Conversational Learning")
# --- Pydantic Models ---
class Question(BaseModel):
text: str
session_id: Optional[str] = "default_session"
class Feedback(BaseModel):
question: str
original_answer: str
corrected_answer: str
category: Optional[str] = None
# --- File Paths ---
JSONL_DATASET_PATH = "cleaned-dataset.jsonl"
USER_CORRECTIONS_PATH = "user_corrections.jsonl"
# List of LMI-specific PDF paths
LMI_PDF_PATHS = [
"LMI-HUMAN-RESOURCE-POLICY.pdf",
"LMI-L&D-POLICY1.pdf",
"Dividend-Distribution-Policy.pdf",
"LH-Operations-Manual.pdf",
"Broker-Policy.pdf",
"LMI-Procurement-Policy-Procedures-Rev.pdf",
"INTERNAL-AUDIT-CHARTER.pdf",
"EXCO-TERMS-OF-REFERENCE-VERSION-2022.pdf",
"Intercompany-Accounting-Policy.pdf",
"Corporate-Social-Responsibility.pdf",
]
# List of national law / external regulation PDFs
NATIONAL_LAW_PDF_PATHS = [
"Labour-Act-2003.pdf",
]
# --- Predefined Conversational Snippets ---
GREETINGS = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening", "howdy", "yo", "sup"]
GREETING_RESPONSES = [
"Hello there! I'm the LMI Chatbot. How can I assist you today?",
"Hi! I'm the LMI Chatbot, here to help with your questions about LMI Holdings. What's on your mind?",
]
THANKS_PHRASES = ["thank you", "thanks", "thank u", "thx", "appreciate it", "much appreciated"]
THANKS_RESPONSES = [
"You're welcome!", "Happy to help!", "No problem at all!",
]
# --- Global Components (initialized later) ---
query_rewrite_llm = None
agent_executor = None
conversation_memories: Dict[str, ConversationBufferWindowMemory] = {}
# --- Helper Functions and Tool Logic ---
def get_session_memory(session_id: str) -> ConversationBufferWindowMemory:
if session_id not in conversation_memories:
conversation_memories[session_id] = ConversationBufferWindowMemory(
k=5, memory_key="chat_history", return_messages=True
)
logger.info(f"Created new memory for session_id: {session_id}")
return conversation_memories[session_id]
def load_jsonl_data(file_path: str, source_name: str) -> List[Document]:
docs = []
if os.path.exists(file_path):
with open(file_path, "r", encoding="utf-8") as f:
for line_num, line in enumerate(f):
try:
data = json.loads(line.strip())
instruction = data.get('instruction', data.get('question', ''))
response = data.get('response', data.get('answer', ''))
if not instruction or not response: continue
content = f"Instruction: {instruction}\nResponse: {response}"
metadata = {"source": source_name, "original_instruction": instruction}
docs.append(Document(page_content=content, metadata=metadata))
except json.JSONDecodeError:
logger.warning(f"Skipping invalid JSON in {file_path} at line {line_num + 1}")
else:
logger.warning(f"{source_name} file ({file_path}) not found.")
return docs
def create_retriever_from_documents(docs: List[Document], embeddings) -> Optional[EnsembleRetriever]:
if not docs: return None
seen_content = set()
unique_docs = [doc for doc in docs if doc.page_content not in seen_content and not seen_content.add(doc.page_content)]
vectorstore = FAISS.from_documents(unique_docs, embeddings, distance_strategy="COSINE")
faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
bm25_retriever = BM25Retriever.from_documents(unique_docs)
bm25_retriever.k = 5
return EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
# --- Correction Saving Logic with Grammar Cleaning ---
def save_user_correction(original_question: str, user_correction: str) -> str:
"""Cleans the user's correction and saves it to the corrections file."""
global query_rewrite_llm
if not original_question or not user_correction:
return "Error: Both original question and correction are required."
cleaned_correction = user_correction
if query_rewrite_llm:
try:
clean_prompt = PromptTemplate.from_template(
"The following text is a user's correction. Please correct any grammatical or spelling errors to make it a clear, professional statement, but do not change the core facts or meaning.\n\nUser's correction: '{correction_text}'\n\nCorrected statement:"
)
cleaner_chain = LLMChain(llm=query_rewrite_llm, prompt=clean_prompt)
response = cleaner_chain.invoke({"correction_text": user_correction})
cleaned_correction = response.get('text', user_correction).strip()
logger.info(f"Cleaned user correction from '{user_correction}' to '{cleaned_correction}'")
except Exception as e:
logger.error(f"Could not clean correction text due to error: {e}")
cleaned_correction = user_correction
try:
correction_entry = {
"instruction": original_question,
"response": cleaned_correction,
"feedback_category": "inline_correction"
}
with open(USER_CORRECTIONS_PATH, "a", encoding="utf-8") as f:
f.write(json.dumps(correction_entry) + "\n")
logger.info(f"Saved inline correction: Q: {original_question} | A: {cleaned_correction}")
return "Correction saved successfully."
except Exception as e:
logger.error(f"Error saving inline correction: {e}", exc_info=True)
return "Failed to save correction."
# --- Agent Tools Definition ---
class CorrectionToolInput(BaseModel):
original_question: str = Field(description="The user's original question that received an incorrect answer. This must be retrieved from the conversation history.")
corrected_answer: str = Field(description="The new, correct information provided by the user in their latest message.")
class SaveCorrectionTool(BaseTool):
name = "SaveCorrectionTool"
description = "Use this tool ONLY when the user is correcting an answer you previously gave. You MUST identify the original question from the chat history and the user's new message as the corrected answer."
args_schema: type[BaseModel] = CorrectionToolInput
def _run(self, original_question: str, corrected_answer: str) -> str:
return save_user_correction(original_question, corrected_answer)
async def _arun(self, original_question: str, corrected_answer: str) -> str:
return self._run(original_question, corrected_answer)
def load_and_setup_rag_components_and_agent():
global agent_executor, query_rewrite_llm
logger.info("Loading main LLM model (for tools and agent)...")
google_api_key = os.getenv("GOOGLE_API_KEY")
if not google_api_key: raise ValueError("GOOGLE_API_KEY environment variable not found!")
llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key, temperature=0.2)
if query_rewrite_llm is None:
try:
query_rewrite_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key, temperature=0.0)
logger.info("Query rewriting/cleaning LLM initialized successfully.")
except Exception as e:
logger.error(f"Failed to initialize query rewriting/cleaning LLM: {e}")
query_rewrite_llm = None
logger.info("Loading HuggingFace embeddings model...")
embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# --- Load LMI Specific Documents ---
lmi_docs = []
for pdf_path in LMI_PDF_PATHS:
if os.path.exists(pdf_path):
try:
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
for page in pages: page.metadata["source"] = os.path.basename(pdf_path)
lmi_docs.extend(pages)
except Exception as e: logger.error(f"Error processing LMI PDF {pdf_path}: {e}")
else: logger.warning(f"LMI PDF file ({pdf_path}) not found.")
lmi_docs.extend(load_jsonl_data(JSONL_DATASET_PATH, "main_dataset"))
lmi_docs.extend(load_jsonl_data(USER_CORRECTIONS_PATH, "user_corrections"))
# --- Load National Law / External Regulation Documents ---
national_law_docs = []
for pdf_path in NATIONAL_LAW_PDF_PATHS:
if os.path.exists(pdf_path):
try:
loader = PyPDFLoader(pdf_path)
pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
for page in pages: page.metadata["source"] = os.path.basename(pdf_path)
national_law_docs.extend(pages)
except Exception as e: logger.error(f"Error processing National Law PDF {pdf_path}: {e}")
else: logger.warning(f"National Law PDF file ({pdf_path}) not found.")
lmi_retriever = create_retriever_from_documents(lmi_docs, embeddings)
national_law_retriever = create_retriever_from_documents(national_law_docs, embeddings)
if not lmi_retriever and not national_law_retriever: raise ValueError("No documents were loaded to create any knowledge base.")
rag_tool_prompt = PromptTemplate.from_template(
"You are an assistant answering based on the provided context.\n"
"Use ONLY the '### Context' to answer the '### Question'.\n"
"If the context is empty or irrelevant, you MUST state: 'NO_INFORMATION_FOUND_IN_CONTEXT'.\n"
"### Context:\n{context}\n\n### Question:\n{question}\n\n### Answer:"
)
rag_chain_for_tool = LLMChain(llm=llm, prompt=rag_tool_prompt, output_parser=StrOutputParser())
def run_rag_retrieval_tool(query: str, retriever) -> str:
if not retriever: return "This knowledge source is not available."
try:
context_docs = retriever.invoke(query)
if not context_docs: return "NO_INFORMATION_FOUND_IN_CONTEXT"
context_text = "\n\n---\n\n".join([doc.page_content for doc in context_docs])
return rag_chain_for_tool.invoke({"context": context_text, "question": query})
except Exception as e: return "ERROR_IN_TOOL_EXECUTION"
# Re-introducing the Google Search tool
def run_google_search_tool(query: str) -> str:
"""Simulates a Google search for comparative or external work-related information."""
logger.info(f"GoogleSearchTool received query: {query}")
return f"Simulated Google Search: For your query '{query}', external public information would be retrieved here. For now, let's assume relevant details were found for comparison."
tools = [SaveCorrectionTool()]
if lmi_retriever:
tools.append(Tool(
name="LMICompanyPolicyTool",
func=lambda query: run_rag_retrieval_tool(query, lmi_retriever),
description="Use for questions about LMI Holdings' internal policies (e.g. HR, L&D, Finance, Operations, Procurement, Social Responsibility), procedures, and user-corrected info. This should be your first choice for any question about LMI.",
))
if national_law_retriever:
tools.append(Tool(
name="GhanaNationalLawTool",
func=lambda query: run_rag_retrieval_tool(query, national_law_retriever),
description="Use for questions about Ghana's national laws and regulations, like the Labour Act. Check this before using Google Search for legal questions.",
))
tools.append(Tool(
name="GoogleSearch",
func=run_google_search_tool,
description="Use this tool as a last resort for general work-related questions ONLY IF you cannot find an answer using LMICompanyPolicyTool or GhanaNationalLawTool. Also use this for questions that explicitly ask to compare LMI with other companies.",
))
# --- Agent Prompt Customization ---
# Pull the base prompt from the hub
prompt_from_hub = hub.pull("hwchase17/react-chat")
# Extract the template from the hub prompt to modify it
# This assumes the prompt is a ChatPromptTemplate with messages
if hasattr(prompt_from_hub, 'messages'):
# Find the system message template to prepend our instructions
system_message_found = False
for message_prompt in prompt_from_hub.messages:
if 'system' in message_prompt.role:
# Add our custom rules to the beginning of the existing system message
original_template = message_prompt.prompt.template
custom_prefix = (
"You are a helpful and friendly assistant named LMI Chatbot.\n"
"Always answer the user's question directly and politely. Even if the answer is in the chat history, do not refer to past turns by saying 'As I mentioned before...' or 'I already answered that.' Treat each query as a new request for information.\n"
"When formatting your final answer, do not use Markdown for emphasis (like using asterisks for bold). Instead, state the information clearly in plain text.\n\n"
)
message_prompt.prompt.template = custom_prefix + original_template
system_message_found = True
break
if not system_message_found:
logger.warning("Could not find a system message in the hub prompt to modify. Agent might not follow custom rules.")
agent_prompt = prompt_from_hub
else:
# Fallback if the prompt structure is not as expected
logger.warning("Hub prompt structure not recognized. Using default prompt without custom rules.")
agent_prompt = prompt_from_hub
agent = create_react_agent(llm, tools, agent_prompt)
agent_executor = AgentExecutor(
agent=agent, tools=tools, verbose=True,
handle_parsing_errors=True, max_iterations=7
)
logger.info(f"Agent Executor initialized with {len(tools)} tools.")
# --- System Startup ---
logger.info("Initializing system components...")
try:
load_and_setup_rag_components_and_agent()
except Exception as e:
logger.critical(f"Failed to initialize system: {e}", exc_info=True)
agent_executor = None
def rewrite_query(question: str) -> str:
global query_rewrite_llm
if not query_rewrite_llm:
logger.warning("Query rewriting LLM not available. Using original query.")
return question
if "the act" in question.lower() and "labour" not in question.lower():
question = question.replace("the act", "the Ghana Labour Act")
logger.info(f"Rule-based addition before LLM rewrite: '{question}'")
rewrite_prompt_template = """Rephrase the user's question into a clear, canonical question.
Examples:
User: "what does the act say about overtime?" -> Rephrased: "What does the Ghana Labour Act 2003 say about overtime?"
User: "how many days off can I get" -> Rephrased: "What is the LMI Holdings policy on annual leave entitlement?"
User: "{user_input}" -> Rephrased:"""
rewrite_prompt = PromptTemplate.from_template(rewrite_prompt_template)
rewriter_chain = LLMChain(llm=query_rewrite_llm, prompt=rewrite_prompt)
response = rewriter_chain.invoke({"user_input": question})
rephrased_question = response.get('text', question).strip()
if rephrased_question.lower() != question.lower():
logger.info(f"LLM rephrased query: '{question}' -> '{rephrased_question}'")
return rephrased_question
return question
@app.post("/ask")
async def ask(question_data: Question) -> Dict[str, Any]:
logger.info(f"Received question: {question_data.text} for session: {question_data.session_id}")
normalized_check_text = ''.join(c for c in question_data.text.lower() if c.isalnum() or c.isspace()).strip()
response = None
if normalized_check_text in GREETINGS: response = random.choice(GREETING_RESPONSES)
elif normalized_check_text in THANKS_PHRASES: response = random.choice(THANKS_RESPONSES)
if response:
memory = get_session_memory(question_data.session_id)
memory.chat_memory.add_user_message(question_data.text)
memory.chat_memory.add_ai_message(response)
return {"answer": response, "type": "conversational"}
if not agent_executor:
raise HTTPException(status_code=503, detail="LMI Chatbot is not available.")
try:
memory = get_session_memory(question_data.session_id)
agent_input = {
"input": question_data.text,
"chat_history": memory.chat_memory.messages
}
response_payload = await agent_executor.ainvoke(agent_input)
final_answer = response_payload.get("output", "I apologize, I encountered an issue.")
if "NO_INFORMATION_FOUND_IN_CONTEXT" in final_answer:
final_answer = "I'm not sure about that, but I'd be happy to help if you provide more details!"
logger.info("Overriding 'not found' to user-friendly message.")
memory.chat_memory.add_user_message(question_data.text)
memory.chat_memory.add_ai_message(final_answer)
return {"answer": final_answer, "type": "agent_response"}
except Exception as e:
logger.error(f"Error in /ask endpoint for session {question_data.session_id}: {str(e)}", exc_info=True)
raise HTTPException(status_code=500, detail="I'm sorry, I encountered an unexpected issue.")
@app.get("/")
async def root():
return {"message": "LMI Chatbot (Agent Mode with Conversational Learning) is running."}
# back to this in case of any yawa
# import os
# import nltk
# import logging
# import json
# from typing import List, Optional, Dict, Any
# import re
# import random
# # Initialize logger early for setup messages
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# logger = logging.getLogger(__name__)
# # --- Robust NLTK 'punkt' setup ---
# nltk_data_dir = "/app/nltk_data"
# try:
# os.makedirs(nltk_data_dir, exist_ok=True)
# except OSError as e:
# logger.error(f"Could not create NLTK data directory {nltk_data_dir}: {e}")
# if nltk_data_dir not in nltk.data.path:
# nltk.data.path.append(nltk_data_dir)
# try:
# nltk.data.find('tokenizers/punkt')
# except LookupError:
# logger.warning(f"'punkt' resource not found by NLTK. Attempting to download to {nltk_data_dir}...")
# try:
# nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
# logger.info(f"'punkt' downloaded successfully to {nltk_data_dir}.")
# nltk.data.find('tokenizers/punkt')
# except Exception as e:
# logger.error(f"Failed to download 'punkt': {e}", exc_info=True)
# raise RuntimeError(f"NLTK 'punkt' resource is missing and download failed: {e}")
# # --- End of NLTK 'punkt' setup ---
# os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/app/cache")
# os.environ["XDG_CACHE_HOME"] = os.environ.get("XDG_CACHE_HOME", "/app/cache")
# os.environ["TMPDIR"] = os.environ.get("TMPDIR", "/app/tmp")
# os.makedirs(os.environ["HF_HOME"], exist_ok=True)
# os.makedirs(os.environ["TMPDIR"], exist_ok=True)
# from fastapi import FastAPI, HTTPException
# from pydantic import BaseModel, Field
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.vectorstores import FAISS
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# from langchain_core.output_parsers import StrOutputParser
# from langchain_community.retrievers import BM25Retriever
# from langchain.retrievers import EnsembleRetriever
# from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain_core.documents import Document
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.agents import AgentExecutor, create_react_agent
# from langchain_core.tools import Tool, BaseTool
# from langchain import hub
# from langchain.memory import ConversationBufferWindowMemory
# from langchain_core.messages import AIMessage, HumanMessage
# app = FastAPI(title="LMI Chatbot with Conversational Learning")
# # --- Pydantic Models ---
# class Question(BaseModel):
# text: str
# session_id: Optional[str] = "default_session"
# class Feedback(BaseModel):
# question: str
# original_answer: str
# corrected_answer: str
# category: Optional[str] = None
# # --- File Paths ---
# JSONL_DATASET_PATH = "cleaned-dataset.jsonl"
# USER_CORRECTIONS_PATH = "user_corrections.jsonl"
# LMI_PDF_PATHS = [
# "LMI-HUMAN-RESOURCE-POLICY.pdf",
# "LMI-L&D-POLICY1.pdf",
# "Dividend-Distribution-Policy.pdf",
# "LH-Operations-Manual.pdf",
# "Broker-Policy.pdf",
# "LMI-Procurement-Policy-Procedures-Rev.pdf",
# "INTERNAL-AUDIT-CHARTER.pdf",
# "EXCO-TERMS-OF-REFERENCE-VERSION-2022.pdf",
# "Intercompany-Accounting-Policy.pdf",
# "Corporate-Social-Responsibility.pdf"
# ]
# LABOUR_ACT_PDF_PATH = ["Labour-Act-2003.pdf",
# "13th-Month-Payment.pdf"
# ]
# # --- Predefined Conversational Snippets ---
# GREETINGS = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening", "howdy", "yo", "sup"]
# GREETING_RESPONSES = [
# "Hello there! I'm the LMI Chatbot. How can I assist you today?",
# "Hi! I'm the LMI Chatbot, here to help with your questions about LMI Holdings. What's on your mind?",
# "Hey! I'm the LMI Chatbot. How can I help you?",
# ]
# THANKS_PHRASES = ["thank you", "thanks", "thank u", "thx", "appreciate it", "much appreciated"]
# THANKS_RESPONSES = [
# "You're welcome!", "Happy to help!", "No problem at all!", "Glad I could assist!",
# ]
# # --- Global Components (initialized later) ---
# query_rewrite_llm = None
# agent_executor = None
# conversation_memories: Dict[str, ConversationBufferWindowMemory] = {}
# # --- Helper Functions and Tool Logic ---
# def get_session_memory(session_id: str) -> ConversationBufferWindowMemory:
# if session_id not in conversation_memories:
# conversation_memories[session_id] = ConversationBufferWindowMemory(
# k=5, memory_key="chat_history", return_messages=True
# )
# logger.info(f"Created new memory for session_id: {session_id}")
# return conversation_memories[session_id]
# def load_jsonl_data(file_path: str, source_name: str) -> List[Document]:
# docs = []
# if os.path.exists(file_path):
# with open(file_path, "r", encoding="utf-8") as f:
# for line_num, line in enumerate(f):
# try:
# data = json.loads(line.strip())
# instruction = data.get('instruction', data.get('question', ''))
# response = data.get('response', data.get('answer', ''))
# if not instruction or not response: continue
# content = f"Instruction: {instruction}\nResponse: {response}"
# metadata = {"source": source_name, "original_instruction": instruction}
# docs.append(Document(page_content=content, metadata=metadata))
# except json.JSONDecodeError:
# logger.warning(f"Skipping invalid JSON in {file_path} at line {line_num + 1}")
# else:
# logger.warning(f"{source_name} file ({file_path}) not found.")
# return docs
# def create_retriever_from_documents(docs: List[Document], embeddings) -> Optional[EnsembleRetriever]:
# if not docs: return None
# seen_content = set()
# unique_docs = [doc for doc in docs if doc.page_content not in seen_content and not seen_content.add(doc.page_content)]
# vectorstore = FAISS.from_documents(unique_docs, embeddings, distance_strategy="COSINE")
# faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# bm25_retriever = BM25Retriever.from_documents(unique_docs)
# bm25_retriever.k = 5
# return EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
# # --- Correction Saving Logic with Grammar Cleaning ---
# def save_user_correction(original_question: str, user_correction: str) -> str:
# """Cleans the user's correction and saves it to the corrections file."""
# global query_rewrite_llm
# if not original_question or not user_correction:
# return "Error: Both original question and correction are required."
# cleaned_correction = user_correction
# if query_rewrite_llm:
# try:
# # Use the query rewriter LLM to clean up grammar
# clean_prompt = PromptTemplate.from_template(
# "The following text is a user's correction. Please correct any grammatical or spelling errors to make it a clear, professional statement, but do not change the core facts or meaning.\n\nUser's correction: '{correction_text}'\n\nCorrected statement:"
# )
# cleaner_chain = LLMChain(llm=query_rewrite_llm, prompt=clean_prompt)
# response = cleaner_chain.invoke({"correction_text": user_correction})
# cleaned_correction = response.get('text', user_correction).strip()
# logger.info(f"Cleaned user correction from '{user_correction}' to '{cleaned_correction}'")
# except Exception as e:
# logger.error(f"Could not clean correction text due to error: {e}")
# # Fallback to the original correction if cleaning fails
# cleaned_correction = user_correction
# try:
# correction_entry = {
# "instruction": original_question,
# "response": cleaned_correction,
# "feedback_category": "inline_correction"
# }
# with open(USER_CORRECTIONS_PATH, "a", encoding="utf-8") as f:
# f.write(json.dumps(correction_entry) + "\n")
# logger.info(f"Saved inline correction: Q: {original_question} | A: {cleaned_correction}")
# return "Correction saved successfully."
# except Exception as e:
# logger.error(f"Error saving inline correction: {e}", exc_info=True)
# return "Failed to save correction."
# # --- Agent Tools Definition ---
# class CorrectionToolInput(BaseModel):
# original_question: str = Field(description="The user's original question that received an incorrect answer. This must be retrieved from the conversation history.")
# corrected_answer: str = Field(description="The new, correct information provided by the user in their latest message.")
# class SaveCorrectionTool(BaseTool):
# name = "SaveCorrectionTool"
# description = "Use this tool ONLY when the user is correcting an answer you previously gave. You MUST identify the original question from the chat history and the user's new message as the corrected answer."
# args_schema: type[BaseModel] = CorrectionToolInput
# def _run(self, original_question: str, corrected_answer: str) -> str:
# return save_user_correction(original_question, corrected_answer)
# async def _arun(self, original_question: str, corrected_answer: str) -> str:
# # For async compatibility
# return self._run(original_question, corrected_answer)
# def load_and_setup_rag_components_and_agent():
# global agent_executor, query_rewrite_llm
# logger.info("Loading main LLM model (for tools and agent)...")
# google_api_key = os.getenv("GOOGLE_API_KEY")
# if not google_api_key: raise ValueError("GOOGLE_API_KEY environment variable not found!")
# llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key, temperature=0.2)
# # Initialize the query rewriter LLM here so it's available for the correction tool
# if query_rewrite_llm is None:
# try:
# query_rewrite_llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key, temperature=0.0)
# logger.info("Query rewriting/cleaning LLM initialized successfully.")
# except Exception as e:
# logger.error(f"Failed to initialize query rewriting/cleaning LLM: {e}")
# query_rewrite_llm = None
# logger.info("Loading HuggingFace embeddings model...")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# lmi_docs, labour_act_docs = [], []
# for pdf_path in LMI_PDF_PATHS:
# if os.path.exists(pdf_path):
# try:
# loader = PyPDFLoader(pdf_path)
# pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
# for page in pages: page.metadata["source"] = os.path.basename(pdf_path)
# lmi_docs.extend(pages)
# except Exception as e: logger.error(f"Error processing LMI PDF {pdf_path}: {e}")
# else: logger.warning(f"LMI PDF file ({pdf_path}) not found.")
# lmi_docs.extend(load_jsonl_data(JSONL_DATASET_PATH, "main_dataset"))
# lmi_docs.extend(load_jsonl_data(USER_CORRECTIONS_PATH, "user_corrections"))
# if os.path.exists(LABOUR_ACT_PDF_PATH):
# try:
# loader = PyPDFLoader(LABOUR_ACT_PDF_PATH)
# pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
# for page in pages: page.metadata["source"] = os.path.basename(LABOUR_ACT_PDF_PATH)
# labour_act_docs.extend(pages)
# except Exception as e: logger.error(f"Error processing Labour Act PDF: {e}")
# else: logger.warning(f"Labour Act PDF file ({LABOUR_ACT_PDF_PATH}) not found.")
# lmi_retriever = create_retriever_from_documents(lmi_docs, embeddings)
# labour_act_retriever = create_retriever_from_documents(labour_act_docs, embeddings)
# if not lmi_retriever and not labour_act_retriever: raise ValueError("No documents loaded.")
# rag_tool_prompt = PromptTemplate.from_template(
# "You are an assistant answering based on the provided context.\n"
# "Use ONLY the '### Context' to answer the '### Question'.\n"
# "If the context is empty or irrelevant, you MUST state: 'NO_INFORMATION_FOUND_IN_CONTEXT'.\n"
# "### Context:\n{context}\n\n### Question:\n{question}\n\n### Answer:"
# )
# rag_chain_for_tool = LLMChain(llm=llm, prompt=rag_tool_prompt, output_parser=StrOutputParser())
# def run_rag_retrieval_tool(query: str, retriever) -> str:
# if not retriever: return "This knowledge source is not available."
# try:
# context_docs = retriever.invoke(query)
# if not context_docs: return "NO_INFORMATION_FOUND_IN_CONTEXT"
# context_text = "\n\n---\n\n".join([doc.page_content for doc in context_docs])
# return rag_chain_for_tool.invoke({"context": context_text, "question": query})
# except Exception as e: return "ERROR_IN_TOOL_EXECUTION"
# tools = [SaveCorrectionTool()]
# if lmi_retriever:
# tools.append(Tool(
# name="LMICompanyPolicyTool",
# func=lambda query: run_rag_retrieval_tool(query, lmi_retriever),
# description="Use for questions about LMI Holdings' internal policies (HR, L&D), procedures, and user-corrected info. This is your primary source for LMI-specific data.",
# ))
# if labour_act_retriever:
# tools.append(Tool(
# name="GhanaLabourActTool",
# func=lambda query: run_rag_retrieval_tool(query, labour_act_retriever),
# description="Use for questions about Ghana's national labour laws, employment regulations, and worker rights as defined in the Labour Act, 2003.",
# ))
# agent_prompt = hub.pull("hwchase17/react-chat")
# agent = create_react_agent(llm, tools, agent_prompt)
# agent_executor = AgentExecutor(
# agent=agent, tools=tools, verbose=True,
# handle_parsing_errors=True, max_iterations=7
# )
# logger.info(f"Agent Executor initialized with {len(tools)} tools.")
# # --- System Startup ---
# logger.info("Initializing system components...")
# try:
# load_and_setup_rag_components_and_agent()
# except Exception as e:
# logger.critical(f"Failed to initialize system: {e}", exc_info=True)
# agent_executor = None
# def rewrite_query(question: str) -> str:
# global query_rewrite_llm
# if not query_rewrite_llm:
# logger.warning("Query rewriting LLM not available. Using original query.")
# return question
# if "the act" in question.lower() and "labour" not in question.lower():
# question = question.replace("the act", "the Ghana Labour Act")
# logger.info(f"Rule-based addition before LLM rewrite: '{question}'")
# rewrite_prompt_template = """Rephrase the user's question into a clear, canonical question.
# Examples:
# User: "what does the act say about overtime?" -> Rephrased: "What does the Ghana Labour Act 2003 say about overtime?"
# User: "how many days off can I get" -> Rephrased: "What is the LMI Holdings policy on annual leave entitlement?"
# User: "{user_input}" -> Rephrased:"""
# rewrite_prompt = PromptTemplate.from_template(rewrite_prompt_template)
# rewriter_chain = LLMChain(llm=query_rewrite_llm, prompt=rewrite_prompt)
# response = rewriter_chain.invoke({"user_input": question})
# rephrased_question = response.get('text', question).strip()
# if rephrased_question.lower() != question.lower():
# logger.info(f"LLM rephrased query: '{question}' -> '{rephrased_question}'")
# return rephrased_question
# return question
# @app.post("/ask")
# async def ask(question_data: Question) -> Dict[str, Any]:
# logger.info(f"Received question: {question_data.text} for session: {question_data.session_id}")
# normalized_check_text = ''.join(c for c in question_data.text.lower() if c.isalnum() or c.isspace()).strip()
# response = None
# if normalized_check_text in GREETINGS: response = random.choice(GREETING_RESPONSES)
# elif normalized_check_text in THANKS_PHRASES: response = random.choice(THANKS_RESPONSES)
# if response:
# memory = get_session_memory(question_data.session_id)
# memory.chat_memory.add_user_message(question_data.text)
# memory.chat_memory.add_ai_message(response)
# return {"answer": response, "type": "conversational"}
# if not agent_executor:
# raise HTTPException(status_code=503, detail="LMI Chatbot is not available.")
# try:
# # Don't rewrite potential corrections. The agent needs to see the raw text.
# # We can let the agent decide if a rewrite is needed, or do it after intent classification.
# # For now, we pass the original text. The agent's LLM is powerful enough to understand it.
# # rewritten_question_text = rewrite_query(question_data.text)
# memory = get_session_memory(question_data.session_id)
# agent_input = {
# "input": question_data.text, # Pass original text to let agent see correction cues like "No..."
# "chat_history": memory.chat_memory.messages
# }
# response_payload = await agent_executor.ainvoke(agent_input)
# final_answer = response_payload.get("output", "I apologize, I encountered an issue.")
# # If the agent used the correction tool, the tool itself provides the confirmation.
# # If it found no info, we rephrase.
# if "NO_INFORMATION_FOUND_IN_CONTEXT" in final_answer:
# final_answer = "I'm not sure about that, but I'd be happy to help if you provide more details!"
# logger.info("Overriding 'not found' to user-friendly message.")
# memory.chat_memory.add_user_message(question_data.text)
# memory.chat_memory.add_ai_message(final_answer)
# return {"answer": final_answer, "type": "agent_response"}
# except Exception as e:
# logger.error(f"Error in /ask endpoint for session {question_data.session_id}: {str(e)}", exc_info=True)
# raise HTTPException(status_code=500, detail="I'm sorry, I encountered an unexpected issue.")
# @app.get("/")
# async def root():
# return {"message": "LMI Chatbot (Agent Mode with Conversational Learning) is running."}
# we will come back incase of any yawa
# import os
# import nltk
# import logging
# import json
# import numpy as np # Not used directly, but often a dependency for ML libraries
# from sklearn.cluster import KMeans # Not used directly in this version
# from typing import List, Optional, Dict, Any
# import re
# import random
# # Initialize logger early for setup messages
# logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
# logger = logging.getLogger(__name__)
# # --- Robust NLTK 'punkt' setup ---
# nltk_data_dir = "/app/nltk_data"
# try:
# os.makedirs(nltk_data_dir, exist_ok=True)
# except OSError as e:
# logger.error(f"Could not create NLTK data directory {nltk_data_dir}: {e}")
# if nltk_data_dir not in nltk.data.path:
# nltk.data.path.append(nltk_data_dir)
# try:
# nltk.data.find('tokenizers/punkt')
# except LookupError:
# logger.warning(f"'punkt' resource not found by NLTK. Attempting to download to {nltk_data_dir}...")
# try:
# nltk.download('punkt', download_dir=nltk_data_dir, quiet=True)
# logger.info(f"'punkt' downloaded successfully to {nltk_data_dir}.")
# nltk.data.find('tokenizers/punkt')
# except Exception as e:
# logger.error(f"Failed to download 'punkt': {e}", exc_info=True)
# raise RuntimeError(f"NLTK 'punkt' resource is missing and download failed: {e}")
# # --- End of NLTK 'punkt' setup ---
# os.environ["HF_HOME"] = os.environ.get("HF_HOME", "/app/cache")
# os.environ["XDG_CACHE_HOME"] = os.environ.get("XDG_CACHE_HOME", "/app/cache")
# os.environ["TMPDIR"] = os.environ.get("TMPDIR", "/app/tmp")
# os.makedirs(os.environ["HF_HOME"], exist_ok=True)
# os.makedirs(os.environ["TMPDIR"], exist_ok=True)
# from fastapi import FastAPI, HTTPException
# from pydantic import BaseModel
# from langchain_community.document_loaders import PyPDFLoader
# from langchain_community.vectorstores import FAISS
# from langchain_community.embeddings import HuggingFaceEmbeddings
# from langchain.prompts import PromptTemplate
# from langchain.chains import LLMChain
# from langchain_core.output_parsers import StrOutputParser
# from langchain_community.retrievers import BM25Retriever
# from langchain.retrievers import EnsembleRetriever
# from langchain_google_genai import ChatGoogleGenerativeAI
# from langchain_core.documents import Document
# from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.agents import AgentExecutor, create_react_agent
# from langchain_core.tools import Tool
# from langchain import hub
# from langchain.memory import ConversationBufferWindowMemory
# from langchain_core.messages import AIMessage, HumanMessage
# app = FastAPI(title="LMI Chatbot with Agent and Specialized Tools")
# # --- Pydantic Models ---
# class Question(BaseModel):
# text: str
# session_id: Optional[str] = "default_session"
# class Feedback(BaseModel):
# question: str
# original_answer: str
# corrected_answer: str
# category: Optional[str] = None
# # --- File Paths ---
# JSONL_DATASET_PATH = "cleaned-dataset.jsonl"
# USER_CORRECTIONS_PATH = "user_corrections.jsonl"
# # List of LMI-specific PDF paths
# LMI_PDF_PATHS = [
# "LMI-HUMAN-RESOURCE-POLICY.pdf",
# "LMI-L&D-POLICY1.pdf",
# ]
# # Path for the Labour Act PDF
# LABOUR_ACT_PDF_PATH = "Labour-Act-2003.pdf"
# # --- Predefined Conversational Snippets ---
# GREETINGS = ["hello", "hi", "hey", "greetings", "good morning", "good afternoon", "good evening", "howdy", "yo", "sup"]
# GREETING_RESPONSES = [
# "Hello there! I'm the LMI Chatbot. How can I assist you today?",
# "Hi! I'm the LMI Chatbot, here to help with your questions about LMI Holdings. What's on your mind?",
# "Hey! I'm the LMI Chatbot. How can I help you?",
# "Greetings! I'm the LMI Chatbot. Ask me anything about LMI Holdings.",
# ]
# THANKS_PHRASES = ["thank you", "thanks", "thank u", "thx", "appreciate it", "much appreciated"]
# THANKS_RESPONSES = [
# "You're welcome!", "Happy to help!", "No problem at all!", "Glad I could assist!", "Anytime!",
# ]
# # --- Global Components (initialized later) ---
# query_rewrite_llm = None
# agent_executor = None
# conversation_memories: Dict[str, ConversationBufferWindowMemory] = {}
# # --- Helper Functions ---
# def get_session_memory(session_id: str) -> ConversationBufferWindowMemory:
# """Retrieves or creates a memory buffer for a given session ID."""
# if session_id not in conversation_memories:
# conversation_memories[session_id] = ConversationBufferWindowMemory(
# k=5, memory_key="chat_history", return_messages=True
# )
# logger.info(f"Created new memory for session_id: {session_id}")
# return conversation_memories[session_id]
# def load_jsonl_data(file_path: str, source_name: str) -> List[Document]:
# docs = []
# if os.path.exists(file_path):
# with open(file_path, "r", encoding="utf-8") as f:
# for line_num, line in enumerate(f):
# try:
# data = json.loads(line.strip())
# instruction = data.get('instruction', data.get('question', ''))
# response = data.get('response', data.get('answer', ''))
# if not instruction or not response:
# logger.warning(f"Skipping line in {file_path}: missing instruction or response.")
# continue
# content = f"Instruction: {instruction}\nResponse: {response}"
# metadata = {"source": source_name, "original_instruction": instruction}
# docs.append(Document(page_content=content, metadata=metadata))
# except json.JSONDecodeError:
# logger.warning(f"Skipping invalid JSON in {file_path} at line {line_num + 1}")
# else:
# logger.warning(f"{source_name} file ({file_path}) not found.")
# return docs
# def create_retriever_from_documents(docs: List[Document], embeddings) -> Optional[EnsembleRetriever]:
# """Creates an EnsembleRetriever from a list of documents."""
# if not docs:
# return None
# seen_content = set()
# unique_docs = []
# for doc in docs:
# if doc.page_content not in seen_content:
# unique_docs.append(doc)
# seen_content.add(doc.page_content)
# logger.info(f"Building vector store for {len(unique_docs)} unique documents...")
# vectorstore = FAISS.from_documents(unique_docs, embeddings, distance_strategy="COSINE")
# faiss_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
# logger.info(f"Building BM25 retriever for {len(unique_docs)} unique documents...")
# bm25_retriever = BM25Retriever.from_documents(unique_docs)
# bm25_retriever.k = 5
# return EnsembleRetriever(retrievers=[faiss_retriever, bm25_retriever], weights=[0.5, 0.5])
# # *** ADDED MISSING FUNCTION DEFINITION HERE ***
# def initialize_query_rewrite_llm():
# """Initializes a separate LLM instance for query rewriting if not already done."""
# global query_rewrite_llm
# if query_rewrite_llm is None:
# try:
# logger.info("Initializing LLM for query rewriting...")
# google_api_key = os.getenv("GOOGLE_API_KEY")
# if not google_api_key:
# logger.error("GOOGLE_API_KEY not found for query_rewrite_llm.")
# return False
# query_rewrite_llm = ChatGoogleGenerativeAI(
# model="gemini-1.5-flash-latest",
# google_api_key=google_api_key,
# temperature=0.0,
# max_output_tokens=150
# )
# logger.info("Query rewriting LLM initialized successfully.")
# return True
# except Exception as e:
# logger.error(f"Failed to initialize query rewriting LLM: {e}", exc_info=True)
# query_rewrite_llm = None
# return False
# return True
# def load_and_setup_rag_components_and_agent():
# global agent_executor
# logger.info("Loading main LLM model (for tools and agent)...")
# google_api_key = os.getenv("GOOGLE_API_KEY")
# if not google_api_key:
# raise ValueError("GOOGLE_API_KEY environment variable not found!")
# llm = ChatGoogleGenerativeAI(model="gemini-2.0-flash", google_api_key=google_api_key, temperature=0.2)
# logger.info("Loading HuggingFace embeddings model...")
# embeddings = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
# # --- Load LMI Specific Documents ---
# lmi_docs = []
# for pdf_path in LMI_PDF_PATHS:
# if os.path.exists(pdf_path):
# try:
# loader = PyPDFLoader(pdf_path)
# pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
# for page in pages: page.metadata["source"] = os.path.basename(pdf_path)
# lmi_docs.extend(pages)
# logger.info(f"Loaded and split {len(pages)} chunks from {pdf_path}.")
# except Exception as e:
# logger.error(f"Error processing LMI PDF {pdf_path}: {e}")
# else:
# logger.warning(f"LMI PDF file ({pdf_path}) not found.")
# lmi_docs.extend(load_jsonl_data(JSONL_DATASET_PATH, "main_dataset"))
# lmi_docs.extend(load_jsonl_data(USER_CORRECTIONS_PATH, "user_corrections"))
# # --- Load Labour Act Documents ---
# labour_act_docs = []
# if os.path.exists(LABOUR_ACT_PDF_PATH):
# try:
# loader = PyPDFLoader(LABOUR_ACT_PDF_PATH)
# pages = loader.load_and_split(RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200))
# for page in pages: page.metadata["source"] = os.path.basename(LABOUR_ACT_PDF_PATH)
# labour_act_docs.extend(pages)
# logger.info(f"Loaded and split {len(pages)} chunks from {LABOUR_ACT_PDF_PATH}.")
# except Exception as e:
# logger.error(f"Error processing Labour Act PDF: {e}")
# else:
# logger.warning(f"Labour Act PDF file ({LABOUR_ACT_PDF_PATH}) not found.")
# # --- Create Specialized Retrievers ---
# lmi_retriever = create_retriever_from_documents(lmi_docs, embeddings)
# labour_act_retriever = create_retriever_from_documents(labour_act_docs, embeddings)
# if not lmi_retriever and not labour_act_retriever:
# raise ValueError("Cannot build RAG system: No documents loaded at all.")
# # --- RAG Chain for Synthesizing Answers ---
# rag_tool_prompt_template = """You are an assistant answering questions based on the provided context.
# Use ONLY the information from the '### Context' to answer the '### Question'.
# If the context is empty or does not contain a relevant answer, you MUST state: 'NO_INFORMATION_FOUND_IN_CONTEXT'. Do not apologize or add extra phrases.
# ### Context:
# {context}
# ### Question:
# {question}
# ### Answer:"""
# rag_tool_prompt = PromptTemplate.from_template(rag_tool_prompt_template)
# rag_chain_for_tool = LLMChain(llm=llm, prompt=rag_tool_prompt, output_parser=StrOutputParser())
# # --- Define Tool Logic ---
# def run_rag_retrieval_tool(query: str, retriever) -> str:
# """Generic RAG tool logic using a provided retriever."""
# if not retriever:
# return "This knowledge source is not available."
# try:
# context_docs = retriever.invoke(query)
# if not context_docs:
# return "NO_INFORMATION_FOUND_IN_CONTEXT"
# context_text = "\n\n---\n\n".join([doc.page_content for doc in context_docs])
# return rag_chain_for_tool.invoke({"context": context_text, "question": query})
# except Exception as e:
# logger.error(f"Error in RAG retrieval tool execution: {e}")
# return "ERROR_IN_TOOL_EXECUTION"
# # --- Define Specialized Tools ---
# tools = []
# if lmi_retriever:
# tools.append(Tool(
# name="LMICompanyPolicyTool",
# func=lambda query: run_rag_retrieval_tool(query, lmi_retriever),
# description="Use this tool for questions about LMI Holdings' internal policies (like HR or L&D), procedures, company overview, mission, and user-corrected information. This is your primary source for LMI-specific internal data.",
# ))
# if labour_act_retriever:
# tools.append(Tool(
# name="GhanaLabourActTool",
# func=lambda query: run_rag_retrieval_tool(query, labour_act_retriever),
# description="Use this tool specifically for questions about Ghana's national labour laws, employment regulations, worker rights, and employer obligations as defined in the Labour Act, 2003.",
# ))
# tools.append(Tool(
# name="GoogleSearch",
# func=lambda query: f"Simulated Google Search: Information about '{query}' that compares LMI with other companies or provides general public data would be retrieved here.",
# description="Use this tool *only* when asked to compare LMI with other companies, or for general public information not about LMI's internal policies or Ghana's Labour Act.",
# ))
# # --- Agent Setup ---
# agent_prompt = hub.pull("hwchase17/react-chat")
# agent = create_react_agent(llm, tools, agent_prompt)
# agent_executor = AgentExecutor(
# agent=agent, tools=tools, verbose=True,
# handle_parsing_errors=True, max_iterations=7
# )
# logger.info(f"Agent Executor initialized successfully with {len(tools)} tools.")
# logger.info("Initializing system components...")
# try:
# load_and_setup_rag_components_and_agent()
# initialize_query_rewrite_llm()
# except Exception as e:
# logger.critical(f"Failed to initialize system: {e}", exc_info=True)
# agent_executor = None
# def rewrite_query(question: str) -> str:
# """Rewrites user queries for better retrieval, adding context if needed."""
# q_lower = question.lower()
# if initialize_query_rewrite_llm() and query_rewrite_llm:
# logger.info(f"Attempting LLM-based rewrite for: '{question}'")
# if "the act" in q_lower and "labour" not in q_lower:
# question = question.replace("the act", "the Ghana Labour Act")
# logger.info(f"Rule-based addition before LLM rewrite: '{question}'")
# rewrite_prompt_template = """Your task is to rephrase the user's question into a clear, concise, and canonical question. Add context if necessary.
# Examples:
# User question: "what does the act say about overtime?" -> Rephrased: "What does the Ghana Labour Act 2003 say about overtime?"
# User question: "how many days off can I get" -> Rephrased: "What is the LMI Holdings policy on annual leave entitlement?"
# User question: "{user_input}" -> Rephrased:"""
# rewrite_prompt = PromptTemplate.from_template(rewrite_prompt_template)
# rewriter_chain = LLMChain(llm=query_rewrite_llm, prompt=rewrite_prompt)
# response = rewriter_chain.invoke({"user_input": question})
# rephrased_question = response.get('text', question).strip()
# if rephrased_question.lower() != question.lower():
# logger.info(f"LLM rephrased query: '{question}' -> '{rephrased_question}'")
# return rephrased_question
# return question
# logger.warning(f"LLM for query rewriting not available. Using original query: '{question}'")
# return question
# @app.post("/ask")
# async def ask(question_data: Question) -> Dict[str, Any]:
# logger.info(f"Received question via /ask endpoint: {question_data.text} for session: {question_data.session_id}")
# normalized_check_text = ''.join(c for c in question_data.text.lower() if c.isalnum() or c.isspace()).strip()
# if normalized_check_text in GREETINGS:
# response = random.choice(GREETING_RESPONSES)
# elif normalized_check_text in THANKS_PHRASES:
# response = random.choice(THANKS_RESPONSES)
# else:
# response = None
# if response:
# logger.info(f"Handling as conversational snippet: {response}")
# memory = get_session_memory(question_data.session_id)
# memory.chat_memory.add_user_message(question_data.text)
# memory.chat_memory.add_ai_message(response)
# return {"answer": response, "type": "conversational", "session_id": question_data.session_id}
# if not agent_executor:
# raise HTTPException(status_code=503, detail="LMI Chatbot is not available. Please try again later.")
# try:
# rewritten_question_text = rewrite_query(question_data.text)
# memory = get_session_memory(question_data.session_id)
# agent_input = {
# "input": rewritten_question_text,
# "chat_history": memory.chat_memory.messages
# }
# response_payload = await agent_executor.ainvoke(agent_input)
# final_answer = response_payload.get("output", "I apologize, I encountered an issue processing your request.")
# if "NO_INFORMATION_FOUND_IN_CONTEXT" in final_answer or "ERROR_IN_TOOL_EXECUTION" in final_answer:
# final_answer = "I'm not sure about that, but I'd be happy to help if you provide more details or clarify your question!"
# logger.info("Overriding 'not found' or 'error' to user-friendly message.")
# memory.chat_memory.add_user_message(question_data.text)
# memory.chat_memory.add_ai_message(final_answer)
# return {"answer": final_answer, "type": "agent_response", "session_id": question_data.session_id}
# except Exception as e:
# logger.error(f"Error in /ask endpoint for session {question_data.session_id}: {str(e)}", exc_info=True)
# raise HTTPException(status_code=500, detail="I'm sorry, I encountered an unexpected issue.")
# @app.post("/feedback")
# async def handle_feedback(feedback_data: Feedback):
# logger.info(f"Received feedback: {feedback_data.dict()}")
# try:
# correction_entry = {
# "instruction": feedback_data.question,
# "response": feedback_data.corrected_answer,
# "original_bot_answer": feedback_data.original_answer,
# "feedback_category": feedback_data.category or "user_corrected"
# }
# with open(USER_CORRECTIONS_PATH, "a", encoding="utf-8") as f:
# f.write(json.dumps(correction_entry) + "\n")
# logger.info(f"Feedback successfully saved to {USER_CORRECTIONS_PATH}.")
# return {"message": "Feedback received and saved. It will be incorporated after the next system update/restart."}
# except Exception as e:
# logger.error(f"Error saving feedback: {e}", exc_info=True)
# raise HTTPException(status_code=500, detail=f"Could not save feedback: {e}")
# @app.get("/")
# async def root():
# logger.info("Root endpoint '/' accessed.")
# if not agent_executor:
# return {"message": "LMI Chatbot is starting up or encountered an issue. System may not be fully available."}
# return {"message": "LMI Chatbot (Agent Mode with Specialized Tools) is running and ready to answer questions!"}