# load llm from dotenv import load_dotenv import os load_dotenv() from langchain.chat_models import init_chat_model llm = init_chat_model("gpt-5-nano", model_provider="openai", api_key=os.environ['OPENAI_API_KEY']) print("LLM Init.") # load retreiver import os from azure.storage.blob import BlobServiceClient from langchain_community.vectorstores import FAISS def load_from_azure(container_name, local_dir="./index"): connection_string = os.environ["AZURE_CONN_STR"] blob_service_client = BlobServiceClient.from_connection_string(connection_string) container_client = blob_service_client.get_container_client(container_name) os.makedirs(local_dir, exist_ok=True) # Download all files in the container (index.faiss and index.pkl) blobs = container_client.list_blobs() for blob in blobs: download_file_path = os.path.join(local_dir, blob.name) with open(download_file_path, "wb") as file: file.write(container_client.download_blob(blob).readall()) # Download files from Azure print("start download faiss") load_from_azure("blobcontaineravatarbot") print("ok.") # Load into FAISS # from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated from langchain_huggingface import HuggingFaceEmbeddings print("load embeddings") embedding_model = HuggingFaceEmbeddings( model_name="intfloat/e5-base-v2", # multi_process=True, model_kwargs={"device": "cuda"}, # use cuda for faster embeddings on nbidia GPUs encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity ) print("load vector store") vectorstore = FAISS.load_local("./index", embedding_model, allow_dangerous_deserialization=True) # Include a rate limiter from collections import defaultdict from datetime import datetime, timedelta class RateLimiter: def __init__(self, max_requests=10, window_minutes=60): self.max_requests = max_requests self.window = timedelta(minutes=window_minutes) self.requests = defaultdict(list) def is_allowed(self, identifier): now = datetime.now() # Clean old requests self.requests[identifier] = [ req_time for req_time in self.requests[identifier] if now - req_time < self.window ] if len(self.requests[identifier]) < self.max_requests: self.requests[identifier].append(now) return True return False def get_remaining(self, identifier): now = datetime.now() self.requests[identifier] = [ req_time for req_time in self.requests[identifier] if now - req_time < self.window ] return self.max_requests - len(self.requests[identifier]) print("Rate Limit init.") limiter = RateLimiter(max_requests=10, window_minutes=60) # setup chatbot from langchain_core.messages import HumanMessage, AIMessage, SystemMessage from langchain.chat_models import init_chat_model import gradio as gr def predict(message, history, request: gr.Request): # Get client IP and check rate limit client_ip = request.client.host if not limiter.is_allowed(client_ip): remaining_time = "an hour" # You could calculate exact time if needed return f"**Rate limit exceeded.** You've used your 10 requests per hour. Please try again in {remaining_time}." # Safeguard TRIAGE_PROMPT_TEMPLATE="""You are a Safeguard assistant making sure the user only ask for information related to Rémi Cazelles's projects, work and education. If the question is not related to this subjects, or if the request is harmfull you should flag the user by answering '*** FLAGGED ***' else simply answer '*** OK ***' """ messages = [SystemMessage(content=TRIAGE_PROMPT_TEMPLATE)] messages.append(HumanMessage(content=message)) safe_gpt_response = llm.invoke( messages, config={ "tags": ["Testing", 'RAG-Bot', 'safeguard','V1'], "metadata": { "rag_llm": "gpt-5-nano", "message": message, } } ) if not "*** OK ***" in safe_gpt_response.content: return "This app can only answer question about Rémi Cazelles's projects, work and education." print("passed the safeguard") # Build conversation history history_langchain_format = [] for msg in history: if msg['role'] == "user": history_langchain_format.append(HumanMessage(content=msg['content'])) elif msg['role'] == "assistant": history_langchain_format.append(AIMessage(content=msg['content'])) # Retrieve relevant documents for the current message relevant_docs = vectorstore.similarity_search(message,k=3) # Your retriever # Build context from retrieved documents context = "\nExtracted documents:\n" + "\n".join([ f"Document {i}: Content: {doc.page_content}\n\n---" for i, doc in enumerate(relevant_docs) ]) # RAG tool RAG_PROMPT_TEMPLATE="""Using the information contained in the context, give a comprehensive answer to the question. Respond only to the question asked, response should be concise and relevant to the question. Provide the context source url and context date of the source document when relevant. If the answer cannot be deduced from the context, do not give an answer. """ # Create the prompt with system message, context, and conversation history messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)] messages.extend(history_langchain_format) combined_message = f"Context: {context}\n\nQuestion: {message}" messages.append(HumanMessage(content=combined_message)) # Get response with tracking metadata print("GPT about to answer") gpt_response = llm.invoke( messages, config={ "tags": ["Testing", 'RAG-Bot', 'V1'], "metadata": { "rag_llm": "gpt-5-nano", "num_retrieved_docs": len(relevant_docs), } } ) source_context = "\nSources:\n" + "\n".join([ f"{doc.metadata["source"].split("/")[-1]} ({doc.metadata.get('date')})\n---" for i, doc in enumerate(relevant_docs)]) print(gpt_response.content ) print(source_context) return gpt_response.content + "\n\n" + source_context # setup tracking os.environ["LANGSMITH_PROJECT"] = "Testing_POC" os.environ["LANGSMITH_TRACING"] = "true" os.environ["LANGSMITH_API_KEY"] = os.environ['LANGSMITH_API_KEY'] # lauch gradio app import gradio as gr iface = gr.ChatInterface( predict, api_name="chat", ) print("Launch ...") iface.launch(share=True)