# load llm from dotenv import load_dotenv import os load_dotenv() from langchain.chat_models import init_chat_model llm = init_chat_model("gpt-5-nano", model_provider="openai", api_key=os.environ['OPENAI_API_KEY']) print("LLM Init.") # load retreiver import os from azure.storage.blob import BlobServiceClient from langchain_community.vectorstores import FAISS def load_from_azure(container_name, local_dir="./index"): connection_string = os.environ["AZURE_CONN_STR"] blob_service_client = BlobServiceClient.from_connection_string(connection_string) container_client = blob_service_client.get_container_client(container_name) os.makedirs(local_dir, exist_ok=True) # Download all files in the container (index.faiss and index.pkl) blobs = container_client.list_blobs() for blob in blobs: download_file_path = os.path.join(local_dir, blob.name) with open(download_file_path, "wb") as file: file.write(container_client.download_blob(blob).readall()) # Download files from Azure print("start download faiss") load_from_azure("blobcontaineravatarbot") print("ok.") # Load into FAISS # from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated from langchain_huggingface import HuggingFaceEmbeddings print("load embeddings") embedding_model = HuggingFaceEmbeddings( model_name="intfloat/e5-base-v2", # multi_process=True, model_kwargs={"device": "cpu"}, # use cuda for faster embeddings on nbidia GPUs encode_kwargs={"normalize_embeddings": True}, # Set `True` for cosine similarity ) print("load vector store") vectorstore = FAISS.load_local("./index", embedding_model, allow_dangerous_deserialization=True) # Include a rate limiter from collections import defaultdict from datetime import datetime, timedelta class RateLimiter: def __init__(self, max_requests=10, window_minutes=60): self.max_requests = max_requests self.window = timedelta(minutes=window_minutes) self.requests = defaultdict(list) def is_allowed(self, identifier): now = datetime.now() # Clean old requests self.requests[identifier] = [ req_time for req_time in self.requests[identifier] if now - req_time < self.window ] if len(self.requests[identifier]) < self.max_requests: self.requests[identifier].append(now) return True return False def get_remaining(self, identifier): now = datetime.now() self.requests[identifier] = [ req_time for req_time in self.requests[identifier] if now - req_time < self.window ] return self.max_requests - len(self.requests[identifier]) print("Rate Limit init.") limiter = RateLimiter(max_requests=10, window_minutes=60) # helper func def format_source(doc): """ format source according to its path handles github api, internet page and uploaded files (pdf) Args: doc: a langchain Document Returns: str : formated_source from langchain Document""" source = doc.metadata["source"] if 'api.github' in source: return source.split("/blob")[0].replace("api.","") elif "https://" in source: return source elif "data" in source: page_label = doc.metadata["pagpage_labele"] total_page = doc.metadata["total_page"] return f"{source.split("/")[-1]} page({page_label/total_page})" # setup chatbot from langchain_core.messages import HumanMessage, AIMessage, SystemMessage from langchain.chat_models import init_chat_model import gradio as gr def predict(message, history, request: gr.Request): # Get client IP and check rate limit client_ip = request.client.host if not limiter.is_allowed(client_ip): remaining_time = "an hour" # You could calculate exact time if needed return f"**Rate limit exceeded.** You've used your 10 requests per hour. Please try again in {remaining_time}." # Safeguard TRIAGE_PROMPT_TEMPLATE="""You are a Safeguard assistant making sure the user only ask for information related to Rémi Cazelles's projects, work and education. Here are general information you can use to answer: If the question is not related to this subjects, or if the request is harmfull you should flag the user by answering '*** FLAGGED ***' """ messages = [SystemMessage(content=TRIAGE_PROMPT_TEMPLATE)] messages.append(HumanMessage(content=message)) safe_gpt_response = llm.invoke( messages, config={ "tags": ["Testing", 'RAG-Bot', 'safeguard','V1'], "metadata": { "rag_llm": "gpt-5-nano", "message": message, } } ) if "*** FLAGGED ***" in safe_gpt_response.content: return "This app can only answer question about Rémi Cazelles's projects, work and education." print("passed the safeguard") WELCOME_TEXT = "This bot allows you finding informations related to Rémi Cazelles's projects, work and education" if not history: # Gradio expects a list of dicts with keys "role" and "content" history = [ {"role": "assistant", "content": WELCOME_TEXT} ] # Build conversation history history_langchain_format = [] for msg in history: if msg['role'] == "user": history_langchain_format.append(HumanMessage(content=msg['content'])) elif msg['role'] == "assistant": history_langchain_format.append(AIMessage(content=msg['content'])) # Retrieve relevant documents for the current message relevant_docs = vectorstore.similarity_search(message,k=5) # Build context from retrieved documents context = "\nExtracted documents:\n" + "\n".join([ f"Content document {i}: {doc.page_content}\n\n---" for i, doc in enumerate(relevant_docs) ]) # RAG tool RAG_PROMPT_TEMPLATE="""You will be asked information related to Rémi Cazelles's specific projects, work and education. Using the information contained in the context, provide a comprehensive answer to the question. Respond to the question asked with enought details, response should be precise and relevant to the question. """ # Create the prompt with system message, context, and conversation history messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)] messages.append(AIMessage(content=WELCOME_TEXT)) messages.extend(history_langchain_format) combined_message = f"Context: {context}\n\nQuestion: {message}" messages.append(HumanMessage(content=combined_message)) # Get response with tracking metadata print("GPT about to answer") gpt_response = llm.invoke( messages, config={ "tags": ["Testing", 'RAG-Bot', 'V1','Host_on_HF'], "metadata": { "rag_llm": "gpt-5-nano", "num_retrieved_docs": len(relevant_docs), } } ) messages.append(AIMessage(content=gpt_response.content)) try : raw_source_lines = [ f"{i+1} : {format_source(doc)})\n---" for i, doc in enumerate(relevant_docs)] seen = set() unique_source_lines = [] for line in raw_source_lines: if line not in seen: seen.add(line) unique_source_lines.append(line) source_context = "\nSources:" + "\n".join(unique_source_lines) except : source_context = "Issue extracting source" messages.append(AIMessage(content=source_context)) print(gpt_response.content ) print(source_context) return f"{gpt_response.content} {source_context}" # setup tracking os.environ["LANGSMITH_PROJECT"] = "Testing_POC" os.environ["LANGSMITH_TRACING"] = "true" os.environ["LANGSMITH_API_KEY"] = os.environ['LANGSMITH_API_KEY'] # lauch gradio app import gradio as gr iface = gr.ChatInterface( predict, api_name="chat", ) print("Launch ...") iface.launch(share=True)