File size: 8,246 Bytes
66ec1a1
 
390d59b
92766e6
 
 
66ec1a1
 
92766e6
66ec1a1
92766e6
 
7db25b1
92766e6
cacbee6
66ec1a1
390d59b
729ce6a
 
 
66ec1a1
729ce6a
 
 
 
 
 
 
 
 
 
 
 
 
 
771f5a7
cacbee6
a8cfaee
cacbee6
771f5a7
619c95d
 
cacbee6
ccbc197
 
 
835cd89
ccbc197
 
cacbee6
ccbc197
66ec1a1
390d59b
f655d4a
 
3bcd138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
cacbee6
6d64ea5
66ec1a1
08b224c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
aee8e6f
08b224c
390d59b
92766e6
 
39a330c
66ec1a1
 
771f5a7
66ec1a1
6d64ea5
 
 
 
 
 
 
92766e6
 
fe48b8f
ba23ddf
92766e6
 
 
 
 
 
 
 
 
 
 
 
66ec1a1
637030a
ba23ddf
92766e6
 
66ec1a1
fe48b8f
92766e6
66ec1a1
 
 
 
 
 
92766e6
 
 
581b36a
92766e6
 
 
581b36a
92766e6
 
59b71ca
 
 
92766e6
fe48b8f
581b36a
2fc1f43
92766e6
 
 
 
 
 
 
 
 
 
 
 
 
 
581b36a
92766e6
 
 
 
 
 
6272ec0
b8c450d
 
24a01d1
581b36a
 
 
24a01d1
 
08b224c
fe48b8f
b8c450d
92766e6
 
 
92ce23f
92766e6
 
390d59b
92766e6
 
 
 
390d59b
92766e6
aebf207
59b71ca
66ec1a1
 
aebf207
 
 
 
e30f62f
 
66ec1a1
aebf207
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226


# load llm
from dotenv import load_dotenv
import os 
load_dotenv()


from langchain.chat_models import init_chat_model

llm = init_chat_model("gpt-5-nano", 
                      model_provider="openai",
                      temperature=1,
                      api_key=os.environ['OPENAI_API_KEY'])
print("LLM Init.")

# load retreiver
import os
from azure.storage.blob import BlobServiceClient
from langchain_community.vectorstores import FAISS

def load_from_azure(container_name, local_dir="./index"):
    connection_string = os.environ["AZURE_CONN_STR"]
    blob_service_client = BlobServiceClient.from_connection_string(connection_string)
    container_client = blob_service_client.get_container_client(container_name)

    os.makedirs(local_dir, exist_ok=True)
    
    # Download all files in the container (index.faiss and index.pkl)
    blobs = container_client.list_blobs()
    for blob in blobs:
        download_file_path = os.path.join(local_dir, blob.name)
        with open(download_file_path, "wb") as file:
            file.write(container_client.download_blob(blob).readall())

# Download files from Azure
print("start download faiss")
load_from_azure("avatarvectordb-container")
print("ok.")
# Load into FAISS
# from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated
from langchain_huggingface import HuggingFaceEmbeddings
print("load embeddings")
embedding_model = HuggingFaceEmbeddings(
    model_name="intfloat/e5-base-v2",
    # multi_process=True,
    model_kwargs={"device": "cpu"},  # use cuda for faster embeddings on nbidia GPUs
    encode_kwargs={"normalize_embeddings": True},  # Set `True` for cosine similarity
)
print("load vector store")
vectorstore = FAISS.load_local("./index", embedding_model, allow_dangerous_deserialization=True)

# Include a rate limiter
from collections import defaultdict
from datetime import datetime, timedelta
class RateLimiter:
    def __init__(self, max_requests=10, window_minutes=60):
        self.max_requests = max_requests
        self.window = timedelta(minutes=window_minutes)
        self.requests = defaultdict(list)
    
    def is_allowed(self, identifier):
        now = datetime.now()
        # Clean old requests
        self.requests[identifier] = [
            req_time for req_time in self.requests[identifier]
            if now - req_time < self.window
        ]
        
        if len(self.requests[identifier]) < self.max_requests:
            self.requests[identifier].append(now)
            return True
        return False
    
    def get_remaining(self, identifier):
        now = datetime.now()
        self.requests[identifier] = [
            req_time for req_time in self.requests[identifier]
            if now - req_time < self.window
        ]
        return self.max_requests - len(self.requests[identifier])
print("Rate Limit init.")
limiter = RateLimiter(max_requests=10, window_minutes=60)

# helper func

def format_source(doc):
    """
    format source according to its path 
    handles github api, internet page and uploaded files (pdf)

    Args:
        doc: a langchain Document
    Returns:
        str : formated_source from langchain Document"""
    source = doc.metadata["source"]
    if 'api.github' in source:
        return source.split("/blob")[0].replace("api.","")
    elif "https://" in source:
        return source
    elif "data" in source:
        page_label = doc.metadata["pagpage_labele"]
        total_page = doc.metadata["total_page"]
        return f"{source.split('/')[-1]} page({page_label/total_page})"
    
# setup chatbot
from langchain_core.messages import HumanMessage, AIMessage, SystemMessage
from langchain.chat_models import init_chat_model
import gradio as gr


def predict(message, history, request: gr.Request):

    # Get client IP and check rate limit
    client_ip = request.client.host
    if not limiter.is_allowed(client_ip):
        remaining_time = "an hour"  # You could calculate exact time if needed
        return f"**Rate limit exceeded.** You've used your 10 requests per hour. Please try again in {remaining_time}."
    
    
    # Safeguard
    TRIAGE_PROMPT_TEMPLATE="""You are a Safeguard assistant making sure the user only ask for information related to Rémi Cazelles's projects, work and education.
    Here are general information you can use to answer:
    If the question is not related to this subjects, or if the request is harmfull you should flag the user by answering '*** FLAGGED ***' """
    messages = [SystemMessage(content=TRIAGE_PROMPT_TEMPLATE)]
    messages.append(HumanMessage(content=message))

    safe_gpt_response = llm.invoke(
        messages,
        config={
            "tags": ["Testing", 'RAG-Bot', 'safeguard','V1'],
            "metadata": {
                "rag_llm": "gpt-5-nano",
                "message": message,
            }
        }
    )

    if "*** FLAGGED ***" in safe_gpt_response.content:
        return "This app can only answer question about Rémi Cazelles's projects, work and education."
    print("passed the safeguard")


    # Build conversation history
    history_langchain_format = []
    for msg in history:
        if msg['role'] == "user":
            history_langchain_format.append(HumanMessage(content=msg['content']))
        elif msg['role'] == "assistant":
            history_langchain_format.append(AIMessage(content=msg['content']))
    

    # Retrieve relevant documents for the current message
    relevant_docs = vectorstore.similarity_search(message,k=20)  #  retriever
    
    # Build context from retrieved documents
    context = "\nExtracted documents:\n" + "\n".join([
        f"Content document {i+1}: {doc.page_content}\n\n---"
        for i, doc in enumerate(relevant_docs)
    ])

    

    # RAG tool
    RAG_PROMPT_TEMPLATE="""You will be asked information related to Rémi Cazelles's specific projects, work and education.
                        Using the information contained in the context, provide a structured answer to the question.
                        Respond to the question asked with enought details, response should be precise and relevant to the question.
                        """


    # Create the prompt with system message, context, and conversation history
    messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)]
    messages.extend(history_langchain_format)
    combined_message = f"Context: {context}\n\nQuestion: {message}"
    messages.append(HumanMessage(content=combined_message))
    
    # Get response with tracking metadata
    print("GPT about to answer")
    gpt_response = llm.invoke(
        messages,
        config={
            "tags": ["Testing", 'RAG-Bot', 'V2','Host_on_HF'],
            "metadata": {
                "rag_llm": "gpt-5-nano",
                "num_retrieved_docs": len(relevant_docs),
            }
        }
    )

    messages.append(AIMessage(content=gpt_response.content))

    try :
        source_context = "\n\nSources:\n" + "\n".join([
        f"{i+1} - {format_source(doc)}"
        for i, doc in enumerate(relevant_docs)])

    except :
        source_context = "Issue extracting source"

    messages.append(AIMessage(content=source_context))
    print(gpt_response.content )
    print(source_context)
    
    return f"{gpt_response.content} {source_context}"


# setup tracking
os.environ["LANGSMITH_PROJECT"] = "Testing_POC"
os.environ["LANGSMITH_TRACING"] = "true"
os.environ["LANGSMITH_API_KEY"] = os.environ['LANGSMITH_API_KEY']

# lauch gradio app
import gradio as gr

iface = gr.ChatInterface(
    predict,
    api_name="chat",
    chatbot=gr.Chatbot(placeholder="Hello! This app can help answering question about Rémi Cazelles's projects, work and education."),
    description="Ask me anything about Rémi’s work, projects, or education. I’ll cite the source documents.",
    examples=["How many years of experience does Rémi have in python, what significant project did he work on?", 
              "When did Rémi graduate from his doctorate, what was his reaserch topic about?", 
              "I have a project in DataENgineering using Microsoft Fabrics for data pipeline, how good is Rémi experience to join a team ASAP?"],
    cache_examples=False
)

iface.launch()