ai should start with a message provifing context and guardrail + the output of reference is ugly

#8
by RCaz - opened
Files changed (2) hide show
  1. agent/test.ipynb +58 -16
  2. app.py +7 -5
agent/test.ipynb CHANGED
@@ -57,7 +57,7 @@
57
  },
58
  {
59
  "cell_type": "code",
60
- "execution_count": 36,
61
  "id": "fbbf5838",
62
  "metadata": {},
63
  "outputs": [],
@@ -135,7 +135,7 @@
135
  },
136
  {
137
  "cell_type": "code",
138
- "execution_count": 39,
139
  "id": "aba59d80",
140
  "metadata": {},
141
  "outputs": [
@@ -143,7 +143,7 @@
143
  "name": "stderr",
144
  "output_type": "stream",
145
  "text": [
146
- "/var/folders/dv/gzhyqctn53s9bh23g7tbvl940000gn/T/ipykernel_29697/3187483442.py:4: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
147
  " embedding_model = HuggingFaceEmbeddings(\n"
148
  ]
149
  }
@@ -165,19 +165,7 @@
165
  "execution_count": null,
166
  "id": "da07d2c2",
167
  "metadata": {},
168
- "outputs": [
169
- {
170
- "ename": "AttributeError",
171
- "evalue": "'builtin_function_or_method' object has no attribute 'date'",
172
- "output_type": "error",
173
- "traceback": [
174
- "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
175
- "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
176
- "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[44]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mdatetime\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnow\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdate\u001b[49m\n\u001b[32m 2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_community\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mvectorstores\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FAISS\n\u001b[32m 3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_community\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mvectorstores\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DistanceStrategy\n",
177
- "\u001b[31mAttributeError\u001b[39m: 'builtin_function_or_method' object has no attribute 'date'"
178
- ]
179
- }
180
- ],
181
  "source": [
182
  "from langchain_community.vectorstores import FAISS\n",
183
  "from langchain_community.vectorstores.utils import DistanceStrategy\n",
@@ -221,6 +209,60 @@
221
  "# Download files from Azure\n",
222
  "load_from_azure(\"blobcontaineravatarbot\")"
223
  ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
224
  }
225
  ],
226
  "metadata": {
 
57
  },
58
  {
59
  "cell_type": "code",
60
+ "execution_count": 2,
61
  "id": "fbbf5838",
62
  "metadata": {},
63
  "outputs": [],
 
135
  },
136
  {
137
  "cell_type": "code",
138
+ "execution_count": 3,
139
  "id": "aba59d80",
140
  "metadata": {},
141
  "outputs": [
 
143
  "name": "stderr",
144
  "output_type": "stream",
145
  "text": [
146
+ "/var/folders/dv/gzhyqctn53s9bh23g7tbvl940000gn/T/ipykernel_5272/3187483442.py:4: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
147
  " embedding_model = HuggingFaceEmbeddings(\n"
148
  ]
149
  }
 
165
  "execution_count": null,
166
  "id": "da07d2c2",
167
  "metadata": {},
168
+ "outputs": [],
 
 
 
 
 
 
 
 
 
 
 
 
169
  "source": [
170
  "from langchain_community.vectorstores import FAISS\n",
171
  "from langchain_community.vectorstores.utils import DistanceStrategy\n",
 
209
  "# Download files from Azure\n",
210
  "load_from_azure(\"blobcontaineravatarbot\")"
211
  ]
212
+ },
213
+ {
214
+ "cell_type": "code",
215
+ "execution_count": 11,
216
+ "id": "32d45df8",
217
+ "metadata": {},
218
+ "outputs": [],
219
+ "source": [
220
+ "from langchain_community.vectorstores import FAISS\n",
221
+ "from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated\n",
222
+ "# from langchain_huggingface import HuggingFaceEmbeddings\n",
223
+ "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
224
+ "from transformers import AutoTokenizer\n",
225
+ "\n",
226
+ "\n",
227
+ "\n",
228
+ "tokenizer_name = \"intfloat/e5-base-v2\"\n",
229
+ "embedding_model = HuggingFaceEmbeddings(\n",
230
+ " model_name=tokenizer_name,\n",
231
+ " # multi_process=True,\n",
232
+ " model_kwargs={\"device\": \"mps\"}, # use cuda for faster embeddings on nbidia GPUs\n",
233
+ " encode_kwargs={\"normalize_embeddings\": True}, # Set `True` for cosine similarity\n",
234
+ ")\n",
235
+ "\n",
236
+ "vs = FAISS.load_local(\"../data/FAISS/512-intfloat-e5-base-v2-2026-01-16\",embedding_model,allow_dangerous_deserialization=True)"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": null,
242
+ "id": "4d166ca0",
243
+ "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "data": {
247
+ "text/plain": [
248
+ "[(Document(id='5269b130-6f0c-4887-8214-25493e7345f1', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 4, 'page_label': '5', 'start_index': 1723}, page_content='. \\nJe remercie sincèrement toutes les personnes qui m’ont permis de mener à bien ce travail : \\nThomas Cacciaguerra, Jullien Drone, Annie Finiels, Philippe Gonzalez , Mourad Guermache , \\nGéraldine Layrac et Peralta Pradial de l’Institut Charles Gehrardt de Montpellier, Ovidiu Ersen et \\nSimona Moldovan de l’Institut de Physique et Chimie des Matériaux de Strasbourg, Jian Liu et \\nMarkus Antonietti de l’Institut Max Planck des Colloides et Interfaces de Potsdam. \\nJe tiens à remer cier chaleureusement Pierre Agulhon, Charlie Basset, Siham Behar, Mélanie \\nBordeaux, Arnaud Chaix, Eddy Dib, Isabelle Girard, Marie-Noëlle Labour, Antoine Lacarrière, \\nAlexander Sachse, Bilel Said, Thibault Terencio, Christophe Trouillefou, Rémi Veneziano, Julian'),\n",
249
+ " 0.7311910248264899),\n",
250
+ " (Document(id='87524f98-b762-4e56-a2fb-3c1319cc983c', metadata={'producer': 'Aspose.Pdf for .NET 8.8.0', 'creator': 'Aspose Ltd.', 'creationdate': '2014-04-28T09:28:27-04:00', 'moddate': '2014-04-28T09:28:27-04:00', 'spdf': '1112', 'source': '../data/research_paper/liu2014.pdf', 'total_pages': 16, 'page': 15, 'page_label': '16', 'start_index': -1}, page_content='.; Blank, D. H.; ten Elshof, J. E., Small 2011 , 7, 2709-2713. \\n(42) Liu, Y.; Wang, H.; Wang, Y .; Xu, H.; Li, M.; Shen, H., Chemical Communications 2011 , 47 , 3790-3792. \\n(43) Cazelles, R.; Drone, J.; Fajula, F.; Ersen, O .; Moldovan, S.; Galarneau, A., New J. Chem. 2013 , 37 , 3721-3730. \\n(44) Duan, Z.; Sun, R., Chem. Geol. 2003 , 193 , 257-271. \\n(45) Yadav, R. K.; Baeg, J.-O.; Oh, G. H.; Park, N .-J.; Kong, K.-J.; Kim, J.; Hwang, D. W.; Biswas, S. K., J. Am. Chem. Soc. \\n2012 , 134 , 11455–11461. \\n(46) Zhou, Z.; Hartmann, M., Chem. Soc. Rev. 2013 , 42 , 3894-3912. \\n \\n \\n \\n \\n \\nPage 15 of 15 Physical Chemistry Chemical Physics\\nPhysical Chemistry Chemical Physics Accepted Manuscript\\nPublished on 28 April 2014. Downloaded by University of Waterloo on 10/06/2014 15:40:59. \\nView Article Online\\nDOI: 10.1039/C4CP01348D'),\n",
251
+ " 0.707401510139031),\n",
252
+ " (Document(id='d4b16de4-ae6b-43b8-ae1e-c8366d418f95', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content=\"Délivré par L’ÉCOLE NATIONALE SUPÉRIEURE DE CHIMIE DE \\nMONTPELLIER \\n \\n \\nPréparée au sein de l’école doctorale Sciences Chimiques \\nEt de l’unité de recherche UMR 5253 \\n \\nSpécialité : Biochimie et Chimie des matériaux \\n \\n \\n \\n \\n \\nPrésentée par Rémi CAZELLES \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nSoutenue le 13 décembre 2013 devant le jury composé de \\n \\n \\n-Mr Eric MARCEAU, Maître de conférences, UMR 7197 \\nUniversité Pierre et Marie Curie, Paris VI \\nRapporteur \\n-Mme Isabelle CHEVALOT, Professeur, UMR 7274 \\nInstitut National Polytechnique de Loraine, Nancy \\nRapporteur \\n-Mr Joël CHOPINEAU, Professeur, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nExaminateur \\n-Mr Alain WALCARIUS, DR1, UMR 7564 \\nLaboratoire de Chimie Physique et Microbiologie \\npour l’Environnement, Nancy \\nExaminateur \\n-Mr Benjamin ERABLE, CR2, UMR 5503 \\nLaboratoire de Génie Chimique de Toulouse \\nExaminateur \\n-Mme Anne GALARNEAU, DR2, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nDirecteur de thèse \\n \\n \\n[Tapez une citation prise dans le document \\nou la synthèse d'un passage intéressant. Vous \\npouvez placer la zone de texte n'importe où \\ndans le document. Utilisez l'onglet Outils de \\nzone de texte pour modifier la mise en forme \\nde la zone de texte de la citation.] \\nBioconversion du CO2 en méthanol par \\nun système polyenzymatique encapsulé \\ndans des nanocapsules poreuses de silice\"),\n",
253
+ " 0.7028941154250334),\n",
254
+ " (Document(id='6a57d507-fa1d-45da-8140-a0c24a95035f', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 2, 'page_label': '3', 'start_index': 0}, page_content=\"Délivré par L’ÉCOLE NATIONALE SUPÉRIEURE DE CHIMIE DE \\nMONTPELLIER \\n \\n \\nPréparée au sein de l’école doctorale Sciences Chimiques \\nEt de l’unité de recherche UMR 5253 \\n \\nSpécialité : Biochimie et Chimie des matériaux \\n \\n \\n \\n \\n \\nPrésentée par Rémi CAZELLES \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nSoutenue le 13 décembre 2013 devant le jury composé de \\n \\n \\n-Mr Eric MARCEAU, Maître de conférences, UMR 7197 \\nUniversité Pierre et Marie Curie, Paris VI \\nRapporteur \\n-Mme Isabelle CHEVALOT, Professeur, UMR 7274 \\nInstitut National Polytechnique de Loraine, Nancy \\nRapporteur \\n-Mr Joël CHOPINEAU, Professeur, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nExaminateur \\n-Mr Alain WALCARIUS, DR1, UMR 7564 \\nLaboratoire de Chimie Physique et Microbiologie \\npour l’Environnement, Nancy \\nExaminateur \\n-Mr Benjamin ERABLE, CR2, UMR 5503 \\nLaboratoire de Génie Chimique de Toulouse \\nExaminateur \\n-Mme Anne GALARNEAU, DR2, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nDirecteur de thèse \\n \\n \\n \\nBioconversion du CO2 en méthanol par \\nun système polyenzymatique encapsulé \\ndans des nanocapsules poreuses de silice \\n[Tapez une citation prise dans le document \\nou la synthèse d'un passage intéressant. Vous \\npouvez placer la zone de texte n'importe où \\ndans le document. Utilisez l'onglet Outils de \\nzone de texte pour modifier la mise en forme \\nde la zone de texte de la citation.]\"),\n",
255
+ " 0.6995823846564415)]"
256
+ ]
257
+ },
258
+ "execution_count": 12,
259
+ "metadata": {},
260
+ "output_type": "execute_result"
261
+ }
262
+ ],
263
+ "source": [
264
+ "docs = vs._similarity_search_with_relevance_scores(\"remi cazelles research work\")"
265
+ ]
266
  }
267
  ],
268
  "metadata": {
app.py CHANGED
@@ -138,16 +138,17 @@ def predict(message, history, request: gr.Request):
138
 
139
 
140
  # RAG tool
141
- RAG_PROMPT_TEMPLATE="""Using the information contained in the context,
 
142
  give a comprehensive answer to the question.
143
- Respond only to the question asked, response should be concise and relevant to the question.
144
- Provide the context source url and context date of the source document when relevant.
145
- If the answer cannot be deduced from the context, do not give an answer.
146
  """
147
 
148
 
149
  # Create the prompt with system message, context, and conversation history
150
  messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)]
 
151
  messages.extend(history_langchain_format)
152
  combined_message = f"Context: {context}\n\nQuestion: {message}"
153
  messages.append(HumanMessage(content=combined_message))
@@ -165,8 +166,9 @@ def predict(message, history, request: gr.Request):
165
  }
166
  )
167
 
 
168
  source_context = "\nSources:\n" + "\n".join([
169
- f"{doc.metadata["source"].split("/")[-1]} ({doc.metadata.get('date')})\n---"
170
  for i, doc in enumerate(relevant_docs)])
171
 
172
  print(gpt_response.content )
 
138
 
139
 
140
  # RAG tool
141
+ RAG_PROMPT_TEMPLATE="""You will be asked information about Rémi Cazelles's projects, work and education.
142
+ Using the information contained in the context,
143
  give a comprehensive answer to the question.
144
+ Respond to the question asked with enought details, response should be precise and relevant to the question.
145
+ If the answer cannot be deduced from the context, simply says you can't find information.
 
146
  """
147
 
148
 
149
  # Create the prompt with system message, context, and conversation history
150
  messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)]
151
+ messages.extend[AIMessage(content="This bot allows you finding informations related to Rémi Cazelles's projects, work and education")]
152
  messages.extend(history_langchain_format)
153
  combined_message = f"Context: {context}\n\nQuestion: {message}"
154
  messages.append(HumanMessage(content=combined_message))
 
166
  }
167
  )
168
 
169
+
170
  source_context = "\nSources:\n" + "\n".join([
171
+ f"{i+1} : {doc.metadata["source"].split("/")[-1]} (page {doc.metadata['page_label']}/{doc.metadata['total_pages']})\n---"
172
  for i, doc in enumerate(relevant_docs)])
173
 
174
  print(gpt_response.content )