Spaces:

RCaz
/

Avatar_bot

Sleeping

App Files Files Community

ai should start with a message provifing context and guardrail + the output of reference is ugly

by RCaz - opened Jan 19

base: refs/heads/main

←

from: refs/pr/8

Discussion Files changed

+65

-21

Files changed (2) hide show

agent/test.ipynb +58 -16
app.py +7 -5

agent/test.ipynb CHANGED Viewed

@@ -57,7 +57,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 36,
    "id": "fbbf5838",
    "metadata": {},
    "outputs": [],
@@ -135,7 +135,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 39,
    "id": "aba59d80",
    "metadata": {},
    "outputs": [
@@ -143,7 +143,7 @@
      "name": "stderr",
      "output_type": "stream",
      "text": [
-      "/var/folders/dv/gzhyqctn53s9bh23g7tbvl940000gn/T/ipykernel_29697/3187483442.py:4: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
       "  embedding_model = HuggingFaceEmbeddings(\n"
      ]
     }
@@ -165,19 +165,7 @@
    "execution_count": null,
    "id": "da07d2c2",
    "metadata": {},
-   "outputs": [
-    {
-     "ename": "AttributeError",
-     "evalue": "'builtin_function_or_method' object has no attribute 'date'",
-     "output_type": "error",
-     "traceback": [
-      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
-      "\u001b[31mAttributeError\u001b[39m                            Traceback (most recent call last)",
-      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[44]\u001b[39m\u001b[32m, line 1\u001b[39m\n\u001b[32m----> \u001b[39m\u001b[32m1\u001b[39m \u001b[43mdatetime\u001b[49m\u001b[43m.\u001b[49m\u001b[43mnow\u001b[49m\u001b[43m.\u001b[49m\u001b[43mdate\u001b[49m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_community\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mvectorstores\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m FAISS\n\u001b[32m      3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mlangchain_community\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mvectorstores\u001b[39;00m\u001b[34;01m.\u001b[39;00m\u001b[34;01mutils\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m DistanceStrategy\n",
-      "\u001b[31mAttributeError\u001b[39m: 'builtin_function_or_method' object has no attribute 'date'"
-     ]
-    }
-   ],
    "source": [
     "from langchain_community.vectorstores import FAISS\n",
     "from langchain_community.vectorstores.utils import DistanceStrategy\n",
@@ -221,6 +209,60 @@
     "# Download files from Azure\n",
     "load_from_azure(\"blobcontaineravatarbot\")"
    ]
   }
  ],
  "metadata": {

   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "fbbf5838",
    "metadata": {},
    "outputs": [],
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "aba59d80",
    "metadata": {},
    "outputs": [
      "name": "stderr",
      "output_type": "stream",
      "text": [
+      "/var/folders/dv/gzhyqctn53s9bh23g7tbvl940000gn/T/ipykernel_5272/3187483442.py:4: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the `langchain-huggingface package and should be used instead. To use it run `pip install -U `langchain-huggingface` and import as `from `langchain_huggingface import HuggingFaceEmbeddings``.\n",
       "  embedding_model = HuggingFaceEmbeddings(\n"
      ]
     }
    "execution_count": null,
    "id": "da07d2c2",
    "metadata": {},
+   "outputs": [],
    "source": [
     "from langchain_community.vectorstores import FAISS\n",
     "from langchain_community.vectorstores.utils import DistanceStrategy\n",
     "# Download files from Azure\n",
     "load_from_azure(\"blobcontaineravatarbot\")"
    ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "32d45df8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from langchain_community.vectorstores import FAISS\n",
+    "from langchain_community.embeddings import HuggingFaceEmbeddings # deprecated\n",
+    "# from langchain_huggingface import HuggingFaceEmbeddings\n",
+    "from langchain_text_splitters import RecursiveCharacterTextSplitter\n",
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "\n",
+    "\n",
+    "tokenizer_name = \"intfloat/e5-base-v2\"\n",
+    "embedding_model = HuggingFaceEmbeddings(\n",
+    "    model_name=tokenizer_name,\n",
+    "    # multi_process=True,\n",
+    "    model_kwargs={\"device\": \"mps\"},  # use cuda for faster embeddings on nbidia GPUs\n",
+    "    encode_kwargs={\"normalize_embeddings\": True},  # Set `True` for cosine similarity\n",
+    ")\n",
+    "\n",
+    "vs = FAISS.load_local(\"../data/FAISS/512-intfloat-e5-base-v2-2026-01-16\",embedding_model,allow_dangerous_deserialization=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4d166ca0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[(Document(id='5269b130-6f0c-4887-8214-25493e7345f1', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 4, 'page_label': '5', 'start_index': 1723}, page_content='. \\nJe remercie sincèrement toutes les personnes qui m’ont permis de mener à bien ce  travail : \\nThomas Cacciaguerra, Jullien Drone,  Annie Finiels, Philippe Gonzalez , Mourad Guermache , \\nGéraldine Layrac et Peralta Pradial de l’Institut Charles Gehrardt de Montpellier, Ovidiu Ersen et \\nSimona Moldovan de l’Institut de Physique et Chimie des Matériaux de Strasbourg, Jian Liu et \\nMarkus Antonietti de l’Institut Max Planck des Colloides et Interfaces de Potsdam. \\nJe tiens à remer cier chaleureusement Pierre Agulhon, Charlie Basset, Siham Behar, Mélanie \\nBordeaux, Arnaud Chaix, Eddy Dib, Isabelle Girard, Marie-Noëlle Labour, Antoine Lacarrière, \\nAlexander Sachse, Bilel Said, Thibault Terencio, Christophe Trouillefou, Rémi Veneziano, Julian'),\n",
+       "  0.7311910248264899),\n",
+       " (Document(id='87524f98-b762-4e56-a2fb-3c1319cc983c', metadata={'producer': 'Aspose.Pdf for .NET 8.8.0', 'creator': 'Aspose Ltd.', 'creationdate': '2014-04-28T09:28:27-04:00', 'moddate': '2014-04-28T09:28:27-04:00', 'spdf': '1112', 'source': '../data/research_paper/liu2014.pdf', 'total_pages': 16, 'page': 15, 'page_label': '16', 'start_index': -1}, page_content='.; Blank, D. H.; ten Elshof, J. E., Small 2011 , 7, 2709-2713. \\n(42)  Liu, Y.; Wang, H.; Wang, Y .; Xu, H.; Li, M.; Shen, H., Chemical Communications 2011 , 47 , 3790-3792. \\n(43)  Cazelles, R.; Drone, J.; Fajula, F.; Ersen, O .; Moldovan, S.; Galarneau, A., New J. Chem. 2013 , 37 , 3721-3730. \\n(44)  Duan, Z.; Sun, R., Chem. Geol. 2003 , 193 , 257-271. \\n(45)  Yadav, R. K.; Baeg, J.-O.; Oh, G. H.; Park, N .-J.; Kong, K.-J.; Kim, J.; Hwang, D. W.; Biswas, S. K., J. Am. Chem. Soc. \\n2012 , 134 , 11455–11461. \\n(46)  Zhou, Z.; Hartmann, M., Chem. Soc. Rev. 2013 , 42 , 3894-3912. \\n \\n \\n  \\n \\n \\nPage 15 of 15 Physical Chemistry Chemical Physics\\nPhysical Chemistry Chemical Physics Accepted Manuscript\\nPublished on 28 April 2014. Downloaded by University of Waterloo on 10/06/2014 15:40:59. \\nView Article Online\\nDOI: 10.1039/C4CP01348D'),\n",
+       "  0.707401510139031),\n",
+       " (Document(id='d4b16de4-ae6b-43b8-ae1e-c8366d418f95', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 0, 'page_label': '1', 'start_index': 0}, page_content=\"Délivré par L’ÉCOLE NATIONALE SUPÉRIEURE DE CHIMIE DE \\nMONTPELLIER \\n \\n \\nPréparée au sein de l’école doctorale Sciences Chimiques \\nEt de l’unité de recherche UMR 5253 \\n \\nSpécialité : Biochimie et Chimie des matériaux \\n \\n \\n \\n \\n \\nPrésentée par Rémi CAZELLES \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nSoutenue le 13 décembre 2013 devant le jury composé de \\n \\n \\n-Mr Eric MARCEAU, Maître de conférences, UMR 7197 \\nUniversité Pierre et Marie Curie, Paris VI \\nRapporteur \\n-Mme Isabelle CHEVALOT, Professeur, UMR 7274 \\nInstitut National Polytechnique de Loraine, Nancy \\nRapporteur \\n-Mr Joël CHOPINEAU, Professeur, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nExaminateur \\n-Mr Alain WALCARIUS, DR1, UMR 7564 \\nLaboratoire de Chimie Physique et Microbiologie \\npour l’Environnement, Nancy \\nExaminateur \\n-Mr Benjamin ERABLE, CR2, UMR 5503 \\nLaboratoire de Génie Chimique de Toulouse \\nExaminateur \\n-Mme Anne GALARNEAU, DR2, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nDirecteur de thèse \\n  \\n \\n[Tapez une citation prise dans le document \\nou la synthèse d'un passage intéressant. Vous \\npouvez placer la zone de texte n'importe où \\ndans le document. Utilisez l'onglet Outils  de \\nzone de texte pour modifier la mise en forme \\nde la zone de texte de la citation.] \\nBioconversion du CO2 en méthanol par \\nun système polyenzymatique encapsulé \\ndans des nanocapsules poreuses de silice\"),\n",
+       "  0.7028941154250334),\n",
+       " (Document(id='6a57d507-fa1d-45da-8140-a0c24a95035f', metadata={'producer': 'Microsoft® Office Word 2007', 'creator': 'Microsoft® Office Word 2007', 'creationdate': '2014-01-14T13:26:10+01:00', 'author': 'rcazelles', 'moddate': '2014-01-14T13:26:10+01:00', 'source': '../data/research_paper/ENSCM_2013_CAZELLES.pdf', 'total_pages': 196, 'page': 2, 'page_label': '3', 'start_index': 0}, page_content=\"Délivré par L’ÉCOLE NATIONALE SUPÉRIEURE DE CHIMIE DE \\nMONTPELLIER \\n \\n \\nPréparée au sein de l’école doctorale Sciences Chimiques \\nEt de l’unité de recherche UMR 5253 \\n \\nSpécialité : Biochimie et Chimie des matériaux \\n \\n \\n \\n \\n \\nPrésentée par Rémi CAZELLES \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\n \\nSoutenue le 13 décembre 2013 devant le jury composé de \\n \\n \\n-Mr Eric MARCEAU, Maître de conférences, UMR 7197 \\nUniversité Pierre et Marie Curie, Paris VI \\nRapporteur \\n-Mme Isabelle CHEVALOT, Professeur, UMR 7274 \\nInstitut National Polytechnique de Loraine, Nancy \\nRapporteur \\n-Mr Joël CHOPINEAU, Professeur, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nExaminateur \\n-Mr Alain WALCARIUS, DR1, UMR 7564 \\nLaboratoire de Chimie Physique et Microbiologie \\npour l’Environnement, Nancy \\nExaminateur \\n-Mr Benjamin ERABLE, CR2, UMR 5503 \\nLaboratoire de Génie Chimique de Toulouse \\nExaminateur \\n-Mme Anne GALARNEAU, DR2, UMR 5253 \\nInstitut Charles Gerhardt de Montpellier \\nDirecteur de thèse \\n  \\n  \\n \\nBioconversion du CO2 en méthanol par \\nun système polyenzymatique encapsulé \\ndans des nanocapsules poreuses de silice \\n[Tapez une citation prise dans le document \\nou la synthèse d'un passage intéressant. Vous \\npouvez placer la zone de texte n'importe où \\ndans le document. Utilisez l'onglet Outils de \\nzone de texte pour modifier la mise en forme \\nde la zone de texte de la citation.]\"),\n",
+       "  0.6995823846564415)]"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "docs = vs._similarity_search_with_relevance_scores(\"remi cazelles research work\")"
+   ]
   }
  ],
  "metadata": {

app.py CHANGED Viewed

@@ -138,16 +138,17 @@ def predict(message, history, request: gr.Request):
     # RAG tool
-    RAG_PROMPT_TEMPLATE="""Using the information contained in the context,
                         give a comprehensive answer to the question.
-                        Respond only to the question asked, response should be concise and relevant to the question.
-                        Provide the context source url and context date of the source document when relevant.
-                        If the answer cannot be deduced from the context, do not give an answer.
                         """
     # Create the prompt with system message, context, and conversation history
     messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)]
     messages.extend(history_langchain_format)
     combined_message = f"Context: {context}\n\nQuestion: {message}"
     messages.append(HumanMessage(content=combined_message))
@@ -165,8 +166,9 @@ def predict(message, history, request: gr.Request):
         }
     )
     source_context = "\nSources:\n" + "\n".join([
-        f"{doc.metadata["source"].split("/")[-1]} ({doc.metadata.get('date')})\n---"
         for i, doc in enumerate(relevant_docs)])
     print(gpt_response.content )

     # RAG tool
+    RAG_PROMPT_TEMPLATE="""You will be asked information about Rémi Cazelles's projects, work and education.
+                        Using the information contained in the context,
                         give a comprehensive answer to the question.
+                        Respond to the question asked with enought details, response should be precise and relevant to the question.
+                        If the answer cannot be deduced from the context, simply says you can't find information.
                         """
     # Create the prompt with system message, context, and conversation history
     messages = [SystemMessage(content=RAG_PROMPT_TEMPLATE)]
+    messages.extend[AIMessage(content="This bot allows you finding informations related to Rémi Cazelles's projects, work and education")]
     messages.extend(history_langchain_format)
     combined_message = f"Context: {context}\n\nQuestion: {message}"
     messages.append(HumanMessage(content=combined_message))
         }
     )
     source_context = "\nSources:\n" + "\n".join([
+        f"{i+1} : {doc.metadata["source"].split("/")[-1]} (page {doc.metadata['page_label']}/{doc.metadata['total_pages']})\n---"
         for i, doc in enumerate(relevant_docs)])
     print(gpt_response.content )