Spaces:
Running
Running
Commit
·
346dab1
1
Parent(s):
fd1c27c
Correctness LLM evaluation
Browse files- __pycache__/config.cpython-310.pyc +0 -0
- config.py +1 -0
- evaluations/answer_relevance.py +0 -0
- evaluations/correctness.py +60 -0
- evaluations/groundedness.py +0 -0
- evaluations/retreival_relevance +0 -0
- graph_agentC.py +29 -2
- pdf_processing.py +33 -8
- requirements.txt +1 -1
__pycache__/config.cpython-310.pyc
CHANGED
|
Binary files a/__pycache__/config.cpython-310.pyc and b/__pycache__/config.cpython-310.pyc differ
|
|
|
config.py
CHANGED
|
@@ -33,6 +33,7 @@ client = Client(
|
|
| 33 |
api_url=langsmith_endpoint,
|
| 34 |
api_key=langsmith_api_key,
|
| 35 |
)
|
|
|
|
| 36 |
|
| 37 |
# Initialize Neo4j connection
|
| 38 |
neo4j_uri = os.getenv("NEO4J_URI")
|
|
|
|
| 33 |
api_url=langsmith_endpoint,
|
| 34 |
api_key=langsmith_api_key,
|
| 35 |
)
|
| 36 |
+
dataset_id = os.getenv("DATASET_UUID")
|
| 37 |
|
| 38 |
# Initialize Neo4j connection
|
| 39 |
neo4j_uri = os.getenv("NEO4J_URI")
|
evaluations/answer_relevance.py
ADDED
|
File without changes
|
evaluations/correctness.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing_extensions import Annotated, TypedDict
|
| 2 |
+
from langchain.chat_models import ChatOpenAI
|
| 3 |
+
from config import client, llm
|
| 4 |
+
from config import dataset_id
|
| 5 |
+
|
| 6 |
+
# Define the output schema for grading
|
| 7 |
+
class CorrectnessGrade(TypedDict):
|
| 8 |
+
explanation: Annotated[str, ..., "Expliquez votre raisonnement pour la note."]
|
| 9 |
+
score: Annotated[float, ..., "Un score de 0 à 10 basé sur la justesse de la réponse"]
|
| 10 |
+
|
| 11 |
+
# Grading prompt with expert evaluation criteria
|
| 12 |
+
correctness_instructions = """Vous êtes un expert en intelligence artificielle chargé d'évaluer la pertinence d'une réponse générée par un système RAG.
|
| 13 |
+
|
| 14 |
+
**Instructions** :
|
| 15 |
+
- Vous recevrez une QUESTION, une RÉPONSE DE RÉFÉRENCE (vérité terrain) et une RÉPONSE GÉNÉRÉE.
|
| 16 |
+
- Évaluez uniquement la précision factuelle de la réponse générée par rapport à la référence.
|
| 17 |
+
- Une réponse peut contenir plus d'informations que la référence, tant qu'elles sont exactes.
|
| 18 |
+
- Si la réponse générée contient des erreurs ou des informations contradictoires, pénalisez-la.
|
| 19 |
+
|
| 20 |
+
**Notation (score sur 10)** :
|
| 21 |
+
- **0 (Très insuffisant)** : Réponse hors sujet ou contenant des erreurs majeures.
|
| 22 |
+
- **2.5 (Insuffisant)** : Réponse partiellement correcte mais incomplète ou floue.
|
| 23 |
+
- **5 (Correct)** : Réponse pertinente mais manquant de clarté ou de détails.
|
| 24 |
+
- **7.5 (Bon)** : Réponse pertinente, claire et complète avec une légère amélioration possible.
|
| 25 |
+
- **10 (Très bon)** : Réponse exacte, détaillée et bien structurée.
|
| 26 |
+
|
| 27 |
+
**Expliquez votre notation étape par étape.**"""
|
| 28 |
+
|
| 29 |
+
def get_reference_answer(question: str) -> str:
|
| 30 |
+
"""Retrieve the reference answer from a LangSmith dataset."""
|
| 31 |
+
|
| 32 |
+
# Récupérer les exemples stockés dans le dataset
|
| 33 |
+
dataset = client.list_examples(dataset_id)
|
| 34 |
+
|
| 35 |
+
# Chercher la correspondance exacte
|
| 36 |
+
for example in dataset:
|
| 37 |
+
if example.inputs.get("question") == question:
|
| 38 |
+
return example.outputs.get("answer") # Retourne la réponse de référence
|
| 39 |
+
|
| 40 |
+
return None # Si aucune correspondance n'est trouvée
|
| 41 |
+
|
| 42 |
+
def correctness(inputs: dict, outputs: dict) -> float:
|
| 43 |
+
"""Evaluates the correctness of a RAG-generated answer and returns a score (0-10)."""
|
| 44 |
+
question = inputs["question"]
|
| 45 |
+
reference_answer = get_reference_answer(question)
|
| 46 |
+
|
| 47 |
+
if not reference_answer:
|
| 48 |
+
raise ValueError(f"No reference answer found for question: {question}")
|
| 49 |
+
|
| 50 |
+
answers = f"""\nQUESTION: {question}
|
| 51 |
+
RÉPONSE DE RÉFÉRENCE: {reference_answer}
|
| 52 |
+
RÉPONSE GÉNÉRÉE: {outputs['answer']}"""
|
| 53 |
+
|
| 54 |
+
# Invoke the LLM for grading
|
| 55 |
+
grade = llm.invoke([
|
| 56 |
+
{"role": "system", "content": correctness_instructions},
|
| 57 |
+
{"role": "user", "content": answers}
|
| 58 |
+
])
|
| 59 |
+
|
| 60 |
+
return grade["score"]
|
evaluations/groundedness.py
ADDED
|
File without changes
|
evaluations/retreival_relevance
ADDED
|
File without changes
|
graph_agentC.py
CHANGED
|
@@ -6,6 +6,7 @@ from neo4j_utils import unified_search
|
|
| 6 |
from typing import TypedDict, Sequence, List, Dict, Optional, Annotated
|
| 7 |
from config import llm
|
| 8 |
import re
|
|
|
|
| 9 |
|
| 10 |
class GraphState(TypedDict):
|
| 11 |
messages: Annotated[Sequence[BaseMessage], add_messages]
|
|
@@ -13,6 +14,8 @@ class GraphState(TypedDict):
|
|
| 13 |
relevant_docs: List[Dict[str, Optional[Dict[str, float]]]] # Résultats de la recherche hybride
|
| 14 |
neo4j_results: list # Résultats de la recherche Neo4j
|
| 15 |
response: str
|
|
|
|
|
|
|
| 16 |
k: int
|
| 17 |
alpha: float
|
| 18 |
similarity_threshold: float
|
|
@@ -79,6 +82,22 @@ def generate_response(state: GraphState) -> dict:
|
|
| 79 |
|
| 80 |
return {"response": response_cleaned}
|
| 81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
def post_process_response(state: GraphState) -> dict:
|
|
@@ -88,13 +107,20 @@ def post_process_response(state: GraphState) -> dict:
|
|
| 88 |
# Vérifier si la réponse est pertinente
|
| 89 |
if not response or response.lower() in ["je ne sais pas", "i don't know"]:
|
| 90 |
response = "Désolé, je n'ai pas trouvé d'informations pertinentes pour votre question."
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
|
| 94 |
# Construction du graphe
|
| 95 |
graph_builder = StateGraph(GraphState)
|
| 96 |
|
| 97 |
# Ajouter les nœuds
|
|
|
|
| 98 |
graph_builder.add_node("retrieve", retrieve_unified)
|
| 99 |
graph_builder.add_node("generate", generate_response)
|
| 100 |
graph_builder.add_node("post_process", post_process_response)
|
|
@@ -102,7 +128,8 @@ graph_builder.add_node("post_process", post_process_response)
|
|
| 102 |
# Définir les transitions
|
| 103 |
graph_builder.set_entry_point("retrieve")
|
| 104 |
graph_builder.add_edge("retrieve", "generate")
|
| 105 |
-
graph_builder.add_edge("generate", "
|
|
|
|
| 106 |
graph_builder.add_edge("post_process", END)
|
| 107 |
|
| 108 |
# Compiler le graphe
|
|
|
|
| 6 |
from typing import TypedDict, Sequence, List, Dict, Optional, Annotated
|
| 7 |
from config import llm
|
| 8 |
import re
|
| 9 |
+
from evaluations.correctness import correctness
|
| 10 |
|
| 11 |
class GraphState(TypedDict):
|
| 12 |
messages: Annotated[Sequence[BaseMessage], add_messages]
|
|
|
|
| 14 |
relevant_docs: List[Dict[str, Optional[Dict[str, float]]]] # Résultats de la recherche hybride
|
| 15 |
neo4j_results: list # Résultats de la recherche Neo4j
|
| 16 |
response: str
|
| 17 |
+
score: Optional[float]
|
| 18 |
+
evaluation_explanation: Optional[str]
|
| 19 |
k: int
|
| 20 |
alpha: float
|
| 21 |
similarity_threshold: float
|
|
|
|
| 82 |
|
| 83 |
return {"response": response_cleaned}
|
| 84 |
|
| 85 |
+
|
| 86 |
+
def evaluate_response(state: GraphState) -> dict:
|
| 87 |
+
"""Évalue la réponse générée en comparant avec la vérité terrain (LangSmith)."""
|
| 88 |
+
|
| 89 |
+
inputs = {"question": state["query"]}
|
| 90 |
+
outputs = {"answer": state["response"]}
|
| 91 |
+
|
| 92 |
+
try:
|
| 93 |
+
score = correctness(inputs, outputs)
|
| 94 |
+
explanation = f"La réponse a obtenu un score de {score}/10. Voici l'explication de l'évaluation..."
|
| 95 |
+
except Exception as e:
|
| 96 |
+
score = 0
|
| 97 |
+
explanation = f"Erreur lors de l'évaluation : {str(e)}"
|
| 98 |
+
|
| 99 |
+
return {"score": score, "evaluation_explanation": explanation}
|
| 100 |
+
|
| 101 |
|
| 102 |
|
| 103 |
def post_process_response(state: GraphState) -> dict:
|
|
|
|
| 107 |
# Vérifier si la réponse est pertinente
|
| 108 |
if not response or response.lower() in ["je ne sais pas", "i don't know"]:
|
| 109 |
response = "Désolé, je n'ai pas trouvé d'informations pertinentes pour votre question."
|
| 110 |
+
|
| 111 |
+
evaluation = evaluate_response(state)
|
| 112 |
+
return {
|
| 113 |
+
"response": response,
|
| 114 |
+
"score": evaluation["score"], # Ajout du score
|
| 115 |
+
"evaluation_explanation": evaluation["evaluation_explanation"] # Explication
|
| 116 |
+
}
|
| 117 |
|
| 118 |
|
| 119 |
# Construction du graphe
|
| 120 |
graph_builder = StateGraph(GraphState)
|
| 121 |
|
| 122 |
# Ajouter les nœuds
|
| 123 |
+
graph_builder.add_node("evaluate", evaluate_response)
|
| 124 |
graph_builder.add_node("retrieve", retrieve_unified)
|
| 125 |
graph_builder.add_node("generate", generate_response)
|
| 126 |
graph_builder.add_node("post_process", post_process_response)
|
|
|
|
| 128 |
# Définir les transitions
|
| 129 |
graph_builder.set_entry_point("retrieve")
|
| 130 |
graph_builder.add_edge("retrieve", "generate")
|
| 131 |
+
graph_builder.add_edge("generate", "evaluate")
|
| 132 |
+
graph_builder.add_edge("evaluate", "post_process")
|
| 133 |
graph_builder.add_edge("post_process", END)
|
| 134 |
|
| 135 |
# Compiler le graphe
|
pdf_processing.py
CHANGED
|
@@ -1,8 +1,9 @@
|
|
| 1 |
-
import
|
| 2 |
-
import re
|
| 3 |
-
import os
|
| 4 |
from langchain.text_splitter import CharacterTextSplitter
|
| 5 |
import pdfplumber
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
def get_existing_pdf(filename="La Confession muette.pdf"):
|
| 8 |
"""Retrieve the PDF file if it exists."""
|
|
@@ -20,12 +21,36 @@ def load_and_preprocess_pdf(pdf_path):
|
|
| 20 |
text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
|
| 21 |
return text
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def split_text(text):
|
| 24 |
-
"""Split text into chunks."""
|
| 25 |
text_splitter = CharacterTextSplitter(
|
| 26 |
separator="\n",
|
| 27 |
-
chunk_size=
|
| 28 |
-
chunk_overlap=200,
|
| 29 |
-
length_function=
|
| 30 |
)
|
| 31 |
-
return text_splitter.split_text(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from transformers import AutoTokenizer
|
|
|
|
|
|
|
| 2 |
from langchain.text_splitter import CharacterTextSplitter
|
| 3 |
import pdfplumber
|
| 4 |
+
from config import *
|
| 5 |
+
import re
|
| 6 |
+
import os
|
| 7 |
|
| 8 |
def get_existing_pdf(filename="La Confession muette.pdf"):
|
| 9 |
"""Retrieve the PDF file if it exists."""
|
|
|
|
| 21 |
text = re.sub(r'\*+ebook converter demo watermarks\*+', '', text, flags=re.IGNORECASE)
|
| 22 |
return text
|
| 23 |
|
| 24 |
+
def token_length(text):
|
| 25 |
+
"""Calcule la longueur en tokens en utilisant SentenceTransformer."""
|
| 26 |
+
return len(model.tokenize(text))
|
| 27 |
+
|
| 28 |
def split_text(text):
|
| 29 |
+
"""Split text into chunks basés sur les tokens."""
|
| 30 |
text_splitter = CharacterTextSplitter(
|
| 31 |
separator="\n",
|
| 32 |
+
chunk_size=1024, # Taille du chunk en tokens
|
| 33 |
+
chunk_overlap=200, # Chevauchement en tokens
|
| 34 |
+
length_function=token_length # Mesurer en tokens
|
| 35 |
)
|
| 36 |
+
return text_splitter.split_text(text)
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
#def split_text(text):
|
| 47 |
+
# """Split text into chunks."""
|
| 48 |
+
# text_splitter = CharacterTextSplitter(
|
| 49 |
+
# separator="\n",
|
| 50 |
+
# chunk_size=2500,
|
| 51 |
+
# chunk_overlap=200,
|
| 52 |
+
# length_function=len
|
| 53 |
+
# )
|
| 54 |
+
# return text_splitter.split_text(text)
|
| 55 |
+
|
| 56 |
+
|
requirements.txt
CHANGED
|
@@ -19,7 +19,7 @@ PyPDF2>=3.0.0
|
|
| 19 |
pdfplumber>=0.9.0
|
| 20 |
|
| 21 |
langchain>=0.0.200
|
| 22 |
-
langsmith>=0.
|
| 23 |
langgraph>=0.2.20,<0.3
|
| 24 |
|
| 25 |
python-dotenv>=1.0.0
|
|
|
|
| 19 |
pdfplumber>=0.9.0
|
| 20 |
|
| 21 |
langchain>=0.0.200
|
| 22 |
+
langsmith>=0.2.4
|
| 23 |
langgraph>=0.2.20,<0.3
|
| 24 |
|
| 25 |
python-dotenv>=1.0.0
|