Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -330,6 +330,106 @@ async def search_tavily(
|
|
| 330 |
"query": query
|
| 331 |
}
|
| 332 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 333 |
|
| 334 |
|
| 335 |
|
|
@@ -399,10 +499,51 @@ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as retrieve_tab:
|
|
| 399 |
api_name="_retrieve"
|
| 400 |
)
|
| 401 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 402 |
# Create the interface with separate tabs
|
| 403 |
demo = gr.TabbedInterface(
|
| 404 |
-
[arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab],
|
| 405 |
-
["ArXiv", "Tavily", "List Retrievers", "Retrieve"]
|
| 406 |
)
|
| 407 |
|
| 408 |
demo.launch(mcp_server=True)
|
|
|
|
| 330 |
"query": query
|
| 331 |
}
|
| 332 |
|
| 333 |
+
##### EVALS #####
|
| 334 |
+
async def evaluate_answer_relevancy(
|
| 335 |
+
query: str,
|
| 336 |
+
response: str,
|
| 337 |
+
) -> float:
|
| 338 |
+
"""Evaluate how relevant the answer is to the query using AnswerRelevancyEvaluator.
|
| 339 |
+
|
| 340 |
+
Args:
|
| 341 |
+
query: Original user query (required)
|
| 342 |
+
response: Generated response to evaluate (required)
|
| 343 |
+
|
| 344 |
+
Returns:
|
| 345 |
+
float: Relevancy score between 0 and 1 (higher is better)
|
| 346 |
+
"""
|
| 347 |
+
try:
|
| 348 |
+
from llama_index.core.evaluation import AnswerRelevancyEvaluator
|
| 349 |
+
|
| 350 |
+
# Initialize the evaluator
|
| 351 |
+
evaluator = AnswerRelevancyEvaluator(llm=llm)
|
| 352 |
+
|
| 353 |
+
# Perform the evaluation
|
| 354 |
+
eval_result = evaluator.evaluate(query=query, response=response)
|
| 355 |
+
|
| 356 |
+
# Return the score as a float
|
| 357 |
+
return float(eval_result.score)
|
| 358 |
+
|
| 359 |
+
except Exception as e:
|
| 360 |
+
# In case of error, return 0.0 (minimum score) and log the error
|
| 361 |
+
print(f"Error in relevancy evaluation: {str(e)}")
|
| 362 |
+
return 0.0
|
| 363 |
+
|
| 364 |
+
async def evaluate_context_relevancy(
|
| 365 |
+
context: str,
|
| 366 |
+
query: str,
|
| 367 |
+
response: str
|
| 368 |
+
) -> float:
|
| 369 |
+
"""Evaluates the relevance of the response considering both the query and the context.
|
| 370 |
+
|
| 371 |
+
Args:
|
| 372 |
+
context: Contextual information / knowledge base (required)
|
| 373 |
+
query: Original user query (required)
|
| 374 |
+
response: Generated response to evaluate (required)
|
| 375 |
+
|
| 376 |
+
Returns:
|
| 377 |
+
float: Relevance score between 0 and 1 (higher is better)
|
| 378 |
+
"""
|
| 379 |
+
try:
|
| 380 |
+
from llama_index.core.evaluation import ContextRelevancyEvaluator
|
| 381 |
+
|
| 382 |
+
# Initialize the relevancy evaluator with context
|
| 383 |
+
evaluator = ContextRelevancyEvaluator(llm=llm)
|
| 384 |
+
|
| 385 |
+
# Perform the evaluation (adapted to handle context)
|
| 386 |
+
eval_result = evaluator.evaluate(
|
| 387 |
+
query=query,
|
| 388 |
+
response=response,
|
| 389 |
+
contexts=[context]
|
| 390 |
+
)
|
| 391 |
+
|
| 392 |
+
return float(eval_result.score)
|
| 393 |
+
|
| 394 |
+
except Exception as e:
|
| 395 |
+
print(f"Error during context relevancy evaluation: {str(e)}")
|
| 396 |
+
return 0.0
|
| 397 |
+
|
| 398 |
+
async def evaluate_faithfulness(
|
| 399 |
+
query: str,
|
| 400 |
+
response: str,
|
| 401 |
+
context: str
|
| 402 |
+
) -> float:
|
| 403 |
+
"""Evaluate how faithful (factually consistent) the response is to the provided context.
|
| 404 |
+
|
| 405 |
+
Args:
|
| 406 |
+
query: Original user query (required)
|
| 407 |
+
response: Generated response to evaluate (required)
|
| 408 |
+
context: Source context/knowledge base used for the response (required)
|
| 409 |
+
|
| 410 |
+
Returns:
|
| 411 |
+
float: Faithfulness score between 0 and 1 (higher is better)
|
| 412 |
+
"""
|
| 413 |
+
try:
|
| 414 |
+
from llama_index.core.evaluation import FaithfulnessEvaluator
|
| 415 |
+
|
| 416 |
+
# Initialize evaluator
|
| 417 |
+
evaluator = FaithfulnessEvaluator(llm=llm)
|
| 418 |
+
|
| 419 |
+
# Perform evaluation
|
| 420 |
+
eval_result = evaluator.evaluate(
|
| 421 |
+
query=query,
|
| 422 |
+
response=response,
|
| 423 |
+
contexts=[context]
|
| 424 |
+
)
|
| 425 |
+
|
| 426 |
+
# Return score as float
|
| 427 |
+
return float(eval_result.score)
|
| 428 |
+
|
| 429 |
+
except Exception as e:
|
| 430 |
+
# On error, return 0.0 (minimum score) and log the error
|
| 431 |
+
print(f"Error in faithfulness evaluation: {str(e)}")
|
| 432 |
+
return 0.0
|
| 433 |
|
| 434 |
|
| 435 |
|
|
|
|
| 499 |
api_name="_retrieve"
|
| 500 |
)
|
| 501 |
|
| 502 |
+
with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as asw_relevance_tab:
|
| 503 |
+
relevancy_interface = gr.Interface(
|
| 504 |
+
fn=evaluate_answer_relevancy,
|
| 505 |
+
inputs=[
|
| 506 |
+
gr.Textbox(label="Original Query", placeholder="E.g.: How does photosynthesis work?"),
|
| 507 |
+
gr.Textbox(label="Answer to Evaluate", placeholder="Paste the generated answer here", lines=5),
|
| 508 |
+
],
|
| 509 |
+
outputs=gr.Number(label="Relevancy Score (0-1)", precision=3),
|
| 510 |
+
title="Relevancy Evaluator (Query-Answer)",
|
| 511 |
+
description="Evaluates how relevant an answer is to the original query (1 = perfectly relevant).",
|
| 512 |
+
api_name="_evaluate_relevancy"
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as ctx_relevance_tab:
|
| 516 |
+
context_relevancy_interface = gr.Interface(
|
| 517 |
+
fn=evaluate_context_relevancy,
|
| 518 |
+
inputs=[
|
| 519 |
+
gr.Textbox(label="Context", placeholder="Relevant text / knowledge base", lines=3),
|
| 520 |
+
gr.Textbox(label="Original Query", placeholder="What question is being answered?"),
|
| 521 |
+
gr.Textbox(label="Generated Answer", placeholder="The answer to evaluate", lines=5),
|
| 522 |
+
],
|
| 523 |
+
outputs=gr.Number(label="Relevancy Score (0-1)", precision=3),
|
| 524 |
+
title="Relevancy Evaluator (Context-Query-Answer)",
|
| 525 |
+
description="Evaluates how relevant the answer is considering both the query and the reference context.",
|
| 526 |
+
api_name="_evaluate_context_relevancy"
|
| 527 |
+
)
|
| 528 |
+
|
| 529 |
+
with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as faithfulness_tab:
|
| 530 |
+
faithfulness_interface = gr.Interface(
|
| 531 |
+
fn=evaluate_faithfulness,
|
| 532 |
+
inputs=[
|
| 533 |
+
gr.Textbox(label="Original Query", placeholder="E.g.: What are the causes of climate change?"),
|
| 534 |
+
gr.Textbox(label="Answer to Evaluate", placeholder="Paste the generated answer here", lines=5),
|
| 535 |
+
gr.Textbox(label="Context", placeholder="Reference text / knowledge base", lines=3),
|
| 536 |
+
],
|
| 537 |
+
outputs=gr.Number(label="Faithfulness Score (0-1)", precision=3),
|
| 538 |
+
title="Faithfulness Evaluator",
|
| 539 |
+
description="Evaluates how faithful/factually consistent the answer is with respect to the provided context (1 = perfectly faithful).",
|
| 540 |
+
api_name="_evaluate_faithfulness"
|
| 541 |
+
)
|
| 542 |
+
|
| 543 |
# Create the interface with separate tabs
|
| 544 |
demo = gr.TabbedInterface(
|
| 545 |
+
[arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab, asw_relevance_tab, ctx_relevance_tab, faithfulness_tab],
|
| 546 |
+
["ArXiv", "Tavily", "List Retrievers", "Retrieve", "Answer Relevance", "Context Relevance", "Faithfulness"]
|
| 547 |
)
|
| 548 |
|
| 549 |
demo.launch(mcp_server=True)
|