geronimo-pericoli commited on
Commit
982f629
·
verified ·
1 Parent(s): 39cb644

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +143 -2
app.py CHANGED
@@ -330,6 +330,106 @@ async def search_tavily(
330
  "query": query
331
  }
332
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
 
334
 
335
 
@@ -399,10 +499,51 @@ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as retrieve_tab:
399
  api_name="_retrieve"
400
  )
401
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
  # Create the interface with separate tabs
403
  demo = gr.TabbedInterface(
404
- [arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab],
405
- ["ArXiv", "Tavily", "List Retrievers", "Retrieve"]
406
  )
407
 
408
  demo.launch(mcp_server=True)
 
330
  "query": query
331
  }
332
 
333
+ ##### EVALS #####
334
+ async def evaluate_answer_relevancy(
335
+ query: str,
336
+ response: str,
337
+ ) -> float:
338
+ """Evaluate how relevant the answer is to the query using AnswerRelevancyEvaluator.
339
+
340
+ Args:
341
+ query: Original user query (required)
342
+ response: Generated response to evaluate (required)
343
+
344
+ Returns:
345
+ float: Relevancy score between 0 and 1 (higher is better)
346
+ """
347
+ try:
348
+ from llama_index.core.evaluation import AnswerRelevancyEvaluator
349
+
350
+ # Initialize the evaluator
351
+ evaluator = AnswerRelevancyEvaluator(llm=llm)
352
+
353
+ # Perform the evaluation
354
+ eval_result = evaluator.evaluate(query=query, response=response)
355
+
356
+ # Return the score as a float
357
+ return float(eval_result.score)
358
+
359
+ except Exception as e:
360
+ # In case of error, return 0.0 (minimum score) and log the error
361
+ print(f"Error in relevancy evaluation: {str(e)}")
362
+ return 0.0
363
+
364
+ async def evaluate_context_relevancy(
365
+ context: str,
366
+ query: str,
367
+ response: str
368
+ ) -> float:
369
+ """Evaluates the relevance of the response considering both the query and the context.
370
+
371
+ Args:
372
+ context: Contextual information / knowledge base (required)
373
+ query: Original user query (required)
374
+ response: Generated response to evaluate (required)
375
+
376
+ Returns:
377
+ float: Relevance score between 0 and 1 (higher is better)
378
+ """
379
+ try:
380
+ from llama_index.core.evaluation import ContextRelevancyEvaluator
381
+
382
+ # Initialize the relevancy evaluator with context
383
+ evaluator = ContextRelevancyEvaluator(llm=llm)
384
+
385
+ # Perform the evaluation (adapted to handle context)
386
+ eval_result = evaluator.evaluate(
387
+ query=query,
388
+ response=response,
389
+ contexts=[context]
390
+ )
391
+
392
+ return float(eval_result.score)
393
+
394
+ except Exception as e:
395
+ print(f"Error during context relevancy evaluation: {str(e)}")
396
+ return 0.0
397
+
398
+ async def evaluate_faithfulness(
399
+ query: str,
400
+ response: str,
401
+ context: str
402
+ ) -> float:
403
+ """Evaluate how faithful (factually consistent) the response is to the provided context.
404
+
405
+ Args:
406
+ query: Original user query (required)
407
+ response: Generated response to evaluate (required)
408
+ context: Source context/knowledge base used for the response (required)
409
+
410
+ Returns:
411
+ float: Faithfulness score between 0 and 1 (higher is better)
412
+ """
413
+ try:
414
+ from llama_index.core.evaluation import FaithfulnessEvaluator
415
+
416
+ # Initialize evaluator
417
+ evaluator = FaithfulnessEvaluator(llm=llm)
418
+
419
+ # Perform evaluation
420
+ eval_result = evaluator.evaluate(
421
+ query=query,
422
+ response=response,
423
+ contexts=[context]
424
+ )
425
+
426
+ # Return score as float
427
+ return float(eval_result.score)
428
+
429
+ except Exception as e:
430
+ # On error, return 0.0 (minimum score) and log the error
431
+ print(f"Error in faithfulness evaluation: {str(e)}")
432
+ return 0.0
433
 
434
 
435
 
 
499
  api_name="_retrieve"
500
  )
501
 
502
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as asw_relevance_tab:
503
+ relevancy_interface = gr.Interface(
504
+ fn=evaluate_answer_relevancy,
505
+ inputs=[
506
+ gr.Textbox(label="Original Query", placeholder="E.g.: How does photosynthesis work?"),
507
+ gr.Textbox(label="Answer to Evaluate", placeholder="Paste the generated answer here", lines=5),
508
+ ],
509
+ outputs=gr.Number(label="Relevancy Score (0-1)", precision=3),
510
+ title="Relevancy Evaluator (Query-Answer)",
511
+ description="Evaluates how relevant an answer is to the original query (1 = perfectly relevant).",
512
+ api_name="_evaluate_relevancy"
513
+ )
514
+
515
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as ctx_relevance_tab:
516
+ context_relevancy_interface = gr.Interface(
517
+ fn=evaluate_context_relevancy,
518
+ inputs=[
519
+ gr.Textbox(label="Context", placeholder="Relevant text / knowledge base", lines=3),
520
+ gr.Textbox(label="Original Query", placeholder="What question is being answered?"),
521
+ gr.Textbox(label="Generated Answer", placeholder="The answer to evaluate", lines=5),
522
+ ],
523
+ outputs=gr.Number(label="Relevancy Score (0-1)", precision=3),
524
+ title="Relevancy Evaluator (Context-Query-Answer)",
525
+ description="Evaluates how relevant the answer is considering both the query and the reference context.",
526
+ api_name="_evaluate_context_relevancy"
527
+ )
528
+
529
+ with gr.Blocks(title="MCP Tools", theme=gr.themes.Base()) as faithfulness_tab:
530
+ faithfulness_interface = gr.Interface(
531
+ fn=evaluate_faithfulness,
532
+ inputs=[
533
+ gr.Textbox(label="Original Query", placeholder="E.g.: What are the causes of climate change?"),
534
+ gr.Textbox(label="Answer to Evaluate", placeholder="Paste the generated answer here", lines=5),
535
+ gr.Textbox(label="Context", placeholder="Reference text / knowledge base", lines=3),
536
+ ],
537
+ outputs=gr.Number(label="Faithfulness Score (0-1)", precision=3),
538
+ title="Faithfulness Evaluator",
539
+ description="Evaluates how faithful/factually consistent the answer is with respect to the provided context (1 = perfectly faithful).",
540
+ api_name="_evaluate_faithfulness"
541
+ )
542
+
543
  # Create the interface with separate tabs
544
  demo = gr.TabbedInterface(
545
+ [arxiv_tab, tavily_tab, list_retrievers_tab, retrieve_tab, asw_relevance_tab, ctx_relevance_tab, faithfulness_tab],
546
+ ["ArXiv", "Tavily", "List Retrievers", "Retrieve", "Answer Relevance", "Context Relevance", "Faithfulness"]
547
  )
548
 
549
  demo.launch(mcp_server=True)