Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -361,14 +361,8 @@ import mauve
|
|
| 361 |
from sacrebleu import corpus_bleu
|
| 362 |
from rouge_score import rouge_scorer
|
| 363 |
from bert_score import score
|
| 364 |
-
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline
|
| 365 |
-
import
|
| 366 |
-
from nltk.util import ngrams
|
| 367 |
-
from nltk.tokenize import word_tokenize
|
| 368 |
-
from nltk.translate.meteor_score import meteor_score
|
| 369 |
-
from nltk.translate.chrf_score import sentence_chrf
|
| 370 |
-
from textstat import flesch_reading_ease, flesch_kincaid_grade
|
| 371 |
-
from sklearn.metrics.pairwise import cosine_similarity
|
| 372 |
from mauve import compute_mauve
|
| 373 |
import os
|
| 374 |
import gradio as gr
|
|
@@ -406,11 +400,9 @@ class RAGEvaluator:
|
|
| 406 |
def __init__(self):
|
| 407 |
self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
|
| 408 |
self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
|
| 409 |
-
#
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
nltk.download('omw-1.4', quiet=True)
|
| 413 |
-
|
| 414 |
def load_gpt2_model(self):
|
| 415 |
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
| 416 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
@@ -449,8 +441,17 @@ class RAGEvaluator:
|
|
| 449 |
return ppl.item()
|
| 450 |
|
| 451 |
def evaluate_diversity(self, texts):
|
| 452 |
-
|
| 453 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
|
| 455 |
return diversity_score
|
| 456 |
|
|
@@ -460,19 +461,79 @@ class RAGEvaluator:
|
|
| 460 |
return bias_score
|
| 461 |
|
| 462 |
def evaluate_meteor(self, candidates, references):
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
def evaluate_chrf(self, candidates, references):
|
| 470 |
-
|
| 471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 472 |
|
| 473 |
def evaluate_readability(self, text):
|
| 474 |
-
|
| 475 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 476 |
return flesch_ease, flesch_grade
|
| 477 |
|
| 478 |
def evaluate_mauve(self, reference_texts, generated_texts):
|
|
|
|
| 361 |
from sacrebleu import corpus_bleu
|
| 362 |
from rouge_score import rouge_scorer
|
| 363 |
from bert_score import score
|
| 364 |
+
from transformers import GPT2LMHeadModel, GPT2Tokenizer, pipeline, AutoTokenizer
|
| 365 |
+
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 366 |
from mauve import compute_mauve
|
| 367 |
import os
|
| 368 |
import gradio as gr
|
|
|
|
| 400 |
def __init__(self):
|
| 401 |
self.gpt2_model, self.gpt2_tokenizer = self.load_gpt2_model()
|
| 402 |
self.bias_pipeline = pipeline("zero-shot-classification", model="Hate-speech-CNERG/dehatebert-mono-english")
|
| 403 |
+
# Initialize tokenizer for text processing
|
| 404 |
+
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
|
| 405 |
+
|
|
|
|
|
|
|
| 406 |
def load_gpt2_model(self):
|
| 407 |
model = GPT2LMHeadModel.from_pretrained('gpt2')
|
| 408 |
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
|
|
|
|
| 441 |
return ppl.item()
|
| 442 |
|
| 443 |
def evaluate_diversity(self, texts):
|
| 444 |
+
# Use Hugging Face tokenizer instead of NLTK
|
| 445 |
+
all_tokens = []
|
| 446 |
+
for text in texts:
|
| 447 |
+
tokens = self.tokenizer.tokenize(text)
|
| 448 |
+
all_tokens.extend(tokens)
|
| 449 |
+
|
| 450 |
+
# Create bigrams manually
|
| 451 |
+
unique_bigrams = set()
|
| 452 |
+
for i in range(len(all_tokens) - 1):
|
| 453 |
+
unique_bigrams.add((all_tokens[i], all_tokens[i+1]))
|
| 454 |
+
|
| 455 |
diversity_score = len(unique_bigrams) / len(all_tokens) if all_tokens else 0
|
| 456 |
return diversity_score
|
| 457 |
|
|
|
|
| 461 |
return bias_score
|
| 462 |
|
| 463 |
def evaluate_meteor(self, candidates, references):
|
| 464 |
+
# Simple approximation of METEOR without NLTK
|
| 465 |
+
# This is a simplified version - consider using an external API for full METEOR
|
| 466 |
+
meteor_scores = []
|
| 467 |
+
for ref, cand in zip(references, candidates):
|
| 468 |
+
ref_tokens = self.tokenizer.tokenize(ref)
|
| 469 |
+
cand_tokens = self.tokenizer.tokenize(cand)
|
| 470 |
+
|
| 471 |
+
# Calculate precision and recall
|
| 472 |
+
common_tokens = set(ref_tokens) & set(cand_tokens)
|
| 473 |
+
precision = len(common_tokens) / len(cand_tokens) if cand_tokens else 0
|
| 474 |
+
recall = len(common_tokens) / len(ref_tokens) if ref_tokens else 0
|
| 475 |
+
|
| 476 |
+
# F-measure with alpha=0.9 (METEOR default)
|
| 477 |
+
if precision + recall == 0:
|
| 478 |
+
f_score = 0
|
| 479 |
+
else:
|
| 480 |
+
f_score = (10 * precision * recall) / (9 * precision + recall)
|
| 481 |
+
|
| 482 |
+
meteor_scores.append(f_score)
|
| 483 |
+
|
| 484 |
+
return sum(meteor_scores) / len(meteor_scores) if meteor_scores else 0
|
| 485 |
|
| 486 |
def evaluate_chrf(self, candidates, references):
|
| 487 |
+
# Simple character n-gram F-score approximation
|
| 488 |
+
chrf_scores = []
|
| 489 |
+
for ref, cand in zip(references, candidates):
|
| 490 |
+
# Character 6-grams
|
| 491 |
+
ref_chars = list(ref)
|
| 492 |
+
cand_chars = list(cand)
|
| 493 |
+
|
| 494 |
+
ref_ngrams = set()
|
| 495 |
+
cand_ngrams = set()
|
| 496 |
+
|
| 497 |
+
# Create character 6-grams
|
| 498 |
+
for i in range(len(ref_chars) - 5):
|
| 499 |
+
ref_ngrams.add(tuple(ref_chars[i:i+6]))
|
| 500 |
+
|
| 501 |
+
for i in range(len(cand_chars) - 5):
|
| 502 |
+
cand_ngrams.add(tuple(cand_chars[i:i+6]))
|
| 503 |
+
|
| 504 |
+
common_ngrams = ref_ngrams & cand_ngrams
|
| 505 |
+
precision = len(common_ngrams) / len(cand_ngrams) if cand_ngrams else 0
|
| 506 |
+
recall = len(common_ngrams) / len(ref_ngrams) if ref_ngrams else 0
|
| 507 |
+
|
| 508 |
+
if precision + recall == 0:
|
| 509 |
+
chrf_score = 0
|
| 510 |
+
else:
|
| 511 |
+
chrf_score = 2 * precision * recall / (precision + recall)
|
| 512 |
+
|
| 513 |
+
chrf_scores.append(chrf_score)
|
| 514 |
+
|
| 515 |
+
return sum(chrf_scores) / len(chrf_scores) if chrf_scores else 0
|
| 516 |
|
| 517 |
def evaluate_readability(self, text):
|
| 518 |
+
# Simple readability metrics without textstat
|
| 519 |
+
words = re.findall(r'\b\w+\b', text.lower())
|
| 520 |
+
sentences = re.split(r'[.!?]+', text)
|
| 521 |
+
|
| 522 |
+
num_words = len(words)
|
| 523 |
+
num_sentences = len([s for s in sentences if s.strip()])
|
| 524 |
+
|
| 525 |
+
# Average word length
|
| 526 |
+
avg_word_length = sum(len(word) for word in words) / num_words if num_words else 0
|
| 527 |
+
|
| 528 |
+
# Words per sentence
|
| 529 |
+
words_per_sentence = num_words / num_sentences if num_sentences else 0
|
| 530 |
+
|
| 531 |
+
# Simplified Flesch Reading Ease approximation
|
| 532 |
+
flesch_ease = 206.835 - (1.015 * words_per_sentence) - (84.6 * avg_word_length)
|
| 533 |
+
|
| 534 |
+
# Simplified Flesch-Kincaid Grade Level approximation
|
| 535 |
+
flesch_grade = (0.39 * words_per_sentence) + (11.8 * avg_word_length) - 15.59
|
| 536 |
+
|
| 537 |
return flesch_ease, flesch_grade
|
| 538 |
|
| 539 |
def evaluate_mauve(self, reference_texts, generated_texts):
|