Update abc
Browse files
abc
CHANGED
|
@@ -29,4 +29,98 @@ print("Cosine Similarity Matrix (rows: retrieved, columns: relevant):\n")
|
|
| 29 |
for i, retrieved in enumerate(retrieved_chunks):
|
| 30 |
for j, relevant in enumerate(relevant_chunks):
|
| 31 |
score = cosine_sim_matrix[i][j].item()
|
| 32 |
-
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 29 |
for i, retrieved in enumerate(retrieved_chunks):
|
| 30 |
for j, relevant in enumerate(relevant_chunks):
|
| 31 |
score = cosine_sim_matrix[i][j].item()
|
| 32 |
+
print(f"Similarity between:\n Retrieved: \"{retrieved}\"\n Relevant : \"{relevant}\"\n Score : {score:.4f}\n")------
|
| 33 |
+
-----------------------------------------
|
| 34 |
+
|
| 35 |
+
|
| 36 |
+
import numpy as np
|
| 37 |
+
from nltk.translate.bleu_score import sentence_bleu
|
| 38 |
+
from rouge_score import rouge_scorer
|
| 39 |
+
from sentence_transformers import SentenceTransformer
|
| 40 |
+
from sklearn.metrics.pairwise import cosine_similarity
|
| 41 |
+
from transformers import GPT2Tokenizer, GPT2LMHeadModel
|
| 42 |
+
import torch
|
| 43 |
+
|
| 44 |
+
#client = genai.Client(api_key=os.getenv("GEMINI_API_KEY"))
|
| 45 |
+
|
| 46 |
+
# Load models
|
| 47 |
+
embedding_model = SentenceTransformer('all-MiniLM-L6-v2')
|
| 48 |
+
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
|
| 49 |
+
perplexity_model = GPT2LMHeadModel.from_pretrained("gpt2")
|
| 50 |
+
perplexity_model.eval()
|
| 51 |
+
|
| 52 |
+
# Evaluation Metrics
|
| 53 |
+
def bleu_rouge_score(reference, generated):
|
| 54 |
+
bleu = sentence_bleu([reference.split()], generated.split())
|
| 55 |
+
rouge = rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True)
|
| 56 |
+
rougeL = rouge.score(reference, generated)['rougeL'].fmeasure
|
| 57 |
+
return {'bleu': bleu, 'rougeL': rougeL}
|
| 58 |
+
|
| 59 |
+
def cosine_sim(reference, generated):
|
| 60 |
+
emb_ref = embedding_model.encode([reference])[0]
|
| 61 |
+
emb_gen = embedding_model.encode([generated])[0]
|
| 62 |
+
sim = cosine_similarity([emb_ref], [emb_gen])[0][0]
|
| 63 |
+
return sim
|
| 64 |
+
|
| 65 |
+
def perplexity_score(text):
|
| 66 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 67 |
+
with torch.no_grad():
|
| 68 |
+
outputs = perplexity_model(**inputs, labels=inputs["input_ids"])
|
| 69 |
+
loss = outputs.loss
|
| 70 |
+
return torch.exp(loss).item()
|
| 71 |
+
|
| 72 |
+
def precision_at_k(retrieved, relevant, k):
|
| 73 |
+
top_k = retrieved[:k]
|
| 74 |
+
correct = sum(1 for item in top_k if item in relevant)
|
| 75 |
+
return correct / k
|
| 76 |
+
|
| 77 |
+
def recall_at_k(retrieved, relevant, k):
|
| 78 |
+
correct = sum(1 for item in retrieved[:k] if item in relevant)
|
| 79 |
+
return correct / len(relevant)
|
| 80 |
+
|
| 81 |
+
def ndcg_at_k(retrieved, relevant, k):
|
| 82 |
+
def dcg(items):
|
| 83 |
+
return sum([1 / np.log2(i+2) if items[i] in relevant else 0 for i in range(len(items))])
|
| 84 |
+
ideal = dcg(relevant[:k])
|
| 85 |
+
actual = dcg(retrieved[:k])
|
| 86 |
+
return actual / ideal if ideal != 0 else 0
|
| 87 |
+
|
| 88 |
+
def hit_at_k(retrieved, relevant, k):
|
| 89 |
+
top_k = retrieved[:k]
|
| 90 |
+
return int(any(item in relevant for item in top_k))
|
| 91 |
+
|
| 92 |
+
# Main Evaluation
|
| 93 |
+
def full_evaluation(reference, generated, retrieved, relevant_chunks):
|
| 94 |
+
return {
|
| 95 |
+
**bleu_rouge_score(reference, generated),
|
| 96 |
+
"cosine_similarity": cosine_sim(reference, generated),
|
| 97 |
+
"perplexity": perplexity_score(generated),
|
| 98 |
+
"precision@5": precision_at_k(retrieved, relevant_chunks, 5),
|
| 99 |
+
"recall@5": recall_at_k(retrieved, relevant_chunks, 5),
|
| 100 |
+
"ndcg@5": ndcg_at_k(retrieved, relevant_chunks, 5),
|
| 101 |
+
"hit@5": hit_at_k(retrieved, relevant_chunks, 5)
|
| 102 |
+
}
|
| 103 |
+
|
| 104 |
+
# Sample Run
|
| 105 |
+
if __name__ == "__main__":
|
| 106 |
+
reference_answer = "The Eiffel Tower is located in Paris."
|
| 107 |
+
generated_response = "Eiffel Tower stands in Paris."
|
| 108 |
+
|
| 109 |
+
retrieved_chunks = [
|
| 110 |
+
"The Eiffel Tower is a landmark in Paris.",
|
| 111 |
+
"Paris is the capital of France.",
|
| 112 |
+
"The Louvre is also in Paris.",
|
| 113 |
+
"Eiffel Tower was built in 1889.",
|
| 114 |
+
"It is a famous tourist spot."
|
| 115 |
+
]
|
| 116 |
+
|
| 117 |
+
relevant_chunks = [
|
| 118 |
+
"The Eiffel Tower is a landmark in Paris.",
|
| 119 |
+
"Eiffel Tower was built in 1889."
|
| 120 |
+
]
|
| 121 |
+
|
| 122 |
+
scores = full_evaluation(reference_answer, generated_response, retrieved_chunks, relevant_chunks)
|
| 123 |
+
|
| 124 |
+
for metric, score in scores.items():
|
| 125 |
+
print(f"{metric}: {score:.4f}" if isinstance(score, float) else f"{metric}: {score}")
|
| 126 |
+
|