jayyd commited on
Commit
cbe2e7b
·
verified ·
1 Parent(s): bfc9d54

Update utils/evaluation.py

Browse files
Files changed (1) hide show
  1. utils/evaluation.py +105 -0
utils/evaluation.py CHANGED
@@ -0,0 +1,105 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ from typing import Dict, Any, List, Optional
3
+ import numpy as np
4
+ from sklearn.metrics.pairwise import cosine_similarity
5
+ from sentence_transformers import SentenceTransformer
6
+
7
+ def calculate_confidence(answer: str) -> float:
8
+ """
9
+ Calculate confidence score for generated answer.
10
+ """
11
+ confidence = 1.0
12
+
13
+ # Reduce confidence for uncertain language
14
+ uncertainty_markers = [
15
+ "might", "may", "could", "possibly", "perhaps",
16
+ "I think", "probably", "likely", "seems", "appears"
17
+ ]
18
+ for marker in uncertainty_markers:
19
+ if marker in answer.lower():
20
+ confidence *= 0.9
21
+
22
+ # Reduce confidence for very short or very long answers
23
+ words = answer.split()
24
+ if len(words) < 5:
25
+ confidence *= 0.8
26
+ elif len(words) > 100:
27
+ confidence *= 0.9
28
+
29
+ # Reduce confidence if answer contains non-financial terms
30
+ financial_terms = [
31
+ "revenue", "profit", "loss", "income", "expense",
32
+ "asset", "liability", "equity", "cash", "stock",
33
+ "share", "dividend", "market", "financial", "fiscal"
34
+ ]
35
+ if not any(term in answer.lower() for term in financial_terms):
36
+ confidence *= 0.7
37
+
38
+ return max(0.1, min(confidence, 1.0))
39
+
40
+ def calculate_semantic_similarity(text1: str, text2: str) -> float:
41
+ """
42
+ Calculate semantic similarity between two texts using sentence embeddings.
43
+ """
44
+ model = SentenceTransformer('all-MiniLM-L6-v2')
45
+ embeddings = model.encode([text1, text2])
46
+ similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
47
+ return float(similarity)
48
+
49
+ def evaluate_response(query: str, answer: str, chunks: Optional[List[str]] = None) -> Dict[str, Any]:
50
+ """
51
+ Evaluate the quality of the generated response.
52
+ """
53
+ confidence = calculate_confidence(answer)
54
+
55
+ metrics = {
56
+ "confidence": confidence,
57
+ "answer_length": len(answer.split()),
58
+ "query_length": len(query.split()),
59
+ }
60
+
61
+ if chunks:
62
+ metrics["num_chunks"] = len(chunks)
63
+ # Calculate chunk relevance score
64
+ if len(chunks) > 0:
65
+ # Split query into terms, excluding common words
66
+ query_terms = [term.lower() for term in query.split()
67
+ if term.lower() not in {'what', 'was', 'is', 'are', 'in', 'the', 'a', 'an', 'and', 'or'}]
68
+
69
+ # Calculate relevance for each chunk
70
+ chunk_scores = []
71
+ for chunk in chunks:
72
+ chunk_lower = chunk.lower()
73
+ matches = sum(1 for term in query_terms if term in chunk_lower)
74
+ chunk_scores.append(matches / len(query_terms) if query_terms else 0)
75
+
76
+ # Take average of chunk scores
77
+ metrics["chunk_relevance"] = sum(chunk_scores) / len(chunks)
78
+ else:
79
+ metrics["chunk_relevance"] = 0.0
80
+
81
+ return metrics
82
+
83
+ def evaluate_models(questions: List[str], answers: List[str], rag_fn, ft_fn) -> List[Dict]:
84
+ """
85
+ Evaluate and compare RAG and fine-tuned models.
86
+ """
87
+ results = []
88
+ for q, a in zip(questions, answers):
89
+ start = time.time()
90
+ rag_answer = rag_fn(q)
91
+ rag_time = time.time() - start
92
+
93
+ start = time.time()
94
+ ft_answer = ft_fn(q)
95
+ ft_time = time.time() - start
96
+
97
+ results.append({
98
+ "question": q,
99
+ "ground_truth": a,
100
+ "rag_answer": rag_answer,
101
+ "rag_time": round(rag_time, 2),
102
+ "ft_answer": ft_answer,
103
+ "ft_time": round(ft_time, 2)
104
+ })
105
+ return results