Spaces:
Sleeping
Sleeping
File size: 3,609 Bytes
cbe2e7b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 |
import time
from typing import Dict, Any, List, Optional
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer
def calculate_confidence(answer: str) -> float:
"""
Calculate confidence score for generated answer.
"""
confidence = 1.0
# Reduce confidence for uncertain language
uncertainty_markers = [
"might", "may", "could", "possibly", "perhaps",
"I think", "probably", "likely", "seems", "appears"
]
for marker in uncertainty_markers:
if marker in answer.lower():
confidence *= 0.9
# Reduce confidence for very short or very long answers
words = answer.split()
if len(words) < 5:
confidence *= 0.8
elif len(words) > 100:
confidence *= 0.9
# Reduce confidence if answer contains non-financial terms
financial_terms = [
"revenue", "profit", "loss", "income", "expense",
"asset", "liability", "equity", "cash", "stock",
"share", "dividend", "market", "financial", "fiscal"
]
if not any(term in answer.lower() for term in financial_terms):
confidence *= 0.7
return max(0.1, min(confidence, 1.0))
def calculate_semantic_similarity(text1: str, text2: str) -> float:
"""
Calculate semantic similarity between two texts using sentence embeddings.
"""
model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode([text1, text2])
similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
return float(similarity)
def evaluate_response(query: str, answer: str, chunks: Optional[List[str]] = None) -> Dict[str, Any]:
"""
Evaluate the quality of the generated response.
"""
confidence = calculate_confidence(answer)
metrics = {
"confidence": confidence,
"answer_length": len(answer.split()),
"query_length": len(query.split()),
}
if chunks:
metrics["num_chunks"] = len(chunks)
# Calculate chunk relevance score
if len(chunks) > 0:
# Split query into terms, excluding common words
query_terms = [term.lower() for term in query.split()
if term.lower() not in {'what', 'was', 'is', 'are', 'in', 'the', 'a', 'an', 'and', 'or'}]
# Calculate relevance for each chunk
chunk_scores = []
for chunk in chunks:
chunk_lower = chunk.lower()
matches = sum(1 for term in query_terms if term in chunk_lower)
chunk_scores.append(matches / len(query_terms) if query_terms else 0)
# Take average of chunk scores
metrics["chunk_relevance"] = sum(chunk_scores) / len(chunks)
else:
metrics["chunk_relevance"] = 0.0
return metrics
def evaluate_models(questions: List[str], answers: List[str], rag_fn, ft_fn) -> List[Dict]:
"""
Evaluate and compare RAG and fine-tuned models.
"""
results = []
for q, a in zip(questions, answers):
start = time.time()
rag_answer = rag_fn(q)
rag_time = time.time() - start
start = time.time()
ft_answer = ft_fn(q)
ft_time = time.time() - start
results.append({
"question": q,
"ground_truth": a,
"rag_answer": rag_answer,
"rag_time": round(rag_time, 2),
"ft_answer": ft_answer,
"ft_time": round(ft_time, 2)
})
return results
|