Spaces:

Rulga
/

status-law-gbot

Running

File size: 17,171 Bytes

"""
Module for evaluation and annotation of bot responses
"""

import json
import os
import datetime
from typing import List, Dict, Any, Tuple, Optional
import io
import logging
from huggingface_hub import HfApi

logger = logging.getLogger(__name__)

from config.settings import (
    DATASET_ID,
    HF_TOKEN,
    CHAT_HISTORY_PATH,
    DATASET_CHAT_HISTORY_PATH,
    DATASET_ANNOTATIONS_PATH
)

class ChatEvaluator:
    def __init__(self, hf_token: str = None, dataset_id: str = None):
        """
        Initialize chat evaluator with lazy loading
        
        Args:
            hf_token: Hugging Face token
            dataset_id: Dataset ID on Hugging Face
        """
        self.hf_token = hf_token or HF_TOKEN
        self.dataset_id = dataset_id or DATASET_ID
        self.api = HfApi(token=self.hf_token)
        
        # Using paths from settings
        self.chat_history_path = DATASET_CHAT_HISTORY_PATH
        self.annotations_path = DATASET_ANNOTATIONS_PATH
        
        # Cache for chat histories and QA pairs
        self._chat_histories = None
        self._qa_pairs = None
        self._annotations = None
        
        # Ensure directories exist in dataset
        try:
            self._ensure_dataset_structure()
        except Exception as e:
            logger.error(f"Failed to ensure dataset structure: {e}")

    def _ensure_dataset_structure(self):
        """Ensure required directories exist in dataset"""
        try:
            files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
            
            # Check and create chat history directory
            if self.chat_history_path not in files:
                self.api.upload_file(
                    path_or_fileobj=io.BytesIO(b""),
                    path_in_repo=f"{self.chat_history_path}/.gitkeep",
                    repo_id=self.dataset_id,
                    repo_type="dataset"
                )
            
            # Check and create annotations directory
            if self.annotations_path not in files:
                self.api.upload_file(
                    path_or_fileobj=io.BytesIO(b""),
                    path_in_repo=f"{self.annotations_path}/.gitkeep",
                    repo_id=self.dataset_id,
                    repo_type="dataset"
                )
        except Exception as e:
            logger.error(f"Error ensuring dataset structure: {e}")
            raise

    def reset_cache(self):
        """
        Reset the cache to force reload of data
        """
        self._chat_histories = None
        self._qa_pairs = None
        self._annotations = None
        logger.info("Chat evaluator cache has been reset")

    def get_chat_history(self, force_reload=False) -> List[Dict[str, Any]]:
        """
        Get all chat histories from the dataset
        
        Args:
            force_reload: If True, ignore cache and reload from dataset
        """
        # Return cached data if available and not forcing reload
        if self._chat_histories is not None and not force_reload:
            logger.debug("Returning cached chat histories")
            return self._chat_histories
        
        try:
            # Get list of all files in chat history directory
            files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")  
            
            # Filter for chat history files
            chat_path = f"{self.chat_history_path}/"
            chat_files = [f for f in files if f.startswith(chat_path) and f.endswith('.json')]
            logger.debug(f"Found {len(chat_files)} chat files")  # More compact log

            histories = []
            for file in chat_files:
                try:
                    # Download and parse each chat file
                    content = self.api.hf_hub_download(
                        repo_id=self.dataset_id,
                        filename=file,
                        repo_type="dataset"
                    )
                    with open(content, 'r', encoding='utf-8') as f:
                        chat_data = json.load(f)
                        if isinstance(chat_data, dict) and 'history' in chat_data:
                            histories.append(chat_data)
                        else:
                            logger.warning(f"Invalid chat history format in {file}")
                except Exception as e:
                    logger.error(f"Error processing chat file {file}: {e}")
                    continue

            # Cache the results
            self._chat_histories = histories
            return histories

        except Exception as e:
            logger.error(f"Failed to get chat histories: {e}")
            return []

    def extract_qa_pairs(self, histories: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
        """
        Extract question-answer pairs from chat histories
        """
        qa_pairs = []
        
        for history in histories:
            messages = history.get('history', [])
            current_question = None
            
            for msg in messages:
                if msg.get('role') == 'user':
                    current_question = msg.get('content')
                elif msg.get('role') == 'assistant' and current_question:
                    qa_pairs.append({
                        'conversation_id': history.get('conversation_id'),
                        'question': current_question,
                        'answer': msg.get('content'),
                        'timestamp': history.get('timestamp')
                    })
                    current_question = None

        logger.debug(f"Extracted {len(qa_pairs)} QA pairs")
        return qa_pairs

    def get_qa_pairs_for_evaluation(self, limit: int = 50, force_reload=False) -> List[Dict[str, Any]]:
        """
        Extract question-answer pairs for evaluation
        
        Args:
            limit: Maximum number of pairs to return
            force_reload: If True, force reload from dataset
            
        Returns:
            List of QA pairs with metadata
        """
        # Return cached data if available and not forcing reload
        if self._qa_pairs is not None and not force_reload:
            logger.debug("Returning cached QA pairs")
            return self._qa_pairs[:limit]  # Respect the limit parameter
        
        chat_data = self.get_chat_history(force_reload=force_reload)
        qa_pairs = []
        
        logger.debug(f"Processing {len(chat_data)} chat histories")
        
        for chat in chat_data:
            conversation_id = chat.get("conversation_id", "unknown")
            timestamp = chat.get("timestamp", "")
            messages = chat.get("history", [])
            
            # Find user-assistant pairs in messages
            for i in range(len(messages) - 1):
                if (messages[i].get("role") == "user" and 
                    messages[i+1].get("role") == "assistant"):
                    question = messages[i].get("content", "").strip()
                    answer = messages[i+1].get("content", "").strip()
                    
                    # Only include non-empty pairs
                    if question and answer:
                        qa_pairs.append({
                            "conversation_id": conversation_id,
                            "timestamp": timestamp,
                            "question": question,
                            "original_answer": answer,
                            "question_timestamp": messages[i].get("timestamp", ""),
                            "answer_timestamp": messages[i+1].get("timestamp", "")
                        })
        
        # Cache the results
        self._qa_pairs = qa_pairs
        
        logger.debug(f"Extracted {len(qa_pairs)} QA pairs")
        # Return up to the limit
        return qa_pairs[:limit]
    
    def get_evaluation_status(self, force_reload=False) -> Dict[str, int]:
        """
        Get status of evaluated QA pairs
        
        Args:
            force_reload: If True, force reload from dataset
            
        Returns:
            Dictionary with counts of evaluated and unevaluated QA pairs
        """
        all_pairs = self.get_qa_pairs_for_evaluation(limit=1000, force_reload=force_reload)  # Get a large sample
        evaluated_pairs = self.get_annotations(force_reload=force_reload)
        
        # Count evaluated conversation IDs
        evaluated_ids = set(item.get("conversation_id") for item in evaluated_pairs)
        
        return {
            "total_qa_pairs": len(all_pairs),
            "evaluated_pairs": len(evaluated_pairs),
            "unevaluated_pairs": len(all_pairs) - len(evaluated_pairs),
            "evaluated_conversations": len(evaluated_ids)
        }
    
    def save_annotation(self, 
                       conversation_id: str,
                       question: str,
                       original_answer: str,
                       improved_answer: str,
                       ratings: Dict[str, int],
                       notes: str = "") -> Tuple[bool, str]:
        """
        Save evaluation annotation
        """
        try:
            # Create annotation object
            annotation = {
                "conversation_id": conversation_id,
                "timestamp": datetime.datetime.now().isoformat(),
                "question": question,
                "original_answer": original_answer,
                "improved_answer": improved_answer,
                "ratings": ratings,
                "notes": notes
            }
            
            # Create filename with conversation_id
            filename = f"{self.annotations_path}/annotation_{conversation_id}.json"
            
            # Convert to JSON bytes
            json_content = json.dumps(annotation, ensure_ascii=False, indent=2).encode('utf-8')
            
            # Upload to dataset using bytes buffer
            self.api.upload_file(
                path_or_fileobj=io.BytesIO(json_content),
                path_in_repo=filename,
                repo_id=self.dataset_id,
                repo_type="dataset"
            )
            
            # Reset annotations cache
            self._annotations = None
            
            return True, "Annotation saved successfully"
            
        except Exception as e:
            logger.error(f"Error saving annotation: {e}")
            return False, f"Failed to save annotation: {str(e)}"
    
    def get_annotations(self, force_reload=False) -> List[Dict[str, Any]]:
        """
        Get all saved annotations from dataset
        
        Args:
            force_reload: If True, force reload from dataset
        """
        # Return cached data if available and not forcing reload
        if self._annotations is not None and not force_reload:
            logger.debug("Returning cached annotations")
            return self._annotations
        
        try:
            annotations = []
            files = self.api.list_repo_files(self.dataset_id, repo_type="dataset")
            
            for file in files:
                if file.startswith(f"{self.annotations_path}/annotation_") and file.endswith(".json"):
                    try:
                        # Download and parse annotation file
                        content = self.api.hf_hub_download(
                            repo_id=self.dataset_id,
                            filename=file,
                            repo_type="dataset"
                        )
                        with open(content, 'r', encoding='utf-8') as f:
                            annotation = json.load(f)
                            annotations.append(annotation)
                    except Exception as e:
                        logger.error(f"Error loading annotation {file}: {e}")
            
            # Sort by timestamp (newest first)
            annotations.sort(key=lambda x: x.get("timestamp", ""), reverse=True)
            
            # Cache the results
            self._annotations = annotations
            
            return annotations
            
        except Exception as e:
            logger.error(f"Error getting annotations: {e}")
            return []
    
    def get_annotation(self, conversation_id: str) -> Optional[Dict[str, Any]]:
        """
        Get specific annotation by conversation ID
        """
        try:
            # First check if annotations are loaded
            if self._annotations is not None:
                for annotation in self._annotations:
                    if annotation.get("conversation_id") == conversation_id:
                        return annotation
            
            # If not found in cache, try direct file access
            filename = f"{self.annotations_path}/annotation_{conversation_id}.json"
            try:
                content = self.api.hf_hub_download(
                    repo_id=self.dataset_id,
                    filename=filename,
                    repo_type="dataset"
                )
                
                with open(content, 'r', encoding='utf-8') as f:
                    return json.load(f)
            except Exception as e:
                logger.error(f"Error loading annotation for {conversation_id}: {e}")
                return None
                
        except Exception as e:
            logger.error(f"Error getting annotation: {e}")
            return None
    
    def export_training_data(self, output_file: str, min_rating: int = 4) -> Tuple[bool, str]:
        """
        Export high-quality annotated data for fine-tuning
        
        Args:
            output_file: Path to output file
            min_rating: Minimum average rating to include in training data
            
        Returns:
            (success, message)
        """
        annotations = self.get_annotations()
        
        if not annotations:
            return False, "No annotations available for export"
        
        try:
            # Filter annotations by quality
            high_quality_examples = []
            
            for annotation in annotations:
                ratings = annotation.get("ratings", {})
                
                # Calculate average rating
                if ratings:
                    avg_rating = sum(ratings.values()) / len(ratings)
                    
                    # Include only high-quality examples
                    if avg_rating >= min_rating:
                        high_quality_examples.append({
                            "messages": [
                                {"role": "user", "content": annotation.get("question", "")},
                                {"role": "assistant", "content": annotation.get("improved_answer", "")}
                            ]
                        })
            
            if not high_quality_examples:
                return False, f"No examples meet the minimum quality threshold of {min_rating}"
            
            # Save to JSONL format
            with open(output_file, "w", encoding="utf-8") as f:
                for example in high_quality_examples:
                    f.write(json.dumps(example, ensure_ascii=False) + "\n")
            
            return True, f"Successfully exported {len(high_quality_examples)} high-quality examples for training"
        except Exception as e:
            return False, f"Error exporting training data: {str(e)}"
    
    def generate_evaluation_report(self) -> Dict[str, Any]:
        """
        Generate evaluation summary report
        
        Returns:
            Dictionary with evaluation metrics
        """
        annotations = self.get_annotations()
        
        if not annotations:
            return {
                "total_evaluations": 0,
                "message": "No evaluations available"
            }
        
        # Initialize metrics
        criteria = set()
        for annotation in annotations:
            criteria.update(annotation.get("ratings", {}).keys())
        
        metrics = {
            "total_evaluations": len(annotations),
            "criteria_averages": {},
            "overall_average": 0,
            "improvement_rate": 0  # Percentage of answers that were improved
        }
        
        # Calculate averages for each criterion
        for criterion in criteria:
            values = [a.get("ratings", {}).get(criterion, 0) for a in annotations if criterion in a.get("ratings", {})]
            if values:
                metrics["criteria_averages"][criterion] = sum(values) / len(values)
        
        # Calculate overall average
        all_ratings = []
        for annotation in annotations:
            all_ratings.extend(annotation.get("ratings", {}).values())
        
        if all_ratings:
            metrics["overall_average"] = sum(all_ratings) / len(all_ratings)
        
        # Calculate improvement rate
        improved_count = sum(1 for a in annotations if a.get("original_answer") != a.get("improved_answer"))
        metrics["improvement_rate"] = (improved_count / len(annotations)) * 100
        
        return metrics