import os
import json
import time
import warnings
from datetime import datetime
from typing import List, Dict, Optional, Tuple
import re

import numpy as np
import pandas as pd
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

# Topic Modeling
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from sklearn.feature_extraction.text import CountVectorizer
import umap
import hdbscan

# Hugging Face
from datasets import Dataset
from huggingface_hub import login

# Vector Database
import faiss
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import HuggingFaceEmbeddings

# Language Models
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    pipeline
)

# Evaluation Metrics
from sklearn.metrics import precision_recall_fscore_support, accuracy_score
from sklearn.metrics.pairwise import cosine_similarity

warnings.filterwarnings('ignore')

# Set matplotlib to use English
plt.rcParams['font.family'] = 'DejaVu Sans'
plt.rcParams['axes.unicode_minus'] = False


# ============================================================================
# Configuration
# ============================================================================

class Config:
    """System configuration parameters"""

    # Paths
    EXCEL_PATH = r'C:\Users\AI\OneDrive\Desktop\enger\ok-Paper_references-2.xlsx'
    OUTPUT_DIR = 'output2025-2'

    # Model Settings
    EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
    DEFAULT_LLM = 'google/flan-t5-large'

    # Topic Modeling
    MIN_CLUSTER_SIZE = 20
    N_NEIGHBORS = 15
    MIN_DF = 5

    # Retrieval
    TOP_K = 5
    MAX_CONTEXT_LENGTH = 3000

    # Generation
    MAX_NEW_TOKENS = 400
    TEMPERATURE = 0.9
    TOP_P = 0.95

    # Evaluation
    EVAL_BATCH_SIZE = 32
    SAVE_PLOTS = True

    # Hugging Face
    HF_TOKEN = "token"
    HF_REPO = "fc28/ChatMed"



# ============================================================================
# Data Processing Module
# ============================================================================

class MedicalDataProcessor:
    """Handles data loading, cleaning, and preprocessing"""

    def __init__(self, config: Config):
        self.config = config
        os.makedirs(config.OUTPUT_DIR, exist_ok=True)

    def load_and_clean_excel(self, file_path: str) -> pd.DataFrame:
        """Load and clean Excel data"""
        print(f"Loading data from: {file_path}")

        # Load Excel
        df = pd.read_excel(file_path)
        print(f"Original records: {len(df)}")

        # Clean data
        df = df.dropna(subset=['PMID']).drop_duplicates(subset=['PMID'])
        print(f"After deduplication: {len(df)}")

        # Standardize fields
        df['PMID'] = df['PMID'].astype(str)
        df['Year'] = pd.to_numeric(df['Year'], errors='coerce').fillna(0).astype(int)
        df['Abstract'] = df['Abstract'].fillna('').str.replace('\n', ' ').str.strip()

        return df

    def prepare_records(self, df: pd.DataFrame) -> List[Dict]:
        """Convert DataFrame to structured records"""
        records = []

        for _, row in df.iterrows():
            # Skip records with insufficient abstract
            abstract = str(row.get('Abstract', '')).strip()
            if len(abstract) < 50:
                continue

            records.append({
                'pmid': str(row['PMID']),
                'title': str(row.get('Title', '')).strip(),
                'year': int(row.get('Year', 0)),
                'journal': str(row.get('Journal', '')).strip(),
                'doi': str(row.get('DOI', '')).strip(),
                'mesh': str(row.get('MeSH', '')).strip(),
                'keywords': str(row.get('Keywords', '')).strip(),
                'abstract': abstract,
                'authors': str(row.get('Authors', '')).strip()
            })

        print(f"Prepared {len(records)} valid records")
        return records

    def save_metadata(self, records: List[Dict]) -> None:
        """Save metadata to CSV"""
        meta_df = pd.DataFrame(records)
        output_path = os.path.join(self.config.OUTPUT_DIR, 'medllm_metadata.csv')
        meta_df.to_csv(output_path, index=False)
        print(f"Saved metadata to: {output_path}")


# ============================================================================
# Topic Modeling Module
# ============================================================================

class MedicalTopicModeler:
    """BERTopic-based topic modeling for medical literature"""

    def __init__(self, config: Config):
        self.config = config
        self.topic_model = None

    def build_topic_model(self) -> BERTopic:
        """Initialize BERTopic with custom components"""

        # Embedding model
        embed_model = SentenceTransformer(self.config.EMBEDDING_MODEL)

        # Vectorizer with stopwords
        vectorizer_model = CountVectorizer(
            stop_words='english',
            ngram_range=(1, 2),
            min_df=self.config.MIN_DF
        )

        # UMAP for dimensionality reduction
        umap_model = umap.UMAP(
            n_components=10,
            random_state=42,
            n_neighbors=self.config.N_NEIGHBORS,
            min_dist=0.0,
            metric='cosine'
        )

        # HDBSCAN for clustering
        hdbscan_model = hdbscan.HDBSCAN(
            min_cluster_size=self.config.MIN_CLUSTER_SIZE,
            metric='euclidean',
            cluster_selection_method='eom'
        )

        # Build BERTopic
        topic_model = BERTopic(
            embedding_model=embed_model,
            vectorizer_model=vectorizer_model,
            umap_model=umap_model,
            hdbscan_model=hdbscan_model,
            verbose=True
        )

        return topic_model

    def fit_topics(self, records: List[Dict]) -> Tuple[List[int], BERTopic]:
        """Fit topic model and assign topics to documents"""
        print("\nPerforming topic modeling...")

        # Prepare documents
        docs = [rec['abstract'][:self.config.MAX_CONTEXT_LENGTH] for rec in records]

        # Build and fit model
        self.topic_model = self.build_topic_model()
        topics, probs = self.topic_model.fit_transform(docs)

        # Update records with cluster assignments
        for rec, topic in zip(records, topics):
            rec['cluster'] = int(topic)

        # Save results
        self._save_topic_results(records, topics)

        return topics, self.topic_model

    def _save_topic_results(self, records: List[Dict], topics: List[int]) -> None:
        """Save topic modeling results"""
        output_dir = self.config.OUTPUT_DIR

        # Topic assignments
        assignments_df = pd.DataFrame({
            'pmid': [r['pmid'] for r in records],
            'cluster': topics
        })
        assignments_df.to_csv(
            os.path.join(output_dir, 'cluster_assignments.csv'),
            index=False
        )

        # Topic info
        topic_info = self.topic_model.get_topic_info()
        topic_info.to_csv(
            os.path.join(output_dir, 'topic_info.csv'),
            index=False
        )

        # Topic keywords with weights
        self._save_topic_keywords()

        print(f"Topic modeling results saved to {output_dir}")

    def _save_topic_keywords(self) -> None:
        """Extract and save topic keywords with weights"""
        all_topics = self.topic_model.get_topic_info()['Topic'].tolist()
        all_topics = [t for t in all_topics if t != -1]  # Exclude noise

        rows = []
        for tid in all_topics:
            kw_weights = self.topic_model.get_topic(tid)
            for keyword, weight in kw_weights:
                rows.append({
                    'Topic': tid,
                    'Keyword': keyword,
                    'Weight': weight
                })

        topic_kw_df = pd.DataFrame(rows)
        topic_kw_df.to_csv(
            os.path.join(self.config.OUTPUT_DIR, 'topic_keywords_weights.csv'),
            index=False
        )


# ============================================================================
# RAG System Module
# ============================================================================

class MedicalRAGSystem:
    """Enhanced RAG system for medical literature Q&A"""

    def __init__(self, config: Config, model_type: str = "t5", model_name: Optional[str] = None):
        self.config = config
        self.model_type = model_type
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # Initialize models
        self._init_embedding_model()
        self._init_generation_model(model_type, model_name)

        # Data storage
        self.documents = []
        self.document_metadata = []
        self.embeddings = None
        self.index = None

        print(f"RAG System initialized on {self.device}")

    def _init_embedding_model(self):
        """Initialize embedding model"""
        print(f"Loading embedding model: {self.config.EMBEDDING_MODEL}")
        self.embedder = SentenceTransformer(
            self.config.EMBEDDING_MODEL,
            device=self.device
        )

    def _init_generation_model(self, model_type: str, model_name: Optional[str]):
        """Initialize generation model based on type"""
        if model_type == "t5":
            model_name = model_name or self.config.DEFAULT_LLM
            print(f"Loading T5 model: {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModelForSeq2SeqLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                low_cpu_mem_usage=True
            )
        elif model_type == "gpt2":
            model_name = model_name or "microsoft/BioGPT"
            print(f"Loading GPT model: {model_name}")
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.tokenizer.pad_token = self.tokenizer.eos_token
            self.model = AutoModelForCausalLM.from_pretrained(
                model_name,
                torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
                low_cpu_mem_usage=True
            )
        else:
            raise ValueError(f"Unsupported model type: {model_type}")

        if torch.cuda.is_available():
            self.model = self.model.to('cuda')
        self.model.eval()

    def build_index(self, records: List[Dict]) -> None:
        """Build FAISS index from records"""
        print("\nBuilding vector index...")

        # Prepare documents
        for rec in records:
            doc_text = f"Title: {rec['title']}\nAbstract: {rec['abstract']}"
            self.documents.append(doc_text)
            self.document_metadata.append(rec)

        # Generate embeddings
        self._generate_embeddings()

        # Save index
        self._save_faiss_index()

    def _generate_embeddings(self):
        """Generate document embeddings in batches"""
        batch_size = self.config.EVAL_BATCH_SIZE
        all_embeddings = []

        for i in tqdm(range(0, len(self.documents), batch_size), desc="Generating embeddings"):
            batch = self.documents[i:i + batch_size]
            embeddings = self.embedder.encode(
                batch,
                convert_to_tensor=True,
                show_progress_bar=False
            )
            all_embeddings.append(embeddings.cpu().numpy())

        self.embeddings = np.vstack(all_embeddings).astype('float32')

        # Build FAISS index
        dim = self.embeddings.shape[1]
        self.index = faiss.IndexFlatL2(dim)
        self.index.add(self.embeddings)
        print(f"Index built with {self.index.ntotal} vectors")

    def _save_faiss_index(self):
        """Save FAISS index using LangChain"""
        emb_model = HuggingFaceEmbeddings(model_name=self.config.EMBEDDING_MODEL)
        faiss_db = FAISS.from_texts(self.documents, emb_model)
        index_path = os.path.join(self.config.OUTPUT_DIR, 'faiss_index')
        faiss_db.save_local(index_path)
        print(f"FAISS index saved to: {index_path}")

    def search(self, query: str, k: int = None) -> List[Dict]:
        """Semantic search for relevant documents"""
        k = k or self.config.TOP_K

        # Encode query
        query_embedding = self.embedder.encode(query, convert_to_tensor=True)
        query_np = query_embedding.cpu().numpy().reshape(1, -1).astype('float32')

        # Search
        distances, indices = self.index.search(query_np, k)

        # Prepare results
        results = []
        for idx, distance in zip(indices[0], distances[0]):
            if idx >= 0:
                metadata = self.document_metadata[idx].copy()
                metadata['relevance_score'] = float(1 / (1 + distance))
                results.append(metadata)

        return results

    def generate_answer(self, query: str, docs: List[Dict]) -> str:
        """Generate answer based on retrieved documents"""
        if self.model_type == "t5":
            return self._generate_t5_answer(query, docs)
        else:
            return self._generate_gpt_answer(query, docs)

    def _generate_t5_answer(self, query: str, docs: List[Dict]) -> str:
        """T5-specific answer generation"""
        # Build context
        context_parts = []
        for i, doc in enumerate(docs[:3]):
            key_info = self._extract_key_sentences(doc['abstract'], query)
            context_parts.append(
                f"Study{i + 1}: {doc['title']} (PMID:{doc['pmid']},{doc['year']}). {key_info}"
            )

        context = " ".join(context_parts)
        prompt = f"Question: {query} Context: {context} Answer:"

        # Tokenize
        inputs = self.tokenizer(
            prompt,
            return_tensors='pt',
            truncation=True,
            max_length=1024
        )

        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=self.config.MAX_NEW_TOKENS,
                min_new_tokens=100,
                temperature=self.config.TEMPERATURE,
                top_p=self.config.TOP_P,
                num_beams=4,
                early_stopping=True,
                no_repeat_ngram_size=3
            )

        answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

        # Post-process if needed
        if len(answer) < 50:
            answer = self._create_structured_answer(query, docs)

        return answer

    def _generate_gpt_answer(self, query: str, docs: List[Dict]) -> str:
        """GPT-style answer generation"""
        # Build context
        context = "Research findings:\n"
        for i, doc in enumerate(docs[:3]):
            context += f"\n{i + 1}. {doc['title']} (PMID: {doc['pmid']}, {doc['year']})\n"
            context += f"   Key findings: {self._extract_key_sentences(doc['abstract'], query)}\n"

        prompt = f"""{context}

Based on the above research findings, answer the following question:
Question: {query}

Answer: Based on the literature,"""

        inputs = self.tokenizer(
            prompt,
            return_tensors='pt',
            truncation=True,
            max_length=1500
        )

        if torch.cuda.is_available():
            inputs = {k: v.to('cuda') for k, v in inputs.items()}

        # Generate
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=self.config.MAX_NEW_TOKENS,
                temperature=0.8,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.pad_token_id
            )

        full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        answer = full_response.split("Answer: Based on the literature,")[-1].strip()

        return "Based on the literature, " + answer

    def _extract_key_sentences(self, abstract: str, query: str) -> str:
        """Extract query-relevant sentences from abstract"""
        sentences = abstract.split('. ')
        query_words = set(query.lower().split())

        # Score sentences
        scored_sentences = []
        for sent in sentences:
            if len(sent) < 20:
                continue

            sent_lower = sent.lower()
            score = 0

            # Query word matches
            for word in query_words:
                if word in sent_lower:
                    score += 2

            # Result indicators
            result_words = ['found', 'showed', 'demonstrated', 'revealed',
                            'indicated', 'suggest', 'conclude', 'effective',
                            'accuracy', 'performance']
            for word in result_words:
                if word in sent_lower:
                    score += 1

            # Numerical results
            if re.search(r'\d+(\.\d+)?%', sent):
                score += 2

            scored_sentences.append((score, sent))

        # Select top sentences
        scored_sentences.sort(key=lambda x: x[0], reverse=True)
        top_sentences = [sent for score, sent in scored_sentences[:2] if score > 0]

        if top_sentences:
            return ' '.join(top_sentences)
        else:
            return ' '.join(sentences[:2])

    def _create_structured_answer(self, query: str, docs: List[Dict]) -> str:
        """Create structured fallback answer"""
        query_lower = query.lower()

        if "application" in query_lower or "use" in query_lower:
            answer = f"Based on the reviewed literature, ChatGPT/AI has shown several applications in medicine:\n\n"

            for i, doc in enumerate(docs[:3]):
                abstract_lower = doc['abstract'].lower()

                if "education" in abstract_lower:
                    app_area = "medical education"
                elif "diagnosis" in abstract_lower:
                    app_area = "clinical diagnosis"
                elif "examination" in abstract_lower:
                    app_area = "medical examinations"
                else:
                    app_area = "healthcare"

                answer += f"{i + 1}. In {app_area}: {doc['title']} "
                answer += f"(PMID: {doc['pmid']}, {doc['year']}) "

                accuracy_match = re.search(r'(\d+(?:\.\d+)?)\s*%', doc['abstract'])
                if accuracy_match:
                    answer += f"reported {accuracy_match.group(1)}% accuracy. "
                else:
                    answer += f"demonstrated promising results. "

                answer += "\n"

        elif "accurate" in query_lower or "accuracy" in query_lower:
            answer = f"Studies report varying accuracy levels for ChatGPT in medical applications:\n\n"

            for doc in docs[:3]:
                percentages = re.findall(r'(\d+(?:\.\d+)?)\s*%', doc['abstract'])

                if percentages:
                    answer += f"• {doc['title'][:60]}... (PMID: {doc['pmid']}, {doc['year']}) "
                    answer += f"reported {', '.join(percentages)}% accuracy in their evaluation.\n"
                else:
                    answer += f"• {doc['title'][:60]}... (PMID: {doc['pmid']}, {doc['year']}) "
                    answer += f"evaluated performance without specific accuracy metrics.\n"

        else:
            answer = f"Based on the literature review for '{query}':\n\n"

            for i, doc in enumerate(docs[:3]):
                answer += f"{i + 1}. {doc['title']} (PMID: {doc['pmid']}, {doc['year']}) - "

                key_finding = self._extract_key_sentences(doc['abstract'], query)
                if key_finding:
                    answer += key_finding[:200] + "...\n"
                else:
                    answer += "Investigated relevant aspects.\n"

        answer += f"\nThese findings are based on {len(docs)} relevant studies in the database."

        return answer

    def qa_pipeline(self, query: str, k: int = None) -> Dict:
        """Complete Q&A pipeline"""
        k = k or self.config.TOP_K
        start_time = time.time()

        # Search
        docs = self.search(query, k=k)
        search_time = time.time() - start_time

        if not docs:
            return {
                'query': query,
                'answer': "No relevant documents found in the database for this query.",
                'sources': [],
                'times': {'search': search_time, 'generation': 0, 'total': search_time}
            }

        # Generate answer
        gen_start = time.time()
        answer = self.generate_answer(query, docs)
        gen_time = time.time() - gen_start

        return {
            'query': query,
            'answer': answer,
            'sources': docs,
            'times': {
                'search': search_time,
                'generation': gen_time,
                'total': time.time() - start_time
            }
        }


# ============================================================================
# Evaluation Module
# ============================================================================

class RAGEvaluator:
    """Comprehensive evaluation for RAG system"""

    def __init__(self, rag_system: MedicalRAGSystem, config: Config):
        self.rag = rag_system
        self.config = config
        self.results = {
            'retrieval_metrics': {},
            'generation_metrics': {},
            'efficiency_metrics': {},
            'query_results': []
        }

    def evaluate_retrieval(self, test_queries: List[Dict]) -> Dict:
        """Evaluate retrieval performance"""
        print("\nEvaluating retrieval performance...")

        metrics = {
            'mrr': [],  # Mean Reciprocal Rank
            'recall_at_k': [],
            'precision_at_k': [],
            'ndcg': []  # Normalized Discounted Cumulative Gain
        }

        for query_data in tqdm(test_queries, desc="Retrieval evaluation"):
            query = query_data['query']
            relevant_pmids = set(query_data.get('relevant_pmids', []))

            if not relevant_pmids:
                continue

            # Get search results
            results = self.rag.search(query, k=10)
            retrieved_pmids = [r['pmid'] for r in results]

            # Calculate metrics
            metrics['mrr'].append(self._calculate_mrr(retrieved_pmids, relevant_pmids))
            metrics['recall_at_k'].append(self._calculate_recall_at_k(retrieved_pmids, relevant_pmids, k=5))
            metrics['precision_at_k'].append(self._calculate_precision_at_k(retrieved_pmids, relevant_pmids, k=5))
            metrics['ndcg'].append(self._calculate_ndcg(retrieved_pmids, relevant_pmids))

        # Average metrics
        avg_metrics = {
            metric: np.mean(values) if values else 0.0
            for metric, values in metrics.items()
        }

        self.results['retrieval_metrics'] = avg_metrics
        return avg_metrics

    def evaluate_generation(self, test_queries: List[str]) -> Dict:
        """Evaluate generation quality"""
        print("\nEvaluating generation quality...")

        metrics = {
            'answer_length': [],
            'response_time': [],
            'perplexity': [],
            'diversity': []
        }

        all_answers = []

        for query in tqdm(test_queries, desc="Generation evaluation"):
            result = self.rag.qa_pipeline(query)

            # Basic metrics
            metrics['answer_length'].append(len(result['answer'].split()))
            metrics['response_time'].append(result['times']['total'])

            # Store for diversity calculation
            all_answers.append(result['answer'])

            # Store detailed result
            self.results['query_results'].append(result)

        # Calculate diversity
        if all_answers:
            metrics['diversity'] = self._calculate_diversity(all_answers)

        # Average metrics
        avg_metrics = {
            'avg_answer_length': np.mean(metrics['answer_length']),
            'avg_response_time': np.mean(metrics['response_time']),
            'answer_diversity': metrics['diversity']
        }

        self.results['generation_metrics'] = avg_metrics
        return avg_metrics

    def evaluate_efficiency(self) -> Dict:
        """Evaluate system efficiency"""
        print("\nEvaluating system efficiency...")

        # Memory usage
        if torch.cuda.is_available():
            gpu_memory = torch.cuda.memory_allocated() / 1e9
            gpu_total = torch.cuda.get_device_properties(0).total_memory / 1e9
        else:
            gpu_memory = 0
            gpu_total = 0

        # Index size
        index_size = self.rag.embeddings.nbytes / 1e6 if self.rag.embeddings is not None else 0

        efficiency_metrics = {
            'gpu_memory_gb': gpu_memory,
            'gpu_total_gb': gpu_total,
            'index_size_mb': index_size,
            'num_documents': len(self.rag.documents),
            'embedding_dim': self.rag.embeddings.shape[1] if self.rag.embeddings is not None else 0
        }

        self.results['efficiency_metrics'] = efficiency_metrics
        return efficiency_metrics

    def save_evaluation_results(self):
        """Save all evaluation results"""
        output_dir = self.config.OUTPUT_DIR

        # Save metrics as JSON
        metrics_path = os.path.join(output_dir, 'evaluation_metrics.json')
        with open(metrics_path, 'w') as f:
            json.dump(self.results, f, indent=2)

        # Save query results as CSV
        if self.results['query_results']:
            query_df = pd.DataFrame([
                {
                    'query': r['query'],
                    'answer': r['answer'],
                    'num_sources': len(r['sources']),
                    'search_time': r['times']['search'],
                    'generation_time': r['times']['generation'],
                    'total_time': r['times']['total']
                }
                for r in self.results['query_results']
            ])
            query_df.to_csv(os.path.join(output_dir, 'query_results.csv'), index=False)

        # Generate plots if configured
        if self.config.SAVE_PLOTS:
            self._generate_evaluation_plots()

        print(f"\nEvaluation results saved to {output_dir}")

    def _calculate_mrr(self, retrieved: List[str], relevant: set) -> float:
        """Calculate Mean Reciprocal Rank"""
        for i, pmid in enumerate(retrieved):
            if pmid in relevant:
                return 1.0 / (i + 1)
        return 0.0

    def _calculate_recall_at_k(self, retrieved: List[str], relevant: set, k: int) -> float:
        """Calculate Recall@K"""
        retrieved_k = set(retrieved[:k])
        if not relevant:
            return 0.0
        return len(retrieved_k & relevant) / len(relevant)

    def _calculate_precision_at_k(self, retrieved: List[str], relevant: set, k: int) -> float:
        """Calculate Precision@K"""
        retrieved_k = retrieved[:k]
        if not retrieved_k:
            return 0.0
        return len([p for p in retrieved_k if p in relevant]) / len(retrieved_k)

    def _calculate_ndcg(self, retrieved: List[str], relevant: set) -> float:
        """Calculate Normalized Discounted Cumulative Gain"""
        dcg = 0.0
        for i, pmid in enumerate(retrieved):
            if pmid in relevant:
                dcg += 1.0 / np.log2(i + 2)

        # Ideal DCG
        idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(relevant), len(retrieved))))

        return dcg / idcg if idcg > 0 else 0.0

    def _calculate_diversity(self, answers: List[str]) -> float:
        """Calculate answer diversity using unique n-grams"""
        all_trigrams = set()
        total_trigrams = 0

        for answer in answers:
            words = answer.lower().split()
            trigrams = [' '.join(words[i:i + 3]) for i in range(len(words) - 2)]
            all_trigrams.update(trigrams)
            total_trigrams += len(trigrams)

        return len(all_trigrams) / total_trigrams if total_trigrams > 0 else 0.0

    def _generate_evaluation_plots(self):
        """Generate evaluation visualization plots"""
        output_dir = self.config.OUTPUT_DIR

        # Response time distribution
        if self.results['query_results']:
            plt.figure(figsize=(10, 6))
            times = [r['times']['total'] for r in self.results['query_results']]
            plt.hist(times, bins=20, edgecolor='black')
            plt.xlabel('Response Time (seconds)')
            plt.ylabel('Frequency')
            plt.title('Response Time Distribution')
            plt.savefig(os.path.join(output_dir, 'response_time_distribution.png'))
            plt.close()

        # Retrieval metrics
        if self.results['retrieval_metrics']:
            plt.figure(figsize=(10, 6))
            metrics = self.results['retrieval_metrics']
            plt.bar(metrics.keys(), metrics.values())
            plt.xlabel('Metric')
            plt.ylabel('Score')
            plt.title('Retrieval Performance Metrics')
            plt.ylim(0, 1)
            plt.savefig(os.path.join(output_dir, 'retrieval_metrics.png'))
            plt.close()


# ============================================================================
# Enhanced Visualization Module
# ============================================================================

class RealEvaluationPlotter:
    """Generate evaluation plots based on actual data"""

    def __init__(self, output_dir: str = 'output2025-2'):
        self.output_dir = output_dir
        self.data = {}
        self.load_all_data()

    def load_all_data(self):
        """Load all available data files"""
        print("Loading data files...")

        # 1. Load test_query_results.json
        test_results_path = os.path.join(self.output_dir, 'test_query_results.json')
        if os.path.exists(test_results_path):
            with open(test_results_path, 'r', encoding='utf-8') as f:
                self.data['test_results'] = json.load(f)
                print(f"✓ Loaded test_query_results.json - {len(self.data['test_results'])} queries")

        # 2. Load evaluation_metrics.json
        metrics_path = os.path.join(self.output_dir, 'evaluation_metrics.json')
        if os.path.exists(metrics_path):
            with open(metrics_path, 'r') as f:
                self.data['eval_metrics'] = json.load(f)
                print("✓ Loaded evaluation_metrics.json")

        # 3. Load cluster_assignments.csv
        cluster_path = os.path.join(self.output_dir, 'cluster_assignments.csv')
        if os.path.exists(cluster_path):
            self.data['clusters'] = pd.read_csv(cluster_path)
            print(f"✓ Loaded cluster_assignments.csv - {len(self.data['clusters'])} records")

        # 4. Load topic_info.csv
        topic_info_path = os.path.join(self.output_dir, 'topic_info.csv')
        if os.path.exists(topic_info_path):
            self.data['topic_info'] = pd.read_csv(topic_info_path)
            print(f"✓ Loaded topic_info.csv - {len(self.data['topic_info'])} topics")

    def generate_all_plots(self):
        """Generate all possible plots"""
        print("\nGenerating plots...")

        if 'test_results' in self.data:
            self.plot_response_time_analysis()
            self.plot_query_performance_details()
            self.plot_answer_quality_analysis()

        if 'eval_metrics' in self.data:
            self.plot_retrieval_metrics()
            self.plot_system_efficiency()

        if 'clusters' in self.data:
            self.plot_topic_distribution()

        print("\nAll plots generated!")

    def plot_response_time_analysis(self):
        """Generate response time analysis plot"""
        print("Generating response time analysis...")

        results = self.data['test_results']

        # Extract time data
        search_times = [r['times']['search'] for r in results]
        generation_times = [r['times']['generation'] for r in results]
        total_times = [r['times']['total'] for r in results]

        # Create figure
        fig, axes = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Response Time Analysis (Based on Actual Data)', fontsize=18, fontweight='bold')

        # 1. Total time distribution
        ax1 = axes[0, 0]
        ax1.hist(total_times, bins=10, color='skyblue', edgecolor='black', alpha=0.7)
        ax1.axvline(np.mean(total_times), color='red', linestyle='dashed',
                    linewidth=2, label=f'Mean: {np.mean(total_times):.2f}s')
        ax1.axvline(np.median(total_times), color='green', linestyle='dashed',
                    linewidth=2, label=f'Median: {np.median(total_times):.2f}s')
        ax1.set_xlabel('Total Response Time (seconds)', fontsize=12)
        ax1.set_ylabel('Frequency', fontsize=12)
        ax1.set_title('Total Response Time Distribution', fontsize=14, fontweight='bold')
        ax1.legend()
        ax1.grid(axis='y', alpha=0.3)

        # 2. Time composition by query
        ax2 = axes[0, 1]
        x = np.arange(len(results))
        width = 0.8

        p1 = ax2.bar(x, search_times, width, label='Search Time', color='lightblue')
        p2 = ax2.bar(x, generation_times, width, bottom=search_times,
                     label='Generation Time', color='lightgreen')

        ax2.set_ylabel('Time (seconds)', fontsize=12)
        ax2.set_title('Time Composition per Query', fontsize=14, fontweight='bold')
        ax2.set_xticks(x)
        ax2.set_xticklabels([f'Q{i + 1}' for i in range(len(results))])
        ax2.legend()
        ax2.grid(axis='y', alpha=0.3)

        # Add total time labels
        for i, (s, g) in enumerate(zip(search_times, generation_times)):
            ax2.text(i, s + g + 0.05, f'{s + g:.2f}', ha='center', va='bottom')

        # 3. Search vs Generation time scatter
        ax3 = axes[1, 0]
        scatter = ax3.scatter(search_times, generation_times,
                              s=100, alpha=0.6, c=total_times,
                              cmap='viridis', edgecolors='black')

        # Add trend line
        z = np.polyfit(search_times, generation_times, 1)
        p = np.poly1d(z)
        ax3.plot(sorted(search_times), p(sorted(search_times)),
                 "r--", alpha=0.8, label=f'Trend: y={z[0]:.2f}x+{z[1]:.2f}')

        ax3.set_xlabel('Search Time (seconds)', fontsize=12)
        ax3.set_ylabel('Generation Time (seconds)', fontsize=12)
        ax3.set_title('Search Time vs Generation Time', fontsize=14, fontweight='bold')
        ax3.legend()
        ax3.grid(True, alpha=0.3)

        # Add colorbar
        cbar = plt.colorbar(scatter, ax=ax3)
        cbar.set_label('Total Time (seconds)', fontsize=10)

        # 4. Time statistics comparison
        ax4 = axes[1, 1]

        # Create box plot
        bp = ax4.boxplot([search_times, generation_times, total_times],
                         labels=['Search Time', 'Generation Time', 'Total Time'],
                         patch_artist=True, showmeans=True)

        # Set colors
        colors = ['lightblue', 'lightgreen', 'lightyellow']
        for patch, color in zip(bp['boxes'], colors):
            patch.set_facecolor(color)
            patch.set_alpha(0.7)

        # Add statistics text
        stats_text = f"Search Time: {np.mean(search_times):.2f}±{np.std(search_times):.2f}s\n"
        stats_text += f"Generation Time: {np.mean(generation_times):.2f}±{np.std(generation_times):.2f}s\n"
        stats_text += f"Total Time: {np.mean(total_times):.2f}±{np.std(total_times):.2f}s"

        ax4.text(0.02, 0.98, stats_text, transform=ax4.transAxes,
                 fontsize=10, verticalalignment='top', horizontalalignment='right',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

        ax4.set_ylabel('Time (seconds)', fontsize=12)
        ax4.set_title('Time Distribution Statistics', fontsize=14, fontweight='bold')
        ax4.grid(axis='y', alpha=0.3)

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'response_time_distribution.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ response_time_distribution.png generated")

    def plot_retrieval_metrics(self):
        """Generate retrieval metrics plot"""
        print("Generating retrieval metrics...")

        # Get metrics
        metrics = {}
        if 'eval_metrics' in self.data and 'retrieval_metrics' in self.data['eval_metrics']:
            metrics = self.data['eval_metrics']['retrieval_metrics']

        # If no retrieval metrics, use generation metrics
        if not metrics and 'eval_metrics' in self.data:
            if 'generation_metrics' in self.data['eval_metrics']:
                gen_metrics = self.data['eval_metrics']['generation_metrics']
                avg_response = gen_metrics.get('avg_response_time', 0)
                metrics = {
                    'response_quality': min(1.0, 200 / gen_metrics.get('avg_answer_length', 200)),
                    'response_speed': min(1.0, 2.0 / avg_response) if avg_response > 0 else 0.5,
                    'answer_diversity': gen_metrics.get('answer_diversity', 0.7),
                    'overall_score': 0.75
                }

        if not metrics:
            print("✗ No retrieval metrics found")
            return

        # Create figure
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
        fig.suptitle('System Performance Metrics', fontsize=16, fontweight='bold')

        # 1. Bar chart
        metric_names = list(metrics.keys())
        metric_values = list(metrics.values())

        # Beautify metric names
        display_names = {
            'mrr': 'MRR',
            'recall_at_k': 'Recall@5',
            'precision_at_k': 'Precision@5',
            'ndcg': 'NDCG',
            'response_quality': 'Answer Quality',
            'response_speed': 'Response Speed',
            'answer_diversity': 'Answer Diversity',
            'overall_score': 'Overall Score'
        }

        metric_labels = [display_names.get(name, name) for name in metric_names]

        bars = ax1.bar(metric_labels, metric_values,
                       color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

        ax1.set_ylim(0, 1.1)
        ax1.set_ylabel('Score', fontsize=12)
        ax1.set_title('Performance Metrics', fontsize=14, fontweight='bold')
        ax1.grid(axis='y', alpha=0.3)

        # Add value labels
        for bar, value in zip(bars, metric_values):
            height = bar.get_height()
            ax1.text(bar.get_x() + bar.get_width() / 2., height + 0.01,
                     f'{value:.3f}', ha='center', va='bottom', fontsize=10)

        # Add average line
        avg_score = np.mean(metric_values)
        ax1.axhline(y=avg_score, color='red', linestyle='--',
                    label=f'Average: {avg_score:.3f}')
        ax1.legend()

        # 2. Radar chart
        angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
        values = metric_values + [metric_values[0]]  # Close the plot
        angles += angles[:1]

        ax2 = plt.subplot(122, projection='polar')
        ax2.plot(angles, values, 'o-', linewidth=2, color='#1f77b4', markersize=8)
        ax2.fill(angles, values, alpha=0.25, color='#1f77b4')
        ax2.set_xticks(angles[:-1])
        ax2.set_xticklabels(metric_labels, fontsize=10)
        ax2.set_ylim(0, 1.0)
        ax2.set_title('Performance Radar Chart', y=1.08, fontsize=14, fontweight='bold')
        ax2.grid(True)

        # Add value labels with adjusted positions
        for i, (angle, value, label) in enumerate(zip(angles[:-1], metric_values, metric_labels)):
            # 根据标签调整文字位置
            if 'Answer Quality' in label:
                # 向右移动
                offset_angle = angle + 0.15
                ax2.text(offset_angle, value + 0.15, f'{value:.2f}',
                         ha='center', va='center', fontsize=9)
            elif 'Answer Diversity' in label:
                # 向左移动
                offset_angle = angle - 0.15
                ax2.text(offset_angle, value + 0.15, f'{value:.2f}',
                         ha='center', va='center', fontsize=9)
            else:
                # 其他标签保持原位
                ax2.text(angle, value + 0.05, f'{value:.2f}',
                         ha='center', va='center', fontsize=9)

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'retrieval_metrics.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ retrieval_metrics.png generated")

    def plot_topic_distribution(self):
        """Generate topic distribution plot"""
        print("Generating topic distribution...")

        if 'clusters' not in self.data:
            print("✗ No cluster data found")
            return

        clusters_df = self.data['clusters']
        topic_counts = clusters_df['cluster'].value_counts().sort_index()

        # Create figure
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
        fig.suptitle('Topic Distribution Analysis', fontsize=16, fontweight='bold')

        # 1. Bar chart
        topics = []
        colors = []
        for i in topic_counts.index:
            if i == -1:
                topics.append('Noise')
                colors.append('gray')
            else:
                topics.append(f'Topic {i}')
                colors.append(plt.cm.tab10(i % 10))

        bars = ax1.bar(range(len(topics)), topic_counts.values, color=colors)
        ax1.set_xlabel('Topic', fontsize=12)
        ax1.set_ylabel('Document Count', fontsize=12)
        ax1.set_title(f'Topic Distribution ({len(clusters_df)} documents)', fontsize=14, fontweight='bold')
        ax1.set_xticks(range(len(topics)))
        ax1.set_xticklabels(topics, rotation=45, ha='right')
        ax1.grid(axis='y', alpha=0.3)

        # Add value labels
        total_docs = len(clusters_df)
        for i, (bar, count) in enumerate(zip(bars, topic_counts.values)):
            height = bar.get_height()
            percentage = (count / total_docs) * 100
            ax1.text(bar.get_x() + bar.get_width() / 2., height + 1,
                     f'{count}\n({percentage:.1f}%)',
                     ha='center', va='bottom', fontsize=9)

        # 2. Pie chart
        threshold = 0.02  # 2% threshold
        pie_data = []
        pie_labels = []
        pie_colors = []
        others_count = 0

        for i, (topic_id, count) in enumerate(topic_counts.items()):
            percentage = count / total_docs
            if percentage >= threshold:
                pie_data.append(count)
                if topic_id == -1:
                    pie_labels.append(f'Noise\n({count} docs)')
                    pie_colors.append('gray')
                else:
                    pie_labels.append(f'Topic {topic_id}\n({count} docs)')
                    pie_colors.append(plt.cm.tab10(topic_id % 10))
            else:
                others_count += count

        if others_count > 0:
            pie_data.append(others_count)
            pie_labels.append(f'Others\n({others_count} docs)')
            pie_colors.append('lightgray')

        wedges, texts, autotexts = ax2.pie(pie_data, labels=pie_labels,
                                           autopct='%1.1f%%',
                                           colors=pie_colors,
                                           startangle=90,
                                           pctdistance=0.85)

        # Style the pie chart
        for text in texts:
            text.set_fontsize(10)
        for autotext in autotexts:
            autotext.set_color('white')
            autotext.set_fontsize(10)
            autotext.set_weight('bold')

        ax2.set_title('Topic Distribution Percentage', fontsize=14, fontweight='bold')

        # Add statistics
        stats_text = f"Total Documents: {total_docs}\n"
        stats_text += f"Topics Identified: {len([t for t in topic_counts.index if t != -1])}\n"
        stats_text += f"Noise Documents: {topic_counts.get(-1, 0)} ({topic_counts.get(-1, 0) / total_docs * 100:.1f}%)"

        fig.text(0.02, 0.02, stats_text, fontsize=10,
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'topic_distribution.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ topic_distribution.png generated")

    def plot_query_performance_details(self):
        """Generate query performance analysis"""
        print("Generating query performance details...")

        results = self.data['test_results']

        # Prepare data
        queries = []
        answer_lengths = []
        source_counts = []
        total_times = []

        for r in results:
            # Simplify query text
            query_text = r['query']
            if 'ChatGPT' in query_text:
                if 'education' in query_text:
                    queries.append('Medical Education')
                elif 'accurate' in query_text or 'accuracy' in query_text:
                    queries.append('Diagnostic Accuracy')
                elif 'limitation' in query_text:
                    queries.append('AI Limitations')
                elif 'examination' in query_text:
                    queries.append('Medical Exams')
                elif 'bone tumor' in query_text:
                    queries.append('Bone Tumor Diagnosis')
                elif 'ethical' in query_text:
                    queries.append('Ethical Considerations')
                elif 'compare' in query_text:
                    queries.append('Human vs AI')
                elif 'radiology' in query_text:
                    queries.append('Radiology Applications')
                else:
                    queries.append('Other Query')
            else:
                queries.append(query_text[:20] + '...')

            answer_lengths.append(len(r['answer'].split()))
            source_counts.append(len(r['sources']))
            total_times.append(r['times']['total'])

        # Create figure
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Query Performance Analysis', fontsize=16, fontweight='bold')

        # 1. Answer length analysis
        bars1 = ax1.bar(queries, answer_lengths, color='lightblue', edgecolor='black')
        ax1.set_ylabel('Answer Length (words)', fontsize=12)
        ax1.set_title('Answer Length by Query Type', fontsize=14, fontweight='bold')
        ax1.tick_params(axis='x', rotation=45)
        ax1.grid(axis='y', alpha=0.3)

        # Add average line
        avg_length = np.mean(answer_lengths)
        ax1.axhline(y=avg_length, color='red', linestyle='--',
                    label=f'Average: {avg_length:.0f} words')
        ax1.legend()

        # Add value labels
        for bar, length in zip(bars1, answer_lengths):
            ax1.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 2,
                     f'{length}', ha='center', va='bottom')

        # 2. Source document count
        bars2 = ax2.bar(queries, source_counts, color='lightgreen', edgecolor='black')
        ax2.set_ylabel('Number of Sources', fontsize=12)
        ax2.set_title('Retrieved Documents per Query', fontsize=14, fontweight='bold')
        ax2.tick_params(axis='x', rotation=45)
        ax2.grid(axis='y', alpha=0.3)
        ax2.set_ylim(0, max(source_counts) + 1)

        # Add value labels
        for bar, count in zip(bars2, source_counts):
            ax2.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.1,
                     f'{count}', ha='center', va='bottom')

        # 3. Response time comparison
        bars3 = ax3.bar(queries, total_times, color='lightyellow', edgecolor='black')
        ax3.set_ylabel('Response Time (seconds)', fontsize=12)
        ax3.set_title('Response Time by Query', fontsize=14, fontweight='bold')
        ax3.tick_params(axis='x', rotation=45)
        ax3.grid(axis='y', alpha=0.3)

        # Mark queries above average
        avg_time = np.mean(total_times)
        ax3.axhline(y=avg_time, color='red', linestyle='--',
                    label=f'Average: {avg_time:.2f}s')

        # Color bars above average differently
        for bar, time in zip(bars3, total_times):
            if time > avg_time:
                bar.set_color('lightcoral')
            ax3.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
                     f'{time:.2f}', ha='center', va='bottom', fontsize=9)

        ax3.legend()

        # 4. Performance scatter plot
        ax4.scatter(answer_lengths, total_times, s=np.array(source_counts) * 50,
                    alpha=0.6, c=range(len(queries)), cmap='viridis')

        # Add query labels
        for i, query in enumerate(queries):
            ax4.annotate(query, (answer_lengths[i], total_times[i]),
                         xytext=(5, 5), textcoords='offset points', fontsize=8)

        ax4.set_xlabel('Answer Length (words)', fontsize=12)
        ax4.set_ylabel('Response Time (seconds)', fontsize=12)
        ax4.set_title('Answer Length vs Response Time (bubble size = source count)', fontsize=14, fontweight='bold')
        ax4.grid(True, alpha=0.3)

        # Add trend line
        z = np.polyfit(answer_lengths, total_times, 1)
        p = np.poly1d(z)
        ax4.plot(sorted(answer_lengths), p(sorted(answer_lengths)),
                 "r--", alpha=0.8, linewidth=2)

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'query_performance_details.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ query_performance_details.png generated")

    def plot_answer_quality_analysis(self):
        """Generate answer quality analysis"""
        print("Generating answer quality analysis...")

        results = self.data['test_results']

        # Analyze answer features
        answer_features = []
        for r in results:
            answer = r['answer']
            features = {
                'query': r['query'][:30] + '...' if len(r['query']) > 30 else r['query'],
                'length': len(answer),
                'word_count': len(answer.split()),
                'sentence_count': len([s for s in answer.split('.') if s.strip()]),
                'has_pmid': answer.count('PMID'),
                'has_percentage': len(re.findall(r'\d+(?:\.\d+)?%', answer)),
                'has_year': len(re.findall(r'\b20\d{2}\b', answer)),
                'sources': len(r['sources'])
            }
            answer_features.append(features)

        # Create figure
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('Answer Quality Analysis', fontsize=16, fontweight='bold')

        # 1. Answer structure analysis
        word_counts = [f['word_count'] for f in answer_features]
        sentence_counts = [f['sentence_count'] for f in answer_features]

        ax1.scatter(word_counts, sentence_counts, s=100, alpha=0.6, edgecolors='black')
        ax1.set_xlabel('Word Count', fontsize=12)
        ax1.set_ylabel('Sentence Count', fontsize=12)
        ax1.set_title('Answer Structure Analysis', fontsize=14, fontweight='bold')
        ax1.grid(True, alpha=0.3)

        # Add average sentence length line
        avg_words_per_sentence = [w / s if s > 0 else 0 for w, s in zip(word_counts, sentence_counts)]
        avg_wps = np.mean([wps for wps in avg_words_per_sentence if wps > 0])
        x_range = np.array([0, max(word_counts)])
        ax1.plot(x_range, x_range / avg_wps, 'r--',
                 label=f'Avg sentence length: {avg_wps:.1f} words')
        ax1.legend()

        # 2. Citation features
        has_pmid_counts = [f['has_pmid'] for f in answer_features]
        has_percentage_counts = [f['has_percentage'] for f in answer_features]
        has_year_counts = [f['has_year'] for f in answer_features]

        feature_names = ['PMID Citations', 'Percentage Data', 'Year References']
        feature_means = [
            np.mean(has_pmid_counts),
            np.mean(has_percentage_counts),
            np.mean(has_year_counts)
        ]

        bars = ax2.bar(feature_names, feature_means,
                       color=['lightblue', 'lightgreen', 'lightyellow'],
                       edgecolor='black')
        ax2.set_ylabel('Average Occurrences', fontsize=12)
        ax2.set_title('Citation Features in Answers', fontsize=14, fontweight='bold')
        ax2.grid(axis='y', alpha=0.3)

        # Add value labels
        for bar, mean in zip(bars, feature_means):
            ax2.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
                     f'{mean:.2f}', ha='center', va='bottom')

        # 3. Quality metrics radar chart
        categories = ['Completeness', 'Accuracy', 'Citation Quality', 'Structure', 'Relevance']

        # Calculate average scores
        avg_scores = []
        for category in categories:
            if category == 'Completeness':
                scores = [min(f['word_count'] / 250, 1.0) for f in answer_features]
            elif category == 'Accuracy':
                scores = [min((f['has_percentage'] + f['has_pmid']) / 5, 1.0) for f in answer_features]
            elif category == 'Citation Quality':
                scores = [min(f['sources'] / 5, 1.0) for f in answer_features]
            elif category == 'Structure':
                scores = [min(f['sentence_count'] / (f['word_count'] / 20), 1.0) if f['word_count'] > 0 else 0
                          for f in answer_features]
            else:  # Relevance
                scores = [0.85] * len(answer_features)

            avg_scores.append(np.mean(scores))

        # Plot radar chart
        angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
        avg_scores_plot = avg_scores + [avg_scores[0]]  # Close the plot
        angles += angles[:1]

        ax3 = plt.subplot(223, projection='polar')
        ax3.plot(angles, avg_scores_plot, 'o-', linewidth=2, color='purple')
        ax3.fill(angles, avg_scores_plot, alpha=0.25, color='purple')
        ax3.set_xticks(angles[:-1])
        ax3.set_xticklabels(categories)
        ax3.set_ylim(0, 1.0)
        ax3.set_title('Answer Quality Score', y=1.08, fontsize=14, fontweight='bold')
        ax3.grid(True)

        # Add score labels
        for angle, score, category in zip(angles[:-1], avg_scores, categories):
            ax3.text(angle, score + 0.05, f'{score:.2f}',
                     ha='center', va='center', fontsize=9)

        # 4. Answer length distribution
        ax4.boxplot([word_counts], labels=['Answer Word Count'], patch_artist=True,
                    boxprops=dict(facecolor='lightblue', alpha=0.7),
                    showmeans=True)

        # Add individual points
        y_pos = np.random.normal(1, 0.04, len(word_counts))
        ax4.scatter(y_pos, word_counts, alpha=0.5, s=30)

        ax4.set_ylabel('Word Count', fontsize=12)
        ax4.set_title('Answer Length Distribution', fontsize=14, fontweight='bold')
        ax4.grid(axis='y', alpha=0.3)

        # Add statistics
        stats_text = f"Mean: {np.mean(word_counts):.0f} words\n"
        stats_text += f"Median: {np.median(word_counts):.0f} words\n"
        stats_text += f"Std Dev: {np.std(word_counts):.0f} words"
        ax4.text(0.02, 0.98, stats_text, transform=ax4.transAxes,
                 fontsize=10, verticalalignment='top',
                 bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'answer_quality_analysis.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ answer_quality_analysis.png generated")

    def plot_system_efficiency(self):
        """Generate system efficiency analysis"""
        print("Generating system efficiency analysis...")

        # Collect efficiency data
        efficiency_data = {}

        # From evaluation_metrics.json
        if 'eval_metrics' in self.data:
            if 'efficiency_metrics' in self.data['eval_metrics']:
                efficiency_data.update(self.data['eval_metrics']['efficiency_metrics'])
            if 'generation_metrics' in self.data['eval_metrics']:
                efficiency_data.update(self.data['eval_metrics']['generation_metrics'])

        # From test_results
        if 'test_results' in self.data:
            results = self.data['test_results']
            search_times = [r['times']['search'] for r in results]
            gen_times = [r['times']['generation'] for r in results]
            total_times = [r['times']['total'] for r in results]

            efficiency_data.update({
                'avg_search_time': np.mean(search_times),
                'avg_generation_time': np.mean(gen_times),
                'avg_total_time': np.mean(total_times),
                'min_response_time': min(total_times),
                'max_response_time': max(total_times)
            })

        if not efficiency_data:
            print("✗ No efficiency data found")
            return

        # Create figure
        fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
        fig.suptitle('System Efficiency Analysis', fontsize=16, fontweight='bold')

        # 1. Time efficiency metrics
        if 'avg_search_time' in efficiency_data:
            time_metrics = {
                'Avg Search Time': efficiency_data.get('avg_search_time', 0),
                'Avg Generation Time': efficiency_data.get('avg_generation_time', 0),
                'Avg Total Time': efficiency_data.get('avg_total_time', 0),
                'Fastest Response': efficiency_data.get('min_response_time', 0),
                'Slowest Response': efficiency_data.get('max_response_time', 0)
            }

            bars = ax1.bar(time_metrics.keys(), time_metrics.values(),
                           color=['lightblue', 'lightgreen', 'lightyellow', 'lightcoral', 'orange'])
            ax1.set_ylabel('Time (seconds)', fontsize=12)
            ax1.set_title('Time Efficiency Metrics', fontsize=14, fontweight='bold')
            ax1.tick_params(axis='x', rotation=45)
            ax1.grid(axis='y', alpha=0.3)

            # Add value labels
            for bar, value in zip(bars, time_metrics.values()):
                ax1.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
                         f'{value:.2f}', ha='center', va='bottom')

        # 2. Resource usage
        resource_metrics = {}
        if 'gpu_memory_gb' in efficiency_data:
            resource_metrics['GPU Memory (GB)'] = efficiency_data['gpu_memory_gb']
        if 'gpu_total_gb' in efficiency_data:
            resource_metrics['GPU Total (GB)'] = efficiency_data['gpu_total_gb']
        if 'index_size_mb' in efficiency_data:
            resource_metrics['Index Size (MB/100)'] = efficiency_data['index_size_mb'] / 100
        if 'num_documents' in efficiency_data:
            resource_metrics['Documents (100s)'] = efficiency_data['num_documents'] / 100

        if resource_metrics:
            ax2.bar(resource_metrics.keys(), resource_metrics.values(),
                    color=['skyblue', 'lightblue', 'lightgreen', 'lightyellow'])
            ax2.set_ylabel('Resource Usage', fontsize=12)
            ax2.set_title('System Resource Utilization', fontsize=14, fontweight='bold')
            ax2.tick_params(axis='x', rotation=45)
            ax2.grid(axis='y', alpha=0.3)

        # 3. Performance trend
        if 'test_results' in self.data:
            results = self.data['test_results']
            query_indices = list(range(len(results)))
            search_times = [r['times']['search'] for r in results]
            gen_times = [r['times']['generation'] for r in results]

            ax3.plot(query_indices, search_times, 'o-', label='Search Time', linewidth=2)
            ax3.plot(query_indices, gen_times, 's-', label='Generation Time', linewidth=2)
            ax3.set_xlabel('Query Index', fontsize=12)
            ax3.set_ylabel('Time (seconds)', fontsize=12)
            ax3.set_title('Query Performance Trend', fontsize=14, fontweight='bold')
            ax3.legend()
            ax3.grid(True, alpha=0.3)

            # Add moving average
            window = min(3, len(results) // 2)
            if window > 1:
                search_ma = pd.Series(search_times).rolling(window=window).mean()
                gen_ma = pd.Series(gen_times).rolling(window=window).mean()
                ax3.plot(query_indices, search_ma, '--', color='blue', alpha=0.5)
                ax3.plot(query_indices, gen_ma, '--', color='orange', alpha=0.5)

        # 4. Efficiency summary
        summary_text = "System Efficiency Summary\n" + "=" * 25 + "\n\n"

        if 'avg_total_time' in efficiency_data:
            summary_text += f"Average Response Time: {efficiency_data['avg_total_time']:.2f}s\n"
        if 'avg_answer_length' in efficiency_data:
            summary_text += f"Average Answer Length: {efficiency_data['avg_answer_length']:.0f} words\n"
        if 'num_documents' in efficiency_data:
            summary_text += f"Indexed Documents: {efficiency_data['num_documents']}\n"
        if 'embedding_dim' in efficiency_data:
            summary_text += f"Embedding Dimension: {efficiency_data['embedding_dim']}\n"

        # Calculate throughput
        if 'avg_total_time' in efficiency_data and efficiency_data['avg_total_time'] > 0:
            throughput = 3600 / efficiency_data['avg_total_time']
            summary_text += f"\nEstimated Throughput: {throughput:.0f} queries/hour"

        ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes,
                 fontsize=12, verticalalignment='top',
                 bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
        ax4.axis('off')

        plt.tight_layout()
        plt.savefig(os.path.join(self.output_dir, 'system_efficiency_analysis.png'),
                    dpi=300, bbox_inches='tight')
        plt.close()
        print("✓ system_efficiency_analysis.png generated")

    def generate_summary_report(self):
        """Generate detailed summary report"""
        print("Generating summary report...")

        report = "Medical Literature RAG System Evaluation Report\n"
        report += "=" * 50 + "\n"
        report += f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"

        # 1. Dataset statistics
        report += "1. Dataset Statistics\n"
        report += "-" * 30 + "\n"

        if 'clusters' in self.data:
            total_docs = len(self.data['clusters'])
            n_topics = len(self.data['clusters']['cluster'].unique())
            noise_docs = len(self.data['clusters'][self.data['clusters']['cluster'] == -1])
            report += f"- Total Documents: {total_docs}\n"
            report += f"- Topics Identified: {n_topics - 1}\n"  # Exclude noise
            report += f"- Noise Documents: {noise_docs} ({noise_docs / total_docs * 100:.1f}%)\n"

        # 2. Performance metrics
        report += "\n2. System Performance Metrics\n"
        report += "-" * 30 + "\n"

        if 'test_results' in self.data:
            results = self.data['test_results']
            search_times = [r['times']['search'] for r in results]
            gen_times = [r['times']['generation'] for r in results]
            total_times = [r['times']['total'] for r in results]
            answer_lengths = [len(r['answer'].split()) for r in results]

            report += f"- Average Search Time: {np.mean(search_times):.3f}s\n"
            report += f"- Average Generation Time: {np.mean(gen_times):.3f}s\n"
            report += f"- Average Total Response Time: {np.mean(total_times):.3f}s\n"
            report += f"- Fastest Response: {min(total_times):.3f}s\n"
            report += f"- Slowest Response: {max(total_times):.3f}s\n"
            report += f"- Average Answer Length: {np.mean(answer_lengths):.0f} words\n"

        # 3. Evaluation results
        if 'eval_metrics' in self.data:
            report += "\n3. Evaluation Metrics\n"
            report += "-" * 30 + "\n"

            if 'generation_metrics' in self.data['eval_metrics']:
                gen_metrics = self.data['eval_metrics']['generation_metrics']
                for key, value in gen_metrics.items():
                    report += f"- {key}: {value:.3f}\n"

            if 'efficiency_metrics' in self.data['eval_metrics']:
                eff_metrics = self.data['eval_metrics']['efficiency_metrics']
                report += f"\nResource Usage:\n"
                for key, value in eff_metrics.items():
                    if isinstance(value, float):
                        report += f"- {key}: {value:.3f}\n"
                    else:
                        report += f"- {key}: {value}\n"

        # 4. Test query results
        report += "\n4. Test Query Example\n"
        report += "-" * 30 + "\n"

        if 'test_results' in self.data and len(self.data['test_results']) > 0:
            first_result = self.data['test_results'][0]
            report += f"Query: {first_result['query']}\n"
            report += f"Answer Preview: {first_result['answer'][:200]}...\n"
            report += f"Sources Used: {len(first_result['sources'])}\n"
            report += f"Response Time: {first_result['times']['total']:.3f}s\n"

        # 5. Recommendations
        report += "\n5. Optimization Recommendations\n"
        report += "-" * 30 + "\n"

        if 'test_results' in self.data:
            avg_time = np.mean([r['times']['total'] for r in self.data['test_results']])
            if avg_time > 3:
                report += "- Consider optimizing model loading and inference speed\n"
            if np.mean([len(r['answer'].split()) for r in self.data['test_results']]) < 150:
                report += "- Consider increasing answer detail and comprehensiveness\n"
            report += "- Implement caching for frequently asked queries\n"
            report += "- Add more diverse test queries for comprehensive evaluation\n"

        # Save report
        report_path = os.path.join(self.output_dir, 'evaluation_report.txt')
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(report)

        print(f"✓ Evaluation report saved to: {report_path}")

        return report


# ============================================================================
# Main Pipeline
# ============================================================================

class MedicalLiteratureRAGPipeline:
    """Main pipeline orchestrating all components"""

    def __init__(self, config: Config):
        self.config = config
        self.processor = MedicalDataProcessor(config)
        self.topic_modeler = MedicalTopicModeler(config)
        self.rag_system = None
        self.evaluator = None

    def run_complete_pipeline(self,
                              excel_path: str,
                              hf_token: Optional[str] = None,
                              hf_repo: Optional[str] = None,
                              run_evaluation: bool = True):
        """Execute complete pipeline"""

        print("=" * 80)
        print("Medical Literature RAG Pipeline")
        print("=" * 80)

        # Step 1: Load and process data
        print("\n[Step 1/6] Loading and processing data...")
        df = self.processor.load_and_clean_excel(excel_path)
        records = self.processor.prepare_records(df)
        self.processor.save_metadata(records)

        # Step 2: Topic modeling
        print("\n[Step 2/6] Performing topic modeling...")
        topics, topic_model = self.topic_modeler.fit_topics(records)

        # Step 3: Create and save dataset
        print("\n[Step 3/6] Creating dataset...")
        self._create_dataset(records, hf_token, hf_repo)

        # Step 4: Build RAG system
        print("\n[Step 4/6] Building RAG system...")
        self.rag_system = MedicalRAGSystem(self.config)
        self.rag_system.build_index(records)

        # Step 5: Run test queries
        print("\n[Step 5/6] Running test queries...")
        self._run_test_queries()

        # Step 6: Evaluation
        if run_evaluation:
            print("\n[Step 6/6] Running evaluation...")
            self._run_evaluation()

        print("\n" + "=" * 80)
        print("Pipeline completed successfully!")
        print(f"All results saved to: {self.config.OUTPUT_DIR}")
        print("=" * 80)

    def _create_dataset(self, records: List[Dict], hf_token: Optional[str], hf_repo: Optional[str]):
        """Create and optionally upload dataset to Hugging Face"""
        # Ensure all records have proper types
        for rec in records:
            # Ensure cluster exists and is int
            if 'cluster' not in rec or rec['cluster'] is None:
                rec['cluster'] = -1
            else:
                rec['cluster'] = int(rec['cluster'])

            # Ensure string fields
            for key in ['pmid', 'title', 'journal', 'mesh', 'keywords', 'abstract', 'doi']:
                val = rec.get(key, '')
                if val is None or pd.isna(val):
                    rec[key] = ''
                else:
                    rec[key] = str(val)

            # Ensure year is int
            yr = rec.get('year', 0)
            if yr is None or pd.isna(yr):
                rec['year'] = 0
            else:
                rec['year'] = int(yr)

        # Create dataset
        ds = Dataset.from_list(records)
        ds = ds.class_encode_column('cluster')

        # Save locally
        df_export = ds.to_pandas()
        export_path = os.path.join(self.config.OUTPUT_DIR, 'medllm_full_dataset.csv')
        df_export.to_csv(export_path, index=False, encoding='utf-8-sig')
        print(f"Dataset saved to: {export_path}")

        # Upload to Hugging Face
        if hf_token and hf_repo:
            try:
                print(f"\nUploading dataset to Hugging Face...")
                login(token=hf_token)
                ds.push_to_hub(hf_repo, private=False)
                print(f"Dataset pushed to https://huggingface.co/datasets/{hf_repo}")
            except Exception as e:
                print(f"Warning: Could not upload to Hugging Face: {e}")

    def _run_test_queries(self):
        """Run predefined test queries"""
        test_queries = [
            "What are the applications of ChatGPT in medical education?",
            "How accurate is ChatGPT in medical diagnosis?",
            "What are the limitations of using AI in healthcare?",
            "ChatGPT's performance in medical examinations",
            "Can ChatGPT help with bone tumor diagnosis?",
            "What are the ethical considerations of AI in medicine?",
            "How does ChatGPT compare to human doctors in diagnosis?",
            "Applications of large language models in radiology"
        ]

        results = []

        print("\nRunning test queries...")
        print("-" * 80)

        for query in test_queries:
            print(f"\nQuery: {query}")
            result = self.rag_system.qa_pipeline(query)

            print(f"\nAnswer:\n{result['answer']}")
            print(f"\nBased on {len(result['sources'])} sources:")
            for i, source in enumerate(result['sources'][:3]):
                print(f"  [{i + 1}] PMID {source['pmid']} ({source['year']}) - {source['title'][:60]}...")

            print(f"\nTiming: Search {result['times']['search']:.2f}s, "
                  f"Generation {result['times']['generation']:.2f}s")
            print("-" * 80)

            results.append(result)

        # Save test results
        test_results_path = os.path.join(self.config.OUTPUT_DIR, 'test_query_results.json')
        with open(test_results_path, 'w', encoding='utf-8') as f:
            json.dump(results, f, indent=2, ensure_ascii=False)

    def _run_evaluation(self):
        """Run comprehensive evaluation"""
        self.evaluator = RAGEvaluator(self.rag_system, self.config)

        # Basic test queries for generation evaluation
        test_queries = [
            "What are the applications of ChatGPT in medical education?",
            "How accurate is ChatGPT in medical diagnosis?",
            "What are the limitations of using AI in healthcare?",
            "ChatGPT's performance in medical examinations",
            "Can ChatGPT help with bone tumor diagnosis?"
        ]

        # Evaluate generation
        gen_metrics = self.evaluator.evaluate_generation(test_queries)
        print("\nGeneration Metrics:")
        for metric, value in gen_metrics.items():
            print(f"  {metric}: {value:.3f}")

        # Evaluate efficiency
        eff_metrics = self.evaluator.evaluate_efficiency()
        print("\nEfficiency Metrics:")
        for metric, value in eff_metrics.items():
            print(f"  {metric}: {value:.3f}")

        # Save all results
        self.evaluator.save_evaluation_results()

        # Generate enhanced plots
        print("\nGenerating evaluation plots...")
        plotter = RealEvaluationPlotter(self.config.OUTPUT_DIR)
        plotter.generate_all_plots()
        plotter.generate_summary_report()


# ============================================================================
# Main Execution
# ============================================================================

def main():
    """Main execution function"""

    # Configuration
    config = Config()

    # Initialize pipeline
    pipeline = MedicalLiteratureRAGPipeline(config)

    # Run complete pipeline with Hugging Face upload
    pipeline.run_complete_pipeline(
        excel_path=config.EXCEL_PATH,
        hf_token=config.HF_TOKEN,
        hf_repo=config.HF_REPO,
        run_evaluation=True
    )

    # Print GPU usage if available
    if torch.cuda.is_available():
        print(f"\nFinal GPU Memory Usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")


if __name__ == "__main__":
    main()