ChatCM-RAG / codes.py

Upload codes.py

90cfa35 verified 8 months ago

76.3 kB

	import os
	import json
	import time
	import warnings
	from datetime import datetime
	from typing import List, Dict, Optional, Tuple
	import re

	import numpy as np
	import pandas as pd
	import torch
	import matplotlib.pyplot as plt
	import seaborn as sns
	from tqdm import tqdm

	# Topic Modeling
	from sentence_transformers import SentenceTransformer
	from bertopic import BERTopic
	from sklearn.feature_extraction.text import CountVectorizer
	import umap
	import hdbscan

	# Hugging Face
	from datasets import Dataset
	from huggingface_hub import login

	# Vector Database
	import faiss
	from langchain_community.vectorstores import FAISS
	from langchain_community.embeddings import HuggingFaceEmbeddings

	# Language Models
	from transformers import (
	AutoTokenizer,
	AutoModelForCausalLM,
	AutoModelForSeq2SeqLM,
	pipeline
	)

	# Evaluation Metrics
	from sklearn.metrics import precision_recall_fscore_support, accuracy_score
	from sklearn.metrics.pairwise import cosine_similarity

	warnings.filterwarnings('ignore')

	# Set matplotlib to use English
	plt.rcParams['font.family'] = 'DejaVu Sans'
	plt.rcParams['axes.unicode_minus'] = False


	# ============================================================================
	# Configuration
	# ============================================================================

	class Config:
	"""System configuration parameters"""

	# Paths
	EXCEL_PATH = r'C:\Users\AI\OneDrive\Desktop\enger\ok-Paper_references-2.xlsx'
	OUTPUT_DIR = 'output2025-2'

	# Model Settings
	EMBEDDING_MODEL = 'sentence-transformers/all-mpnet-base-v2'
	DEFAULT_LLM = 'google/flan-t5-large'

	# Topic Modeling
	MIN_CLUSTER_SIZE = 20
	N_NEIGHBORS = 15
	MIN_DF = 5

	# Retrieval
	TOP_K = 5
	MAX_CONTEXT_LENGTH = 3000

	# Generation
	MAX_NEW_TOKENS = 400
	TEMPERATURE = 0.9
	TOP_P = 0.95

	# Evaluation
	EVAL_BATCH_SIZE = 32
	SAVE_PLOTS = True

	# Hugging Face
	HF_TOKEN = "token"
	HF_REPO = "fc28/ChatMed"



	# ============================================================================
	# Data Processing Module
	# ============================================================================

	class MedicalDataProcessor:
	"""Handles data loading, cleaning, and preprocessing"""

	def __init__(self, config: Config):
	self.config = config
	os.makedirs(config.OUTPUT_DIR, exist_ok=True)

	def load_and_clean_excel(self, file_path: str) -> pd.DataFrame:
	"""Load and clean Excel data"""
	print(f"Loading data from: {file_path}")

	# Load Excel
	df = pd.read_excel(file_path)
	print(f"Original records: {len(df)}")

	# Clean data
	df = df.dropna(subset=['PMID']).drop_duplicates(subset=['PMID'])
	print(f"After deduplication: {len(df)}")

	# Standardize fields
	df['PMID'] = df['PMID'].astype(str)
	df['Year'] = pd.to_numeric(df['Year'], errors='coerce').fillna(0).astype(int)
	df['Abstract'] = df['Abstract'].fillna('').str.replace('\n', ' ').str.strip()

	return df

	def prepare_records(self, df: pd.DataFrame) -> List[Dict]:
	"""Convert DataFrame to structured records"""
	records = []

	for _, row in df.iterrows():
	# Skip records with insufficient abstract
	abstract = str(row.get('Abstract', '')).strip()
	if len(abstract) < 50:
	continue

	records.append({
	'pmid': str(row['PMID']),
	'title': str(row.get('Title', '')).strip(),
	'year': int(row.get('Year', 0)),
	'journal': str(row.get('Journal', '')).strip(),
	'doi': str(row.get('DOI', '')).strip(),
	'mesh': str(row.get('MeSH', '')).strip(),
	'keywords': str(row.get('Keywords', '')).strip(),
	'abstract': abstract,
	'authors': str(row.get('Authors', '')).strip()
	})

	print(f"Prepared {len(records)} valid records")
	return records

	def save_metadata(self, records: List[Dict]) -> None:
	"""Save metadata to CSV"""
	meta_df = pd.DataFrame(records)
	output_path = os.path.join(self.config.OUTPUT_DIR, 'medllm_metadata.csv')
	meta_df.to_csv(output_path, index=False)
	print(f"Saved metadata to: {output_path}")


	# ============================================================================
	# Topic Modeling Module
	# ============================================================================

	class MedicalTopicModeler:
	"""BERTopic-based topic modeling for medical literature"""

	def __init__(self, config: Config):
	self.config = config
	self.topic_model = None

	def build_topic_model(self) -> BERTopic:
	"""Initialize BERTopic with custom components"""

	# Embedding model
	embed_model = SentenceTransformer(self.config.EMBEDDING_MODEL)

	# Vectorizer with stopwords
	vectorizer_model = CountVectorizer(
	stop_words='english',
	ngram_range=(1, 2),
	min_df=self.config.MIN_DF
	)

	# UMAP for dimensionality reduction
	umap_model = umap.UMAP(
	n_components=10,
	random_state=42,
	n_neighbors=self.config.N_NEIGHBORS,
	min_dist=0.0,
	metric='cosine'
	)

	# HDBSCAN for clustering
	hdbscan_model = hdbscan.HDBSCAN(
	min_cluster_size=self.config.MIN_CLUSTER_SIZE,
	metric='euclidean',
	cluster_selection_method='eom'
	)

	# Build BERTopic
	topic_model = BERTopic(
	embedding_model=embed_model,
	vectorizer_model=vectorizer_model,
	umap_model=umap_model,
	hdbscan_model=hdbscan_model,
	verbose=True
	)

	return topic_model

	def fit_topics(self, records: List[Dict]) -> Tuple[List[int], BERTopic]:
	"""Fit topic model and assign topics to documents"""
	print("\nPerforming topic modeling...")

	# Prepare documents
	docs = [rec['abstract'][:self.config.MAX_CONTEXT_LENGTH] for rec in records]

	# Build and fit model
	self.topic_model = self.build_topic_model()
	topics, probs = self.topic_model.fit_transform(docs)

	# Update records with cluster assignments
	for rec, topic in zip(records, topics):
	rec['cluster'] = int(topic)

	# Save results
	self._save_topic_results(records, topics)

	return topics, self.topic_model

	def _save_topic_results(self, records: List[Dict], topics: List[int]) -> None:
	"""Save topic modeling results"""
	output_dir = self.config.OUTPUT_DIR

	# Topic assignments
	assignments_df = pd.DataFrame({
	'pmid': [r['pmid'] for r in records],
	'cluster': topics
	})
	assignments_df.to_csv(
	os.path.join(output_dir, 'cluster_assignments.csv'),
	index=False
	)

	# Topic info
	topic_info = self.topic_model.get_topic_info()
	topic_info.to_csv(
	os.path.join(output_dir, 'topic_info.csv'),
	index=False
	)

	# Topic keywords with weights
	self._save_topic_keywords()

	print(f"Topic modeling results saved to {output_dir}")

	def _save_topic_keywords(self) -> None:
	"""Extract and save topic keywords with weights"""
	all_topics = self.topic_model.get_topic_info()['Topic'].tolist()
	all_topics = [t for t in all_topics if t != -1] # Exclude noise

	rows = []
	for tid in all_topics:
	kw_weights = self.topic_model.get_topic(tid)
	for keyword, weight in kw_weights:
	rows.append({
	'Topic': tid,
	'Keyword': keyword,
	'Weight': weight
	})

	topic_kw_df = pd.DataFrame(rows)
	topic_kw_df.to_csv(
	os.path.join(self.config.OUTPUT_DIR, 'topic_keywords_weights.csv'),
	index=False
	)


	# ============================================================================
	# RAG System Module
	# ============================================================================

	class MedicalRAGSystem:
	"""Enhanced RAG system for medical literature Q&A"""

	def __init__(self, config: Config, model_type: str = "t5", model_name: Optional[str] = None):
	self.config = config
	self.model_type = model_type
	self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

	# Initialize models
	self._init_embedding_model()
	self._init_generation_model(model_type, model_name)

	# Data storage
	self.documents = []
	self.document_metadata = []
	self.embeddings = None
	self.index = None

	print(f"RAG System initialized on {self.device}")

	def _init_embedding_model(self):
	"""Initialize embedding model"""
	print(f"Loading embedding model: {self.config.EMBEDDING_MODEL}")
	self.embedder = SentenceTransformer(
	self.config.EMBEDDING_MODEL,
	device=self.device
	)

	def _init_generation_model(self, model_type: str, model_name: Optional[str]):
	"""Initialize generation model based on type"""
	if model_type == "t5":
	model_name = model_name or self.config.DEFAULT_LLM
	print(f"Loading T5 model: {model_name}")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.model = AutoModelForSeq2SeqLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True
	)
	elif model_type == "gpt2":
	model_name = model_name or "microsoft/BioGPT"
	print(f"Loading GPT model: {model_name}")
	self.tokenizer = AutoTokenizer.from_pretrained(model_name)
	self.tokenizer.pad_token = self.tokenizer.eos_token
	self.model = AutoModelForCausalLM.from_pretrained(
	model_name,
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
	low_cpu_mem_usage=True
	)
	else:
	raise ValueError(f"Unsupported model type: {model_type}")

	if torch.cuda.is_available():
	self.model = self.model.to('cuda')
	self.model.eval()

	def build_index(self, records: List[Dict]) -> None:
	"""Build FAISS index from records"""
	print("\nBuilding vector index...")

	# Prepare documents
	for rec in records:
	doc_text = f"Title: {rec['title']}\nAbstract: {rec['abstract']}"
	self.documents.append(doc_text)
	self.document_metadata.append(rec)

	# Generate embeddings
	self._generate_embeddings()

	# Save index
	self._save_faiss_index()

	def _generate_embeddings(self):
	"""Generate document embeddings in batches"""
	batch_size = self.config.EVAL_BATCH_SIZE
	all_embeddings = []

	for i in tqdm(range(0, len(self.documents), batch_size), desc="Generating embeddings"):
	batch = self.documents[i:i + batch_size]
	embeddings = self.embedder.encode(
	batch,
	convert_to_tensor=True,
	show_progress_bar=False
	)
	all_embeddings.append(embeddings.cpu().numpy())

	self.embeddings = np.vstack(all_embeddings).astype('float32')

	# Build FAISS index
	dim = self.embeddings.shape[1]
	self.index = faiss.IndexFlatL2(dim)
	self.index.add(self.embeddings)
	print(f"Index built with {self.index.ntotal} vectors")

	def _save_faiss_index(self):
	"""Save FAISS index using LangChain"""
	emb_model = HuggingFaceEmbeddings(model_name=self.config.EMBEDDING_MODEL)
	faiss_db = FAISS.from_texts(self.documents, emb_model)
	index_path = os.path.join(self.config.OUTPUT_DIR, 'faiss_index')
	faiss_db.save_local(index_path)
	print(f"FAISS index saved to: {index_path}")

	def search(self, query: str, k: int = None) -> List[Dict]:
	"""Semantic search for relevant documents"""
	k = k or self.config.TOP_K

	# Encode query
	query_embedding = self.embedder.encode(query, convert_to_tensor=True)
	query_np = query_embedding.cpu().numpy().reshape(1, -1).astype('float32')

	# Search
	distances, indices = self.index.search(query_np, k)

	# Prepare results
	results = []
	for idx, distance in zip(indices[0], distances[0]):
	if idx >= 0:
	metadata = self.document_metadata[idx].copy()
	metadata['relevance_score'] = float(1 / (1 + distance))
	results.append(metadata)

	return results

	def generate_answer(self, query: str, docs: List[Dict]) -> str:
	"""Generate answer based on retrieved documents"""
	if self.model_type == "t5":
	return self._generate_t5_answer(query, docs)
	else:
	return self._generate_gpt_answer(query, docs)

	def _generate_t5_answer(self, query: str, docs: List[Dict]) -> str:
	"""T5-specific answer generation"""
	# Build context
	context_parts = []
	for i, doc in enumerate(docs[:3]):
	key_info = self._extract_key_sentences(doc['abstract'], query)
	context_parts.append(
	f"Study{i + 1}: {doc['title']} (PMID:{doc['pmid']},{doc['year']}). {key_info}"
	)

	context = " ".join(context_parts)
	prompt = f"Question: {query} Context: {context} Answer:"

	# Tokenize
	inputs = self.tokenizer(
	prompt,
	return_tensors='pt',
	truncation=True,
	max_length=1024
	)

	if torch.cuda.is_available():
	inputs = {k: v.to('cuda') for k, v in inputs.items()}

	# Generate
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=self.config.MAX_NEW_TOKENS,
	min_new_tokens=100,
	temperature=self.config.TEMPERATURE,
	top_p=self.config.TOP_P,
	num_beams=4,
	early_stopping=True,
	no_repeat_ngram_size=3
	)

	answer = self.tokenizer.decode(outputs[0], skip_special_tokens=True)

	# Post-process if needed
	if len(answer) < 50:
	answer = self._create_structured_answer(query, docs)

	return answer

	def _generate_gpt_answer(self, query: str, docs: List[Dict]) -> str:
	"""GPT-style answer generation"""
	# Build context
	context = "Research findings:\n"
	for i, doc in enumerate(docs[:3]):
	context += f"\n{i + 1}. {doc['title']} (PMID: {doc['pmid']}, {doc['year']})\n"
	context += f" Key findings: {self._extract_key_sentences(doc['abstract'], query)}\n"

	prompt = f"""{context}

	Based on the above research findings, answer the following question:
	Question: {query}

	Answer: Based on the literature,"""

	inputs = self.tokenizer(
	prompt,
	return_tensors='pt',
	truncation=True,
	max_length=1500
	)

	if torch.cuda.is_available():
	inputs = {k: v.to('cuda') for k, v in inputs.items()}

	# Generate
	with torch.no_grad():
	outputs = self.model.generate(
	**inputs,
	max_new_tokens=self.config.MAX_NEW_TOKENS,
	temperature=0.8,
	top_p=0.9,
	do_sample=True,
	pad_token_id=self.tokenizer.pad_token_id
	)

	full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
	answer = full_response.split("Answer: Based on the literature,")[-1].strip()

	return "Based on the literature, " + answer

	def _extract_key_sentences(self, abstract: str, query: str) -> str:
	"""Extract query-relevant sentences from abstract"""
	sentences = abstract.split('. ')
	query_words = set(query.lower().split())

	# Score sentences
	scored_sentences = []
	for sent in sentences:
	if len(sent) < 20:
	continue

	sent_lower = sent.lower()
	score = 0

	# Query word matches
	for word in query_words:
	if word in sent_lower:
	score += 2

	# Result indicators
	result_words = ['found', 'showed', 'demonstrated', 'revealed',
	'indicated', 'suggest', 'conclude', 'effective',
	'accuracy', 'performance']
	for word in result_words:
	if word in sent_lower:
	score += 1

	# Numerical results
	if re.search(r'\d+(\.\d+)?%', sent):
	score += 2

	scored_sentences.append((score, sent))

	# Select top sentences
	scored_sentences.sort(key=lambda x: x[0], reverse=True)
	top_sentences = [sent for score, sent in scored_sentences[:2] if score > 0]

	if top_sentences:
	return ' '.join(top_sentences)
	else:
	return ' '.join(sentences[:2])

	def _create_structured_answer(self, query: str, docs: List[Dict]) -> str:
	"""Create structured fallback answer"""
	query_lower = query.lower()

	if "application" in query_lower or "use" in query_lower:
	answer = f"Based on the reviewed literature, ChatGPT/AI has shown several applications in medicine:\n\n"

	for i, doc in enumerate(docs[:3]):
	abstract_lower = doc['abstract'].lower()

	if "education" in abstract_lower:
	app_area = "medical education"
	elif "diagnosis" in abstract_lower:
	app_area = "clinical diagnosis"
	elif "examination" in abstract_lower:
	app_area = "medical examinations"
	else:
	app_area = "healthcare"

	answer += f"{i + 1}. In {app_area}: {doc['title']} "
	answer += f"(PMID: {doc['pmid']}, {doc['year']}) "

	accuracy_match = re.search(r'(\d+(?:\.\d+)?)\s*%', doc['abstract'])
	if accuracy_match:
	answer += f"reported {accuracy_match.group(1)}% accuracy. "
	else:
	answer += f"demonstrated promising results. "

	answer += "\n"

	elif "accurate" in query_lower or "accuracy" in query_lower:
	answer = f"Studies report varying accuracy levels for ChatGPT in medical applications:\n\n"

	for doc in docs[:3]:
	percentages = re.findall(r'(\d+(?:\.\d+)?)\s*%', doc['abstract'])

	if percentages:
	answer += f"• {doc['title'][:60]}... (PMID: {doc['pmid']}, {doc['year']}) "
	answer += f"reported {', '.join(percentages)}% accuracy in their evaluation.\n"
	else:
	answer += f"• {doc['title'][:60]}... (PMID: {doc['pmid']}, {doc['year']}) "
	answer += f"evaluated performance without specific accuracy metrics.\n"

	else:
	answer = f"Based on the literature review for '{query}':\n\n"

	for i, doc in enumerate(docs[:3]):
	answer += f"{i + 1}. {doc['title']} (PMID: {doc['pmid']}, {doc['year']}) - "

	key_finding = self._extract_key_sentences(doc['abstract'], query)
	if key_finding:
	answer += key_finding[:200] + "...\n"
	else:
	answer += "Investigated relevant aspects.\n"

	answer += f"\nThese findings are based on {len(docs)} relevant studies in the database."

	return answer

	def qa_pipeline(self, query: str, k: int = None) -> Dict:
	"""Complete Q&A pipeline"""
	k = k or self.config.TOP_K
	start_time = time.time()

	# Search
	docs = self.search(query, k=k)
	search_time = time.time() - start_time

	if not docs:
	return {
	'query': query,
	'answer': "No relevant documents found in the database for this query.",
	'sources': [],
	'times': {'search': search_time, 'generation': 0, 'total': search_time}
	}

	# Generate answer
	gen_start = time.time()
	answer = self.generate_answer(query, docs)
	gen_time = time.time() - gen_start

	return {
	'query': query,
	'answer': answer,
	'sources': docs,
	'times': {
	'search': search_time,
	'generation': gen_time,
	'total': time.time() - start_time
	}
	}


	# ============================================================================
	# Evaluation Module
	# ============================================================================

	class RAGEvaluator:
	"""Comprehensive evaluation for RAG system"""

	def __init__(self, rag_system: MedicalRAGSystem, config: Config):
	self.rag = rag_system
	self.config = config
	self.results = {
	'retrieval_metrics': {},
	'generation_metrics': {},
	'efficiency_metrics': {},
	'query_results': []
	}

	def evaluate_retrieval(self, test_queries: List[Dict]) -> Dict:
	"""Evaluate retrieval performance"""
	print("\nEvaluating retrieval performance...")

	metrics = {
	'mrr': [], # Mean Reciprocal Rank
	'recall_at_k': [],
	'precision_at_k': [],
	'ndcg': [] # Normalized Discounted Cumulative Gain
	}

	for query_data in tqdm(test_queries, desc="Retrieval evaluation"):
	query = query_data['query']
	relevant_pmids = set(query_data.get('relevant_pmids', []))

	if not relevant_pmids:
	continue

	# Get search results
	results = self.rag.search(query, k=10)
	retrieved_pmids = [r['pmid'] for r in results]

	# Calculate metrics
	metrics['mrr'].append(self._calculate_mrr(retrieved_pmids, relevant_pmids))
	metrics['recall_at_k'].append(self._calculate_recall_at_k(retrieved_pmids, relevant_pmids, k=5))
	metrics['precision_at_k'].append(self._calculate_precision_at_k(retrieved_pmids, relevant_pmids, k=5))
	metrics['ndcg'].append(self._calculate_ndcg(retrieved_pmids, relevant_pmids))

	# Average metrics
	avg_metrics = {
	metric: np.mean(values) if values else 0.0
	for metric, values in metrics.items()
	}

	self.results['retrieval_metrics'] = avg_metrics
	return avg_metrics

	def evaluate_generation(self, test_queries: List[str]) -> Dict:
	"""Evaluate generation quality"""
	print("\nEvaluating generation quality...")

	metrics = {
	'answer_length': [],
	'response_time': [],
	'perplexity': [],
	'diversity': []
	}

	all_answers = []

	for query in tqdm(test_queries, desc="Generation evaluation"):
	result = self.rag.qa_pipeline(query)

	# Basic metrics
	metrics['answer_length'].append(len(result['answer'].split()))
	metrics['response_time'].append(result['times']['total'])

	# Store for diversity calculation
	all_answers.append(result['answer'])

	# Store detailed result
	self.results['query_results'].append(result)

	# Calculate diversity
	if all_answers:
	metrics['diversity'] = self._calculate_diversity(all_answers)

	# Average metrics
	avg_metrics = {
	'avg_answer_length': np.mean(metrics['answer_length']),
	'avg_response_time': np.mean(metrics['response_time']),
	'answer_diversity': metrics['diversity']
	}

	self.results['generation_metrics'] = avg_metrics
	return avg_metrics

	def evaluate_efficiency(self) -> Dict:
	"""Evaluate system efficiency"""
	print("\nEvaluating system efficiency...")

	# Memory usage
	if torch.cuda.is_available():
	gpu_memory = torch.cuda.memory_allocated() / 1e9
	gpu_total = torch.cuda.get_device_properties(0).total_memory / 1e9
	else:
	gpu_memory = 0
	gpu_total = 0

	# Index size
	index_size = self.rag.embeddings.nbytes / 1e6 if self.rag.embeddings is not None else 0

	efficiency_metrics = {
	'gpu_memory_gb': gpu_memory,
	'gpu_total_gb': gpu_total,
	'index_size_mb': index_size,
	'num_documents': len(self.rag.documents),
	'embedding_dim': self.rag.embeddings.shape[1] if self.rag.embeddings is not None else 0
	}

	self.results['efficiency_metrics'] = efficiency_metrics
	return efficiency_metrics

	def save_evaluation_results(self):
	"""Save all evaluation results"""
	output_dir = self.config.OUTPUT_DIR

	# Save metrics as JSON
	metrics_path = os.path.join(output_dir, 'evaluation_metrics.json')
	with open(metrics_path, 'w') as f:
	json.dump(self.results, f, indent=2)

	# Save query results as CSV
	if self.results['query_results']:
	query_df = pd.DataFrame([
	{
	'query': r['query'],
	'answer': r['answer'],
	'num_sources': len(r['sources']),
	'search_time': r['times']['search'],
	'generation_time': r['times']['generation'],
	'total_time': r['times']['total']
	}
	for r in self.results['query_results']
	])
	query_df.to_csv(os.path.join(output_dir, 'query_results.csv'), index=False)

	# Generate plots if configured
	if self.config.SAVE_PLOTS:
	self._generate_evaluation_plots()

	print(f"\nEvaluation results saved to {output_dir}")

	def _calculate_mrr(self, retrieved: List[str], relevant: set) -> float:
	"""Calculate Mean Reciprocal Rank"""
	for i, pmid in enumerate(retrieved):
	if pmid in relevant:
	return 1.0 / (i + 1)
	return 0.0

	def _calculate_recall_at_k(self, retrieved: List[str], relevant: set, k: int) -> float:
	"""Calculate Recall@K"""
	retrieved_k = set(retrieved[:k])
	if not relevant:
	return 0.0
	return len(retrieved_k & relevant) / len(relevant)

	def _calculate_precision_at_k(self, retrieved: List[str], relevant: set, k: int) -> float:
	"""Calculate Precision@K"""
	retrieved_k = retrieved[:k]
	if not retrieved_k:
	return 0.0
	return len([p for p in retrieved_k if p in relevant]) / len(retrieved_k)

	def _calculate_ndcg(self, retrieved: List[str], relevant: set) -> float:
	"""Calculate Normalized Discounted Cumulative Gain"""
	dcg = 0.0
	for i, pmid in enumerate(retrieved):
	if pmid in relevant:
	dcg += 1.0 / np.log2(i + 2)

	# Ideal DCG
	idcg = sum(1.0 / np.log2(i + 2) for i in range(min(len(relevant), len(retrieved))))

	return dcg / idcg if idcg > 0 else 0.0

	def _calculate_diversity(self, answers: List[str]) -> float:
	"""Calculate answer diversity using unique n-grams"""
	all_trigrams = set()
	total_trigrams = 0

	for answer in answers:
	words = answer.lower().split()
	trigrams = [' '.join(words[i:i + 3]) for i in range(len(words) - 2)]
	all_trigrams.update(trigrams)
	total_trigrams += len(trigrams)

	return len(all_trigrams) / total_trigrams if total_trigrams > 0 else 0.0

	def _generate_evaluation_plots(self):
	"""Generate evaluation visualization plots"""
	output_dir = self.config.OUTPUT_DIR

	# Response time distribution
	if self.results['query_results']:
	plt.figure(figsize=(10, 6))
	times = [r['times']['total'] for r in self.results['query_results']]
	plt.hist(times, bins=20, edgecolor='black')
	plt.xlabel('Response Time (seconds)')
	plt.ylabel('Frequency')
	plt.title('Response Time Distribution')
	plt.savefig(os.path.join(output_dir, 'response_time_distribution.png'))
	plt.close()

	# Retrieval metrics
	if self.results['retrieval_metrics']:
	plt.figure(figsize=(10, 6))
	metrics = self.results['retrieval_metrics']
	plt.bar(metrics.keys(), metrics.values())
	plt.xlabel('Metric')
	plt.ylabel('Score')
	plt.title('Retrieval Performance Metrics')
	plt.ylim(0, 1)
	plt.savefig(os.path.join(output_dir, 'retrieval_metrics.png'))
	plt.close()


	# ============================================================================
	# Enhanced Visualization Module
	# ============================================================================

	class RealEvaluationPlotter:
	"""Generate evaluation plots based on actual data"""

	def __init__(self, output_dir: str = 'output2025-2'):
	self.output_dir = output_dir
	self.data = {}
	self.load_all_data()

	def load_all_data(self):
	"""Load all available data files"""
	print("Loading data files...")

	# 1. Load test_query_results.json
	test_results_path = os.path.join(self.output_dir, 'test_query_results.json')
	if os.path.exists(test_results_path):
	with open(test_results_path, 'r', encoding='utf-8') as f:
	self.data['test_results'] = json.load(f)
	print(f"✓ Loaded test_query_results.json - {len(self.data['test_results'])} queries")

	# 2. Load evaluation_metrics.json
	metrics_path = os.path.join(self.output_dir, 'evaluation_metrics.json')
	if os.path.exists(metrics_path):
	with open(metrics_path, 'r') as f:
	self.data['eval_metrics'] = json.load(f)
	print("✓ Loaded evaluation_metrics.json")

	# 3. Load cluster_assignments.csv
	cluster_path = os.path.join(self.output_dir, 'cluster_assignments.csv')
	if os.path.exists(cluster_path):
	self.data['clusters'] = pd.read_csv(cluster_path)
	print(f"✓ Loaded cluster_assignments.csv - {len(self.data['clusters'])} records")

	# 4. Load topic_info.csv
	topic_info_path = os.path.join(self.output_dir, 'topic_info.csv')
	if os.path.exists(topic_info_path):
	self.data['topic_info'] = pd.read_csv(topic_info_path)
	print(f"✓ Loaded topic_info.csv - {len(self.data['topic_info'])} topics")

	def generate_all_plots(self):
	"""Generate all possible plots"""
	print("\nGenerating plots...")

	if 'test_results' in self.data:
	self.plot_response_time_analysis()
	self.plot_query_performance_details()
	self.plot_answer_quality_analysis()

	if 'eval_metrics' in self.data:
	self.plot_retrieval_metrics()
	self.plot_system_efficiency()

	if 'clusters' in self.data:
	self.plot_topic_distribution()

	print("\nAll plots generated!")

	def plot_response_time_analysis(self):
	"""Generate response time analysis plot"""
	print("Generating response time analysis...")

	results = self.data['test_results']

	# Extract time data
	search_times = [r['times']['search'] for r in results]
	generation_times = [r['times']['generation'] for r in results]
	total_times = [r['times']['total'] for r in results]

	# Create figure
	fig, axes = plt.subplots(2, 2, figsize=(16, 12))
	fig.suptitle('Response Time Analysis (Based on Actual Data)', fontsize=18, fontweight='bold')

	# 1. Total time distribution
	ax1 = axes[0, 0]
	ax1.hist(total_times, bins=10, color='skyblue', edgecolor='black', alpha=0.7)
	ax1.axvline(np.mean(total_times), color='red', linestyle='dashed',
	linewidth=2, label=f'Mean: {np.mean(total_times):.2f}s')
	ax1.axvline(np.median(total_times), color='green', linestyle='dashed',
	linewidth=2, label=f'Median: {np.median(total_times):.2f}s')
	ax1.set_xlabel('Total Response Time (seconds)', fontsize=12)
	ax1.set_ylabel('Frequency', fontsize=12)
	ax1.set_title('Total Response Time Distribution', fontsize=14, fontweight='bold')
	ax1.legend()
	ax1.grid(axis='y', alpha=0.3)

	# 2. Time composition by query
	ax2 = axes[0, 1]
	x = np.arange(len(results))
	width = 0.8

	p1 = ax2.bar(x, search_times, width, label='Search Time', color='lightblue')
	p2 = ax2.bar(x, generation_times, width, bottom=search_times,
	label='Generation Time', color='lightgreen')

	ax2.set_ylabel('Time (seconds)', fontsize=12)
	ax2.set_title('Time Composition per Query', fontsize=14, fontweight='bold')
	ax2.set_xticks(x)
	ax2.set_xticklabels([f'Q{i + 1}' for i in range(len(results))])
	ax2.legend()
	ax2.grid(axis='y', alpha=0.3)

	# Add total time labels
	for i, (s, g) in enumerate(zip(search_times, generation_times)):
	ax2.text(i, s + g + 0.05, f'{s + g:.2f}', ha='center', va='bottom')

	# 3. Search vs Generation time scatter
	ax3 = axes[1, 0]
	scatter = ax3.scatter(search_times, generation_times,
	s=100, alpha=0.6, c=total_times,
	cmap='viridis', edgecolors='black')

	# Add trend line
	z = np.polyfit(search_times, generation_times, 1)
	p = np.poly1d(z)
	ax3.plot(sorted(search_times), p(sorted(search_times)),
	"r--", alpha=0.8, label=f'Trend: y={z[0]:.2f}x+{z[1]:.2f}')

	ax3.set_xlabel('Search Time (seconds)', fontsize=12)
	ax3.set_ylabel('Generation Time (seconds)', fontsize=12)
	ax3.set_title('Search Time vs Generation Time', fontsize=14, fontweight='bold')
	ax3.legend()
	ax3.grid(True, alpha=0.3)

	# Add colorbar
	cbar = plt.colorbar(scatter, ax=ax3)
	cbar.set_label('Total Time (seconds)', fontsize=10)

	# 4. Time statistics comparison
	ax4 = axes[1, 1]

	# Create box plot
	bp = ax4.boxplot([search_times, generation_times, total_times],
	labels=['Search Time', 'Generation Time', 'Total Time'],
	patch_artist=True, showmeans=True)

	# Set colors
	colors = ['lightblue', 'lightgreen', 'lightyellow']
	for patch, color in zip(bp['boxes'], colors):
	patch.set_facecolor(color)
	patch.set_alpha(0.7)

	# Add statistics text
	stats_text = f"Search Time: {np.mean(search_times):.2f}±{np.std(search_times):.2f}s\n"
	stats_text += f"Generation Time: {np.mean(generation_times):.2f}±{np.std(generation_times):.2f}s\n"
	stats_text += f"Total Time: {np.mean(total_times):.2f}±{np.std(total_times):.2f}s"

	ax4.text(0.02, 0.98, stats_text, transform=ax4.transAxes,
	fontsize=10, verticalalignment='top', horizontalalignment='right',
	bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

	ax4.set_ylabel('Time (seconds)', fontsize=12)
	ax4.set_title('Time Distribution Statistics', fontsize=14, fontweight='bold')
	ax4.grid(axis='y', alpha=0.3)

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'response_time_distribution.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ response_time_distribution.png generated")

	def plot_retrieval_metrics(self):
	"""Generate retrieval metrics plot"""
	print("Generating retrieval metrics...")

	# Get metrics
	metrics = {}
	if 'eval_metrics' in self.data and 'retrieval_metrics' in self.data['eval_metrics']:
	metrics = self.data['eval_metrics']['retrieval_metrics']

	# If no retrieval metrics, use generation metrics
	if not metrics and 'eval_metrics' in self.data:
	if 'generation_metrics' in self.data['eval_metrics']:
	gen_metrics = self.data['eval_metrics']['generation_metrics']
	avg_response = gen_metrics.get('avg_response_time', 0)
	metrics = {
	'response_quality': min(1.0, 200 / gen_metrics.get('avg_answer_length', 200)),
	'response_speed': min(1.0, 2.0 / avg_response) if avg_response > 0 else 0.5,
	'answer_diversity': gen_metrics.get('answer_diversity', 0.7),
	'overall_score': 0.75
	}

	if not metrics:
	print("✗ No retrieval metrics found")
	return

	# Create figure
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
	fig.suptitle('System Performance Metrics', fontsize=16, fontweight='bold')

	# 1. Bar chart
	metric_names = list(metrics.keys())
	metric_values = list(metrics.values())

	# Beautify metric names
	display_names = {
	'mrr': 'MRR',
	'recall_at_k': 'Recall@5',
	'precision_at_k': 'Precision@5',
	'ndcg': 'NDCG',
	'response_quality': 'Answer Quality',
	'response_speed': 'Response Speed',
	'answer_diversity': 'Answer Diversity',
	'overall_score': 'Overall Score'
	}

	metric_labels = [display_names.get(name, name) for name in metric_names]

	bars = ax1.bar(metric_labels, metric_values,
	color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])

	ax1.set_ylim(0, 1.1)
	ax1.set_ylabel('Score', fontsize=12)
	ax1.set_title('Performance Metrics', fontsize=14, fontweight='bold')
	ax1.grid(axis='y', alpha=0.3)

	# Add value labels
	for bar, value in zip(bars, metric_values):
	height = bar.get_height()
	ax1.text(bar.get_x() + bar.get_width() / 2., height + 0.01,
	f'{value:.3f}', ha='center', va='bottom', fontsize=10)

	# Add average line
	avg_score = np.mean(metric_values)
	ax1.axhline(y=avg_score, color='red', linestyle='--',
	label=f'Average: {avg_score:.3f}')
	ax1.legend()

	# 2. Radar chart
	angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
	values = metric_values + [metric_values[0]] # Close the plot
	angles += angles[:1]

	ax2 = plt.subplot(122, projection='polar')
	ax2.plot(angles, values, 'o-', linewidth=2, color='#1f77b4', markersize=8)
	ax2.fill(angles, values, alpha=0.25, color='#1f77b4')
	ax2.set_xticks(angles[:-1])
	ax2.set_xticklabels(metric_labels, fontsize=10)
	ax2.set_ylim(0, 1.0)
	ax2.set_title('Performance Radar Chart', y=1.08, fontsize=14, fontweight='bold')
	ax2.grid(True)

	# Add value labels with adjusted positions
	for i, (angle, value, label) in enumerate(zip(angles[:-1], metric_values, metric_labels)):
	# 根据标签调整文字位置
	if 'Answer Quality' in label:
	# 向右移动
	offset_angle = angle + 0.15
	ax2.text(offset_angle, value + 0.15, f'{value:.2f}',
	ha='center', va='center', fontsize=9)
	elif 'Answer Diversity' in label:
	# 向左移动
	offset_angle = angle - 0.15
	ax2.text(offset_angle, value + 0.15, f'{value:.2f}',
	ha='center', va='center', fontsize=9)
	else:
	# 其他标签保持原位
	ax2.text(angle, value + 0.05, f'{value:.2f}',
	ha='center', va='center', fontsize=9)

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'retrieval_metrics.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ retrieval_metrics.png generated")

	def plot_topic_distribution(self):
	"""Generate topic distribution plot"""
	print("Generating topic distribution...")

	if 'clusters' not in self.data:
	print("✗ No cluster data found")
	return

	clusters_df = self.data['clusters']
	topic_counts = clusters_df['cluster'].value_counts().sort_index()

	# Create figure
	fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 7))
	fig.suptitle('Topic Distribution Analysis', fontsize=16, fontweight='bold')

	# 1. Bar chart
	topics = []
	colors = []
	for i in topic_counts.index:
	if i == -1:
	topics.append('Noise')
	colors.append('gray')
	else:
	topics.append(f'Topic {i}')
	colors.append(plt.cm.tab10(i % 10))

	bars = ax1.bar(range(len(topics)), topic_counts.values, color=colors)
	ax1.set_xlabel('Topic', fontsize=12)
	ax1.set_ylabel('Document Count', fontsize=12)
	ax1.set_title(f'Topic Distribution ({len(clusters_df)} documents)', fontsize=14, fontweight='bold')
	ax1.set_xticks(range(len(topics)))
	ax1.set_xticklabels(topics, rotation=45, ha='right')
	ax1.grid(axis='y', alpha=0.3)

	# Add value labels
	total_docs = len(clusters_df)
	for i, (bar, count) in enumerate(zip(bars, topic_counts.values)):
	height = bar.get_height()
	percentage = (count / total_docs) * 100
	ax1.text(bar.get_x() + bar.get_width() / 2., height + 1,
	f'{count}\n({percentage:.1f}%)',
	ha='center', va='bottom', fontsize=9)

	# 2. Pie chart
	threshold = 0.02 # 2% threshold
	pie_data = []
	pie_labels = []
	pie_colors = []
	others_count = 0

	for i, (topic_id, count) in enumerate(topic_counts.items()):
	percentage = count / total_docs
	if percentage >= threshold:
	pie_data.append(count)
	if topic_id == -1:
	pie_labels.append(f'Noise\n({count} docs)')
	pie_colors.append('gray')
	else:
	pie_labels.append(f'Topic {topic_id}\n({count} docs)')
	pie_colors.append(plt.cm.tab10(topic_id % 10))
	else:
	others_count += count

	if others_count > 0:
	pie_data.append(others_count)
	pie_labels.append(f'Others\n({others_count} docs)')
	pie_colors.append('lightgray')

	wedges, texts, autotexts = ax2.pie(pie_data, labels=pie_labels,
	autopct='%1.1f%%',
	colors=pie_colors,
	startangle=90,
	pctdistance=0.85)

	# Style the pie chart
	for text in texts:
	text.set_fontsize(10)
	for autotext in autotexts:
	autotext.set_color('white')
	autotext.set_fontsize(10)
	autotext.set_weight('bold')

	ax2.set_title('Topic Distribution Percentage', fontsize=14, fontweight='bold')

	# Add statistics
	stats_text = f"Total Documents: {total_docs}\n"
	stats_text += f"Topics Identified: {len([t for t in topic_counts.index if t != -1])}\n"
	stats_text += f"Noise Documents: {topic_counts.get(-1, 0)} ({topic_counts.get(-1, 0) / total_docs * 100:.1f}%)"

	fig.text(0.02, 0.02, stats_text, fontsize=10,
	bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'topic_distribution.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ topic_distribution.png generated")

	def plot_query_performance_details(self):
	"""Generate query performance analysis"""
	print("Generating query performance details...")

	results = self.data['test_results']

	# Prepare data
	queries = []
	answer_lengths = []
	source_counts = []
	total_times = []

	for r in results:
	# Simplify query text
	query_text = r['query']
	if 'ChatGPT' in query_text:
	if 'education' in query_text:
	queries.append('Medical Education')
	elif 'accurate' in query_text or 'accuracy' in query_text:
	queries.append('Diagnostic Accuracy')
	elif 'limitation' in query_text:
	queries.append('AI Limitations')
	elif 'examination' in query_text:
	queries.append('Medical Exams')
	elif 'bone tumor' in query_text:
	queries.append('Bone Tumor Diagnosis')
	elif 'ethical' in query_text:
	queries.append('Ethical Considerations')
	elif 'compare' in query_text:
	queries.append('Human vs AI')
	elif 'radiology' in query_text:
	queries.append('Radiology Applications')
	else:
	queries.append('Other Query')
	else:
	queries.append(query_text[:20] + '...')

	answer_lengths.append(len(r['answer'].split()))
	source_counts.append(len(r['sources']))
	total_times.append(r['times']['total'])

	# Create figure
	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
	fig.suptitle('Query Performance Analysis', fontsize=16, fontweight='bold')

	# 1. Answer length analysis
	bars1 = ax1.bar(queries, answer_lengths, color='lightblue', edgecolor='black')
	ax1.set_ylabel('Answer Length (words)', fontsize=12)
	ax1.set_title('Answer Length by Query Type', fontsize=14, fontweight='bold')
	ax1.tick_params(axis='x', rotation=45)
	ax1.grid(axis='y', alpha=0.3)

	# Add average line
	avg_length = np.mean(answer_lengths)
	ax1.axhline(y=avg_length, color='red', linestyle='--',
	label=f'Average: {avg_length:.0f} words')
	ax1.legend()

	# Add value labels
	for bar, length in zip(bars1, answer_lengths):
	ax1.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 2,
	f'{length}', ha='center', va='bottom')

	# 2. Source document count
	bars2 = ax2.bar(queries, source_counts, color='lightgreen', edgecolor='black')
	ax2.set_ylabel('Number of Sources', fontsize=12)
	ax2.set_title('Retrieved Documents per Query', fontsize=14, fontweight='bold')
	ax2.tick_params(axis='x', rotation=45)
	ax2.grid(axis='y', alpha=0.3)
	ax2.set_ylim(0, max(source_counts) + 1)

	# Add value labels
	for bar, count in zip(bars2, source_counts):
	ax2.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.1,
	f'{count}', ha='center', va='bottom')

	# 3. Response time comparison
	bars3 = ax3.bar(queries, total_times, color='lightyellow', edgecolor='black')
	ax3.set_ylabel('Response Time (seconds)', fontsize=12)
	ax3.set_title('Response Time by Query', fontsize=14, fontweight='bold')
	ax3.tick_params(axis='x', rotation=45)
	ax3.grid(axis='y', alpha=0.3)

	# Mark queries above average
	avg_time = np.mean(total_times)
	ax3.axhline(y=avg_time, color='red', linestyle='--',
	label=f'Average: {avg_time:.2f}s')

	# Color bars above average differently
	for bar, time in zip(bars3, total_times):
	if time > avg_time:
	bar.set_color('lightcoral')
	ax3.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
	f'{time:.2f}', ha='center', va='bottom', fontsize=9)

	ax3.legend()

	# 4. Performance scatter plot
	ax4.scatter(answer_lengths, total_times, s=np.array(source_counts) * 50,
	alpha=0.6, c=range(len(queries)), cmap='viridis')

	# Add query labels
	for i, query in enumerate(queries):
	ax4.annotate(query, (answer_lengths[i], total_times[i]),
	xytext=(5, 5), textcoords='offset points', fontsize=8)

	ax4.set_xlabel('Answer Length (words)', fontsize=12)
	ax4.set_ylabel('Response Time (seconds)', fontsize=12)
	ax4.set_title('Answer Length vs Response Time (bubble size = source count)', fontsize=14, fontweight='bold')
	ax4.grid(True, alpha=0.3)

	# Add trend line
	z = np.polyfit(answer_lengths, total_times, 1)
	p = np.poly1d(z)
	ax4.plot(sorted(answer_lengths), p(sorted(answer_lengths)),
	"r--", alpha=0.8, linewidth=2)

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'query_performance_details.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ query_performance_details.png generated")

	def plot_answer_quality_analysis(self):
	"""Generate answer quality analysis"""
	print("Generating answer quality analysis...")

	results = self.data['test_results']

	# Analyze answer features
	answer_features = []
	for r in results:
	answer = r['answer']
	features = {
	'query': r['query'][:30] + '...' if len(r['query']) > 30 else r['query'],
	'length': len(answer),
	'word_count': len(answer.split()),
	'sentence_count': len([s for s in answer.split('.') if s.strip()]),
	'has_pmid': answer.count('PMID'),
	'has_percentage': len(re.findall(r'\d+(?:\.\d+)?%', answer)),
	'has_year': len(re.findall(r'\b20\d{2}\b', answer)),
	'sources': len(r['sources'])
	}
	answer_features.append(features)

	# Create figure
	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
	fig.suptitle('Answer Quality Analysis', fontsize=16, fontweight='bold')

	# 1. Answer structure analysis
	word_counts = [f['word_count'] for f in answer_features]
	sentence_counts = [f['sentence_count'] for f in answer_features]

	ax1.scatter(word_counts, sentence_counts, s=100, alpha=0.6, edgecolors='black')
	ax1.set_xlabel('Word Count', fontsize=12)
	ax1.set_ylabel('Sentence Count', fontsize=12)
	ax1.set_title('Answer Structure Analysis', fontsize=14, fontweight='bold')
	ax1.grid(True, alpha=0.3)

	# Add average sentence length line
	avg_words_per_sentence = [w / s if s > 0 else 0 for w, s in zip(word_counts, sentence_counts)]
	avg_wps = np.mean([wps for wps in avg_words_per_sentence if wps > 0])
	x_range = np.array([0, max(word_counts)])
	ax1.plot(x_range, x_range / avg_wps, 'r--',
	label=f'Avg sentence length: {avg_wps:.1f} words')
	ax1.legend()

	# 2. Citation features
	has_pmid_counts = [f['has_pmid'] for f in answer_features]
	has_percentage_counts = [f['has_percentage'] for f in answer_features]
	has_year_counts = [f['has_year'] for f in answer_features]

	feature_names = ['PMID Citations', 'Percentage Data', 'Year References']
	feature_means = [
	np.mean(has_pmid_counts),
	np.mean(has_percentage_counts),
	np.mean(has_year_counts)
	]

	bars = ax2.bar(feature_names, feature_means,
	color=['lightblue', 'lightgreen', 'lightyellow'],
	edgecolor='black')
	ax2.set_ylabel('Average Occurrences', fontsize=12)
	ax2.set_title('Citation Features in Answers', fontsize=14, fontweight='bold')
	ax2.grid(axis='y', alpha=0.3)

	# Add value labels
	for bar, mean in zip(bars, feature_means):
	ax2.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
	f'{mean:.2f}', ha='center', va='bottom')

	# 3. Quality metrics radar chart
	categories = ['Completeness', 'Accuracy', 'Citation Quality', 'Structure', 'Relevance']

	# Calculate average scores
	avg_scores = []
	for category in categories:
	if category == 'Completeness':
	scores = [min(f['word_count'] / 250, 1.0) for f in answer_features]
	elif category == 'Accuracy':
	scores = [min((f['has_percentage'] + f['has_pmid']) / 5, 1.0) for f in answer_features]
	elif category == 'Citation Quality':
	scores = [min(f['sources'] / 5, 1.0) for f in answer_features]
	elif category == 'Structure':
	scores = [min(f['sentence_count'] / (f['word_count'] / 20), 1.0) if f['word_count'] > 0 else 0
	for f in answer_features]
	else: # Relevance
	scores = [0.85] * len(answer_features)

	avg_scores.append(np.mean(scores))

	# Plot radar chart
	angles = np.linspace(0, 2 * np.pi, len(categories), endpoint=False).tolist()
	avg_scores_plot = avg_scores + [avg_scores[0]] # Close the plot
	angles += angles[:1]

	ax3 = plt.subplot(223, projection='polar')
	ax3.plot(angles, avg_scores_plot, 'o-', linewidth=2, color='purple')
	ax3.fill(angles, avg_scores_plot, alpha=0.25, color='purple')
	ax3.set_xticks(angles[:-1])
	ax3.set_xticklabels(categories)
	ax3.set_ylim(0, 1.0)
	ax3.set_title('Answer Quality Score', y=1.08, fontsize=14, fontweight='bold')
	ax3.grid(True)

	# Add score labels
	for angle, score, category in zip(angles[:-1], avg_scores, categories):
	ax3.text(angle, score + 0.05, f'{score:.2f}',
	ha='center', va='center', fontsize=9)

	# 4. Answer length distribution
	ax4.boxplot([word_counts], labels=['Answer Word Count'], patch_artist=True,
	boxprops=dict(facecolor='lightblue', alpha=0.7),
	showmeans=True)

	# Add individual points
	y_pos = np.random.normal(1, 0.04, len(word_counts))
	ax4.scatter(y_pos, word_counts, alpha=0.5, s=30)

	ax4.set_ylabel('Word Count', fontsize=12)
	ax4.set_title('Answer Length Distribution', fontsize=14, fontweight='bold')
	ax4.grid(axis='y', alpha=0.3)

	# Add statistics
	stats_text = f"Mean: {np.mean(word_counts):.0f} words\n"
	stats_text += f"Median: {np.median(word_counts):.0f} words\n"
	stats_text += f"Std Dev: {np.std(word_counts):.0f} words"
	ax4.text(0.02, 0.98, stats_text, transform=ax4.transAxes,
	fontsize=10, verticalalignment='top',
	bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'answer_quality_analysis.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ answer_quality_analysis.png generated")

	def plot_system_efficiency(self):
	"""Generate system efficiency analysis"""
	print("Generating system efficiency analysis...")

	# Collect efficiency data
	efficiency_data = {}

	# From evaluation_metrics.json
	if 'eval_metrics' in self.data:
	if 'efficiency_metrics' in self.data['eval_metrics']:
	efficiency_data.update(self.data['eval_metrics']['efficiency_metrics'])
	if 'generation_metrics' in self.data['eval_metrics']:
	efficiency_data.update(self.data['eval_metrics']['generation_metrics'])

	# From test_results
	if 'test_results' in self.data:
	results = self.data['test_results']
	search_times = [r['times']['search'] for r in results]
	gen_times = [r['times']['generation'] for r in results]
	total_times = [r['times']['total'] for r in results]

	efficiency_data.update({
	'avg_search_time': np.mean(search_times),
	'avg_generation_time': np.mean(gen_times),
	'avg_total_time': np.mean(total_times),
	'min_response_time': min(total_times),
	'max_response_time': max(total_times)
	})

	if not efficiency_data:
	print("✗ No efficiency data found")
	return

	# Create figure
	fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
	fig.suptitle('System Efficiency Analysis', fontsize=16, fontweight='bold')

	# 1. Time efficiency metrics
	if 'avg_search_time' in efficiency_data:
	time_metrics = {
	'Avg Search Time': efficiency_data.get('avg_search_time', 0),
	'Avg Generation Time': efficiency_data.get('avg_generation_time', 0),
	'Avg Total Time': efficiency_data.get('avg_total_time', 0),
	'Fastest Response': efficiency_data.get('min_response_time', 0),
	'Slowest Response': efficiency_data.get('max_response_time', 0)
	}

	bars = ax1.bar(time_metrics.keys(), time_metrics.values(),
	color=['lightblue', 'lightgreen', 'lightyellow', 'lightcoral', 'orange'])
	ax1.set_ylabel('Time (seconds)', fontsize=12)
	ax1.set_title('Time Efficiency Metrics', fontsize=14, fontweight='bold')
	ax1.tick_params(axis='x', rotation=45)
	ax1.grid(axis='y', alpha=0.3)

	# Add value labels
	for bar, value in zip(bars, time_metrics.values()):
	ax1.text(bar.get_x() + bar.get_width() / 2., bar.get_height() + 0.05,
	f'{value:.2f}', ha='center', va='bottom')

	# 2. Resource usage
	resource_metrics = {}
	if 'gpu_memory_gb' in efficiency_data:
	resource_metrics['GPU Memory (GB)'] = efficiency_data['gpu_memory_gb']
	if 'gpu_total_gb' in efficiency_data:
	resource_metrics['GPU Total (GB)'] = efficiency_data['gpu_total_gb']
	if 'index_size_mb' in efficiency_data:
	resource_metrics['Index Size (MB/100)'] = efficiency_data['index_size_mb'] / 100
	if 'num_documents' in efficiency_data:
	resource_metrics['Documents (100s)'] = efficiency_data['num_documents'] / 100

	if resource_metrics:
	ax2.bar(resource_metrics.keys(), resource_metrics.values(),
	color=['skyblue', 'lightblue', 'lightgreen', 'lightyellow'])
	ax2.set_ylabel('Resource Usage', fontsize=12)
	ax2.set_title('System Resource Utilization', fontsize=14, fontweight='bold')
	ax2.tick_params(axis='x', rotation=45)
	ax2.grid(axis='y', alpha=0.3)

	# 3. Performance trend
	if 'test_results' in self.data:
	results = self.data['test_results']
	query_indices = list(range(len(results)))
	search_times = [r['times']['search'] for r in results]
	gen_times = [r['times']['generation'] for r in results]

	ax3.plot(query_indices, search_times, 'o-', label='Search Time', linewidth=2)
	ax3.plot(query_indices, gen_times, 's-', label='Generation Time', linewidth=2)
	ax3.set_xlabel('Query Index', fontsize=12)
	ax3.set_ylabel('Time (seconds)', fontsize=12)
	ax3.set_title('Query Performance Trend', fontsize=14, fontweight='bold')
	ax3.legend()
	ax3.grid(True, alpha=0.3)

	# Add moving average
	window = min(3, len(results) // 2)
	if window > 1:
	search_ma = pd.Series(search_times).rolling(window=window).mean()
	gen_ma = pd.Series(gen_times).rolling(window=window).mean()
	ax3.plot(query_indices, search_ma, '--', color='blue', alpha=0.5)
	ax3.plot(query_indices, gen_ma, '--', color='orange', alpha=0.5)

	# 4. Efficiency summary
	summary_text = "System Efficiency Summary\n" + "=" * 25 + "\n\n"

	if 'avg_total_time' in efficiency_data:
	summary_text += f"Average Response Time: {efficiency_data['avg_total_time']:.2f}s\n"
	if 'avg_answer_length' in efficiency_data:
	summary_text += f"Average Answer Length: {efficiency_data['avg_answer_length']:.0f} words\n"
	if 'num_documents' in efficiency_data:
	summary_text += f"Indexed Documents: {efficiency_data['num_documents']}\n"
	if 'embedding_dim' in efficiency_data:
	summary_text += f"Embedding Dimension: {efficiency_data['embedding_dim']}\n"

	# Calculate throughput
	if 'avg_total_time' in efficiency_data and efficiency_data['avg_total_time'] > 0:
	throughput = 3600 / efficiency_data['avg_total_time']
	summary_text += f"\nEstimated Throughput: {throughput:.0f} queries/hour"

	ax4.text(0.1, 0.9, summary_text, transform=ax4.transAxes,
	fontsize=12, verticalalignment='top',
	bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.3))
	ax4.axis('off')

	plt.tight_layout()
	plt.savefig(os.path.join(self.output_dir, 'system_efficiency_analysis.png'),
	dpi=300, bbox_inches='tight')
	plt.close()
	print("✓ system_efficiency_analysis.png generated")

	def generate_summary_report(self):
	"""Generate detailed summary report"""
	print("Generating summary report...")

	report = "Medical Literature RAG System Evaluation Report\n"
	report += "=" * 50 + "\n"
	report += f"Generated: {pd.Timestamp.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"

	# 1. Dataset statistics
	report += "1. Dataset Statistics\n"
	report += "-" * 30 + "\n"

	if 'clusters' in self.data:
	total_docs = len(self.data['clusters'])
	n_topics = len(self.data['clusters']['cluster'].unique())
	noise_docs = len(self.data['clusters'][self.data['clusters']['cluster'] == -1])
	report += f"- Total Documents: {total_docs}\n"
	report += f"- Topics Identified: {n_topics - 1}\n" # Exclude noise
	report += f"- Noise Documents: {noise_docs} ({noise_docs / total_docs * 100:.1f}%)\n"

	# 2. Performance metrics
	report += "\n2. System Performance Metrics\n"
	report += "-" * 30 + "\n"

	if 'test_results' in self.data:
	results = self.data['test_results']
	search_times = [r['times']['search'] for r in results]
	gen_times = [r['times']['generation'] for r in results]
	total_times = [r['times']['total'] for r in results]
	answer_lengths = [len(r['answer'].split()) for r in results]

	report += f"- Average Search Time: {np.mean(search_times):.3f}s\n"
	report += f"- Average Generation Time: {np.mean(gen_times):.3f}s\n"
	report += f"- Average Total Response Time: {np.mean(total_times):.3f}s\n"
	report += f"- Fastest Response: {min(total_times):.3f}s\n"
	report += f"- Slowest Response: {max(total_times):.3f}s\n"
	report += f"- Average Answer Length: {np.mean(answer_lengths):.0f} words\n"

	# 3. Evaluation results
	if 'eval_metrics' in self.data:
	report += "\n3. Evaluation Metrics\n"
	report += "-" * 30 + "\n"

	if 'generation_metrics' in self.data['eval_metrics']:
	gen_metrics = self.data['eval_metrics']['generation_metrics']
	for key, value in gen_metrics.items():
	report += f"- {key}: {value:.3f}\n"

	if 'efficiency_metrics' in self.data['eval_metrics']:
	eff_metrics = self.data['eval_metrics']['efficiency_metrics']
	report += f"\nResource Usage:\n"
	for key, value in eff_metrics.items():
	if isinstance(value, float):
	report += f"- {key}: {value:.3f}\n"
	else:
	report += f"- {key}: {value}\n"

	# 4. Test query results
	report += "\n4. Test Query Example\n"
	report += "-" * 30 + "\n"

	if 'test_results' in self.data and len(self.data['test_results']) > 0:
	first_result = self.data['test_results'][0]
	report += f"Query: {first_result['query']}\n"
	report += f"Answer Preview: {first_result['answer'][:200]}...\n"
	report += f"Sources Used: {len(first_result['sources'])}\n"
	report += f"Response Time: {first_result['times']['total']:.3f}s\n"

	# 5. Recommendations
	report += "\n5. Optimization Recommendations\n"
	report += "-" * 30 + "\n"

	if 'test_results' in self.data:
	avg_time = np.mean([r['times']['total'] for r in self.data['test_results']])
	if avg_time > 3:
	report += "- Consider optimizing model loading and inference speed\n"
	if np.mean([len(r['answer'].split()) for r in self.data['test_results']]) < 150:
	report += "- Consider increasing answer detail and comprehensiveness\n"
	report += "- Implement caching for frequently asked queries\n"
	report += "- Add more diverse test queries for comprehensive evaluation\n"

	# Save report
	report_path = os.path.join(self.output_dir, 'evaluation_report.txt')
	with open(report_path, 'w', encoding='utf-8') as f:
	f.write(report)

	print(f"✓ Evaluation report saved to: {report_path}")

	return report


	# ============================================================================
	# Main Pipeline
	# ============================================================================

	class MedicalLiteratureRAGPipeline:
	"""Main pipeline orchestrating all components"""

	def __init__(self, config: Config):
	self.config = config
	self.processor = MedicalDataProcessor(config)
	self.topic_modeler = MedicalTopicModeler(config)
	self.rag_system = None
	self.evaluator = None

	def run_complete_pipeline(self,
	excel_path: str,
	hf_token: Optional[str] = None,
	hf_repo: Optional[str] = None,
	run_evaluation: bool = True):
	"""Execute complete pipeline"""

	print("=" * 80)
	print("Medical Literature RAG Pipeline")
	print("=" * 80)

	# Step 1: Load and process data
	print("\n[Step 1/6] Loading and processing data...")
	df = self.processor.load_and_clean_excel(excel_path)
	records = self.processor.prepare_records(df)
	self.processor.save_metadata(records)

	# Step 2: Topic modeling
	print("\n[Step 2/6] Performing topic modeling...")
	topics, topic_model = self.topic_modeler.fit_topics(records)

	# Step 3: Create and save dataset
	print("\n[Step 3/6] Creating dataset...")
	self._create_dataset(records, hf_token, hf_repo)

	# Step 4: Build RAG system
	print("\n[Step 4/6] Building RAG system...")
	self.rag_system = MedicalRAGSystem(self.config)
	self.rag_system.build_index(records)

	# Step 5: Run test queries
	print("\n[Step 5/6] Running test queries...")
	self._run_test_queries()

	# Step 6: Evaluation
	if run_evaluation:
	print("\n[Step 6/6] Running evaluation...")
	self._run_evaluation()

	print("\n" + "=" * 80)
	print("Pipeline completed successfully!")
	print(f"All results saved to: {self.config.OUTPUT_DIR}")
	print("=" * 80)

	def _create_dataset(self, records: List[Dict], hf_token: Optional[str], hf_repo: Optional[str]):
	"""Create and optionally upload dataset to Hugging Face"""
	# Ensure all records have proper types
	for rec in records:
	# Ensure cluster exists and is int
	if 'cluster' not in rec or rec['cluster'] is None:
	rec['cluster'] = -1
	else:
	rec['cluster'] = int(rec['cluster'])

	# Ensure string fields
	for key in ['pmid', 'title', 'journal', 'mesh', 'keywords', 'abstract', 'doi']:
	val = rec.get(key, '')
	if val is None or pd.isna(val):
	rec[key] = ''
	else:
	rec[key] = str(val)

	# Ensure year is int
	yr = rec.get('year', 0)
	if yr is None or pd.isna(yr):
	rec['year'] = 0
	else:
	rec['year'] = int(yr)

	# Create dataset
	ds = Dataset.from_list(records)
	ds = ds.class_encode_column('cluster')

	# Save locally
	df_export = ds.to_pandas()
	export_path = os.path.join(self.config.OUTPUT_DIR, 'medllm_full_dataset.csv')
	df_export.to_csv(export_path, index=False, encoding='utf-8-sig')
	print(f"Dataset saved to: {export_path}")

	# Upload to Hugging Face
	if hf_token and hf_repo:
	try:
	print(f"\nUploading dataset to Hugging Face...")
	login(token=hf_token)
	ds.push_to_hub(hf_repo, private=False)
	print(f"Dataset pushed to https://huggingface.co/datasets/{hf_repo}")
	except Exception as e:
	print(f"Warning: Could not upload to Hugging Face: {e}")

	def _run_test_queries(self):
	"""Run predefined test queries"""
	test_queries = [
	"What are the applications of ChatGPT in medical education?",
	"How accurate is ChatGPT in medical diagnosis?",
	"What are the limitations of using AI in healthcare?",
	"ChatGPT's performance in medical examinations",
	"Can ChatGPT help with bone tumor diagnosis?",
	"What are the ethical considerations of AI in medicine?",
	"How does ChatGPT compare to human doctors in diagnosis?",
	"Applications of large language models in radiology"
	]

	results = []

	print("\nRunning test queries...")
	print("-" * 80)

	for query in test_queries:
	print(f"\nQuery: {query}")
	result = self.rag_system.qa_pipeline(query)

	print(f"\nAnswer:\n{result['answer']}")
	print(f"\nBased on {len(result['sources'])} sources:")
	for i, source in enumerate(result['sources'][:3]):
	print(f" [{i + 1}] PMID {source['pmid']} ({source['year']}) - {source['title'][:60]}...")

	print(f"\nTiming: Search {result['times']['search']:.2f}s, "
	f"Generation {result['times']['generation']:.2f}s")
	print("-" * 80)

	results.append(result)

	# Save test results
	test_results_path = os.path.join(self.config.OUTPUT_DIR, 'test_query_results.json')
	with open(test_results_path, 'w', encoding='utf-8') as f:
	json.dump(results, f, indent=2, ensure_ascii=False)

	def _run_evaluation(self):
	"""Run comprehensive evaluation"""
	self.evaluator = RAGEvaluator(self.rag_system, self.config)

	# Basic test queries for generation evaluation
	test_queries = [
	"What are the applications of ChatGPT in medical education?",
	"How accurate is ChatGPT in medical diagnosis?",
	"What are the limitations of using AI in healthcare?",
	"ChatGPT's performance in medical examinations",
	"Can ChatGPT help with bone tumor diagnosis?"
	]

	# Evaluate generation
	gen_metrics = self.evaluator.evaluate_generation(test_queries)
	print("\nGeneration Metrics:")
	for metric, value in gen_metrics.items():
	print(f" {metric}: {value:.3f}")

	# Evaluate efficiency
	eff_metrics = self.evaluator.evaluate_efficiency()
	print("\nEfficiency Metrics:")
	for metric, value in eff_metrics.items():
	print(f" {metric}: {value:.3f}")

	# Save all results
	self.evaluator.save_evaluation_results()

	# Generate enhanced plots
	print("\nGenerating evaluation plots...")
	plotter = RealEvaluationPlotter(self.config.OUTPUT_DIR)
	plotter.generate_all_plots()
	plotter.generate_summary_report()


	# ============================================================================
	# Main Execution
	# ============================================================================

	def main():
	"""Main execution function"""

	# Configuration
	config = Config()

	# Initialize pipeline
	pipeline = MedicalLiteratureRAGPipeline(config)

	# Run complete pipeline with Hugging Face upload
	pipeline.run_complete_pipeline(
	excel_path=config.EXCEL_PATH,
	hf_token=config.HF_TOKEN,
	hf_repo=config.HF_REPO,
	run_evaluation=True
	)

	# Print GPU usage if available
	if torch.cuda.is_available():
	print(f"\nFinal GPU Memory Usage: {torch.cuda.memory_allocated() / 1e9:.2f} GB")


	if __name__ == "__main__":
	main()