Spaces:

softblackhole
/

rag-evaluation-system

Sleeping

soft.engineer

add setting tab

58259d1 4 months ago

38.7 kB

	import gradio as gr
	import os
	import logging
	from dotenv import load_dotenv
	import tempfile
	import pandas as pd
	from typing import List, Dict, Any, Tuple
	import shutil
	import json
	from core.ingest import DocumentChunker, HierarchyManager
	from core.index import VectorStore
	from core.retrieval import RAGManager
	from core.eval import RAGEvaluator
	from core.utils import generate_id
	import os as _os
	_OPENAI_ON = False
	try:
	from openai import OpenAI as _OpenAI
	_OPENAI_ON = True if _os.getenv("OPENAI_API_KEY") else False
	except Exception:
	_OPENAI_ON = False

	def _auto_detect_query_filters(query: str) -> Dict[str, Any]:
	"""Infer hierarchy filters (level1/2/3, doc_type) from a query.
	Prefers OpenAI; falls back to heuristic scan of hierarchy keywords.
	"""
	# Try OpenAI if available
	if os.getenv("OPENAI_API_KEY") and _OpenAI is not None:
	try:
	logger.debug("Calling OpenAI for query filter detection.")
	client = _OpenAI()
	prompt = (
	"Given a user query, infer optional filters for a hierarchical RAG system."
	" Return JSON with any of: level1, level2, level3, doc_type."
	" Use concise strings; omit fields you cannot infer. Query: " + query
	)
	resp = client.chat.completions.create(
	model=_os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
	messages=[{"role": "user", "content": prompt}],
	temperature=0.0,
	)
	import json as _json
	content = resp.choices[0].message.content
	data = _json.loads(content)
	logger.debug(f"OpenAI filters inferred: {data}")
	if isinstance(data, dict) and any(data.get(k) for k in ("level1","level2","level3","doc_type")):
	return data
	except Exception:
	logger.exception("OpenAI query filter detection failed; will try heuristic.")

	# Heuristic fallback
	try:
	hm = HierarchyManager()
	ql = query.lower()
	best = {"level1": None, "level2": None, "level3": None, "doc_type": None}
	best_score = -1
	for hname, hdef in hm.hierarchies.items():
	for l1 in hdef['levels']['level1']['values']:
	score = 0
	if l1.lower() in ql:
	score += 2
	for l2 in hdef['levels']['level2']['values'].get(l1, []):
	if l2.lower() in ql:
	score += 2
	for l3 in hdef['levels']['level3']['values'].get(l2, []):
	if l3.lower() in ql:
	score += 1
	if score > best_score:
	best_score = score
	best.update({"level1": l1})
	for dt in ["Policy","Manual","FAQ","Report","Note","Guideline"]:
	if dt.lower() in ql:
	best["doc_type"] = dt
	break
	logger.debug(f"Heuristic filters inferred: {best if best_score>0 or best.get('doc_type') else {}}")
	return best if best_score > 0 or best.get("doc_type") else {}
	except Exception:
	logger.debug("Heuristic detection failed; returning no filters.")
	return {}

	# Load environment variables from .env if present, then configure logging
	load_dotenv()
	logging.basicConfig(level=os.getenv("LOG_LEVEL", "INFO"))
	logger = logging.getLogger("rag_app")
	if os.getenv("OPENAI_API_KEY"):
	logger.info("OpenAI API key detected. OpenAI-powered auto-detection is ENABLED.")
	if os.getenv("OPENAI_MODEL"):
	logger.info(f"OpenAI model: {os.getenv('OPENAI_MODEL')}")
	else:
	logger.info("OpenAI API key not set. Falling back to heuristic auto-detection.")

	# Global variables
	rag_manager = None
	evaluator = None
	current_collection = "documents"
	persist_directory = None

	def initialize_system():
	"""Initialize the RAG system"""
	global rag_manager, evaluator, persist_directory

	# Try /data/chroma first (for HF Spaces), fallback to ./chroma_data
	persist_dir = "/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data"

	# Create directory with proper permissions, and check if we can write to it
	try:
	os.makedirs(persist_dir, exist_ok=True, mode=0o755)
	# Test write permissions
	test_file = os.path.join(persist_dir, ".test_write")
	try:
	with open(test_file, 'w') as f:
	f.write("test")
	os.remove(test_file)
	except (PermissionError, OSError):
	# If can't write to /data/chroma, use ./chroma_data
	persist_dir = "./chroma_data"
	os.makedirs(persist_dir, exist_ok=True, mode=0o755)
	except (PermissionError, OSError) as e:
	# If even ./chroma_data fails, try current directory
	persist_dir = "./chroma_data"
	os.makedirs(persist_dir, exist_ok=True, mode=0o755)

	persist_directory = persist_dir

	rag_manager = RAGManager(persist_directory=persist_dir)
	evaluator = RAGEvaluator(rag_manager)

	return f"System initialized successfully! Using persist directory: {persist_dir}"

	def reset_index() -> str:
	"""Clear Chroma persistence and reinitialize the vector store."""
	global rag_manager, evaluator, persist_directory
	try:
	dir_path = persist_directory or ("/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data")
	if os.path.exists(dir_path):
	shutil.rmtree(dir_path, ignore_errors=True)
	os.makedirs(dir_path, exist_ok=True, mode=0o755)
	persist_directory = dir_path
	rag_manager = RAGManager(persist_directory=dir_path)
	evaluator = RAGEvaluator(rag_manager)
	return f"Index reset complete. Using fresh directory: {dir_path}"
	except Exception as ex:
	return f"Failed to reset index: {ex}"

	def upload_documents(files: List[str], hierarchy: str, doc_type: str, language: str, progress: Any = None) -> Tuple[str, List[Dict[str, Any]], Dict[str, Any], List[Dict[str, Any]]]:
	"""Upload and process documents.
	Returns: (status_text, per_file_summaries, collection_stats)
	per_file_summaries: [{filename, chunks, language, doc_type, hierarchy}]
	"""
	global rag_manager, persist_directory

	if not files:
	return "No files provided!"

	# Ensure system is initialized
	if not rag_manager:
	initialize_system()

	chunker = DocumentChunker()
	all_chunks = []
	per_file_summaries: List[Dict[str, Any]] = []
	chunk_rows: List[Dict[str, Any]] = []

	processed_count = 0
	errors: List[str] = []
	total = len(files)

	for idx, file_path in enumerate(files, start=1):
	if progress:
	try:
	progress(idx, total=total, desc=f"Processing {idx}/{total}: {os.path.basename(file_path)}")
	except Exception:
	pass
	try:
	chunks = chunker.chunk_document(file_path, hierarchy, doc_type, language)
	if chunks:
	# Aggregate per-file metadata from chunks (majority vote)
	from collections import Counter
	langs = Counter([c.metadata.get('lang') for c in chunks if c.metadata.get('lang')])
	docts = Counter([c.metadata.get('doc_type') for c in chunks if c.metadata.get('doc_type')])
	# prefer explicit 'hierarchy' if present, else most common level1's hierarchy name is unknown
	hier_names = Counter([c.metadata.get('hierarchy') for c in chunks if c.metadata.get('hierarchy')])
	per_file_summaries.append({
	'Filename': os.path.basename(file_path),
	'Chunks': len(chunks),
	'Language': (langs.most_common(1)[0][0] if langs else None),
	'Doc Type': (docts.most_common(1)[0][0] if docts else None),
	'Hierarchy': (hier_names.most_common(1)[0][0] if hier_names else None)
	})
	# Prepare per-chunk preview rows
	for c in chunks:
	md = c.metadata or {}
	chunk_rows.append({
	'Filename': os.path.basename(file_path),
	'Level1': md.get('level1'),
	'Level2': md.get('level2'),
	'Level3': md.get('level3'),
	'Doc Type': md.get('doc_type'),
	'Language': md.get('lang'),
	'Preview': (c.content[:160] + '...') if c.content else ''
	})
	# Log indexing summary per file (without AI percentage)
	try:
	logger.info("Indexed %s: chunks=%d, lang=%s, doc_type=%s, hierarchy=%s",
	os.path.basename(file_path), len(chunks),
	(langs.most_common(1)[0][0] if langs else None),
	(docts.most_common(1)[0][0] if docts else None),
	(hier_names.most_common(1)[0][0] if hier_names else None))
	except Exception:
	pass
	all_chunks.extend(chunks)
	processed_count += 1
	else:
	errors.append(f"Warning: {os.path.basename(file_path)} produced no chunks")
	except Exception as e:
	error_msg = f"{os.path.basename(file_path)}: {str(e)}"
	errors.append(error_msg)
	# Continue processing other files instead of stopping

	# Index if any chunk present
	vector_store = rag_manager.vector_store
	stats: Dict[str, Any] = {"document_count": 0, "collection_name": current_collection}
	if all_chunks:
	vector_store.add_documents(current_collection, all_chunks)
	stats = vector_store.get_collection_stats(current_collection)

	# Build result message
	status_lines = [
	f"Processed {processed_count}/{total} files",
	f"Indexed chunks: {len(all_chunks)}"
	]
	if errors:
	status_lines.append("\nErrors/Warnings:\n" + "\n".join(f"- {e}" for e in errors))

	if not all_chunks:
	return "\n".join(status_lines), per_file_summaries, stats, chunk_rows

	return "\n".join(status_lines), per_file_summaries, stats, chunk_rows

	def build_rag_index(files: List[Any], hierarchy: str, doc_type: str, language: str) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
	"""Build RAG index from uploaded files"""
	if not files:
	return "No files provided!", None

	# Gradio file objects already contain the full path in .name property
	# No need to prepend /tmp/ - just use the path directly
	file_paths = []
	for file in files:
	# Get the file path - Gradio provides it as .name or as a string
	if isinstance(file, str):
	file_path = file
	elif hasattr(file, 'name') and file.name:
	# file.name already contains the full path (e.g., /tmp/gradio/.../filename.txt)
	file_path = file.name
	else:
	# Fallback for edge cases
	return f"Error: Unable to get file path from uploaded file", None

	# Normalize the path to handle any double slashes
	file_path = os.path.normpath(file_path)

	# Ensure the file exists
	if not os.path.exists(file_path):
	return f"Error: File not found at {file_path}", None

	file_paths.append(file_path)

	# Normalize "Auto" to None for auto-detection downstream
	norm_hierarchy = None if not hierarchy or str(hierarchy).lower() == 'auto' else hierarchy
	norm_doc_type = None if not doc_type or str(doc_type).lower() == 'auto' else doc_type
	norm_language = None if not language or str(language).lower() == 'auto' else language

	# Process documents (progress provided by gradio)
	status_text, file_summaries, stats, chunk_rows = upload_documents(file_paths, norm_hierarchy, norm_doc_type, norm_language, progress=None)

	# Build per-file dataframe (no totals row)
	per_file_df = pd.DataFrame(file_summaries) if file_summaries else pd.DataFrame(columns=['Filename','Chunks','Language','Doc Type','Hierarchy'])

	chunks_df = pd.DataFrame(chunk_rows) if chunk_rows else pd.DataFrame(columns=['Filename','Level1','Level2','Level3','Doc Type','Language','Preview'])
	return status_text, per_file_df, chunks_df

	def search_documents(query: str, k: int, level1: str, level2: str, level3: str, doc_type: str) -> Tuple[str, str, str, pd.DataFrame]:
	"""Search documents using both RAG pipelines"""
	if not rag_manager:
	return "System not initialized!", "", "", None

	# Convert empty strings to None
	level1 = level1 if level1 else None
	level2 = level2 if level2 else None
	level3 = level3 if level3 else None
	doc_type = doc_type if doc_type else None

	# Auto-detect filters from query if none provided
	if not any([level1, level2, level3, doc_type]):
	inferred = _auto_detect_query_filters(query)
	level1 = level1 or inferred.get('level1')
	level2 = level2 or inferred.get('level2')
	level3 = level3 or inferred.get('level3')
	doc_type = doc_type or inferred.get('doc_type')

	base_result, hier_result = rag_manager.compare_retrieval(query, k, level1, level2, level3, doc_type)

	# Prepare results for display
	results_data = []
	for i, source in enumerate(base_result.sources):
	results_data.append({
	'Pipeline': 'Base-RAG',
	'Rank': i + 1,
	'Content': source['content'][:100] + '...',
	'Domain': source['metadata'].get('level1', 'N/A'),
	'Section': source['metadata'].get('level2', 'N/A'),
	'Topic': source['metadata'].get('level3', 'N/A'),
	'Score': f"{source['score']:.3f}"
	})

	for i, source in enumerate(hier_result.sources):
	results_data.append({
	'Pipeline': 'Hier-RAG',
	'Rank': i + 1,
	'Content': source['content'][:100] + '...',
	'Domain': source['metadata'].get('level1', 'N/A'),
	'Section': source['metadata'].get('level2', 'N/A'),
	'Topic': source['metadata'].get('level3', 'N/A'),
	'Score': f"{source['score']:.3f}"
	})

	results_df = pd.DataFrame(results_data)

	comparison_text = f"""
	## Retrieval Comparison

	### Base-RAG
	- Latency: {base_result.latency:.3f}s
	- Retrieved: {len(base_result.sources)} documents

	### Hier-RAG
	- Latency: {hier_result.latency:.3f}s
	- Retrieved: {len(hier_result.sources)} documents
	- Filters: level1={level1 or 'None'}, level2={level2 or 'None'}, level3={level3 or 'None'}, doc_type={doc_type or 'None'}
	"""

	return base_result.content, hier_result.content, comparison_text, results_df

	def search_documents_auto(query: str) -> Tuple[str, str, str, pd.DataFrame]:
	"""Search with only a query; k and filters auto-handled.
	- k defaults to DEFAULT_SEARCH_K env or 5
	- filters inferred from query when possible
	"""
	default_k = int(os.getenv("DEFAULT_SEARCH_K", 5))
	return search_documents(query, default_k, None, None, None, None)

	def search_documents_unified(query: str, manual: bool, k: int = None,
	level1: str = None, level2: str = None,
	level3: str = None, doc_type: str = None) -> Tuple[str, str, str, pd.DataFrame]:
	"""Single entry search. If manual is True, use provided controls; else auto."""
	if manual:
	# normalize empty/"Auto" strings to None (auto-detect)
	def _norm(v):
	return None if (v is None or (isinstance(v, str) and v.strip().lower() == 'auto') or v == '') else v
	level1 = _norm(level1)
	level2 = _norm(level2)
	level3 = _norm(level3)
	doc_type = _norm(doc_type)
	k = k or int(os.getenv("DEFAULT_SEARCH_K", 5))
	return search_documents(query, k, level1, level2, level3, doc_type)
	return search_documents_auto(query)

	def _toggle_manual_controls(manual: bool):
	"""Update manual controls interactive state, accordion, and default values."""
	auto_update = gr.update(value="Auto", interactive=manual)
	return (
	gr.update(open=manual),
	gr.update(interactive=manual), # k slider
	auto_update, # level1
	auto_update, # level2
	auto_update, # level3
	auto_update, # doc_type
	)

	def _llm_answer(user_message: str, contexts: List[Dict[str, Any]]) -> str:
	"""Generate a natural, human-like answer grounded in contexts.
	Uses OpenAI if configured; otherwise produces a conversational fallback.
	"""
	# Build a compact context string
	ctx_blocks = []
	for i, c in enumerate(contexts, 1):
	src = c.get('metadata', {}).get('source_name', 'unknown')
	snippet = (c.get('content', '') or '')[:400]
	ctx_blocks.append(f"[{i}] ({src}) {snippet}")
	ctx_text = "\n\n".join(ctx_blocks)

	# Prefer OpenAI
	if os.getenv("OPENAI_API_KEY") and '_OpenAI' in globals() and _OpenAI is not None:
	try:
	client = _OpenAI()
	system_prompt = (
	"You are a helpful, professional assistant. Answer in a warm, natural, and concise tone. "
	"ALWAYS ground the answer ONLY in the provided contexts. If information is missing, say so. "
	"Style: Start with a clear 1-2 sentence answer. Then, if helpful, add 2-5 short bullet points with key facts. "
	"Avoid hedging, avoid citations inline, avoid repeating the question."
	)
	content = (
	f"User question:\n{user_message}\n\n"
	f"Contexts (each begins with [n]):\n{ctx_text}\n\n"
	"Write a natural, human-like response as specified."
	)
	resp = client.chat.completions.create(
	model=os.getenv("OPENAI_MODEL", "gpt-4o-mini"),
	messages=[
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": content},
	],
	temperature=0.2,
	)
	return resp.choices[0].message.content.strip()
	except Exception:
	pass

	# Fallback heuristic: conversational synthesis from top snippets
	if contexts:
	snippets = []
	for src in contexts[:3]:
	txt = (src.get('content') or '').strip()
	if txt:
	snippets.append(txt)
	joined = " ".join(snippets)
	# Naive sentence split
	sentences = [s.strip() for s in joined.replace('\n', ' ').split('.') if s.strip()]
	bullets = sentences[:4]
	lead = "Here’s what the documents say:"
	if bullets:
	bullets_text = "\n".join([f"- {b}." for b in bullets])
	return f"{lead}\n\n{bullets_text}"
	return "I don't have enough information to answer that yet."


	def chat_with_rag(message: str, history: List[Dict[str, str]], pipeline: str, k: int) -> Tuple[str, List[Dict[str, str]]]:
	"""Chat interface with RAG: choose one pipeline (base_rag or hier_rag),
	retrieve, then generate an LLM answer grounded in retrieved contexts and
	show sources used.
	"""
	if not rag_manager:
	return "System not initialized! Please build the RAG index first.", history

	# Retrieve with chosen pipeline
	filters_note = ""
	if pipeline == "hier_rag":
	inferred = _auto_detect_query_filters(message)
	h_l1 = inferred.get('level1')
	h_l2 = inferred.get('level2')
	h_l3 = inferred.get('level3')
	h_dt = inferred.get('doc_type')
	result = rag_manager.hier_rag.retrieve(message, k, h_l1, h_l2, h_l3, h_dt)
	# Show filters only when detected
	found = []
	if h_l1:
	found.append(f"level1={h_l1}")
	if h_l2:
	found.append(f"level2={h_l2}")
	if h_l3:
	found.append(f"level3={h_l3}")
	if h_dt:
	found.append(f"doc_type={h_dt}")
	if found:
	filters_note = " (filters: " + ", ".join(found) + ")"
	else:
	result = rag_manager.base_rag.retrieve(message, k)

	# Generate grounded answer
	answer = _llm_answer(message, result.sources)

	# Sources list
	src_lines = []
	for s in result.sources[:k]:
	src_name = s.get('metadata', {}).get('source_name', 'unknown')
	src_lines.append(f"• {src_name}")

	response = (
	f"{answer}\n\n"
	f"─────────────────────────────────────\n"
	f"📎 Sources ({'Hier-RAG' if pipeline=='hier_rag' else 'Base-RAG'}{filters_note}):\n" + ("\n".join(src_lines) or "(none)")
	)

	history.append({"role": "user", "content": message})
	history.append({"role": "assistant", "content": response})
	return "", history

	def run_evaluation(queries_json: str, output_filename: str) -> Tuple[str, pd.DataFrame, pd.DataFrame]:
	"""Run quantitative evaluation"""
	if not evaluator:
	return "System not initialized!", None, None

	try:
	queries = json.loads(queries_json)
	except json.JSONDecodeError as e:
	return f"Invalid JSON format: {str(e)}", None, None

	df, results = evaluator.batch_evaluate(queries, output_filename)

	# Generate summary statistics
	summary = df.groupby(['pipeline', 'k']).agg({
	'hit_at_k': 'mean',
	'mrr': 'mean',
	'semantic_similarity': 'mean',
	'latency': 'mean',
	'retrieved_count': 'mean'
	}).reset_index()

	# Create comparison plot data
	plot_data = pd.DataFrame({
	'k': summary[summary['pipeline'] == 'base_rag']['k'],
	'base_rag_hit@k': summary[summary['pipeline'] == 'base_rag']['hit_at_k'],
	'hier_rag_hit@k': summary[summary['pipeline'] == 'hier_rag']['hit_at_k'],
	'base_rag_mrr': summary[summary['pipeline'] == 'base_rag']['mrr'],
	'hier_rag_mrr': summary[summary['pipeline'] == 'hier_rag']['mrr']
	})

	return f"Evaluation completed! Processed {len(queries)} queries. Results saved to {output_filename}", df, plot_data

	# Diagnostics: simple OpenAI connectivity test
	## (removed) test_openai_connectivity helper

	# Initialize system
	initialize_system()

	# Create Gradio interface
	# Minimal CSS to keep layout stable when vertical scrollbar appears and improve mobile spacing
	APP_CSS = """
	html, body { scrollbar-gutter: stable both-edges; }
	body { overflow-y: scroll; }
	* { box-sizing: border-box; }
	@media (max-width: 768px) {
	.gradio-container { padding-left: 8px; padding-right: 8px; }
	}
	"""

	with gr.Blocks(title="RAG Evaluation System", css=APP_CSS) as demo:
	gr.Markdown("# RAG Evaluation System: Hierarchical vs Standard RAG")

	with gr.Tab("Upload Documents"):
	gr.Markdown("## Upload and Process Documents")
	with gr.Row():
	with gr.Column():
	file_upload = gr.File(
	label="Upload PDF/TXT Files",
	file_count="multiple",
	file_types=[".pdf", ".txt"]
	)
	hierarchy_dropdown = gr.Dropdown(
	choices=["Auto", "hospital", "bank", "fluid_simulation"],
	label="Select Hierarchy",
	value="Auto"
	)
	doc_type_dropdown = gr.Dropdown(
	choices=["Auto", "Policy", "Manual", "FAQ", "Report", "Note", "Guideline"],
	label="Document Type",
	value="Auto"
	)
	language_dropdown = gr.Dropdown(
	choices=["Auto", "en", "ja"],
	label="Language",
	value="Auto"
	)
	build_btn = gr.Button("Build RAG Index")

	with gr.Column():
	build_output = gr.Textbox(label="Build Status", lines=4)
	stats_table = gr.DataFrame(label="File Summary")
	chunks_table = gr.DataFrame(label="Indexed Chunks (preview)")
	reset_btn = gr.Button("Reset Index (Clear chroma_data)", variant="secondary")

	with gr.Tab("Search"):
	gr.Markdown("## Compare Retrieval Pipelines")
	with gr.Row():
	with gr.Column():
	manual_toggle = gr.Checkbox(label="Manual controls", value=False)
	search_query = gr.Textbox(
	label="Search Query",
	placeholder="Enter your query...",
	lines=2
	)
	# Single Search button
	search_btn = gr.Button("Search", variant="primary")
	with gr.Accordion("Manual (optional)", open=False) as manual_section:
	k_slider = gr.Slider(minimum=1, maximum=20, value=int(os.getenv("DEFAULT_SEARCH_K", 5)), step=1, label="Number of results (k)", interactive=False)
	with gr.Row():
	level1_filter = gr.Dropdown(label="Domain (Level1)", choices=["Auto"], value="Auto", allow_custom_value=True, interactive=False)
	level2_filter = gr.Dropdown(label="Section (Level2)", choices=["Auto"], value="Auto", allow_custom_value=True, interactive=False)
	with gr.Row():
	level3_filter = gr.Dropdown(label="Topic (Level3)", choices=["Auto"], value="Auto", allow_custom_value=True, interactive=False)
	doc_type_filter = gr.Dropdown(label="Document Type", choices=["Auto", "Policy", "Manual", "FAQ", "Report", "Note", "Guideline"], value="Auto", allow_custom_value=True, interactive=False)

	with gr.Column():
	base_results = gr.Textbox(
	label="Base-RAG Results",
	lines=8,
	max_lines=12
	)
	hier_results = gr.Textbox(
	label="Hier-RAG Results",
	lines=8,
	max_lines=12
	)

	comparison_text = gr.Markdown()
	results_table = gr.DataFrame(label="Detailed Results")

	with gr.Tab("Chat"):
	gr.Markdown("## Chat with RAG System")
	with gr.Row():
	with gr.Column(scale=1):
	pipeline_radio = gr.Radio(
	choices=["base_rag", "hier_rag"],
	label="RAG Pipeline",
	value="hier_rag"
	)
	chat_k_slider = gr.Slider(
	minimum=1, maximum=10, value=3, step=1,
	label="Number of results"
	)

	with gr.Column(scale=2):
	chatbot = gr.Chatbot(label="RAG Chat", type="messages")
	chat_input = gr.Textbox(
	label="Message",
	placeholder="Ask a question...",
	lines=2
	)
	chat_btn = gr.Button("Send")

	with gr.Tab("Evaluation"):
	gr.Markdown("## Quantitative Evaluation")
	with gr.Row():
	with gr.Column():
	eval_queries = gr.Textbox(
	label="Evaluation Queries (JSON)",
	lines=10,
	placeholder='''[
	{
	"query": "question 1",
	"ground_truth": ["expected answer 1", "expected answer 2"],
	"k_values": [1, 3, 5],
	"level1": "Clinical",
	"level2": "Emergency",
	"level3": "Triage",
	"doc_type": "Report"
	}
	]''',
	value='''[
	{
	"query": "What are the emergency procedures?",
	"ground_truth": ["Emergency protocols for triage", "Patient assessment guidelines"],
	"k_values": [1, 3, 5]
	}
	]'''
	)
	eval_output_name = gr.Textbox(
	label="Output Filename",
	value="evaluation_results.csv"
	)
	eval_btn = gr.Button("Run Evaluation", variant="primary")
	# Diagnostics removed

	with gr.Column():
	eval_output = gr.Textbox(label="Evaluation Status", lines=3)
	eval_results_table = gr.DataFrame(label="Evaluation Results")
	eval_plot = gr.LinePlot(
	label="Performance Comparison",
	x="k",
	y=["base_rag_hit@k", "hier_rag_hit@k"],
	title="Hit@k Comparison",
	width=600,
	height=400
	)
	with gr.Tab("Settings"):
	gr.Markdown("## Settings")
	gr.Markdown("Configure embedding models and system preferences.")

	with gr.Accordion("Embedding Configuration", open=True):
	gr.Markdown("Select the embedding provider and model. Switching providers requires re-indexing your documents.")

	with gr.Row():
	with gr.Column():
	emb_provider = gr.Radio(
	choices=["SentenceTransformers", "OpenAI"],
	value="SentenceTransformers",
	label="Embeddings Provider",
	info="Choose between local SentenceTransformers models or OpenAI embeddings (requires API key)"
	)

	with gr.Row():
	apply_embed_btn = gr.Button("Apply Embedding Settings", variant="primary")

	with gr.Row():
	with gr.Column():
	st_model_in = gr.Textbox(
	label="SentenceTransformers Model",
	value=os.getenv("ST_EMBED_MODEL", "all-MiniLM-L6-v2"),
	interactive=False,
	info="Local embedding model (384 dimensions)"
	)
	with gr.Column():
	oai_model_in = gr.Textbox(
	label="OpenAI Embedding Model",
	value=os.getenv("OPENAI_EMBED_MODEL", "text-embedding-3-small"),
	interactive=False,
	info="OpenAI embedding model (1536 dimensions for small, 3072 for large)"
	)

	embed_status = gr.Textbox(
	label="Status",
	lines=3,
	interactive=False,
	placeholder="Embedding configuration status will appear here..."
	)

	# Define handler before wiring it
	def _apply_embeddings(provider, st_model, oai_model):
	try:
	use_oai = (provider == "OpenAI")
	rag_manager.vector_store.configure_embeddings(use_oai, openai_model=oai_model, st_model_name=st_model)
	status_msg = f"✅ Embeddings successfully configured!\n\n"
	status_msg += f"Provider: {provider}\n"
	if use_oai:
	status_msg += f"Model: {oai_model} (OpenAI)\n"
	status_msg += f"Dimensions: {3072 if 'large' in oai_model.lower() else 1536}\n"
	else:
	status_msg += f"Model: {st_model} (SentenceTransformers)\n"
	status_msg += f"Dimensions: ~384\n"
	status_msg += f"\n⚠️ Note: If switching providers, reset and rebuild your index in the Upload tab."
	return status_msg
	except Exception as ex:
	return f"❌ Failed to set embeddings: {ex}\n\nPlease check your configuration and try again."

	apply_embed_btn.click(
	fn=_apply_embeddings,
	inputs=[emb_provider, st_model_in, oai_model_in],
	outputs=embed_status
	)

	# Event handlers
	build_btn.click(
	fn=build_rag_index,
	inputs=[file_upload, hierarchy_dropdown, doc_type_dropdown, language_dropdown],
	outputs=[build_output, stats_table, chunks_table],
	api_name="build_rag"
	)

	reset_btn.click(
	fn=reset_index,
	inputs=None,
	outputs=build_output
	)

	# One button: uses manual if checked; otherwise auto
	search_btn.click(
	fn=search_documents_unified,
	inputs=[search_query, manual_toggle, k_slider, level1_filter, level2_filter, level3_filter, doc_type_filter],
	outputs=[base_results, hier_results, comparison_text, results_table],
	api_name="search"
	)

	# Toggle manual controls interactive and accordion state
	manual_toggle.change(
	fn=_toggle_manual_controls,
	inputs=[manual_toggle],
	outputs=[manual_section, k_slider, level1_filter, level2_filter, level3_filter, doc_type_filter]
	)

	chat_btn.click(
	fn=chat_with_rag,
	inputs=[chat_input, chatbot, pipeline_radio, chat_k_slider],
	outputs=[chat_input, chatbot],
	api_name="chat"
	).then(
	lambda: None,
	None,
	chat_input,
	queue=False
	)

	eval_btn.click(
	fn=run_evaluation,
	inputs=[eval_queries, eval_output_name],
	outputs=[eval_output, eval_results_table, eval_plot],
	api_name="evaluate"
	)

	# Diagnostics trigger removed

	# MCP Server Implementation
	import asyncio
	import sys
	from typing import Any, List, Optional
	try:
	from mcp.server import Server
	from mcp.server.models import InitializationOptions
	from mcp.types import Tool, TextContent
	MCP_AVAILABLE = True
	except ImportError:
	MCP_AVAILABLE = False
	# Fallback for when MCP is not installed
	Server = None
	Tool = None
	TextContent = None

	class RAGMCPServer:
	"""MCP server for RAG system"""

	def __init__(self):
	persist_dir = "/data/chroma" if os.path.exists("/data/chroma") else "./chroma_data"
	self.rag_manager = RAGManager(persist_directory=persist_dir)
	self.evaluator = RAGEvaluator(self.rag_manager)

	async def list_tools(self) -> List[Tool]:
	"""List available MCP tools"""
	return [
	Tool(
	name="search_documents",
	description="Search documents using RAG system (Base-RAG or Hier-RAG)",
	inputSchema={
	"type": "object",
	"properties": {
	"query": {"type": "string", "description": "Search query"},
	"k": {"type": "integer", "description": "Number of results", "default": 5},
	"pipeline": {"type": "string", "enum": ["base_rag", "hier_rag"], "default": "base_rag"},
	"level1": {"type": "string", "description": "Level1 filter (domain)"},
	"level2": {"type": "string", "description": "Level2 filter (section)"},
	"level3": {"type": "string", "description": "Level3 filter (topic)"},
	"doc_type": {"type": "string", "description": "Document type filter"}
	},
	"required": ["query"]
	}
	),
	Tool(
	name="evaluate_retrieval",
	description="Evaluate RAG performance with batch queries",
	inputSchema={
	"type": "object",
	"properties": {
	"queries": {
	"type": "array",
	"description": "List of query objects with query, ground_truth, k_values, and optional filters",
	"items": {"type": "object"}
	},
	"output_file": {"type": "string", "description": "Output filename for results"}
	},
	"required": ["queries"]
	}
	)
	]

	async def call_tool(self, name: str, arguments: dict) -> List[TextContent]:
	"""Call an MCP tool by name"""
	if name == "search_documents":
	query = arguments.get("query")
	k = arguments.get("k", 5)
	pipeline = arguments.get("pipeline", "base_rag")
	level1 = arguments.get("level1")
	level2 = arguments.get("level2")
	level3 = arguments.get("level3")
	doc_type = arguments.get("doc_type")

	if pipeline == "base_rag":
	result = self.rag_manager.base_rag.retrieve(query, k)
	else:
	result = self.rag_manager.hier_rag.retrieve(query, k, level1, level2, level3, doc_type)

	response = {
	"content": result.content,
	"sources": [
	{
	"content": source['content'][:200],
	"metadata": source['metadata'],
	"score": source['score']
	} for source in result.sources
	],
	"latency": result.latency,
	"strategy": pipeline
	}

	return [TextContent(type="text", text=json.dumps(response, indent=2))]

	elif name == "evaluate_retrieval":
	queries = arguments.get("queries", [])
	output_file = arguments.get("output_file")

	df, results = self.evaluator.batch_evaluate(queries, output_file)

	summary = df.groupby('pipeline').agg({
	'hit_at_k': 'mean',
	'mrr': 'mean',
	'semantic_similarity': 'mean',
	'latency': 'mean'
	}).reset_index()

	response = {
	"summary": summary.to_dict('records'),
	"total_queries": len(queries),
	"output_file": output_file
	}

	return [TextContent(type="text", text=json.dumps(response, indent=2))]

	else:
	raise ValueError(f"Unknown tool: {name}")

	# Export for Gradio Client
	if __name__ == "__main__":
	# If run as CLI, prefer plain Gradio serving. Spaces will import demo directly.
	# Respect common hosting env vars.
	host = os.getenv("HOST", "0.0.0.0")
	port = int(os.getenv("PORT", os.getenv("GRADIO_SERVER_PORT", 7860)))
	# Avoid SSR and API schema on Spaces to prevent response length errors
	demo.launch(server_name=host, server_port=port, share=False, ssl_verify=False, ssr_mode=False)