Spaces:

seanpedrickcase
/

document_redaction

Running

document_redaction / tools /summaries.py

Sync: Changed to pi agent to improve interrupt function. Changes to open text redaction - earlier checks for Comprehend connection, now a markdown preview shows first redaction results

5a1ee57 about 11 hours ago

raw

history blame contribute delete

100 kB

	import os
	import re
	import time
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from typing import List, Optional, Tuple

	import boto3
	import gradio as gr
	import markdown
	import pandas as pd
	import spaces
	from gradio import Progress as progress
	from tqdm import tqdm

	from tools.config import (
	AWS_ACCESS_KEY,
	AWS_LLM_PII_OPTION,
	AWS_REGION,
	AWS_SECRET_KEY,
	CLOUD_LLM_PII_MODEL_CHOICE,
	CLOUD_SUMMARISATION_MODEL_CHOICE,
	DEFAULT_INFERENCE_SERVER_PII_MODEL,
	INFERENCE_SERVER_API_URL,
	INFERENCE_SERVER_PII_OPTION,
	LLM_CONTEXT_LENGTH,
	LLM_MAX_NEW_TOKENS,
	LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE,
	LOCAL_TRANSFORMERS_LLM_PII_OPTION,
	MAX_SPACES_GPU_RUN_TIME,
	OUTPUT_FOLDER,
	PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS,
	REASONING_SUFFIX,
	RUN_AWS_FUNCTIONS,
	SUMMARY_PAGE_GROUP_MAX_WORKERS,
	model_name_map,
	)
	from tools.file_conversion import is_pdf, word_level_ocr_df_to_line_level_ocr_df
	from tools.helper_functions import (
	clean_column_name,
	create_batch_file_path_details,
	get_file_name_no_ext,
	)
	from tools.llm_funcs import (
	calculate_tokens_from_metadata,
	construct_azure_client,
	construct_gemini_generative_model,
	load_model,
	process_requests,
	)

	max_tokens = LLM_MAX_NEW_TOKENS
	reasoning_suffix = REASONING_SUFFIX
	max_text_length = 500

	###
	# System prompt
	###

	generic_system_prompt = """You are a researcher analysing a document. Use British English spelling and grammar."""

	system_prompt = """You are a researcher analysing a document. Use British English spelling and grammar."""

	markdown_additional_prompt = """ You will be given a request for a markdown table. You must respond with ONLY the markdown table. Do not include any introduction, explanation, or concluding text."""

	###
	# SUMMARISE TOPICS PROMPT
	###

	summary_assistant_prefill = ""

	summarise_topic_descriptions_system_prompt = system_prompt

	summarise_topic_descriptions_prompt = """Your task is to make a consolidated summary of the text below. {summary_format}
	Return only the summary and no other text. Do not mention specific response numbers in the summary.{additional_summary_instructions}

	Text to summarise:
	{summaries}

	Summary:"""

	concise_summary_format_prompt = "Return a concise summary that summarises only the most important themes from the original text"

	detailed_summary_format_prompt = (
	"Return a summary that includes as much detail as possible from the original text"
	)

	###
	# OVERALL SUMMARY PROMPTS
	###

	summarise_everything_system_prompt = system_prompt

	summarise_everything_prompt = """Below is a table that gives an overview of the main issues related to a document.
	Your task is to summarise the text in the table below. {summary_format}. Return only the summary and no other text. Use headers and paragraphs to structure the summary where appropriate. Format the output for Excel display using: bold text for main headings, • bullet points for sub-items, and line breaks between sections. Avoid markdown symbols like # or ##. {additional_summary_instructions}

	Table to summarise:
	{topic_summary_table}

	Summary:"""


	def _summarisation_upload_to_paths(file_upload):
	"""Normalise Gradio file input to a list of file paths (str, list, or dict with 'name')."""
	if not file_upload:
	return []
	paths = []
	if isinstance(file_upload, str):
	paths.append(file_upload)
	elif isinstance(file_upload, list):
	for item in file_upload:
	if isinstance(item, str):
	paths.append(item)
	elif isinstance(item, dict):
	paths.append(item.get("name") or item.get("path") or "")
	elif hasattr(item, "name"):
	paths.append(item.name)
	elif hasattr(item, "path"):
	paths.append(item.path)
	elif isinstance(file_upload, dict):
	paths.append(file_upload.get("name") or file_upload.get("path") or "")
	elif hasattr(file_upload, "name"):
	paths.append(file_upload.name)
	elif hasattr(file_upload, "path"):
	paths.append(file_upload.path)
	return [p for p in paths if p and str(p).strip()]


	def _upload_contains_pdf(file_upload):
	"""Return True if the summarisation upload contains any PDF file."""
	paths = _summarisation_upload_to_paths(file_upload)
	return any(is_pdf(p) for p in paths)


	###
	# Document Summarisation Functions
	###
	def get_model_choice_from_inference_method(inference_method: str) -> str:
	"""
	Get the default model choice for a given inference method (for summarisation).
	Uses the default values defined in config.py (CLOUD_SUMMARISATION_MODEL_CHOICE for cloud).

	Args:
	inference_method: One of "aws-bedrock", "local", "inference-server"

	Returns:
	str: The model choice string to use
	"""
	# Map inference method to model choice using defaults from config.py
	if inference_method == "aws-bedrock":
	return CLOUD_SUMMARISATION_MODEL_CHOICE
	elif inference_method == "local":
	return LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE
	elif inference_method == "inference-server":
	return DEFAULT_INFERENCE_SERVER_PII_MODEL
	else:
	raise ValueError(
	f"Unknown inference method: {inference_method}. "
	f"Expected one of: 'aws-bedrock', 'local', 'inference-server'"
	)


	def get_model_source_from_model_choice(model_choice: str) -> str:
	"""
	Determine model source from model_choice by comparing to defaults from config.py.
	Does not check model_name_map - uses the defined defaults.

	Args:
	model_choice: The model choice string

	Returns:
	str: The model source ("AWS", "Local", or "inference-server")
	"""
	# Compare model_choice to the default config values to determine source
	if model_choice == LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE:
	return "Local"
	elif model_choice == DEFAULT_INFERENCE_SERVER_PII_MODEL:
	return "inference-server"
	elif (
	model_choice == CLOUD_LLM_PII_MODEL_CHOICE
	or model_choice == CLOUD_SUMMARISATION_MODEL_CHOICE
	):
	return "AWS"
	else:
	# If it doesn't match any default, infer from common patterns
	# AWS Bedrock models typically have "amazon." or "anthropic." prefix
	if model_choice.startswith("amazon.") or model_choice.startswith("anthropic."):
	return "AWS"
	# Inference server models are often custom names
	# Default to AWS for backward compatibility, but could be inference-server
	# Since we're using defaults, assume AWS if it's not clearly local
	return "AWS"


	def load_csv_files_to_dataframe(file_input):
	"""
	Load CSV files from Gradio file input and combine them into a single DataFrame.
	Similar to how duplicate pages function handles file input.

	Args:
	file_input: Gradio file input (can be a single file, list of files, or file objects)

	Returns:
	pd.DataFrame: Combined DataFrame with columns page, line, and text
	"""
	if not file_input:
	return pd.DataFrame(columns=["page", "line", "text"])

	# Handle different input types (similar to run_tabular_duplicate_detection)
	file_paths = []
	if isinstance(file_input, str):
	file_paths.append(file_input)
	elif isinstance(file_input, list):
	for f_item in file_input:
	if isinstance(f_item, str):
	file_paths.append(f_item)
	elif hasattr(f_item, "name"):
	file_paths.append(f_item.name)
	elif hasattr(file_input, "name"):
	file_paths.append(file_input.name)

	# Load and combine all CSV files
	all_dfs = []
	for file_path in file_paths:
	try:
	df = pd.read_csv(file_path)
	# Convert word-level OCR to line-level if user uploaded word-level file
	if "ocr_results_with_words" in os.path.basename(file_path) and (
	"word_text" in df.columns and "text" not in df.columns
	):
	df = word_level_ocr_df_to_line_level_ocr_df(df)
	# Ensure required columns exist
	if "page" in df.columns and "line" in df.columns and "text" in df.columns:
	all_dfs.append(df[["page", "line", "text"]])
	else:
	print(
	f"Warning: {file_path} does not have required columns (page, line, text)"
	)
	except Exception as e:
	print(f"Error loading {file_path}: {e}")

	if not all_dfs:
	return pd.DataFrame(columns=["page", "line", "text"])

	# Combine all DataFrames
	combined_df = pd.concat(all_dfs, ignore_index=True)
	return combined_df


	# Wrapper function to convert inference method to model choice
	@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
	def summarise_document_wrapper(
	all_page_line_level_ocr_results_df,
	output_folder,
	summarisation_inference_method,
	summarisation_api_key,
	summarisation_temperature,
	file_name,
	summarisation_context,
	summarisation_aws_access_key,
	summarisation_aws_secret_key,
	summarisation_hf_api_key,
	summarisation_azure_endpoint,
	summarisation_format,
	summarisation_additional_instructions,
	summarisation_max_pages_per_group,
	in_summarisation_ocr_files=None,
	):
	"""
	Wrapper to select the correct model and format for document summarization, and optionally
	load input OCR CSV files if they are provided.

	Args:
	all_page_line_level_ocr_results_df (pd.DataFrame): Pre-loaded DataFrame containing the line-level OCR results.
	output_folder (str): Path to folder where outputs should be saved.
	summarisation_inference_method (str): String specifying which inference/LLM method to use ('aws-bedrock', etc).
	summarisation_api_key (str): API key for the selected inference method, if required.
	summarisation_temperature (float): The temperature parameter for the model (controls randomness).
	file_name (str): Name to use as a base for output files.
	summarisation_context (str): Additional context string to include in the summarization.
	summarisation_aws_access_key (str): AWS access key if using AWS inference.
	summarisation_aws_secret_key (str): AWS secret key if using AWS inference.
	summarisation_hf_api_key (str): HuggingFace API key if required.
	summarisation_azure_endpoint (str): Endpoint string if using Azure inference.
	summarisation_format (str): Format for the summary output (e.g., "bullets", "structured").
	summarisation_additional_instructions (str): Extra instructions to pass to the summarization LLM.
	summarisation_max_pages_per_group (int): Maximum number of pages to group per LLM summarization pass.
	in_summarisation_ocr_files (str \| list \| object, optional): One or more file paths or file-like objects to OCR results in CSV format.

	Returns:
	Output of the downstream summarisation process (see next code section for details).
	"""
	"""Wrapper to convert inference method selection to model choice and load CSV files."""
	# Map inference method option to inference method string
	inference_method_map = {
	AWS_LLM_PII_OPTION: "aws-bedrock",
	LOCAL_TRANSFORMERS_LLM_PII_OPTION: "local",
	INFERENCE_SERVER_PII_OPTION: "inference-server",
	}

	inference_method = inference_method_map.get(
	summarisation_inference_method, "aws-bedrock"
	)

	# Use config default for region
	summarisation_aws_region = AWS_REGION
	summarisation_api_url = INFERENCE_SERVER_API_URL

	# Get model choice from inference method
	model_choice = get_model_choice_from_inference_method(inference_method)

	# Load CSV files if provided, otherwise use the dataframe
	if in_summarisation_ocr_files:
	ocr_df = load_csv_files_to_dataframe(in_summarisation_ocr_files)
	else:
	ocr_df = all_page_line_level_ocr_results_df

	# If file_name is None or empty, derive it from in_summarisation_ocr_files
	if not file_name or file_name.strip() == "":
	if in_summarisation_ocr_files:
	# Extract file path from in_summarisation_ocr_files (similar to load_csv_files_to_dataframe)
	file_paths = []
	if isinstance(in_summarisation_ocr_files, str):
	file_paths.append(in_summarisation_ocr_files)
	elif isinstance(in_summarisation_ocr_files, list):
	for f_item in in_summarisation_ocr_files:
	if isinstance(f_item, str):
	file_paths.append(f_item)
	elif hasattr(f_item, "name"):
	file_paths.append(f_item.name)
	elif hasattr(in_summarisation_ocr_files, "name"):
	file_paths.append(in_summarisation_ocr_files.name)

	# Get the first file path and extract filename prefix
	if file_paths:
	first_file_path = file_paths[0]
	# Get basename without extension
	basename = os.path.basename(first_file_path)
	filename_without_ext, _ = os.path.splitext(basename)
	# Take first 20 characters, removing any invalid filename characters
	filename_prefix = filename_without_ext[:20]
	# Remove any invalid characters for filenames
	invalid_chars = '<>:"/\\\|?*'
	for char in invalid_chars:
	filename_prefix = filename_prefix.replace(char, "_")
	file_name = filename_prefix if filename_prefix else "document"
	else:
	file_name = "document"
	else:
	file_name = "document"

	# Call the actual summarise_document function (timed for usage logs)
	start_time = time.perf_counter()
	(
	output_files,
	status_message,
	llm_model_name,
	llm_total_input_tokens,
	llm_total_output_tokens,
	summary_display_text,
	) = summarise_document(
	ocr_df,
	output_folder,
	model_choice,
	summarisation_api_key,
	summarisation_temperature,
	file_name,
	summarisation_context,
	summarisation_aws_access_key,
	summarisation_aws_secret_key,
	summarisation_aws_region,
	summarisation_hf_api_key,
	summarisation_azure_endpoint,
	summarisation_api_url,
	summarisation_format,
	summarisation_additional_instructions,
	max_pages_per_group=summarisation_max_pages_per_group,
	)
	elapsed_seconds = round(time.perf_counter() - start_time, 1)

	return (
	output_files,
	status_message,
	llm_model_name,
	llm_total_input_tokens,
	llm_total_output_tokens,
	summary_display_text,
	elapsed_seconds,
	)


	def group_pages_by_context_length(
	all_page_line_level_ocr_results_df: pd.DataFrame,
	context_length: int = LLM_CONTEXT_LENGTH,
	tokenizer=None,
	model_source: str = "Local",
	max_pages_per_group: int = 30,
	) -> List[Tuple[List[int], str]]:
	"""
	Group pages into chunks that fit within the LLM context length.
	Splits pages into roughly equal-sized groups (e.g. 56 pages with room for 50
	per context -> two groups of 28, not 50 and 6). Each page is prefixed with
	'=== Page x ==='.

	Args:
	all_page_line_level_ocr_results_df: DataFrame with columns 'page', 'line', 'text'
	context_length: Maximum context length in tokens
	tokenizer: Tokenizer for accurate token counting
	model_source: Source of the model for token counting

	Returns:
	List of tuples: (list of page numbers, formatted text for that group)
	"""
	if (
	all_page_line_level_ocr_results_df is None
	or all_page_line_level_ocr_results_df.empty
	):
	return []

	# Group by page and concatenate text
	page_texts = {}
	for _, row in all_page_line_level_ocr_results_df.iterrows():
	page = int(row["page"])
	text = str(row.get("text", ""))
	if page not in page_texts:
	page_texts[page] = []
	page_texts[page].append(text)

	# Format each page with header and get token count per page
	page_list = [] # (page_num, formatted_page, page_tokens)
	for page_num in sorted(page_texts.keys()):
	page_text = " ".join(page_texts[page_num])
	formatted_page = f"=== Page {page_num} ===\n{page_text}"
	page_tokens = count_tokens_in_text(formatted_page, tokenizer, model_source)
	page_list.append((page_num, formatted_page, page_tokens))

	# Reserve some tokens for the prompt template
	reserved_tokens = 500
	available_tokens = context_length - reserved_tokens

	if not page_list:
	return []

	# Sanitise max_pages_per_group
	try:
	max_pages_per_group_int = int(max_pages_per_group)
	except Exception:
	max_pages_per_group_int = 30
	if max_pages_per_group_int < 1:
	max_pages_per_group_int = 1

	# Step 1: Greedy pass to determine minimum number of groups by tokens
	k_token = 0
	cur_tokens = 0
	for _, _, pt in page_list:
	if cur_tokens + pt > available_tokens and cur_tokens > 0:
	k_token += 1
	cur_tokens = 0
	cur_tokens += pt
	k_token += 1 # last group
	n = len(page_list)

	# Also enforce a maximum pages-per-group cap
	k_pages = (n + max_pages_per_group_int - 1) // max_pages_per_group_int

	# Final number of groups must satisfy both token limit and max-pages limit
	k = max(k_token, k_pages)

	# Step 2: Target pages per group for roughly equal split (e.g. 56 pages, 2 groups -> 28, 28)
	q, r = n // k, n % k
	target_per_group = [q + 1] * r + [q] * (k - r)

	# Step 3: Assign pages to groups with target sizes, respecting token limit
	groups = []
	page_idx = 0
	for group_idx in range(k):
	target = min(target_per_group[group_idx], max_pages_per_group_int)
	current_group_pages = []
	current_group_text = ""
	current_tokens = 0
	while page_idx < n and len(current_group_pages) < target:
	page_num, formatted_page, page_tokens = page_list[page_idx]
	if current_tokens + page_tokens > available_tokens and current_group_pages:
	break # full by token limit; start next group
	current_group_pages.append(page_num)
	if current_group_text:
	current_group_text += "\n\n" + formatted_page
	else:
	current_group_text = formatted_page
	current_tokens += page_tokens
	page_idx += 1
	if current_group_pages:
	groups.append((current_group_pages, current_group_text))

	# Any remaining pages (e.g. group hit token limit before target) go into final group(s)
	while page_idx < n:
	current_group_pages = []
	current_group_text = ""
	current_tokens = 0
	while page_idx < n and len(current_group_pages) < max_pages_per_group_int:
	page_num, formatted_page, page_tokens = page_list[page_idx]
	if current_tokens + page_tokens > available_tokens and current_group_pages:
	break
	# If even a single page exceeds limit, add it anyway to avoid infinite loop
	current_group_pages.append(page_num)
	if current_group_text:
	current_group_text += "\n\n" + formatted_page
	else:
	current_group_text = formatted_page
	current_tokens += page_tokens
	page_idx += 1
	if current_group_pages:
	groups.append((current_group_pages, current_group_text))

	return groups


	def summarise_text_chunk(
	text_chunk: str,
	model_choice: str,
	in_api_key: str,
	temperature: float,
	context_textbox: str = "",
	aws_access_key_textbox: str = "",
	aws_secret_key_textbox: str = "",
	aws_region_textbox: str = "",
	model_name_map: dict = None,
	hf_api_key_textbox: str = "",
	azure_endpoint_textbox: str = "",
	api_url: str = None,
	reasoning_suffix: str = "",
	local_model=None,
	tokenizer=None,
	assistant_model=None,
	summarise_format_radio: str = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
	additional_summary_instructions: str = "",
	) -> Tuple[str, str, dict]:
	"""
	Summarise a single text chunk using the summarise_output_topics_query function.

	Returns:
	Tuple of (summary_text, full_prompt, metadata)
	"""
	from tools.config import (
	model_name_map as default_model_name_map,
	)

	# Note: load_model is already imported at the top of the file

	if model_name_map is None:
	model_name_map = default_model_name_map

	if additional_summary_instructions:
	additional_summary_instructions = (
	"Important additional instructions to follow closely: "
	+ additional_summary_instructions
	)

	formatted_summary_prompt = [
	summarise_topic_descriptions_prompt.format(
	summaries=text_chunk,
	summary_format=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)
	]

	# Format system prompt
	formatted_system_prompt = summarise_topic_descriptions_system_prompt.format(
	column_name="document text",
	consultation_context=context_textbox if context_textbox else "",
	)

	# Determine model source from model_choice using defaults from config.py
	# Does not check model_name_map - uses the defined defaults
	model_source = get_model_source_from_model_choice(model_choice)

	# Setup model based on model source
	# Load model and tokenizer together to ensure they're from the same source
	# This prevents mismatches that could occur if they're loaded separately
	# Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
	if (model_source == "Local") & (local_model is None or tokenizer is None):
	progress(0.1, f"Using model: {LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE}")
	# Use load_model() to ensure both are loaded atomically
	# This is safer than calling get_pii_model() and get_pii_tokenizer() separately
	loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
	if local_model is None:
	local_model = loaded_model
	if tokenizer is None:
	tokenizer = loaded_tokenizer
	if assistant_model is None:
	assistant_model = loaded_assistant_model

	# Setup bedrock for AWS models
	# Use the same approach as file_redaction.py (lines 939-969) for consistency
	bedrock_runtime = None
	if model_source == "AWS":
	# Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
	region = aws_region_textbox if aws_region_textbox else AWS_REGION

	if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
	print("Connecting to Bedrock via existing SSO connection")
	bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
	elif aws_access_key_textbox and aws_secret_key_textbox:
	print(
	"Connecting to Bedrock using AWS access key and secret keys from user input."
	)
	bedrock_runtime = boto3.client(
	"bedrock-runtime",
	aws_access_key_id=aws_access_key_textbox,
	aws_secret_access_key=aws_secret_key_textbox,
	region_name=region,
	)
	elif RUN_AWS_FUNCTIONS:
	print("Connecting to Bedrock via existing SSO connection")
	bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
	elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
	print("Getting Bedrock credentials from environment variables")
	bedrock_runtime = boto3.client(
	"bedrock-runtime",
	aws_access_key_id=AWS_ACCESS_KEY,
	aws_secret_access_key=AWS_SECRET_KEY,
	region_name=region,
	)
	else:
	bedrock_runtime = None
	out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
	print(out_message)
	raise Exception(out_message)

	# Note: Gemini and Azure/OpenAI clients are handled within summarise_output_topics_query
	# via the process_requests function, so we don't need to set them up here
	# Similar to how llm_entity_detection.py handles them (lines 554-584)

	# Apply reasoning suffix if needed
	if reasoning_suffix:
	is_gpt_oss_model = (
	"gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower()
	)
	if is_gpt_oss_model or ("Local" in model_source and reasoning_suffix):
	formatted_system_prompt = formatted_system_prompt + "\n" + reasoning_suffix

	# Call the summarisation function
	try:
	response, conversation_history, metadata, response_text = (
	summarise_output_topics_query(
	model_choice,
	in_api_key,
	temperature,
	formatted_summary_prompt,
	formatted_system_prompt,
	model_source,
	bedrock_runtime,
	local_model if local_model else [],
	tokenizer if tokenizer else [],
	assistant_model if assistant_model else [],
	azure_endpoint_textbox,
	api_url,
	)
	)

	full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt[0]
	return response_text, full_prompt, metadata
	except Exception as e:
	print(f"Error summarising text chunk: {e}")
	full_prompt = formatted_system_prompt + "\n" + formatted_summary_prompt[0]
	return "", full_prompt, {}


	def recursively_summarise(
	summaries: List[str],
	model_choice: str,
	in_api_key: str,
	temperature: float,
	context_length: int = LLM_CONTEXT_LENGTH,
	tokenizer=None,
	model_source: str = "Local",
	token_accumulator=None,
	**kwargs,
	) -> List[str]:
	"""
	Recursively summarise summaries until they fit within context length.

	Args:
	token_accumulator: Optional list to accumulate [input_tokens, output_tokens] from metadata
	"""
	# Check total length
	combined_summaries = "\n\n".join(summaries)
	total_tokens = count_tokens_in_text(combined_summaries, tokenizer, model_source)

	# Reserve tokens for prompt
	reserved_tokens = 500
	available_tokens = context_length - reserved_tokens

	if total_tokens <= available_tokens:
	return summaries

	# Need to summarise further - group summaries into chunks
	groups = []
	current_group = []
	current_tokens = 0

	for summary in summaries:
	summary_tokens = count_tokens_in_text(summary, tokenizer, model_source)
	if current_tokens + summary_tokens > available_tokens and current_group:
	groups.append("\n\n".join(current_group))
	current_group = [summary]
	current_tokens = summary_tokens
	else:
	current_group.append(summary)
	current_tokens += summary_tokens

	if current_group:
	groups.append("\n\n".join(current_group))

	# Summarise each group
	new_summaries = []
	for group_text in groups:
	summary_text, _, metadata = summarise_text_chunk(
	group_text,
	model_choice,
	in_api_key,
	temperature,
	tokenizer=tokenizer,
	model_source=model_source,
	**kwargs,
	)
	if summary_text:
	new_summaries.append(summary_text)
	# Accumulate tokens if accumulator provided
	if token_accumulator is not None and metadata:
	# Convert metadata to string if it's a list
	metadata_string = (
	str(metadata) if not isinstance(metadata, str) else metadata
	)
	input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
	metadata_string, model_choice, model_name_map
	)
	token_accumulator[0] += input_tokens
	token_accumulator[1] += output_tokens

	# Recursively call if still too long
	if len(new_summaries) > 1:
	return recursively_summarise(
	new_summaries,
	model_choice,
	in_api_key,
	temperature,
	context_length,
	tokenizer,
	model_source,
	token_accumulator=token_accumulator,
	**kwargs,
	)

	return new_summaries


	def summarise_document(
	all_page_line_level_ocr_results_df: pd.DataFrame,
	output_folder: str,
	model_choice: str,
	in_api_key: str,
	temperature: float,
	file_name: str = "document",
	context_textbox: str = "",
	aws_access_key_textbox: str = "",
	aws_secret_key_textbox: str = "",
	aws_region_textbox: str = "",
	hf_api_key_textbox: str = "",
	azure_endpoint_textbox: str = "",
	api_url: str = None,
	summarise_format_radio: str = "Return a summary up to two paragraphs long that includes as much detail as possible from the original text",
	additional_summary_instructions: str = "",
	max_pages_per_group: int = 30,
	summary_page_group_max_workers: Optional[int] = None,
	progress=gr.Progress(track_tqdm=True),
	) -> Tuple[List[str], str]:
	"""
	Main function to summarise a document from OCR results.

	Args:
	all_page_line_level_ocr_results_df (pd.DataFrame): DataFrame containing line-level OCR results.
	output_folder (str): The folder where outputs will be saved.
	model_choice (str): The model to use for summarization.
	in_api_key (str): API key for the selected model/inference method.
	temperature (float): LLM temperature hyperparameter.
	file_name (str, optional): Name to use for the output files. Default is "document".
	context_textbox (str, optional): Extra context for summarization. Default is "".
	aws_access_key_textbox (str, optional): AWS access key, if using AWS. Default is "".
	aws_secret_key_textbox (str, optional): AWS secret key, if using AWS. Default is "".
	aws_region_textbox (str, optional): AWS region string. Default is "".
	hf_api_key_textbox (str, optional): HuggingFace API key, if used. Default is "".
	azure_endpoint_textbox (str, optional): Azure endpoint, if used. Default is "".
	api_url (str, optional): API URL. Default is None.
	summarise_format_radio (str, optional): Summary output format instructions. Default is detailed summary.
	additional_summary_instructions (str, optional): Extra instructions for the summarization. Default is "".
	max_pages_per_group (int, optional): Maximum number of pages to group per LLM pass. Default is 30.
	progress (gr.Progress, optional): Gradio progress tracker. Default is Gradio Progress with tqdm.

	Returns:
	Tuple of (output_file_paths, status_message)
	"""
	import os
	from datetime import datetime

	from tools.llm_funcs import load_model

	output_files = []
	all_prompts = []
	all_responses = []
	all_token_counts = (
	[]
	) # Store (input_tokens, output_tokens) for each prompt/response
	page_group_page_ranges = (
	[]
	) # Store (min_page, max_page) for each saved prompt/response
	page_group_summaries = []

	# Initialize token tracking variables
	llm_total_input_tokens = 0
	llm_total_output_tokens = 0
	llm_model_name = ""

	try:
	# Determine model source from model_choice using defaults from config.py
	# Does not check model_name_map - uses the defined defaults
	model_source = get_model_source_from_model_choice(model_choice)

	local_model = None
	tokenizer = None
	assistant_model = None

	# Setup model based on model source - check for Local models
	# Load model and tokenizer together to ensure they're from the same source
	# This prevents mismatches that could occur if they're loaded separately
	# Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
	if model_source == "Local":
	if local_model is None or tokenizer is None:
	progress(0.05, "Loading local model...")
	# Use load_model() to ensure both are loaded atomically
	# This is safer than calling get_pii_model() and get_pii_tokenizer() separately
	loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
	if local_model is None:
	local_model = loaded_model
	if tokenizer is None:
	tokenizer = loaded_tokenizer
	if assistant_model is None:
	assistant_model = loaded_assistant_model

	# Step 1: Group pages by context length
	progress(0.1, "Grouping pages by context length...")
	page_groups = group_pages_by_context_length(
	all_page_line_level_ocr_results_df,
	LLM_CONTEXT_LENGTH,
	tokenizer,
	model_source,
	max_pages_per_group=max_pages_per_group,
	)

	if not page_groups:
	return [], "No OCR results found. Please run text extraction first."

	# Step 2: Summarise each page group (optionally in parallel)
	_summary_page_group_max_workers = (
	summary_page_group_max_workers
	if summary_page_group_max_workers is not None
	else SUMMARY_PAGE_GROUP_MAX_WORKERS
	)
	use_parallel_page_groups = (
	_summary_page_group_max_workers > 1 and len(page_groups) > 1
	)
	progress(0.2, f"Summarising {len(page_groups)} page groups...")

	def _summarise_one_group(args):
	i, page_nums, group_text = args
	summary_text, full_prompt, metadata = summarise_text_chunk(
	group_text,
	model_choice,
	in_api_key,
	temperature,
	context_textbox=context_textbox,
	aws_access_key_textbox=aws_access_key_textbox,
	aws_secret_key_textbox=aws_secret_key_textbox,
	aws_region_textbox=aws_region_textbox,
	hf_api_key_textbox=hf_api_key_textbox,
	azure_endpoint_textbox=azure_endpoint_textbox,
	api_url=api_url,
	local_model=local_model,
	tokenizer=tokenizer,
	assistant_model=assistant_model,
	summarise_format_radio=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)
	return (i, page_nums, summary_text, full_prompt, metadata)

	if use_parallel_page_groups:
	max_workers = min(_summary_page_group_max_workers, len(page_groups))
	tasks = [
	(i, page_nums, group_text)
	for i, (page_nums, group_text) in enumerate(page_groups)
	]
	results_by_index = [None] * len(page_groups)
	pbar = tqdm(
	total=len(page_groups),
	unit="groups",
	desc="Summarising page groups",
	)
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = {
	executor.submit(_summarise_one_group, t): t[0] for t in tasks
	}
	completed = 0
	for future in as_completed(futures):
	i, page_nums, summary_text, full_prompt, metadata = future.result()
	results_by_index[i] = (
	page_nums,
	summary_text,
	full_prompt,
	metadata,
	)
	completed += 1
	pbar.update(1)
	progress(
	0.2 + (completed / len(page_groups)) * 0.5,
	f"Summarising page group {completed}/{len(page_groups)} (pages {min(page_nums)}-{max(page_nums)})...",
	)
	pbar.close()
	# Build lists in page-group order
	for i in range(len(page_groups)):
	if results_by_index[i] is None:
	continue
	page_nums, summary_text, full_prompt, metadata = results_by_index[i]
	if summary_text:
	try:
	min_page = int(min(page_nums)) if page_nums else 0
	max_page = int(max(page_nums)) if page_nums else 0
	except Exception:
	min_page, max_page = 0, 0
	page_group_page_ranges.append((min_page, max_page))
	page_group_summaries.append(summary_text)
	all_prompts.append(full_prompt)
	all_responses.append(summary_text)
	input_tokens, output_tokens = 0, 0
	if metadata:
	metadata_string = (
	str(metadata) if not isinstance(metadata, str) else metadata
	)
	input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
	metadata_string, model_choice, model_name_map
	)
	llm_total_input_tokens += input_tokens
	llm_total_output_tokens += output_tokens
	if not llm_model_name and model_choice:
	llm_model_name = model_choice
	all_token_counts.append((input_tokens, output_tokens))
	else:
	seq_pbar = tqdm(
	page_groups,
	unit="groups",
	desc="Summarising page groups",
	)
	for i, (page_nums, group_text) in enumerate(seq_pbar):
	progress(
	0.2 + (i / len(page_groups)) * 0.5,
	f"Summarising page group {i+1}/{len(page_groups)} (pages {min(page_nums)}-{max(page_nums)})...",
	)
	summary_text, full_prompt, metadata = summarise_text_chunk(
	group_text,
	model_choice,
	in_api_key,
	temperature,
	context_textbox=context_textbox,
	aws_access_key_textbox=aws_access_key_textbox,
	aws_secret_key_textbox=aws_secret_key_textbox,
	aws_region_textbox=aws_region_textbox,
	hf_api_key_textbox=hf_api_key_textbox,
	azure_endpoint_textbox=azure_endpoint_textbox,
	api_url=api_url,
	local_model=local_model,
	tokenizer=tokenizer,
	assistant_model=assistant_model,
	summarise_format_radio=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)
	if summary_text:
	try:
	min_page = int(min(page_nums)) if page_nums else 0
	max_page = int(max(page_nums)) if page_nums else 0
	except Exception:
	min_page, max_page = 0, 0
	page_group_page_ranges.append((min_page, max_page))
	page_group_summaries.append(summary_text)
	all_prompts.append(full_prompt)
	all_responses.append(summary_text)
	input_tokens, output_tokens = 0, 0
	if metadata:
	metadata_string = (
	str(metadata) if not isinstance(metadata, str) else metadata
	)
	input_tokens, output_tokens, _ = calculate_tokens_from_metadata(
	metadata_string, model_choice, model_name_map
	)
	llm_total_input_tokens += input_tokens
	llm_total_output_tokens += output_tokens
	if not llm_model_name and model_choice:
	llm_model_name = model_choice
	all_token_counts.append((input_tokens, output_tokens))
	seq_pbar.close()

	# Step 3: Recursively summarise if needed
	progress(0.7, "Checking if recursive summarisation is needed...")
	# Create token accumulator for recursive summarization
	recursive_token_accumulator = [0, 0] # [input_tokens, output_tokens]
	final_summaries = recursively_summarise(
	page_group_summaries,
	model_choice,
	in_api_key,
	temperature,
	context_length=LLM_CONTEXT_LENGTH,
	tokenizer=tokenizer,
	model_source=model_source,
	token_accumulator=recursive_token_accumulator,
	context_textbox=context_textbox,
	aws_access_key_textbox=aws_access_key_textbox,
	aws_secret_key_textbox=aws_secret_key_textbox,
	aws_region_textbox=aws_region_textbox,
	hf_api_key_textbox=hf_api_key_textbox,
	azure_endpoint_textbox=azure_endpoint_textbox,
	api_url=api_url,
	local_model=local_model,
	assistant_model=assistant_model,
	summarise_format_radio=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)

	# Add tokens from recursive summarization
	llm_total_input_tokens += recursive_token_accumulator[0]
	llm_total_output_tokens += recursive_token_accumulator[1]

	# Step 4: Create overall summary
	progress(0.85, "Creating overall summary...")
	# Create a topic summary DataFrame for overall_summary: three columns only
	summary_numbers = list(range(1, len(final_summaries) + 1))
	if len(final_summaries) == len(page_groups):
	page_ranges = [f"Pages {min(pg[0])}-{max(pg[0])}" for pg in page_groups]
	else:
	# Recursion combined some summaries - use "All" or full range
	if len(final_summaries) == 1 and page_groups:
	all_pages = [p for pg in page_groups for p in pg[0]]
	page_ranges = [f"Pages {min(all_pages)}-{max(all_pages)}"]
	else:
	page_ranges = ["All"] * len(final_summaries)
	topic_summary_df = pd.DataFrame(
	{
	"Summary number": summary_numbers,
	"Page range": page_ranges,
	"Summary": final_summaries,
	}
	)

	# Call overall_summary
	(
	output_files,
	html_output_table,
	overall_summarised_outputs_df,
	out_metadata_str,
	overall_input_tokens,
	overall_output_tokens,
	number_of_calls_num,
	time_taken,
	out_message,
	overall_logged_content,
	overall_prompt,
	overall_response,
	) = overall_summary(
	topic_summary_df=topic_summary_df,
	model_choice=model_choice,
	in_api_key=in_api_key,
	temperature=temperature,
	reference_data_file_name=file_name,
	output_folder=output_folder,
	context_textbox=context_textbox,
	aws_access_key_textbox=aws_access_key_textbox,
	aws_secret_key_textbox=aws_secret_key_textbox,
	aws_region_textbox=aws_region_textbox,
	hf_api_key_textbox=hf_api_key_textbox,
	azure_endpoint_textbox=azure_endpoint_textbox,
	api_url=api_url,
	local_model=local_model,
	tokenizer=tokenizer,
	assistant_model=assistant_model,
	summarise_format_radio=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	progress=progress,
	)

	llm_total_input_tokens += overall_input_tokens
	llm_total_output_tokens += overall_output_tokens

	# Extract summary texts from the DataFrame
	if (
	overall_summarised_outputs_df is not None
	and not overall_summarised_outputs_df.empty
	):
	if "Summary" in overall_summarised_outputs_df.columns:
	overall_summary_texts = overall_summarised_outputs_df[
	"Summary"
	].tolist()
	else:
	# Fallback: get from first column if "Summary" column doesn't exist
	overall_summary_texts = overall_summarised_outputs_df.iloc[
	:, 0
	].tolist()
	else:
	overall_summary_texts = []

	# Step 5: Save outputs
	progress(0.95, "Saving output files...")
	timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
	file_name_clean = get_file_name_no_ext(file_name) if file_name else "document"
	# Ensure file_name_clean is not empty
	if not file_name_clean or file_name_clean.strip() == "":
	file_name_clean = "document"

	summaries_folder = os.path.join(output_folder, "summaries")
	os.makedirs(summaries_folder, exist_ok=True)

	# Save prompts and responses as .txt files for page group summaries
	for i, (prompt, response) in enumerate(zip(all_prompts, all_responses)):
	# Page range for this prompt/response pair
	min_page, max_page = (
	page_group_page_ranges[i] if i < len(page_group_page_ranges) else (0, 0)
	)
	page_range_slug = f"pages_{min_page}_{max_page}"
	txt_file_path = os.path.join(
	summaries_folder,
	f"{file_name_clean}_{page_range_slug}_prompt_response_{timestamp}.txt",
	)
	# Get token counts for this prompt/response pair
	input_tokens, output_tokens = (
	all_token_counts[i] if i < len(all_token_counts) else (0, 0)
	)

	with open(txt_file_path, "w", encoding="utf-8") as f:
	f.write("=" * 80 + "\n")
	f.write("TOKEN INFORMATION\n")
	f.write("=" * 80 + "\n")
	f.write(f"Page Range: {min_page}-{max_page}\n")
	f.write(f"Input Tokens: {input_tokens}\n")
	f.write(f"Output Tokens: {output_tokens}\n")
	f.write(f"Maximum Context Length: {LLM_CONTEXT_LENGTH}\n")
	f.write(f"Model: {model_choice}\n")
	f.write(f"Temperature: {temperature}\n")
	f.write("=" * 80 + "\n\n")
	f.write("=" * 80 + "\n")
	f.write("PROMPT\n")
	f.write("=" * 80 + "\n")
	f.write(prompt)
	f.write("\n\n" + "=" * 80 + "\n")
	f.write("RESPONSE\n")
	f.write("=" * 80 + "\n")
	f.write(response)
	output_files.append(txt_file_path)

	# Save overall summary prompt/response

	# Fallback: If we don't have prompt/response from logged_content, use summary texts
	# This should rarely happen, but provides a safety net
	if not overall_prompt and overall_summary_texts:
	# Construct a basic prompt representation (this is a fallback, not ideal)
	overall_prompt = (
	f"Overall summary request for document: {file_name_clean}\n"
	)
	overall_prompt += f"Input: {len(final_summaries)} summary group(s) to combine into overall summary\n"
	overall_prompt += f"Summary format: {summarise_format_radio}\n"
	if additional_summary_instructions:
	overall_prompt += (
	f"Additional instructions: {additional_summary_instructions}\n"
	)

	# If we still don't have a response, use summary texts
	if not overall_response and overall_summary_texts:
	overall_response = (
	"\n\n".join(overall_summary_texts)
	if isinstance(overall_summary_texts, list)
	else str(overall_summary_texts)
	)

	# Save overall summary .txt file if we have response content (always create if we have summary texts)
	if overall_response or overall_summary_texts:
	txt_file_path = os.path.join(
	summaries_folder,
	f"{file_name_clean}_overall_summary_prompt_response_{timestamp}.txt",
	)
	with open(txt_file_path, "w", encoding="utf-8") as f:
	f.write("=" * 80 + "\n")
	f.write("TOKEN INFORMATION\n")
	f.write("=" * 80 + "\n")
	f.write(f"Input Tokens: {overall_input_tokens}\n")
	f.write(f"Output Tokens: {overall_output_tokens}\n")
	f.write(f"Maximum Context Length: {LLM_CONTEXT_LENGTH}\n")
	f.write(f"Model: {model_choice}\n")
	f.write(f"Temperature: {temperature}\n")
	f.write("=" * 80 + "\n\n")
	f.write("=" * 80 + "\n")
	f.write("PROMPT\n")
	f.write("=" * 80 + "\n")
	f.write(overall_prompt)
	f.write("\n\n" + "=" * 80 + "\n")
	f.write("RESPONSE\n")
	f.write("=" * 80 + "\n")
	f.write(overall_response)
	output_files.append(txt_file_path)

	# Save summaries as CSV
	summary_data = {"Type": [], "Page_Range": [], "Summary": []}

	# Add page group summaries
	for i, (page_nums, summary) in enumerate(
	zip([pg[0] for pg in page_groups], page_group_summaries)
	):
	summary_data["Type"].append("Page Group Summary")
	summary_data["Page_Range"].append(f"{min(page_nums)}-{max(page_nums)}")
	summary_data["Summary"].append(summary)

	# Add final summaries if different from page group summaries
	if final_summaries != page_group_summaries:
	for i, summary in enumerate(final_summaries):
	summary_data["Type"].append("Final Summary")
	summary_data["Page_Range"].append(f"Group {i+1}")
	summary_data["Summary"].append(summary)

	# Add overall summary - ensure overall_summary_texts is a list of strings
	if overall_summary_texts:
	# Handle case where overall_summary_texts might be a single string
	if isinstance(overall_summary_texts, str):
	overall_summary_texts = [overall_summary_texts]
	# Ensure each item is a string, not being iterated character by character
	for summary in overall_summary_texts:
	if isinstance(summary, str):
	summary_data["Type"].append("Overall Summary")
	summary_data["Page_Range"].append("All")
	summary_data["Summary"].append(summary)
	elif hasattr(summary, "__iter__") and not isinstance(summary, str):
	# If it's iterable but not a string, convert to string
	summary_str = str(summary)
	summary_data["Type"].append("Overall Summary")
	summary_data["Page_Range"].append("All")
	summary_data["Summary"].append(summary_str)

	summary_df = pd.DataFrame(summary_data)
	csv_file_path = os.path.join(
	summaries_folder, f"{file_name_clean}_summaries_{timestamp}.csv"
	)
	summary_df.to_csv(csv_file_path, index=False, encoding="utf-8-sig")
	output_files.append(csv_file_path)

	progress(1.0, "Summarisation complete!")
	status_message = (
	f"Summarisation complete! Generated {len(output_files)} output files."
	)

	# Prepare summary text for display (combine all overall summary texts)
	summary_display_text = ""
	if overall_summary_texts:
	if isinstance(overall_summary_texts, list):
	summary_display_text = "\n\n".join(overall_summary_texts)
	else:
	summary_display_text = str(overall_summary_texts)

	return (
	output_files,
	status_message,
	llm_model_name,
	llm_total_input_tokens,
	llm_total_output_tokens,
	summary_display_text,
	)

	except Exception as e:
	error_message = f"Error during summarisation: {str(e)}"
	print(error_message)
	import traceback

	traceback.print_exc()
	return (
	output_files,
	error_message,
	llm_model_name,
	llm_total_input_tokens,
	llm_total_output_tokens,
	"", # Empty summary display text on error
	)


	def join_unique_summaries(x):
	unique_summaries = []
	seen = set()

	for s in x:
	if pd.isna(s):
	continue

	# 1. Normalize whitespace and split lines
	s_str = str(s).strip()
	lines = s_str.split("\n")

	for line in lines:
	# 2. Aggressive Cleaning
	# Remove "Rows X to Y:" prefix
	line = re.sub(
	r"^Rows\s+\d+\s+to\s+\d+:\s*", "", line, flags=re.IGNORECASE
	).strip()

	# Remove generic "Prefix:" if it exists (e.g., "Summary: ...")
	if ": " in line:
	parts = line.split(": ", 1)
	if len(parts[0]) < 50 and " " not in parts[0]:
	line = parts[1].strip()

	# 3. Handle Invisible Characters (Crucial)
	# Replace non-breaking spaces (\xa0) and multiple spaces with a single standard space
	normalized_line = re.sub(r"\s+", " ", line).strip()

	# 4. Check against Seen
	if normalized_line and normalized_line not in seen:
	unique_summaries.append(normalized_line)
	seen.add(normalized_line)

	return "\n".join(unique_summaries)


	def sample_reference_table_summaries(
	reference_df: pd.DataFrame,
	random_seed: int,
	no_of_sampled_summaries: int = 100,
	sample_reference_table_checkbox: bool = False,
	):
	"""
	Sample x number of summaries from which to produce summaries, so that the input token length is not too long.
	"""

	if sample_reference_table_checkbox:

	all_summaries = pd.DataFrame(
	columns=[
	"General topic",
	"Subtopic",
	"Sentiment",
	"Group",
	"Response References",
	"Summary",
	]
	)

	if "Group" not in reference_df.columns:
	reference_df["Group"] = "All"

	reference_df_grouped = reference_df.groupby(
	["General topic", "Subtopic", "Sentiment", "Group"]
	)

	if "Revised summary" in reference_df.columns:
	out_message = "Summary has already been created for this file"
	print(out_message)
	raise Exception(out_message)

	for group_keys, reference_df_group in reference_df_grouped:
	if len(reference_df_group["General topic"]) > 1:

	filtered_reference_df = reference_df_group.reset_index()

	filtered_reference_df_unique = filtered_reference_df.drop_duplicates(
	[
	"General topic",
	"Subtopic",
	"Sentiment",
	"Group",
	"Start row of group",
	]
	)

	# Sample n of the unique topic summaries PER GROUP. To limit the length of the text going into the summarisation tool
	# This ensures each group gets up to no_of_sampled_summaries summaries, not the total across all groups
	number_of_summaries_to_sample = min(
	no_of_sampled_summaries, len(filtered_reference_df_unique)
	)
	print(
	f"Sampling {number_of_summaries_to_sample} summaries from group {group_keys}, from dataframe filtered_reference_df_unique.head(5):\n{filtered_reference_df_unique.head(5)}"
	)
	filtered_reference_df_unique_sampled = (
	filtered_reference_df_unique.sample(
	number_of_summaries_to_sample, random_state=random_seed
	)
	)

	all_summaries = pd.concat(
	[all_summaries, filtered_reference_df_unique_sampled]
	)

	print("all_summaries.tail(5):\n", all_summaries.tail(5))

	# If no responses/topics qualify, just go ahead with the original reference dataframe
	if all_summaries.empty:
	sampled_reference_table_df = reference_df
	# Filter by sentiment only (Response References is a string in original df, not a count)
	sampled_reference_table_df = sampled_reference_table_df.loc[
	sampled_reference_table_df["Sentiment"] != "Not Mentioned"
	]
	else:
	# Deduplicate summaries within each group before joining to prevent repeated summaries

	sampled_reference_table_df = (
	all_summaries.groupby(
	["General topic", "Subtopic", "Sentiment", "Group"]
	)
	.agg(
	{
	"Response References": "size", # Count the number of references
	"Summary": join_unique_summaries, # Join unique summaries only
	}
	)
	.reset_index()
	)
	# Filter by sentiment and count (Response References is now a numeric count after aggregation)
	sampled_reference_table_df = sampled_reference_table_df.loc[
	(sampled_reference_table_df["Sentiment"] != "Not Mentioned")
	& (sampled_reference_table_df["Response References"] > 1)
	]
	else:
	sampled_reference_table_df = reference_df

	summarised_references_markdown = sampled_reference_table_df.to_markdown(index=False)

	return sampled_reference_table_df, summarised_references_markdown


	def count_tokens_in_text(text: str, tokenizer=None, model_source: str = "Local") -> int:
	"""
	Count the number of tokens in the given text.

	Args:
	text (str): The text to count tokens for
	tokenizer (object, optional): Tokenizer object for local models. Defaults to None.
	model_source (str): Source of the model to determine tokenization method. Defaults to "Local".

	Returns:
	int: Number of tokens in the text
	"""
	if not text:
	return 0

	try:
	if model_source == "Local" and tokenizer and len(tokenizer) > 0:
	# Use local tokenizer if available
	tokens = tokenizer[0].encode(text, add_special_tokens=False)
	return len(tokens)
	else:
	# Fallback: rough estimation using word count (approximately 1.3 tokens per word)
	word_count = len(text.split())
	return int(word_count * 1.3)
	except Exception as e:
	print(f"Error counting tokens: {e}. Using word count estimation.")
	# Fallback: rough estimation using word count
	word_count = len(text.split())
	return int(word_count * 1.3)


	def clean_markdown_table_whitespace(markdown_text: str) -> str:
	if not markdown_text:
	return markdown_text

	lines = markdown_text.splitlines()
	cleaned_lines = []

	for line in lines:
	# 1. Clean all types of whitespace (including non-breaking spaces \u00A0)
	# This turns every cell into a single-spaced string
	cells = [re.sub(r"[\s\u00A0]+", " ", cell.strip()) for cell in line.split("\|")]

	# 2. Check if the row is effectively empty (only pipes or whitespace)
	# We join the content; if nothing is left, it's a "ghost" row.
	if not "".join(cells).strip():
	continue

	# 3. Handle the separator row specifically (e.g., \|:---\|---:\|)
	# We reset these to a small fixed width so they don't stretch the table.
	if re.match(r"^[\|\s\-:]+$", line):
	new_separator = []
	for cell in cells:
	if not cell: # Outer pipes
	new_separator.append("")
	elif ":" in cell: # Alignment markers
	left = ":" if cell.startswith(":") else "-"
	right = ":" if cell.endswith(":") else "-"
	new_separator.append(f"{left}---{right}")
	else:
	new_separator.append("---")
	cleaned_lines.append("\|".join(new_separator))
	continue

	# 4. Standard data row: Rejoin with single padding
	# We filter out empty outer parts caused by leading/trailing pipes
	formatted_row = (
	"\| "
	+ " \| ".join(
	c for c in cells if c or cells.index(c) not in [0, len(cells) - 1]
	)
	+ " \|"
	)

	# Simple fallback if the logic above is too aggressive for your specific table style:
	# formatted_row = "\|".join(f" {c} " if c else "" for c in cells)

	cleaned_lines.append(formatted_row)

	return "\n".join(cleaned_lines)


	def summarise_output_topics_query(
	model_choice: str,
	in_api_key: str,
	temperature: float,
	formatted_summary_prompt: str,
	summarise_topic_descriptions_system_prompt: str,
	model_source: str,
	bedrock_runtime: boto3.Session.client,
	local_model=list(),
	tokenizer=list(),
	assistant_model=list(),
	azure_endpoint: str = "",
	api_url: str = None,
	):
	"""
	Query an LLM to generate a summary of topics based on the provided prompts.

	Args:
	model_choice (str): The name/type of model to use for generation
	in_api_key (str): API key for accessing the model service
	temperature (float): Temperature parameter for controlling randomness in generation
	formatted_summary_prompt (str): The formatted prompt containing topics to summarise
	summarise_topic_descriptions_system_prompt (str): System prompt providing context and instructions
	model_source (str): Source of the model (e.g. "AWS", "Gemini", "Local")
	bedrock_runtime (boto3.Session.client): AWS Bedrock runtime client for AWS models
	local_model (object, optional): Local model object if using local inference. Defaults to empty list.
	tokenizer (object, optional): Tokenizer object if using local inference. Defaults to empty list.
	Returns:
	tuple: Contains:
	- response_text (str): The generated summary text
	- conversation_history (list): History of the conversation with the model
	- whole_conversation_metadata (list): Metadata about the conversation
	"""
	conversation_history = list()
	whole_conversation_metadata = list()
	client = list()
	client_config = {}

	# Combine system prompt and user prompt for token counting
	full_input_text = (
	summarise_topic_descriptions_system_prompt + "\n" + formatted_summary_prompt[0]
	if isinstance(formatted_summary_prompt, list)
	else summarise_topic_descriptions_system_prompt
	+ "\n"
	+ formatted_summary_prompt
	)

	# Count tokens in the input text
	input_token_count = count_tokens_in_text(full_input_text, tokenizer, model_source)

	# Check if input exceeds context length
	if input_token_count > LLM_CONTEXT_LENGTH:
	error_message = f"Input text exceeds LLM context length. Input tokens: {input_token_count}, Max context length: {LLM_CONTEXT_LENGTH}. Please reduce the input text size."
	print(error_message)
	raise ValueError(error_message)

	print(f"Input token count: {input_token_count} (Max: {LLM_CONTEXT_LENGTH})")

	# Prepare Gemini models before query
	if "Gemini" in model_source:
	# print("Using Gemini model:", model_choice)
	client, config = construct_gemini_generative_model(
	in_api_key=in_api_key,
	temperature=temperature,
	model_choice=model_choice,
	system_prompt=system_prompt,
	max_tokens=max_tokens,
	)
	elif "Azure/OpenAI" in model_source:
	client, config = construct_azure_client(
	in_api_key=os.environ.get("AZURE_INFERENCE_CREDENTIAL", ""),
	endpoint=azure_endpoint,
	)
	elif "Local" in model_source:
	pass
	# print("Using local model: ", model_choice)
	elif "AWS" in model_source:
	pass
	# print("Using AWS Bedrock model:", model_choice)

	whole_conversation = [summarise_topic_descriptions_system_prompt]

	# Process requests to large language model
	(
	responses,
	conversation_history,
	whole_conversation,
	whole_conversation_metadata,
	response_text,
	) = process_requests(
	formatted_summary_prompt,
	summarise_topic_descriptions_system_prompt,
	conversation_history,
	whole_conversation,
	whole_conversation_metadata,
	client,
	client_config,
	model_choice,
	temperature,
	bedrock_runtime=bedrock_runtime,
	model_source=model_source,
	local_model=local_model,
	tokenizer=tokenizer,
	assistant_model=assistant_model,
	assistant_prefill=summary_assistant_prefill,
	api_url=api_url,
	)

	summarised_output = re.sub(
	r"\n{2,}", "\n", response_text
	) # Replace multiple line breaks with a single line break
	summarised_output = re.sub(
	r"^\n{1,}", "", summarised_output
	) # Remove one or more line breaks at the start
	summarised_output = re.sub(
	r"\n", "<br>", summarised_output
	) # Replace \n with more html friendly <br> tags
	summarised_output = summarised_output.strip()

	print("Finished summary query")

	# Ensure the system prompt is included in the conversation history
	try:
	if isinstance(conversation_history, list):
	has_system_prompt = False

	if conversation_history:
	first_entry = conversation_history[0]
	if isinstance(first_entry, dict):
	role_is_system = first_entry.get("role") == "system"
	parts = first_entry.get("parts")
	content_matches = (
	parts == summarise_topic_descriptions_system_prompt
	or (
	isinstance(parts, list)
	and summarise_topic_descriptions_system_prompt in parts
	)
	)
	has_system_prompt = role_is_system and content_matches
	elif isinstance(first_entry, str):
	has_system_prompt = (
	first_entry.strip().lower().startswith("system:")
	)

	if not has_system_prompt:
	conversation_history.insert(
	0,
	{
	"role": "system",
	"parts": [summarise_topic_descriptions_system_prompt],
	},
	)
	except Exception as _e:
	# Non-fatal: if anything goes wrong, return the original conversation history
	pass

	return (
	summarised_output,
	conversation_history,
	whole_conversation_metadata,
	response_text,
	)


	def process_debug_output_iteration(
	output_debug_files: str,
	summaries_folder: str,
	batch_file_path_details: str,
	model_choice_clean_short: str,
	final_system_prompt: str,
	summarised_output: str,
	conversation_history: list,
	metadata: list,
	log_output_files: list,
	task_type: str,
	) -> tuple[str, str, str, str]:
	"""
	Writes debug files for summary generation if output_debug_files is "True",
	and returns the content of the prompt, summary, conversation, and metadata for the current iteration.

	Args:
	output_debug_files (str): Flag to indicate if debug files should be written.
	summaries_folder (str): The folder where output files are saved.
	batch_file_path_details (str): Details for the batch file path.
	model_choice_clean_short (str): Shortened cleaned model choice.
	final_system_prompt (str): The system prompt content.
	summarised_output (str): The summarised output content.
	conversation_history (list): The full conversation history.
	metadata (list): The metadata for the conversation.
	log_output_files (list): A list to append paths of written log files. This list is modified in-place.
	task_type (str): The type of task being performed.
	Returns:
	tuple[str, str, str, str]: A tuple containing the content of the prompt,
	summarised output, conversation history (as string),
	and metadata (as string) for the current iteration.
	"""
	current_prompt_content = final_system_prompt
	current_summary_content = summarised_output

	if isinstance(conversation_history, list):

	# Handle both list of strings and list of dicts
	if conversation_history and isinstance(conversation_history[0], dict):
	# Convert list of dicts to list of strings
	conversation_strings = list()
	for entry in conversation_history:
	if "role" in entry and "parts" in entry:
	role = entry["role"].capitalize()
	message = (
	" ".join(entry["parts"])
	if isinstance(entry["parts"], list)
	else str(entry["parts"])
	)
	conversation_strings.append(f"{role}: {message}")
	else:
	# Fallback for unexpected dict format
	conversation_strings.append(str(entry))
	current_conversation_content = "\n".join(conversation_strings)
	else:
	# Handle list of strings
	current_conversation_content = "\n".join(conversation_history)
	else:
	current_conversation_content = str(conversation_history)
	current_metadata_content = str(metadata)
	current_task_type = task_type

	if output_debug_files == "True":
	try:
	formatted_prompt_output_path = (
	summaries_folder
	+ batch_file_path_details
	+ "_full_prompt_"
	+ model_choice_clean_short
	+ "_"
	+ current_task_type
	+ ".txt"
	)
	final_table_output_path = (
	summaries_folder
	+ batch_file_path_details
	+ "_full_response_"
	+ model_choice_clean_short
	+ "_"
	+ current_task_type
	+ ".txt"
	)
	whole_conversation_path = (
	summaries_folder
	+ batch_file_path_details
	+ "_full_conversation_"
	+ model_choice_clean_short
	+ "_"
	+ current_task_type
	+ ".txt"
	)
	whole_conversation_path_meta = (
	summaries_folder
	+ batch_file_path_details
	+ "_metadata_"
	+ model_choice_clean_short
	+ "_"
	+ current_task_type
	+ ".txt"
	)

	with open(
	formatted_prompt_output_path,
	"w",
	encoding="utf-8-sig",
	errors="replace",
	) as f:
	f.write(current_prompt_content)
	with open(
	final_table_output_path, "w", encoding="utf-8-sig", errors="replace"
	) as f:
	f.write(current_summary_content)
	with open(
	whole_conversation_path, "w", encoding="utf-8-sig", errors="replace"
	) as f:
	f.write(current_conversation_content)
	with open(
	whole_conversation_path_meta,
	"w",
	encoding="utf-8-sig",
	errors="replace",
	) as f:
	f.write(current_metadata_content)

	log_output_files.append(formatted_prompt_output_path)
	log_output_files.append(final_table_output_path)
	log_output_files.append(whole_conversation_path)
	log_output_files.append(whole_conversation_path_meta)
	except Exception as e:
	print(f"Error in writing debug files for summary: {e}")

	# Return the content of the objects for the current iteration.
	# The caller can then append these to separate lists if accumulation is desired.
	return (
	current_prompt_content,
	current_summary_content,
	current_conversation_content,
	current_metadata_content,
	)


	def convert_markdown_headers_to_excel_format(text: str) -> str:
	"""
	Convert markdown headers to Excel-friendly format that preserves hierarchy.

	Converts:
	- # Header (H1) -> === HEADER === (most prominent)
	- ## Header (H2) -> --- Header --- (medium)
	- ### Header (H3) -> ── Header ── (less prominent)
	- #### Header (H4) -> • Header (with bullet)
	- ##### Header (H5) -> • Header (indented)
	- ###### Header (H6) -> • Header (more indented)

	Args:
	text (str): Text containing markdown headers

	Returns:
	str: Text with markdown headers converted to Excel-friendly format
	"""
	if not text:
	return text

	lines = text.split("\n")
	converted_lines = []

	for line in lines:
	# Match markdown headers (# through ######)
	header_match = re.match(r"^(#{1,6})\s+(.+)$", line)
	if header_match:
	header_level = len(header_match.group(1)) # Number of # characters
	header_text = header_match.group(2).strip()

	if header_level == 1:
	# H1: Most prominent - uppercase with double equals
	converted_line = f"=== {header_text.upper()} ==="
	elif header_level == 2:
	# H2: Medium prominence - title case with dashes
	converted_line = f"--- {header_text.title()} ---"
	elif header_level == 3:
	# H3: Less prominent - title case with single dashes
	converted_line = f"── {header_text.title()} ──"
	elif header_level == 4:
	# H4: Bullet with no indentation
	converted_line = f"• {header_text}"
	elif header_level == 5:
	# H5: Bullet with indentation
	converted_line = f" • {header_text}"
	else: # header_level == 6
	# H6: Bullet with more indentation
	converted_line = f" • {header_text}"

	converted_lines.append(converted_line)
	else:
	converted_lines.append(line)

	return "\n".join(converted_lines)


	@spaces.GPU(duration=MAX_SPACES_GPU_RUN_TIME)
	def overall_summary(
	topic_summary_df: pd.DataFrame,
	model_choice: str,
	in_api_key: str,
	temperature: float,
	reference_data_file_name: str,
	output_folder: str = OUTPUT_FOLDER,
	context_textbox: str = "",
	aws_access_key_textbox: str = "",
	aws_secret_key_textbox: str = "",
	aws_region_textbox: str = "",
	model_name_map: dict = model_name_map,
	hf_api_key_textbox: str = "",
	azure_endpoint_textbox: str = "",
	existing_logged_content: list = list(),
	api_url: str = None,
	output_debug_files: str = "False",
	log_output_files: list = list(),
	reasoning_suffix: str = reasoning_suffix,
	local_model: object = None,
	tokenizer: object = None,
	assistant_model: object = None,
	summarise_everything_prompt: str = summarise_everything_prompt,
	summarise_everything_system_prompt: str = summarise_everything_system_prompt,
	summarise_format_radio: str = detailed_summary_format_prompt,
	additional_summary_instructions: str = "",
	do_summaries: str = "Yes",
	progress=gr.Progress(track_tqdm=True),
	) -> Tuple[
	List[str],
	List[str],
	int,
	str,
	List[str],
	List[str],
	int,
	int,
	int,
	float,
	List[dict],
	]:
	"""
	Create an overall summary of all responses based on a topic summary table.

	Args:
	topic_summary_df (pd.DataFrame): DataFrame with columns "Summary number", "Page range", "Summary"
	model_choice (str): Name of the LLM model to use
	in_api_key (str): API key for model access
	temperature (float): Temperature parameter for model generation
	reference_data_file_name (str): Name of reference data file
	output_folder (str, optional): Folder to save outputs. Defaults to OUTPUT_FOLDER.
	context_textbox (str, optional): Additional context. Defaults to empty string.
	aws_access_key_textbox (str, optional): AWS access key. Defaults to empty string.
	aws_secret_key_textbox (str, optional): AWS secret key. Defaults to empty string.
	aws_region_textbox (str, optional): AWS region. Defaults to empty string.
	model_name_map (dict, optional): Mapping of model names. Defaults to model_name_map.
	hf_api_key_textbox (str, optional): Hugging Face API key. Defaults to empty string.
	existing_logged_content (list, optional): List of existing logged content. Defaults to empty list.
	output_debug_files (str, optional): Flag to indicate if debug files should be written. Defaults to "False".
	log_output_files (list, optional): List of existing logged content. Defaults to empty list.
	api_url (str, optional): API URL for inference-server models. Defaults to None.
	reasoning_suffix (str, optional): Suffix for reasoning. Defaults to reasoning_suffix.
	local_model (object, optional): Local model object. Defaults to None.
	tokenizer (object, optional): Tokenizer object. Defaults to None.
	assistant_model (object, optional): Assistant model object. Defaults to None.
	summarise_everything_prompt (str, optional): Prompt for overall summary
	summarise_everything_system_prompt (str, optional): System prompt for overall summary
	summarise_format_radio (str, optional): Summary format radio. Defaults to summarise_format_radio.
	additional_summary_instructions (str, optional): Additional summary instructions. Defaults to additional_summary_instructions.
	do_summaries (str, optional): Whether to generate summaries. Defaults to "Yes".
	progress (gr.Progress, optional): Progress tracker. Defaults to gr.Progress(track_tqdm=True).

	Returns:
	Tuple containing:
	List[str]: Output files
	List[str]: Text summarised outputs
	int: Latest summary completed
	str: Output metadata
	List[str]: Summarised outputs
	List[str]: Summarised outputs for DataFrame
	int: Number of input tokens
	int: Number of output tokens
	int: Number of API calls
	float: Time taken
	List[dict]: List of logged content
	"""

	out_metadata = list()
	latest_summary_completed = 0
	output_files = list()
	txt_summarised_outputs = list()
	summarised_outputs = list()
	summarised_outputs_for_df = list()
	input_tokens_num = 0
	output_tokens_num = 0
	number_of_calls_num = 0
	time_taken = 0
	out_message = list()
	all_logged_content = list()
	all_prompts_content = list()
	all_summaries_content = list()
	all_metadata_content = list()
	all_groups_content = list()
	all_batches_content = list()
	all_model_choice_content = list()
	all_validated_content = list()
	task_type = "Overall summary"
	all_task_type_content = list()
	log_output_files = list()
	all_logged_content = list()
	all_file_names_content = list()
	tic = time.perf_counter()

	summaries_folder = os.path.join(output_folder, "summaries")
	os.makedirs(summaries_folder, exist_ok=True)

	# Expect three columns: Summary number, Page range, Summary
	required_cols = ["Summary number", "Page range", "Summary"]
	if not all(c in topic_summary_df.columns for c in required_cols):
	raise ValueError(
	"topic_summary_df must have columns: Summary number, Page range, Summary"
	)
	topic_summary_df = topic_summary_df[required_cols].copy()
	topic_summary_df = topic_summary_df.sort_values(by="Summary number", ascending=True)

	# Single "group" containing the whole table (no grouping by Group column)
	unique_groups = ["All"]

	len(unique_groups)

	if context_textbox and "The context of this analysis is" not in context_textbox:
	context_textbox = "The context of this analysis is '" + context_textbox + "'."

	# if length_groups > 1:
	# comprehensive_summary_format_prompt = (
	# comprehensive_summary_format_prompt_by_group
	# )
	# else:
	# comprehensive_summary_format_prompt = comprehensive_summary_format_prompt

	batch_file_path_details = create_batch_file_path_details(reference_data_file_name)
	# Use model_choice directly as short_name, or try to get from model_name_map if available
	if model_name_map and model_choice in model_name_map:
	model_choice_clean = model_name_map[model_choice]["short_name"]
	else:
	# Use model_choice directly if not in model_name_map
	model_choice_clean = model_choice
	model_choice_clean_short = clean_column_name(
	model_choice_clean, max_length=20, front_characters=False
	)

	tic = time.perf_counter()

	# Determine model source from model_choice using defaults from config.py
	# Does not check model_name_map - uses the defined defaults
	model_source = get_model_source_from_model_choice(model_choice)

	# Load model and tokenizer together to ensure they're from the same source
	# This prevents mismatches that could occur if they're loaded separately
	# Similar to llm_funcs.py pattern (lines 830-839) and llm_entity_detection.py (lines 519-533)
	if (model_source == "Local") & (local_model is None or tokenizer is None):
	progress(0.1, f"Using model: {LOCAL_TRANSFORMERS_LLM_PII_MODEL_CHOICE}")
	# Use load_model() to ensure both are loaded atomically
	# This is safer than calling get_pii_model() and get_pii_tokenizer() separately
	loaded_model, loaded_tokenizer, loaded_assistant_model = load_model()
	if local_model is None:
	local_model = loaded_model
	if tokenizer is None:
	tokenizer = loaded_tokenizer
	if assistant_model is None:
	assistant_model = loaded_assistant_model

	summary_loop = tqdm(
	unique_groups, desc="Creating overall summary for groups", unit="groups"
	)

	if do_summaries == "Yes":
	# Determine model source from model_choice using defaults from config.py
	# Does not check model_name_map - uses the defined defaults
	model_source = get_model_source_from_model_choice(model_choice)

	# Setup bedrock for AWS models only
	# Use the same approach as file_redaction.py (lines 939-969) for consistency
	bedrock_runtime = None
	if model_source == "AWS":
	# Use aws_region_textbox if provided, otherwise fall back to AWS_REGION from config
	region = aws_region_textbox if aws_region_textbox else AWS_REGION

	if RUN_AWS_FUNCTIONS and PRIORITISE_SSO_OVER_AWS_ENV_ACCESS_KEYS:
	print("Connecting to Bedrock via existing SSO connection")
	bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
	elif aws_access_key_textbox and aws_secret_key_textbox:
	print(
	"Connecting to Bedrock using AWS access key and secret keys from user input."
	)
	bedrock_runtime = boto3.client(
	"bedrock-runtime",
	aws_access_key_id=aws_access_key_textbox,
	aws_secret_access_key=aws_secret_key_textbox,
	region_name=region,
	)
	elif RUN_AWS_FUNCTIONS:
	print("Connecting to Bedrock via existing SSO connection")
	bedrock_runtime = boto3.client("bedrock-runtime", region_name=region)
	elif AWS_ACCESS_KEY and AWS_SECRET_KEY:
	print("Getting Bedrock credentials from environment variables")
	bedrock_runtime = boto3.client(
	"bedrock-runtime",
	aws_access_key_id=AWS_ACCESS_KEY,
	aws_secret_access_key=AWS_SECRET_KEY,
	region_name=region,
	)
	else:
	bedrock_runtime = None
	out_message = "Cannot connect to AWS Bedrock service. Please provide access keys under LLM settings, or choose another model type."
	print(out_message)
	raise Exception(out_message)

	for summary_group in summary_loop:

	print("Creating overall summary for group:", summary_group)

	# Use the full table (three columns: Summary number, Page range, Summary)
	group_df = topic_summary_df.copy()

	# Prepare the system prompt first (needed for token counting)
	formatted_summarise_everything_system_prompt = (
	summarise_everything_system_prompt.format(
	consultation_context=context_textbox
	)
	)

	# Apply reasoning suffix for GPT-OSS models (Local, inference-server, or AWS)
	is_gpt_oss_model = (
	"gpt-oss" in model_choice.lower() or "gpt_oss" in model_choice.lower()
	)

	if is_gpt_oss_model:
	# Use default reasoning suffix if not set
	effective_reasoning_suffix = (
	reasoning_suffix if reasoning_suffix else "Reasoning: low"
	)
	if effective_reasoning_suffix:
	formatted_summarise_everything_system_prompt = (
	formatted_summarise_everything_system_prompt
	+ "\n"
	+ effective_reasoning_suffix
	)
	elif "Local" in model_source and reasoning_suffix:
	# For other local models, use reasoning_suffix if provided
	formatted_summarise_everything_system_prompt = (
	formatted_summarise_everything_system_prompt
	+ "\n"
	+ reasoning_suffix
	)

	if additional_summary_instructions:
	additional_summary_instructions = (
	"Important additional instructions to follow closely: "
	+ additional_summary_instructions
	)

	# Create a test prompt with empty table to get base token count
	test_summary_text = ""
	test_formatted_summary_prompt = [
	summarise_everything_prompt.format(
	topic_summary_table=test_summary_text,
	summary_format=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)
	]

	# Calculate base token count (system prompt + prompt template without table)
	full_test_text = (
	formatted_summarise_everything_system_prompt
	+ "\n"
	+ test_formatted_summary_prompt[0]
	)
	base_token_count = count_tokens_in_text(
	full_test_text, tokenizer, model_source
	)

	# Calculate available tokens for the summary table
	available_tokens = LLM_CONTEXT_LENGTH - base_token_count

	# Ensure markdown table rows don't get visually "split" by newlines inside cells.
	# Markdown tables don't reliably support multiline cells, so we replace internal
	# newlines with a single-line representation before calling `to_markdown()`.
	def _escape_markdown_table_cell(value):
	if not isinstance(value, str):
	return value
	s = value.replace("\r\n", "\n").replace("\r", "\n")
	# Keep content in a single cell/row in markdown output
	s = s.replace("\n", "\\n")
	# Avoid breaking markdown table syntax
	s = s.replace("\|", "\\\|")
	return s

	if "Summary" in group_df.columns:
	group_df["Summary"] = group_df["Summary"].apply(
	_escape_markdown_table_cell
	)

	# Truncate DataFrame rows if needed to fit within context limit
	if len(group_df) > 0:
	# Start with all rows and check if they fit
	current_summary_text = group_df.to_markdown(index=False)
	current_summary_text = clean_markdown_table_whitespace(
	current_summary_text
	)
	current_token_count = count_tokens_in_text(
	current_summary_text, tokenizer, model_source
	)

	# If the full table exceeds available tokens, truncate rows
	if current_token_count > available_tokens:
	print(
	f"Warning: Summary table for group '{summary_group}' exceeds context limit. "
	f"Truncating rows. Table tokens: {current_token_count}, Available: {available_tokens}"
	)

	# Binary search approach: find the maximum number of rows that fit
	# Start with all rows and reduce until we fit
	num_rows = len(group_df)
	min_rows = 0
	max_rows = num_rows
	best_df = group_df.iloc[:0] # Empty DataFrame as fallback

	# Try to find the maximum number of rows that fit
	while min_rows < max_rows:
	mid_rows = (min_rows + max_rows + 1) // 2
	test_df = group_df.iloc[:mid_rows]
	test_summary = test_df.to_markdown(index=False)
	test_summary = clean_markdown_table_whitespace(test_summary)
	test_token_count = count_tokens_in_text(
	test_summary, tokenizer, model_source
	)

	if test_token_count <= available_tokens:
	best_df = test_df
	min_rows = mid_rows
	else:
	max_rows = mid_rows - 1

	# Use the best fitting DataFrame
	group_df = best_df
	print(
	f"Truncated to {len(group_df)} rows (from {num_rows} original rows) "
	f"to fit within context limit."
	)

	# Create summary_text from (possibly truncated) DataFrame
	summary_text = group_df.to_markdown(index=False)
	# Clean extraneous whitespace from markdown table cells
	summary_text = clean_markdown_table_whitespace(summary_text)

	formatted_summary_prompt = [
	summarise_everything_prompt.format(
	topic_summary_table=summary_text,
	summary_format=summarise_format_radio,
	additional_summary_instructions=additional_summary_instructions,
	)
	]

	combined_prompt = (
	formatted_summarise_everything_system_prompt
	+ "\n"
	+ formatted_summary_prompt[0]
	)

	try:
	response, conversation_history, metadata, response_text = (
	summarise_output_topics_query(
	model_choice,
	in_api_key,
	temperature,
	formatted_summary_prompt,
	formatted_summarise_everything_system_prompt,
	model_source,
	bedrock_runtime,
	local_model,
	tokenizer=tokenizer,
	assistant_model=assistant_model,
	azure_endpoint=azure_endpoint_textbox,
	api_url=api_url,
	)
	)
	summarised_output_for_df = response_text
	summarised_output = response
	except Exception as e:
	print(
	"Cannot create overall summary for group:",
	summary_group,
	"due to:",
	e,
	)
	summarised_output = ""
	summarised_output_for_df = ""

	# Remove multiple consecutive line breaks (2 or more) and replace with single line break
	if summarised_output_for_df:
	summarised_output_for_df = re.sub(
	r"\n{2,}", "\n", summarised_output_for_df
	)
	# Convert markdown headers to Excel-friendly format
	summarised_output_for_df = convert_markdown_headers_to_excel_format(
	summarised_output_for_df
	)
	if summarised_output:
	summarised_output = re.sub(r"\n{2,}", "\n", summarised_output)

	summarised_outputs_for_df.append(summarised_output_for_df)
	summarised_outputs.append(summarised_output)
	txt_summarised_outputs.append(
	f"""Group name: {summary_group}\n""" + summarised_output
	)

	out_metadata.extend(metadata)
	out_metadata_str = ". ".join(out_metadata)

	full_prompt = (
	formatted_summarise_everything_system_prompt
	+ "\n"
	+ formatted_summary_prompt[0]
	)

	(
	current_prompt_content_logged,
	current_summary_content_logged,
	current_conversation_content_logged,
	current_metadata_content_logged,
	) = process_debug_output_iteration(
	output_debug_files,
	summaries_folder,
	batch_file_path_details,
	model_choice_clean_short,
	full_prompt,
	summarised_output,
	conversation_history,
	metadata,
	log_output_files,
	task_type=task_type,
	)

	all_prompts_content.append(current_prompt_content_logged)
	all_summaries_content.append(current_summary_content_logged)
	# all_conversation_content.append(current_conversation_content_logged)
	all_metadata_content.append(current_metadata_content_logged)
	all_groups_content.append(summary_group)
	all_batches_content.append("1")
	all_model_choice_content.append(model_choice_clean_short)
	all_validated_content.append("No")
	all_task_type_content.append(task_type)
	all_file_names_content.append(reference_data_file_name)
	latest_summary_completed += 1
	clean_column_name(summary_group)

	# Write overall outputs to csv
	overall_summary_output_csv_path = (
	output_folder
	+ "summaries/"
	+ batch_file_path_details
	+ "_overall_summary_"
	+ model_choice_clean_short
	+ ".csv"
	)
	summarised_outputs_df = pd.DataFrame(
	data={"Group": unique_groups, "Summary": summarised_outputs_for_df}
	)
	if output_debug_files == "True":
	summarised_outputs_df.drop(["1", "2", "3"], axis=1, errors="ignore").to_csv(
	overall_summary_output_csv_path, index=None, encoding="utf-8-sig"
	)
	output_files.append(overall_summary_output_csv_path)

	summarised_outputs_df_for_display = pd.DataFrame(
	data={"Group": unique_groups, "Summary": summarised_outputs}
	)
	summarised_outputs_df_for_display["Summary"] = (
	summarised_outputs_df_for_display["Summary"]
	.apply(lambda x: markdown.markdown(x) if isinstance(x, str) else x)
	.str.replace(r"\n", "<br>", regex=False)
	.str.replace(r"(<br>\s*){2,}", "<br>", regex=True)
	)
	html_output_table = summarised_outputs_df_for_display.to_html(
	index=False, escape=False
	)

	output_files = list(set(output_files))

	input_tokens_num, output_tokens_num, number_of_calls_num = (
	calculate_tokens_from_metadata(
	out_metadata_str, model_choice, model_name_map
	)
	)

	# Check if beyond max time allowed for processing and break if necessary
	toc = time.perf_counter()
	time_taken = toc - tic

	out_message = "\n".join(out_message)
	out_message = (
	out_message
	+ " "
	+ f"Overall summary finished processing. Total time: {time_taken:.2f}s"
	)
	print(out_message)

	# Combine the logged content into a list of dictionaries
	all_logged_content = [
	{
	"prompt": prompt,
	"response": summary,
	"metadata": metadata,
	"batch": batch,
	"model_choice": model_choice,
	"validated": validated,
	"group": group,
	"task_type": task_type,
	"file_name": file_name,
	}
	for prompt, summary, metadata, batch, model_choice, validated, group, task_type, file_name in zip(
	all_prompts_content,
	all_summaries_content,
	all_metadata_content,
	all_batches_content,
	all_model_choice_content,
	all_validated_content,
	all_groups_content,
	all_task_type_content,
	all_file_names_content,
	)
	]

	if isinstance(existing_logged_content, pd.DataFrame):
	existing_logged_content = existing_logged_content.to_dict(orient="records")

	out_logged_content = existing_logged_content + all_logged_content

	return (
	output_files,
	html_output_table,
	summarised_outputs_df,
	out_metadata_str,
	input_tokens_num,
	output_tokens_num,
	number_of_calls_num,
	time_taken,
	out_message,
	out_logged_content,
	combined_prompt,
	response_text,
	)