Spaces:

Shirochi
/

Glossarion

Running

App Files Files Community

Glossarion / GlossaryManager.py

Shirochi

Upload 93 files

ec038f4 verified about 2 months ago

raw

history blame contribute delete

316 kB

	# -- coding: utf-8 --
	# This is for automatic glossary generation only, unrelated to the more thorough glossary generation you get from clicking the "Extract Glossary" button

	import os
	import re
	import os
	import sys
	import threading
	import tempfile
	import queue
	import time
	import json
	from bs4 import BeautifulSoup
	import PatternManager as PM
	import duplicate_detection_config as ddc
	from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed

	# Default unified auto-glossary prompt (used when AUTO_GLOSSARY_PROMPT is unset/empty).
	# NOTE: This matches the GUI's default_unified_prompt in GlossaryManager_GUI.py.
	DEFAULT_AUTO_GLOSARY_PROMPT3 = """You are a novel glossary extraction assistant.

	You must strictly return ONLY CSV format with 2-4 columns in this exact order: type,raw_name,translated_name,gender,description.
	For character entries, determine gender from context, leave empty if context is insufficient.
	For non-character entries, leave gender empty.
	The description column is optional and can contain brief context (role, location, significance).

	Critical Requirement: The translated name and description column must be in {language}.

	For example:
	character,ᫀ이히리ᄐ 나애,Dihirit Ade,female,The enigmatic guild leader of the Shadow Lotus who operates from the concealed backrooms of the capital, manipulating city politics through commerce and wielding dual daggers with lethal precision
	character,ᫀ뢔사난,Kim Sang-hyu,male,A master swordsman from the Northern Sect known for his icy demeanor and unparalleled skill with the Frost Blade technique which he uses to defend the border fortress

	CRITICAL EXTRACTION RULES:
	- Extract All Character names, Terms, Location names, Ability/Skill names, Item names, Organization names, and Titles/Ranks.
	- Do NOT extract sentences, dialogue, actions, questions, or statements as glossary entries
	- REJECT entries that contain verbs or end with punctuation (?, !, .)
	- REJECT entries starting with: "Me", "How", "What", "Why", "I", "He", "She", "They", "That's", "So", "Therefore", "Still", "But", "Protagonist". (The description column is excluded from this restriction)
	- Do NOT output any entries that are rejected by the above rules; skip them entirely
	- If unsure whether something is a proper noun/name, skip it
	- The description column must contain detailed context/explanation
	- Create at least one glossary entry for EVERY context marker window (lines ending with "=== CONTEXT N END ==="); treat each marker boundary as a required extraction point.
	- You must create {marker} glossary entries (one or more per window; do not invent placeholders).
	- You must include absolutely all characters found in the provided text in your glossary generation. Do not skip any character."""


	# Class-level shared lock for API submission timing
	_api_submission_lock = threading.Lock()
	_last_api_submission_time = 0
	_results_lock = threading.Lock()
	_file_write_lock = threading.Lock()
	_stop_requested = False
	# Register watchdog cleanup once per process (best-effort)
	_watchdog_atexit_registered = False
	BOOK_TITLE_RAW = None
	BOOK_TITLE_TRANSLATED = None
	BOOK_TITLE_VALUE = None # Legacy support if needed, or remove? Keeping for safety but won't use.


	def _extract_title_from_metadata(meta):
	"""Best-effort lookup of a book title inside metadata structures."""
	if not isinstance(meta, dict):
	return None

	title_keys = [
	"title",
	"book_title",
	"bookTitle",
	"title_translated",
	"translated_title",
	"title_en",
	]
	for key in title_keys:
	val = meta.get(key)
	if val:
	return str(val).strip()

	for nested_key in ("metadata", "opf", "info", "data"):
	nested = meta.get(nested_key)
	if isinstance(nested, dict):
	nested_title = _extract_title_from_metadata(nested)
	if nested_title:
	return nested_title
	return None


	def _extract_raw_title_from_epub(epub_path):
	"""Extract the raw untranslated title from the input EPUB content.opf."""
	if not epub_path or not os.path.exists(epub_path):
	return None

	print(f"[Metadata] Checking input EPUB for raw title: {epub_path}")

	# Try manual parsing first (more robust)
	try:
	import zipfile
	with zipfile.ZipFile(epub_path, 'r') as zf:
	# Find opf
	opf_name = next((n for n in zf.namelist() if n.lower().endswith('.opf')), None)
	if opf_name:
	content = zf.read(opf_name).decode('utf-8', errors='ignore')
	# Use BS4 with xml parser
	try:
	soup = BeautifulSoup(content, 'xml')
	except Exception:
	soup = BeautifulSoup(content, 'html.parser')

	# Try dc:title
	title_tag = soup.find('dc:title')
	if not title_tag:
	# Fallback to any title tag
	title_tag = soup.find('title')

	if title_tag:
	val = title_tag.get_text(strip=True)
	if val:
	return val
	except Exception as e:
	print(f"[Warning] Manual EPUB title extraction failed: {e}")

	# Fallback: ebooklib
	try:
	from ebooklib import epub
	book = epub.read_epub(epub_path)
	titles = book.get_metadata("DC", "title")
	if titles:
	val = titles[0][0]
	if val:
	return str(val).strip()
	except Exception as e:
	print(f"[Warning] Could not read EPUB metadata via ebooklib: {e}")

	return None


	def _extract_translated_title_from_metadata(output_dir):
	"""Extract translated title from metadata.json in output directory."""
	base_dir = os.path.abspath(output_dir or ".")
	epub_path = os.getenv("EPUB_PATH", "")
	epub_base = os.path.splitext(os.path.basename(epub_path or ""))[0] if epub_path else None

	candidates = []
	# Only check output directory logic for translated title
	if epub_base:
	candidates.append(os.path.join(base_dir, epub_base, "metadata.json"))

	# Also check direct output dir
	candidates.append(os.path.join(base_dir, "metadata.json"))

	for meta_path in candidates:
	# print(f"[Metadata] Checking for translated book title at: {meta_path}")
	if os.path.exists(meta_path):
	try:
	with open(meta_path, "r", encoding="utf-8") as f:
	meta = json.load(f)
	meta_title = _extract_title_from_metadata(meta)
	if meta_title:
	return meta_title.strip()
	except Exception as e:
	print(f"[Warning] Could not read metadata.json for book title: {e}")

	return None


	def _derive_book_title(output_dir):
	"""Legacy wrapper - logic moved to save_glossary main flow."""
	return None


	def _ensure_book_title_csv_lines(csv_lines):
	"""
	Ensure the CSV (header + rows) contains a leading book title entry when enabled.
	Uses distinct raw and translated titles.
	"""
	if not csv_lines:
	return csv_lines
	include = os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "1").lower() not in ("0", "false", "no")

	raw_title = BOOK_TITLE_RAW
	trans_title = BOOK_TITLE_TRANSLATED

	# If we don't have BOTH, we can't create a perfect entry.
	# But user said "no scenarios with untranslated and untranslated".
	# So if one is missing, we might skip OR just use what we have?
	# User said "we only need untranslated text and translated text".
	# Assuming if both aren't available, we might default to what we have but prefer distinct.

	# Logic: if we have raw but no translated, use raw for both? No, user hates that.
	# But if we literally don't have a translation, we can't invent one.
	# The requirement seems to be: Get the CORRECT source for each field.

	if not include:
	return csv_lines

	if not raw_title and not trans_title:
	return csv_lines

	# Normalize for dedup check
	norm_raw = raw_title.lower() if raw_title else ""
	norm_trans = trans_title.lower() if trans_title else ""

	# Skip if already present
	header = csv_lines[0]
	for line in csv_lines[1:]:
	parts = [p.strip() for p in line.split(",")]
	if len(parts) >= 3:
	# Check if this line is already the book title
	p_raw = parts[1].lower()
	p_trans = parts[2].lower()

	# Match if we find our raw title or our translated title in the respective columns
	if (raw_title and p_raw == norm_raw) or (trans_title and p_trans == norm_trans):
	return csv_lines

	fields = [f.strip() for f in header.split(",")]
	row = []
	for field in fields:
	key = field.lower()
	if key == "type":
	row.append("book")
	elif key == "raw_name":
	row.append(raw_title if raw_title else (trans_title if trans_title else ""))
	elif key == "translated_name":
	row.append(trans_title if trans_title else (raw_title if raw_title else ""))
	else:
	row.append("")
	book_line = ",".join(row)
	return [header, book_line] + csv_lines[1:]

	def _csv_sort_key(line: str):
	"""Sort book first, then characters, then others by raw name."""
	try:
	parts = line.split(",")
	entry_type = parts[0].strip().lower()
	name = parts[1].lower() if len(parts) > 1 else line.lower()
	except Exception:
	entry_type = ""
	name = line.lower()
	order = {"book": -1, "character": 0, "term": 1}
	return (order.get(entry_type, 2), name)

	# Timing variables
	_extraction_time = 0
	_api_time = 0
	_freq_check_time = 0
	_dedup_time = 0
	_io_time = 0


	def _get_stop_file_path():
	"""Return the stop-flag file path (shared across processes)."""
	return os.environ.get("GLOSSARY_STOP_FILE") or os.path.join(tempfile.gettempdir(), "glossarion_glossary.stop")


	def _get_glossary_status_file_path() -> str:
	"""File path for cross-process status about chunk submission/completion.

	This lets the parent process decide whether it's safe to "wait for chunks" even when
	WAIT_FOR_CHUNKS is disabled.
	"""
	try:
	explicit = os.environ.get("GLOSSARY_STATUS_FILE")
	if explicit:
	return explicit
	except Exception:
	pass

	# Default: colocate next to the stop file so both processes can find it deterministically.
	try:
	stop_fp = _get_stop_file_path()
	if stop_fp:
	return f"{stop_fp}.status.json"
	except Exception:
	pass

	return os.path.join(tempfile.gettempdir(), "glossarion_glossary.status.json")


	def _write_glossary_status(payload: dict) -> None:
	"""Best-effort atomic write of glossary chunk status."""
	try:
	fp = _get_glossary_status_file_path()
	os.makedirs(os.path.dirname(fp) or ".", exist_ok=True)
	tmp = f"{fp}.tmp"
	with open(tmp, "w", encoding="utf-8") as f:
	json.dump(payload, f, ensure_ascii=False, indent=2)
	os.replace(tmp, fp)
	except Exception:
	# Status is best-effort only.
	pass


	def _clear_api_watchdog_state(*, remove_watchdog_file: bool = True) -> None:
	"""Best-effort reset of unified_api_client watchdog state.

	GlossaryManager often runs in a separate process; if it exits mid-stream or is force-stopped,
	its watchdog JSON file can keep the GUI progress bar "busy" until manually cleared.
	"""
	# Reset in-memory counters
	try:
	import unified_api_client
	if hasattr(unified_api_client, '_api_watchdog_reset'):
	unified_api_client._api_watchdog_reset()
	except Exception:
	pass

	# Remove the per-process watchdog file (if enabled)
	if remove_watchdog_file:
	try:
	wd_dir = os.environ.get("GLOSSARION_WATCHDOG_DIR")
	if wd_dir and os.path.isdir(wd_dir):
	fp = os.path.join(wd_dir, f"api_watchdog_{os.getpid()}.json")
	tmp = f"{fp}.tmp"
	try:
	if os.path.exists(tmp):
	os.remove(tmp)
	except Exception:
	pass
	try:
	if os.path.exists(fp):
	os.remove(fp)
	except Exception:
	pass
	except Exception:
	pass


	def set_stop_flag(value: bool):
	"""Set the module-level stop flag and propagate to shared channels."""
	global _stop_requested
	_stop_requested = bool(value)

	# Mirror to environment for other components
	os.environ["TRANSLATION_CANCELLED"] = "1" if value else "0"

	# If we're stopping, clear watchdog immediately so the GUI bar doesn't stick.
	# (If graceful-stop semantics are needed, the caller should avoid setting stop until ready.)
	if value:
	_clear_api_watchdog_state(remove_watchdog_file=True)

	# Touch/remove stop file for cross-process signalling
	stop_path = _get_stop_file_path()
	try:
	if value:
	with open(stop_path, "w", encoding="utf-8") as f:
	f.write("stop")
	else:
	if os.path.exists(stop_path):
	os.remove(stop_path)
	except Exception:
	pass

	# Notify unified_api_client if present
	try:
	import unified_api_client
	if hasattr(unified_api_client, "UnifiedClient"):
	unified_api_client.UnifiedClient._global_cancelled = bool(value)
	if hasattr(unified_api_client, "global_stop_flag"):
	unified_api_client.global_stop_flag = bool(value)
	except Exception:
	pass


	# Function to check if stop is requested (can be overridden)
	def is_stop_requested():
	"""Check if stop has been requested from any source.

	NOTE: TRANSLATION_CANCELLED is set on BOTH graceful and immediate stop.
	During graceful stop we must let in-flight API calls finish, so we only
	treat it as a stop signal when GRACEFUL_STOP is not active. When
	graceful stop IS active, the orchestrator in TransateKRtoEN handles the
	decision of whether to wait or cancel.
	"""
	if _stop_requested:
	return True

	# Environment toggle (set by GUI stop button)
	# Only treat as immediate stop when GRACEFUL_STOP is not active
	if os.environ.get("TRANSLATION_CANCELLED") == "1":
	if os.environ.get("GRACEFUL_STOP") != "1":
	return True

	# File-based stop flag for cross-process cancellation
	try:
	stop_path = _get_stop_file_path()
	if stop_path and os.path.exists(stop_path):
	return True
	except Exception:
	pass

	# Unified API client global cancellation
	try:
	import unified_api_client
	if getattr(unified_api_client, "global_stop_flag", False):
	return True
	if hasattr(unified_api_client, "UnifiedClient") and getattr(unified_api_client.UnifiedClient, "_global_cancelled", False):
	return True
	except Exception:
	pass

	return False

	def set_output_redirect(log_callback=None):
	"""Redirect print statements to a callback function for GUI integration"""
	if log_callback:
	import threading

	class CallbackWriter:
	def __init__(self, callback):
	self.callback = callback
	self.main_thread = threading.main_thread()

	def write(self, text):
	if text.strip():
	# The callback (append_log) is already thread-safe - it handles QTimer internally
	# So we can call it directly from any thread
	self.callback(text.strip())

	def flush(self):
	pass

	sys.stdout = CallbackWriter(log_callback)

	def is_traditional_translation_api(model: str) -> bool:
	"""Check if the model is a traditional translation API"""
	return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/')

	def _model_uses_own_auth(model: str) -> bool:
	"""Check if the model uses its own authentication (no API key needed).
	authgpt/ uses OAuth tokens, vertex/ uses Google service account credentials."""
	if not model:
	return False
	m = model.lower()
	return m.startswith('authgpt/') or m.startswith('vertex/')

	def _ensure_multi_key_config_loaded():
	"""Best-effort load of multi-key config when running in subprocesses.

	In subprocesses, in-memory key lists are not inherited. If multi-key mode is
	enabled via env but no keys are present, load them from config.json and
	initialize UnifiedClient's in-memory pool.
	"""
	try:
	if os.getenv('USE_MULTI_API_KEYS', '0') != '1':
	return
	except Exception:
	return

	# If keys are already present in env or in-memory, nothing to do.
	try:
	mk_env = os.getenv('MULTI_API_KEYS', '')
	if mk_env and str(mk_env).strip() not in ('', '[]', 'null', 'None'):
	return
	except Exception:
	pass

	try:
	import unified_api_client as _uac
	with _uac.UnifiedClient._in_memory_multi_keys_lock:
	if _uac.UnifiedClient._in_memory_multi_keys:
	return
	except Exception:
	pass

	# Try to load from config.json in common locations.
	cfg_paths = []
	try:
	cfg_env = os.getenv('CONFIG_FILE')
	if cfg_env:
	cfg_paths.append(cfg_env)
	except Exception:
	pass
	try:
	cfg_paths.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json"))
	except Exception:
	pass
	try:
	cfg_paths.append(os.path.join(os.getcwd(), "config.json"))
	except Exception:
	pass

	# Deduplicate while preserving order
	seen = set()
	candidates = []
	for p in cfg_paths:
	if not p:
	continue
	p_norm = os.path.abspath(p)
	if p_norm in seen:
	continue
	seen.add(p_norm)
	candidates.append(p_norm)

	cfg = None
	cfg_path = None
	for p in candidates:
	if os.path.exists(p):
	try:
	with open(p, 'r', encoding='utf-8') as f:
	cfg = json.load(f)
	cfg_path = p
	break
	except Exception:
	continue

	if not isinstance(cfg, dict):
	return

	keys = cfg.get('multi_api_keys') or []
	if not keys:
	return

	force_rotation = bool(cfg.get('force_key_rotation', True))
	rotation_frequency = int(cfg.get('rotation_frequency', 1))
	try:
	os.environ.setdefault('FORCE_KEY_ROTATION', '1' if force_rotation else '0')
	os.environ.setdefault('ROTATION_FREQUENCY', str(rotation_frequency))
	os.environ.setdefault('USE_MULTI_KEYS', '1') # backward-compat
	except Exception:
	pass

	try:
	import unified_api_client as _uac
	_uac.UnifiedClient.set_in_memory_multi_keys(
	keys,
	force_rotation=force_rotation,
	rotation_frequency=rotation_frequency,
	)
	if cfg_path:
	print(f"[DEBUG] Loaded multi-key config from {os.path.basename(cfg_path)} ({len(keys)} keys)")
	else:
	print(f"[DEBUG] Loaded multi-key config ({len(keys)} keys)")
	except Exception as e:
	print(f"[DEBUG] Failed to initialize multi-key config from file: {e}")

	def send_with_interrupt(args, *kwargs):
	"""Lazy wrapper to avoid circular import"""
	from TransateKRtoEN import send_with_interrupt as _send_with_interrupt
	return _send_with_interrupt(args, *kwargs)


	# Class-level shared lock for API submission timing
	_api_submission_lock = threading.Lock()
	_last_api_submission_time = 0
	_results_lock = threading.Lock()
	_file_write_lock = threading.Lock()

	# Timing variables
	_extraction_time = 0
	_api_time = 0
	_freq_check_time = 0
	_dedup_time = 0
	_io_time = 0



	def _atomic_write_file(filepath, content, encoding='utf-8'):
	"""Atomically write to a file to prevent corruption from concurrent writes"""

	# Create temp file in same directory to ensure same filesystem
	dir_path = os.path.dirname(filepath)

	with _file_write_lock:
	try:
	# Write to temporary file first
	with tempfile.NamedTemporaryFile(mode='w', encoding=encoding,
	dir=dir_path, delete=False) as tmp_file:
	tmp_file.write(content)
	tmp_path = tmp_file.name

	# Atomic rename (on same filesystem)
	if os.name == 'nt': # Windows
	# Windows doesn't support atomic rename if target exists
	if os.path.exists(filepath):
	os.remove(filepath)
	os.rename(tmp_path, filepath)
	else: # Unix/Linux/Mac
	os.rename(tmp_path, filepath)

	return True

	except Exception as e:
	print(f"⚠️ Atomic write failed: {e}")
	# Cleanup temp file if it exists
	if 'tmp_path' in locals() and os.path.exists(tmp_path):
	try:
	os.remove(tmp_path)
	except:
	pass

	# Fallback to direct write with lock
	try:
	with open(filepath, 'w', encoding=encoding) as f:
	f.write(content)
	return True
	except Exception as e2:
	print(f"⚠️ Fallback write also failed: {e2}")
	return False

	def save_glossary(output_dir, chapters, instructions, language="korean", log_callback=None):
	"""Targeted glossary generator with true CSV format output and parallel processing"""

	# If the user stops translation while glossary runs in a subprocess, we must ensure the
	# per-process watchdog file doesn't stick around and keep the GUI progress bar "busy".
	# We only clear on stop (not on normal completion).
	global _watchdog_atexit_registered
	if not _watchdog_atexit_registered:
	try:
	import atexit

	def _cleanup_watchdog_on_exit():
	try:
	if is_stop_requested():
	_clear_api_watchdog_state(remove_watchdog_file=True)
	except Exception:
	pass

	atexit.register(_cleanup_watchdog_on_exit)
	_watchdog_atexit_registered = True
	except Exception:
	pass
	# Note: Don't redirect stdout here if log_callback is provided by subprocess worker
	# The worker already captures stdout and sends to queue
	# Only redirect if we're NOT in a subprocess (i.e., log_callback is a real GUI callback)
	import sys
	in_subprocess = hasattr(sys.stdout, 'queue') # Worker's LogCapture has a queue attribute

	if log_callback and not in_subprocess:
	set_output_redirect(log_callback)

	# Clear any stale stop flags before starting a new glossary run
	try:
	set_stop_flag(False)
	except Exception:
	try:
	os.environ["TRANSLATION_CANCELLED"] = "0"
	except Exception:
	pass
	try:
	stop_path = _get_stop_file_path()
	if stop_path and os.path.exists(stop_path):
	os.remove(stop_path)
	except Exception:
	pass
	try:
	import unified_api_client
	if hasattr(unified_api_client, "UnifiedClient"):
	unified_api_client.UnifiedClient._global_cancelled = False
	if hasattr(unified_api_client, "global_stop_flag"):
	unified_api_client.global_stop_flag = False
	except Exception:
	pass

	print("📱 Targeted Glossary Generator v6.0 (CSV Format + Parallel)")

	# CRITICAL: Reload ALL glossary settings from environment variables at the START
	# This ensures child processes spawned by ProcessPoolExecutor get the latest values
	# Force fresh read of all environment variables (they were set by save_config)
	print("🔄 Reloading glossary settings from environment variables...")

	# Honor output directory override (same behavior as translation pipeline)
	try:
	override_dir = os.getenv("OUTPUT_DIRECTORY")
	if override_dir:
	override_dir = os.path.abspath(override_dir)
	leaf = os.path.basename(os.path.abspath(output_dir)) or "output"
	# Always place under the override root (handles different drives safely)
	output_dir = os.path.join(override_dir, leaf)
	except Exception as e:
	print(f"⚠️ OUTPUT_DIRECTORY override failed: {e}")
	print(f"📁 Glossary output directory: {os.path.abspath(output_dir)}")

	# Check stop flag at start
	# Ensure output directory exists
	try:
	os.makedirs(output_dir, exist_ok=True)
	except Exception as _e:
	print(f"⚠️ Could not ensure output directory exists: {output_dir} ({_e})")
	if is_stop_requested():
	print("📁 ❌ Glossary generation stopped by user")
	_clear_api_watchdog_state(remove_watchdog_file=True)
	return {}

	# CLEAR incremental history UNCONDITIONALLY at the start of any run
	# This prevents stale chunks from polluting the aggregation, regardless of whether chunking is used
	incremental_dir = os.path.join(output_dir, "incremental_glossary")
	if os.path.exists(incremental_dir):
	print(f"📑 Cleaning incremental glossary folder: {incremental_dir}")
	try:
	import shutil
	# Safely clear the entire incremental folder
	for filename in os.listdir(incremental_dir):
	file_path = os.path.join(incremental_dir, filename)
	try:
	if os.path.isfile(file_path) or os.path.islink(file_path):
	os.unlink(file_path)
	elif os.path.isdir(file_path):
	shutil.rmtree(file_path)
	except Exception as e:
	print(f"⚠️ Failed to delete {file_path}: {e}")
	except Exception as e:
	print(f"⚠️ Failed to clear incremental history: {e}")

	# Ensure directory exists for potential use
	os.makedirs(incremental_dir, exist_ok=True)

	# Check if glossary already exists; if so, we'll MERGE it later (do not return early)
	glossary_path = os.path.join(output_dir, "glossary.csv")
	existing_glossary_content = None
	if os.path.exists(glossary_path):
	print(f"📁 Existing glossary detected (will merge): {glossary_path}")
	try:
	with open(glossary_path, 'r', encoding='utf-8') as f:
	existing_glossary_content = f.read()
	except Exception as e:
	print(f"⚠️ Could not read existing glossary: {e}")

	# Rest of the method continues as before...
	print("📁 Extracting names and terms with configurable options")
	global BOOK_TITLE_RAW, BOOK_TITLE_TRANSLATED

	# 1. Get raw title from input EPUB (input path)
	epub_path = os.getenv("EPUB_PATH", "")
	BOOK_TITLE_RAW = _extract_raw_title_from_epub(epub_path)

	# 2. Get translated title from output metadata (output path)
	BOOK_TITLE_TRANSLATED = _extract_translated_title_from_metadata(output_dir)

	# Debug info
	if BOOK_TITLE_RAW:
	print(f"📚 Raw book title: {BOOK_TITLE_RAW}")
	if BOOK_TITLE_TRANSLATED:
	print(f"📚 Translated book title: {BOOK_TITLE_TRANSLATED}")

	# Check stop flag before processing
	if is_stop_requested():
	print("📁 ❌ Glossary generation stopped by user")
	_clear_api_watchdog_state(remove_watchdog_file=True)
	return {}

	# Check if automatic glossary generation is enabled
	enable_auto_glossary = os.getenv("ENABLE_AUTO_GLOSSARY", "1") == "1"

	# Check for manual glossary first (CSV only)
	manual_glossary_path = os.getenv("MANUAL_GLOSSARY")
	existing_glossary = None
	if manual_glossary_path and os.path.exists(manual_glossary_path):
	print(f"📁 Manual glossary detected: {os.path.basename(manual_glossary_path)}")
	try:
	with open(manual_glossary_path, 'r', encoding='utf-8') as f:
	content = f.read()
	# Treat as CSV text and stage it for merge; also copy to output for visibility
	target_path = os.path.join(output_dir, "glossary.csv")
	with open(target_path, 'w', encoding='utf-8') as f:
	f.write(content)
	print(f"📁 ✅ Manual CSV glossary copied to: {target_path}")
	existing_glossary = content

	# Skip automatic generation when manual glossary is loaded
	if not enable_auto_glossary:
	print(f"ℹ️ Automatic glossary generation disabled, using manual glossary only")
	return {}
	else:
	print(f"ℹ️ Skipping automatic glossary generation (manual glossary already loaded)")
	return {}
	except Exception as e:
	print(f"⚠️ Could not copy manual glossary: {e}")
	print(f"📁 Proceeding with automatic generation...")

	# Check if auto-glossary is disabled without a manual glossary
	if not enable_auto_glossary:
	print(f"ℹ️ Automatic glossary generation is disabled and no manual glossary provided")
	return {}

	# Check for existing glossary from manual extraction
	# Avoid double-nesting when output_dir already ends with "Glossary"
	if os.path.basename(os.path.abspath(output_dir)).lower() == "glossary":
	glossary_folder_path = output_dir
	else:
	glossary_folder_path = os.path.join(output_dir, "Glossary")
	# existing_glossary may already be set by MANUAL_GLOSSARY above

	if os.path.exists(glossary_folder_path):
	for file in os.listdir(glossary_folder_path):
	if file.endswith("_glossary.json"):
	existing_path = os.path.join(glossary_folder_path, file)
	try:
	with open(existing_path, 'r', encoding='utf-8') as f:
	existing_content = f.read()
	existing_glossary = existing_content
	print(f"📁 Found existing glossary from manual extraction: {file}")
	break
	except Exception as e:
	print(f"⚠️ Could not load existing glossary: {e}")

	# Get configuration from environment variables (FRESH READ)
	min_frequency = int(os.getenv("GLOSSARY_MIN_FREQUENCY", "2"))
	max_names = int(os.getenv("GLOSSARY_MAX_NAMES", "50"))
	max_titles = int(os.getenv("GLOSSARY_MAX_TITLES", "30"))

	# Batch sizing:
	# - GUI uses BATCH_SIZE for concurrency/batching.
	# - Keep GLOSSARY_BATCH_SIZE for backward compatibility, but default to GUI's value.
	batch_size = int(os.getenv("GLOSSARY_BATCH_SIZE", os.getenv("BATCH_SIZE", "50")))
	strip_honorifics = os.getenv("GLOSSARY_STRIP_HONORIFICS", "1") == "1"
	fuzzy_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90"))
	max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "0"))

	# DEBUG: Show what we're reading from environment
	max_sentences_env = os.getenv("GLOSSARY_MAX_SENTENCES", "200")
	print(f"🔍 [DEBUG] Reading GLOSSARY_MAX_SENTENCES from environment: '{max_sentences_env}'")
	max_sentences = int(max_sentences_env)
	print(f"🔍 [DEBUG] Converted to integer: {max_sentences}")
	include_all_characters_env = os.getenv("GLOSSARY_INCLUDE_ALL_CHARACTERS", "0")
	include_all_characters = include_all_characters_env == "1"
	include_gender_context_flag = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	print(f"📑 DEBUG: Include all characters (dynamic limit expansion) = '{include_all_characters_env}'")

	print(f"📑 Settings: Min frequency: {min_frequency}, Max names: {max_names}, Max titles: {max_titles}")
	print(f"📑 Strip honorifics: {'✅ Yes' if strip_honorifics else '❌ No'}")
	print(f"📑 Fuzzy matching threshold: {fuzzy_threshold}")
	print(f"📑 Max sentences for filtering: {max_sentences}")

	# Get custom prompt from environment
	custom_prompt = os.getenv("AUTO_GLOSSARY_PROMPT", "").strip()

	# Initialize to the default unified prompt when unset/empty.
	# Pattern-based extraction remains disabled elsewhere.
	if not custom_prompt:
	custom_prompt = DEFAULT_AUTO_GLOSARY_PROMPT3.strip()
	os.environ["AUTO_GLOSSARY_PROMPT"] = custom_prompt
	print("📑 AUTO_GLOSSARY_PROMPT not set - initialized to default unified prompt")

	def clean_html(html_text):
	"""Remove HTML tags to get clean text"""
	soup = BeautifulSoup(html_text, 'html.parser')
	return soup.get_text()

	# Check stop before processing chapters
	if is_stop_requested():
	print("📑 ❌ Glossary generation stopped by user")
	_clear_api_watchdog_state(remove_watchdog_file=True)
	return {}

	# Get chapter split threshold, toggle, and filter mode
	chapter_split_threshold = int(os.getenv("GLOSSARY_CHAPTER_SPLIT_THRESHOLD", "100000"))
	chapter_split_enabled = os.getenv("GLOSSARY_ENABLE_CHAPTER_SPLIT", "1") == "1"
	filter_mode = os.getenv("GLOSSARY_FILTER_MODE", "all") # all, only_with_honorifics, only_without_honorifics

	# Check if parallel extraction is enabled for automatic glossary
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
	batch_translation = os.getenv("BATCH_TRANSLATION", "0") == "1"
	# Prefer GUI's batch size; fall back to glossary batch size if needed.
	api_batch_size = int(os.getenv("BATCH_SIZE", os.getenv("GLOSSARY_BATCH_SIZE", "5")))
	batching_mode = os.getenv("BATCHING_MODE", "direct")
	batch_group_size = int(os.getenv("BATCH_GROUP_SIZE", "3"))
	# Backward compatibility
	if os.getenv("CONSERVATIVE_BATCHING", "0") == "1":
	batching_mode = "conservative"

	# Log the settings
	print(f"📑 Filter mode: {filter_mode}")
	if extraction_workers > 1:
	print(f"📑 Parallel extraction enabled: {extraction_workers} workers")
	if batch_translation:
	print(f"📑 Batch API calls enabled: {api_batch_size} chunks per batch")
	print(f"📑 Batching mode: {batching_mode}")
	if batching_mode == "conservative":
	print(f"📑 Conservative group size: {batch_group_size}")

	all_text = ' '.join(clean_html(chapter["body"]) for chapter in chapters)
	print(f"📑 Processing {len(all_text):,} characters of text")

	# Apply smart filtering FIRST to check actual size needed
	use_smart_filter = os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1"
	effective_text_size = len(all_text)

	filtered_text_cache = None
	if use_smart_filter and custom_prompt: # Only apply for AI extraction
	print(f"📁 Smart filtering enabled - checking effective text size after filtering...")
	# Perform filtering ONCE and reuse for chunking
	filtered_sample, _ = _filter_text_for_glossary(all_text, min_frequency, max_sentences)
	filtered_text_cache = filtered_sample
	effective_text_size = len(filtered_sample)
	# Calculate token count using tiktoken
	try:
	import tiktoken
	enc = tiktoken.get_encoding("cl100k_base")
	token_count = len(enc.encode(filtered_sample))
	print(f"📁 Text reduction: {len(all_text):,} → {effective_text_size:,} chars ({100*(1-effective_text_size/len(all_text)):.1f}% reduction) \| {token_count:,} tokens")
	except:
	print(f"📁 Text reduction: {len(all_text):,} → {effective_text_size:,} chars ({100*(1-effective_text_size/len(all_text)):.1f}% reduction)")

	# Safety check: Calculate actual token count for chunking decision
	estimated_tokens = None
	try:
	import tiktoken
	enc = tiktoken.get_encoding("cl100k_base")
	estimated_tokens = len(enc.encode(filtered_text_cache if filtered_text_cache else all_text))
	except:
	# Fallback estimate: 1 token ≈ 3-4 characters for Asian languages
	estimated_tokens = effective_text_size // 3

	# Get output token limit (glossary-specific with fallback to global)
	max_output_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "65536")))

	# Use compression factor to determine safe input limit (from CJK→English compression ratio)
	# Use glossary-specific compression factor with fallback to global
	compression_factor = float(os.getenv("GLOSSARY_COMPRESSION_FACTOR", os.getenv("COMPRESSION_FACTOR", "1.0")))
	# Safe input limit is max_output divided by compression factor
	# (e.g., if compression is 0.7, output will be 70% of input, so we can use 1/0.7 = 1.43x for safety)
	safe_input_limit = int(max_output_tokens / max(compression_factor, 0.1)) if compression_factor > 0 else int(max_output_tokens * 0.8)

	if estimated_tokens > safe_input_limit:
	# Only show detailed token logs if using token-based chunking (threshold == 0)
	if chapter_split_threshold == 0:
	print(f"⚠️ Text too large for single API call!")
	print(f" Estimated tokens: {estimated_tokens:,}")
	print(f" Safe input limit: {safe_input_limit:,} (based on {compression_factor:.2f}x compression factor and {max_output_tokens:,} max output tokens)")
	print(f" Will use ChapterSplitter for token-based chunking...")
	else:
	# Character-based threshold already set, just use it silently
	pass

	# Check if we need to split into chunks based on EFFECTIVE size after filtering
	needs_chunking = chapter_split_enabled and (
	(chapter_split_threshold == 0 and estimated_tokens > safe_input_limit) or
	(chapter_split_threshold > 0 and effective_text_size > chapter_split_threshold)
	)
	if not chapter_split_enabled:
	print("📑 Chapter splitting disabled (GLOSSARY_ENABLE_CHAPTER_SPLIT=0) - processing without pre-splitting")

	if needs_chunking:
	# Prepare chunk processing
	incremental_dir = os.path.join(output_dir, "incremental_glossary")
	agg_path = os.path.join(incremental_dir, "glossary.incremental.all.csv")

	# CLEAR incremental history if it exists to ensure 'all' file only contains current run data
	# This prevents it from growing indefinitely across multiple runs
	if os.path.exists(incremental_dir):
	try:
	import shutil
	# Safely clear the entire incremental folder
	for filename in os.listdir(incremental_dir):
	file_path = os.path.join(incremental_dir, filename)
	try:
	if os.path.isfile(file_path) or os.path.islink(file_path):
	os.unlink(file_path)
	elif os.path.isdir(file_path):
	shutil.rmtree(file_path)
	except Exception as e:
	print(f"⚠️ Failed to delete {file_path}: {e}")
	print(f"📑 Cleared incremental glossary folder: {incremental_dir}")
	except Exception as e:
	print(f"⚠️ Failed to clear incremental history: {e}")

	# Ensure directory exists (if it was fully removed or didn't exist)
	os.makedirs(incremental_dir, exist_ok=True)

	if chapter_split_threshold == 0:
	# Use ChapterSplitter for token-based intelligent chunking
	print(f"📑 Text exceeds safe token limit, using ChapterSplitter for token-based chunking...")
	from chapter_splitter import ChapterSplitter

	# Get the model name for the tokenizer
	model = os.getenv("MODEL", "gemini-2.0-flash")
	splitter = ChapterSplitter(model_name=model, target_tokens=safe_input_limit)

	# Get the text to split (filtered or raw)
	text_to_split = filtered_text_cache if (use_smart_filter and custom_prompt and filtered_text_cache) else all_text

	# Use ChapterSplitter to intelligently split based on tokens
	split_results = splitter.split_chapter(text_to_split, max_tokens=safe_input_limit)
	chunks_to_process = [(i, chunk) for i, (chunk, _, _) in enumerate(split_results, 1)]

	print(f"📑 ChapterSplitter created {len(chunks_to_process)} token-balanced chunks")
	all_glossary_entries = []
	else:
	# Use character-based splitting with fixed threshold
	print(f"📑 Effective text exceeds {chapter_split_threshold:,} chars, will process in chunks...")

	# If using smart filter, we need to split the FILTERED text, not raw text
	if use_smart_filter and custom_prompt:
	# Split the filtered text into chunks (reuse cached filtered text)
	filtered_text = filtered_text_cache if filtered_text_cache is not None else _filter_text_for_glossary(all_text, min_frequency, max_sentences)[0]
	chunks_to_process = []

	# Split filtered text into chunks of appropriate size
	chunk_size = chapter_split_threshold
	for i in range(0, len(filtered_text), chunk_size):
	chunk_text = filtered_text[i:i + chunk_size]
	chunks_to_process.append((len(chunks_to_process) + 1, chunk_text))

	print(f"📑 Split filtered text into {len(chunks_to_process)} chunks")
	all_glossary_entries = []
	else:
	# Original logic for unfiltered text
	all_glossary_entries = []
	chunk_size = 0
	chunk_chapters = []
	chunks_to_process = []

	for idx, chapter in enumerate(chapters):
	if is_stop_requested():
	print("📑 ❌ Glossary generation stopped by user")
	return all_glossary_entries

	chapter_text = clean_html(chapter["body"])
	chunk_size += len(chapter_text)
	chunk_chapters.append(chapter)

	# Process chunk when it reaches threshold or last chapter
	if chunk_size >= chapter_split_threshold or idx == len(chapters) - 1:
	chunk_text = ' '.join(clean_html(ch["body"]) for ch in chunk_chapters)
	chunks_to_process.append((len(chunks_to_process) + 1, chunk_text))

	# Reset for next chunk
	chunk_size = 0
	chunk_chapters = []

	print(f"📑 Split into {len(chunks_to_process)} chunks for processing")

	# Batch toggle decides concurrency: ON => parallel API calls; OFF => strict sequential
	if batch_translation and custom_prompt and len(chunks_to_process) > 1:
	print(f"📑 Processing chunks in batch mode with {api_batch_size} chunks per batch...")
	# Set fast mode for batch processing
	os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "1"

	# Use batch API calls for AI extraction
	all_csv_lines = _process_chunks_batch_api(
	chunks_to_process, custom_prompt, language,
	min_frequency, max_names, max_titles,
	output_dir, strip_honorifics, fuzzy_threshold,
	filter_mode, api_batch_size, extraction_workers, max_sentences
	)

	# Reset validation mode
	os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "0"

	print(f"📑 All chunks completed. Aggregated raw lines: {len(all_csv_lines)}")

	# Process all collected entries at once (even if empty)
	# Add header so downstream steps can work uniformly
	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1"
	if include_description:
	all_csv_lines.insert(0, "type,raw_name,translated_name,gender,description")
	elif include_gender_context:
	all_csv_lines.insert(0, "type,raw_name,translated_name,gender")
	else:
	all_csv_lines.insert(0, "type,raw_name,translated_name")

	# Merge with any on-disk glossary first (to avoid overwriting user edits)
	on_disk_path = os.path.join(output_dir, "glossary.csv")
	if os.path.exists(on_disk_path):
	try:
	with open(on_disk_path, 'r', encoding='utf-8') as f:
	on_disk_content = f.read()
	all_csv_lines = _merge_csv_entries(all_csv_lines, on_disk_content, strip_honorifics, language)
	print("📑 Merged with existing on-disk glossary")
	except Exception as e:
	print(f"⚠️ Failed to merge with existing on-disk glossary: {e}")

	# Apply filter mode if needed
	if filter_mode == "only_with_honorifics":
	filtered = [all_csv_lines[0]] # Keep header
	for line in all_csv_lines[1:]:
	parts = line.split(',', 2)
	if len(parts) >= 3 and parts[0] == "character":
	filtered.append(line)
	all_csv_lines = filtered
	print(f"📑 Filter applied: {len(all_csv_lines)-1} character entries with honorifics kept")

	# Ensure book title header is present before dedup/sort when requested
	if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1":
	all_csv_lines = _ensure_book_title_csv_lines(all_csv_lines)
	# Apply fuzzy deduplication (deferred until after all chunks)
	try:
	print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...")
	all_csv_lines = _deduplicate_glossary_with_fuzzy(all_csv_lines, fuzzy_threshold)
	except Exception as e:
	print(f"⚠️ Deduplication error: {e} — continuing without dedup")

	# Sort by type and name
	print(f"📑 Sorting glossary by type and name...")
	header = all_csv_lines[0]
	entries = all_csv_lines[1:]
	if entries:
	entries.sort(key=_csv_sort_key)
	all_csv_lines = [header] + entries

	# Save
	# Check format preference
	use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'

	if not use_legacy_format:
	# Convert to token-efficient format
	all_csv_lines = _convert_to_token_efficient_format(all_csv_lines)

	# Final sanitize to prevent stray headers
	all_csv_lines = _sanitize_final_glossary_lines(all_csv_lines, use_legacy_format)
	# If user requested stop, avoid writing new glossary to disk
	if is_stop_requested():
	print("🛑 Stop requested — skipping final glossary write (batch mode)")
	return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {}

	# If user stopped and we have no entries, keep existing file to avoid wiping it
	if is_stop_requested() and len(all_csv_lines) <= 1:
	print("🛑 Stop requested with no new entries — preserving existing glossary.csv")
	return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {}

	# Save
	csv_content = '\n'.join(all_csv_lines)
	glossary_path = os.path.join(output_dir, "glossary.csv")
	_atomic_write_file(glossary_path, csv_content)

	# Verify file exists; fallback direct write if needed
	if not os.path.exists(glossary_path):
	try:
	with open(glossary_path, 'w', encoding='utf-8') as f:
	f.write(csv_content)
	print("📑 Fallback write succeeded for glossary.csv")
	except Exception as e:
	print(f"❌ Failed to write glossary.csv: {e}")

	print(f"\n📑 ✅ GLOSSARY SAVED!")
	print(f"📑 ✅ AI GLOSSARY SAVED!")
	c_count, t_count, total = _count_glossary_entries(all_csv_lines, use_legacy_format)
	print(f"📑 Character entries: {c_count}")
	# print(f"📑 Term entries: {t_count}")
	print(f"📑 Total entries: {total}")

	return _parse_csv_to_dict(csv_content)
	else:
	# Strict sequential processing (one API call at a time)
	_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
	_prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED")
	_prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER")
	os.environ["GLOSSARY_DEFER_SAVE"] = "1"
	# Tell the extractor each chunk is already filtered to avoid re-running smart filter per chunk
	os.environ["_CHUNK_ALREADY_FILTERED"] = "1"
	os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1"
	try:
	for pos, (chunk_idx, chunk_text) in enumerate(chunks_to_process, start=1):
	if is_stop_requested():
	break

	print(f"📑 Processing chunk {chunk_idx}/{len(chunks_to_process)} ({len(chunk_text):,} chars)...")

	if custom_prompt:
	chunk_glossary = _extract_with_custom_prompt(
	custom_prompt, chunk_text, language,
	min_frequency, max_names, max_titles,
	None, output_dir, # Don't pass existing glossary to chunks
	strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback,
	chunk_pos=pos,
	total_chunks=len(chunks_to_process),
	)
	else:
	# Pattern fallback disabled
	print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping chunk glossary extraction (pattern fallback disabled)")
	chunk_glossary = {}

	# Normalize to CSV lines and aggregate
	chunk_lines = []
	if isinstance(chunk_glossary, list):
	for line in chunk_glossary:
	if line and not line.startswith('type,'):
	all_glossary_entries.append(line)
	chunk_lines.append(line)
	else:
	for raw_name, translated_name in chunk_glossary.items():
	entry_type = "character" if _has_honorific(raw_name) else "term"
	line = f"{entry_type},{raw_name},{translated_name}"
	all_glossary_entries.append(line)
	chunk_lines.append(line)

	# Incremental update (per chunk file inside incremental_glossary folder)
	try:
	_incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode)
	print(f"📑 Incremental write: chunk {chunk_idx} (+{len(chunk_lines)} entries)")
	except Exception as e2:
	print(f"⚠️ Incremental write failed for chunk {chunk_idx}: {e2}")
	finally:
	if _prev_defer is None:
	if "GLOSSARY_DEFER_SAVE" in os.environ:
	del os.environ["GLOSSARY_DEFER_SAVE"]
	else:
	os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer
	if _prev_filtered is None:
	os.environ.pop("_CHUNK_ALREADY_FILTERED", None)
	else:
	os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered
	if _prev_force_disable is None:
	os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None)
	else:
	os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable

	# Build CSV from aggregated entries
	print(f"📑 DEBUG: all_glossary_entries count before merge: {len(all_glossary_entries)}")

	# START WITH INCREMENTAL GLOSSARY AS BASE IF IT EXISTS AND IS LARGER
	# This ensures that if memory was lost (e.g. during a long sequential run), we rely on the disk backup
	incremental_dir = os.path.join(output_dir, "incremental_glossary")
	incremental_path = os.path.join(incremental_dir, "glossary.incremental.all.csv")
	base_entries = list(all_glossary_entries)
	using_incremental_as_base = False

	if os.path.exists(incremental_path):
	try:
	with open(incremental_path, 'r', encoding='utf-8') as f:
	inc_content = f.read()

	# Simple parse to count lines/entries
	inc_lines = [line for line in inc_content.split('\n') if line.strip() and not line.startswith('type,')]
	print(f"📑 Found incremental glossary: {len(inc_lines)} entries (Memory: {len(all_glossary_entries)} entries)")

	if len(inc_lines) > len(all_glossary_entries):
	print("📑 🔄 Incremental glossary is larger than memory - using it as primary source")
	# We need to ensure it has the header for csv_lines logic below
	# But csv_lines construction adds header anyway.
	# So we just REPLACE base_entries with inc_lines
	base_entries = inc_lines
	using_incremental_as_base = True
	except Exception as e:
	print(f"⚠️ Failed to check incremental glossary: {e}")

	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1"

	if include_description:
	csv_lines = ["type,raw_name,translated_name,gender,description"] + base_entries
	elif include_gender_context:
	csv_lines = ["type,raw_name,translated_name,gender"] + base_entries
	else:
	csv_lines = ["type,raw_name,translated_name"] + base_entries

	# If we used incremental as base, we must merge MEMORY into it (to capture the last chunk if it wasn't in incremental yet)
	if using_incremental_as_base and all_glossary_entries:
	print("📑 Merging memory entries into incremental base...")
	# Create a mini-CSV for memory entries
	mem_csv = ["type,raw_name,translated_name"] + all_glossary_entries
	csv_lines = _merge_csv_entries(csv_lines, '\n'.join(mem_csv), strip_honorifics, language)

	# Merge with any provided existing glossary AND on-disk glossary to avoid overwriting
	on_disk_path = os.path.join(output_dir, "glossary.csv")

	merge_sources = []
	if existing_glossary:
	merge_sources.append(existing_glossary)

	# We already handled incremental above as the base, so we don't add it to merge_sources here

	if os.path.exists(on_disk_path):
	try:
	with open(on_disk_path, 'r', encoding='utf-8') as f:
	merge_sources.append(f.read())
	print("📑 Found existing on-disk glossary to merge")
	except Exception as e:
	print(f"⚠️ Failed to read on-disk glossary for merging: {e}")
	# Also merge the main on-disk glossary if it was present at start
	if existing_glossary_content:
	csv_lines = _merge_csv_entries(csv_lines, existing_glossary_content, strip_honorifics, language)
	for src in merge_sources:
	before_merge_count = len(csv_lines)
	csv_lines = _merge_csv_entries(csv_lines, src, strip_honorifics, language)
	print(f"📑 DEBUG: Merged source. Count: {before_merge_count} -> {len(csv_lines)}")

	# Apply filter mode to final results
	csv_lines = _filter_csv_by_mode(csv_lines, filter_mode)

	# Ensure book title entry before dedup/sort when requested
	if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1":
	csv_lines = _ensure_book_title_csv_lines(csv_lines)
	# Apply fuzzy deduplication (deferred until after all chunks)
	print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...")
	original_count = len(csv_lines) - 1
	csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)
	deduped_count = len(csv_lines) - 1
	if original_count > deduped_count:
	print(f"📑 Removed {original_count - deduped_count} duplicate entries")

	# Sort by type and name
	print(f"📑 Sorting glossary by type and name...")
	header = csv_lines[0]
	entries = csv_lines[1:]
	entries.sort(key=_csv_sort_key)
	csv_lines = [header] + entries

	# Token-efficient format if enabled
	use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'
	if not use_legacy_format:
	csv_lines = _convert_to_token_efficient_format(csv_lines)

	# Final sanitize to prevent stray headers and section titles at end
	csv_lines = _sanitize_final_glossary_lines(csv_lines, use_legacy_format)
	# If user requested stop, avoid overwriting files; preserve existing when possible
	if is_stop_requested():
	if len(csv_lines) <= 1 and os.path.exists(on_disk_path):
	print("🛑 Stop requested with no new entries — preserving existing glossary.csv")
	return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {}
	print("🛑 Stop requested — skipping final glossary write (chunked mode)")
	return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {}

	# Copy glossary extension file if configured
	# Copy glossary extension file if configured
	add_additional_glossary = os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1'
	additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '')

	if add_additional_glossary and additional_glossary_path and os.path.exists(additional_glossary_path):
	print(f"📜 Processing glossary extension: {os.path.basename(additional_glossary_path)}")
	try:
	import shutil
	file_ext = os.path.splitext(additional_glossary_path)[1].lower()

	# Target path in output directory
	target_path = os.path.join(output_dir, "glossary_extension.csv")

	if file_ext == '.csv':
	# Copy CSV directly
	shutil.copy2(additional_glossary_path, target_path)
	print(f"📜 Copied glossary extension to {os.path.basename(target_path)}")

	elif file_ext in ['.txt', '.json', '.pdf']:
	# Convert non-CSV formats to CSV
	converted_lines = []

	if file_ext == '.txt':
	with open(additional_glossary_path, 'r', encoding='utf-8') as f:
	content = f.read()
	# Try to parse as CSV-like format
	for line in content.strip().split('\n'):
	if line.strip():
	converted_lines.append(line.strip())

	elif file_ext == '.json':
	import json
	with open(additional_glossary_path, 'r', encoding='utf-8') as f:
	data = json.load(f)
	# Add CSV header
	converted_lines.append("type,raw_name,translated_name")
	# Convert JSON to CSV format
	if isinstance(data, dict):
	for key, value in data.items():
	if isinstance(value, dict):
	raw = value.get('raw', key)
	translated = value.get('translated', value.get('translation', key))
	entry_type = value.get('type', 'term')
	converted_lines.append(f"{entry_type},{raw},{translated}")
	else:
	converted_lines.append(f"term,{key},{value}")
	elif isinstance(data, list):
	for entry in data:
	if isinstance(entry, dict):
	entry_type = entry.get('type', 'term')
	raw = entry.get('raw_name', entry.get('raw', ''))
	translated = entry.get('translated_name', entry.get('translated', ''))
	if raw and translated:
	converted_lines.append(f"{entry_type},{raw},{translated}")

	elif file_ext == '.pdf':
	# Try to extract text from PDF and save as CSV
	try:
	import PyPDF2
	with open(additional_glossary_path, 'rb') as f:
	pdf_reader = PyPDF2.PdfReader(f)
	pdf_text = []
	for page in pdf_reader.pages:
	pdf_text.append(page.extract_text())
	text_content = '\n'.join(pdf_text)
	# Try to parse as CSV
	for line in text_content.strip().split('\n'):
	if line.strip():
	converted_lines.append(line.strip())
	except ImportError:
	print("⚠️ PyPDF2 not available, cannot read PDF. Install with: pip install PyPDF2")
	except Exception as pdf_error:
	print(f"⚠️ Could not read PDF: {pdf_error}")

	# Write converted content to CSV
	if converted_lines:
	with open(target_path, 'w', encoding='utf-8') as f:
	f.write('\n'.join(converted_lines))
	print(f"📜 Converted and saved glossary extension to {os.path.basename(target_path)}")

	except Exception as e:
	print(f"⚠️ Failed to copy glossary extension: {e}")
	import traceback
	traceback.print_exc()

	try:
	# Save
	csv_content = '\n'.join(csv_lines)
	glossary_path = os.path.join(output_dir, "glossary.csv")
	_atomic_write_file(glossary_path, csv_content)

	# Verify file exists; fallback direct write if needed
	if not os.path.exists(glossary_path):
	try:
	with open(glossary_path, 'w', encoding='utf-8') as f:
	f.write(csv_content)
	print("📑 Fallback write succeeded for glossary.csv")
	except Exception as e:
	print(f"❌ Failed to write glossary.csv: {e}")
	finally:
	print(f"\n📑 ✅ CHUNKED GLOSSARY SAVED!")
	print(f"📑 ✅ AI GLOSSARY SAVED!")
	print(f"📑 File: {glossary_path}")
	c_count, t_count, total = _count_glossary_entries(csv_lines, use_legacy_format)
	print(f"📑 Character entries: {c_count}")
	# print(f"📑 Term entries: {t_count}")
	print(f"📑 Total entries: {total}")

	return _parse_csv_to_dict(csv_content)

	# Original single-text processing
	if custom_prompt:
	# Pass cached filtered text if available to avoid re-filtering
	text_to_process = filtered_text_cache if filtered_text_cache is not None else all_text
	already_filtered = filtered_text_cache is not None

	# Set environment flag to indicate text is already filtered
	if already_filtered:
	os.environ["_TEXT_ALREADY_FILTERED"] = "1"

	try:
	return _extract_with_custom_prompt(custom_prompt, text_to_process, language,
	min_frequency, max_names, max_titles,
	existing_glossary, output_dir,
	strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback)
	finally:
	if already_filtered:
	os.environ.pop("_TEXT_ALREADY_FILTERED", None)
	else:
	# Pattern fallback disabled
	print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping automatic glossary generation (pattern fallback disabled)")
	return {}

	total_time = time.time() - total_start_time
	print(f"\n📑 ========== GLOSSARY GENERATION COMPLETE ==========")
	print(f"📑 Total time: {total_time:.1f}s")
	print(f"📑 Performance breakdown:")
	print(f"📑 - Extraction: {0:.1f}s")
	print(f"📑 - API calls: {0:.1f}s")
	print(f"📑 - Frequency checking: {0:.1f}s")
	print(f"📑 - Deduplication: {0:.1f}s")
	print(f"📑 - File I/O: {0:.1f}s")
	print(f"📑 ================================================")

	return result # This is the existing return statement

	def _convert_to_token_efficient_format(csv_lines):
	"""Convert CSV lines to token-efficient format with sections and asterisks"""
	if len(csv_lines) <= 1:
	return csv_lines

	header = csv_lines[0]
	entries = csv_lines[1:]

	# Group by type (only from valid CSV lines)
	import re as _re
	import csv as _csv
	grouped = {}
	for line in entries:
	if not line.strip():
	continue
	# Only accept proper CSV rows: at least 3 fields and a sane type token
	parts_full = [p.strip() for p in line.split(',')]
	if len(parts_full) < 3:
	continue
	entry_type = parts_full[0].lower()
	if not _re.match(r'^[a-z_]+$', entry_type):
	continue
	if entry_type not in grouped:
	grouped[entry_type] = []
	grouped[entry_type].append(line)

	# Rebuild with token-efficient format
	result = []
	# Extract column headers from CSV to show in dynamic header
	columns = ['translated_name', 'raw_name']
	# Check for gender and description columns
	try:
	header_parts = [p.strip() for p in next(_csv.reader([header]))] if header else []
	except Exception:
	header_parts = [p.strip() for p in header.split(',')] if header else []
	if 'gender' in header_parts:
	columns.append('gender')
	if 'description' in header_parts:
	columns.append('description')
	# Add any other custom fields (exclude type, raw_name, translated_name, gender, description)
	standard_cols = {'type', 'raw_name', 'translated_name', 'gender', 'description'}
	for col in header_parts:
	if col.lower() not in standard_cols and col:
	columns.append(col)
	result.append(f"Glossary Columns: {', '.join(columns)}\n")

	# Process in order: character first, then term, then others
	type_order = ['book', 'character', 'term'] + [t for t in grouped.keys() if t not in ['book', 'character', 'term']]

	# Precompute column indices for richer rendering
	lower_header = [h.lower() for h in header_parts]
	def _idx(name):
	return lower_header.index(name) if name in lower_header else -1
	type_idx = _idx('type')
	raw_idx = _idx('raw_name')
	trans_idx = _idx('translated_name')
	gender_idx = _idx('gender')
	desc_idx = _idx('description')
	for entry_type in type_order:
	if entry_type not in grouped:
	continue

	entries = grouped[entry_type]

	# Add section header
	section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper()
	result.append(f"=== {section_name} ===")

	# Add entries in new format
	for line in entries:
	try:
	parts = next(_csv.reader([line]))
	except Exception:
	parts = [p.strip() for p in line.split(',')]

	if header_parts and len(parts) < len(header_parts):
	parts += [''] * (len(header_parts) - len(parts))
	elif header_parts and len(parts) > len(header_parts):
	# If unquoted commas split the description, merge overflow into the description column
	if desc_idx != -1 and desc_idx < len(header_parts):
	parts = parts[:desc_idx] + [",".join(parts[desc_idx:])]
	else:
	parts = parts[:len(header_parts)]

	# Extract core fields using header positions when available
	entry_type_val = (parts[type_idx] if type_idx != -1 and len(parts) > type_idx else entry_type).lower()
	raw_name = parts[raw_idx] if raw_idx != -1 and len(parts) > raw_idx else (parts[1] if len(parts) > 1 else '')
	translated_name = parts[trans_idx] if trans_idx != -1 and len(parts) > trans_idx else (parts[2] if len(parts) > 2 else '')
	if not raw_name or not translated_name:
	continue

	entry_line = f"* {translated_name} ({raw_name})"

	# Gender support (any type that supplies it)
	if gender_idx != -1 and len(parts) > gender_idx:
	gender_val = parts[gender_idx].strip()
	if gender_val and gender_val != 'Unknown':
	entry_line += f" [{gender_val}]"

	# Description + extra fields
	desc_val = parts[desc_idx].strip() if desc_idx != -1 and len(parts) > desc_idx else ''
	# Fallback: if no description column exists in header but there are trailing columns,
	# join everything after the last known core column as description.
	if desc_idx == -1:
	core_max = max(idx for idx in [type_idx, raw_idx, trans_idx, gender_idx] if idx != -1) if any(idx != -1 for idx in [type_idx, raw_idx, trans_idx, gender_idx]) else 2
	if len(parts) > core_max + 1:
	desc_tail = ",".join(parts[core_max + 1:]).strip()
	if desc_tail and not desc_val:
	desc_val = desc_tail
	extra_segments = []
	for idx, col in enumerate(header_parts):
	col_lower = col.lower()
	if col_lower in ['type', 'raw_name', 'translated_name', 'gender', 'description']:
	continue
	if idx < len(parts):
	val = parts[idx].strip()
	if val:
	extra_segments.append(f"{col}: {val}")

	base_desc = desc_val
	if not base_desc and extra_segments:
	base_desc = extra_segments[0]
	extra_segments = extra_segments[1:]

	if base_desc:
	entry_line += f": {base_desc}"
	for seg in extra_segments:
	entry_line += f" \| {seg}"

	result.append(entry_line)

	result.append("") # Blank line between sections

	return result

	def _count_glossary_entries(lines, use_legacy_format=False):
	"""Return (char_count, term_count, total_count) for either format."""
	if not lines:
	return 0, 0, 0
	if use_legacy_format:
	data = lines[1:] if lines and lines[0].lower().startswith('type,raw_name') else lines
	char_count = sum(1 for ln in data if ln.startswith('character,'))
	term_count = sum(1 for ln in data if ln.startswith('term,'))
	total = sum(1 for ln in data if ln and ',' in ln)
	return char_count, term_count, total
	# token-efficient
	current = None
	char_count = term_count = total = 0
	for ln in lines:
	s = ln.strip()
	if s.startswith('=== ') and 'CHARACTER' in s.upper():
	current = 'character'
	continue
	if s.startswith('=== ') and 'TERM' in s.upper():
	current = 'term'
	continue
	if s.startswith('* '):
	total += 1
	if current == 'character':
	char_count += 1
	elif current == 'term':
	term_count += 1
	return char_count, term_count, total

	def _sanitize_final_glossary_lines(lines, use_legacy_format=False):
	"""Remove stray CSV headers and normalize header placement before saving.
	- In legacy CSV mode, ensure exactly one header at the very top.
	- In token-efficient mode, remove any CSV header lines entirely.
	"""
	header_norm = "type,raw_name,translated_name"
	if not lines:
	return lines

	if use_legacy_format:
	sanitized = []
	header_seen = False
	for ln in lines:
	txt = ln.strip()
	if txt.lower().startswith("type,raw_name"):
	if not header_seen:
	sanitized.append(header_norm)
	header_seen = True
	# skip duplicates
	else:
	sanitized.append(ln)
	# ensure header at top
	if sanitized and not sanitized[0].strip().lower().startswith("type,raw_name"):
	sanitized.insert(0, header_norm)
	return sanitized
	else:
	# remove any CSV header lines anywhere and duplicate top headers/sections
	cleaned = []
	glossary_header_seen = False
	for i, ln in enumerate(lines):
	txt = ln.strip()
	low = txt.lower()
	# Drop CSV headers
	if low.startswith("type,raw_name"):
	continue
	# Keep only the first main glossary header
	if low.startswith("glossary:"):
	if glossary_header_seen:
	continue
	glossary_header_seen = True
	cleaned.append(ln)
	continue
	# Remove bogus section like '=== GLOSSARY: ... ==='
	if low.startswith("=== glossary:"):
	continue
	cleaned.append(ln)
	return cleaned

	def _process_chunks_batch_api(chunks_to_process, custom_prompt, language,
	min_frequency, max_names, max_titles,
	output_dir, strip_honorifics, fuzzy_threshold,
	filter_mode, api_batch_size, extraction_workers, max_sentences=200):
	"""Process chunks using batch API calls for AI extraction with thread delay.

	IMPORTANT: when a stop is requested, we must stop submitting new API work immediately.
	Any already in-flight requests may finish (graceful stop) or be aborted by unified_api_client
	cancellation (immediate stop).
	"""

	print(f"📑 Using batch API mode with {api_batch_size} chunks per batch")

	# Graceful stop semantics:
	# - If GRACEFUL_STOP=1 and WAIT_FOR_CHUNKS=1: stop submitting new work, but do NOT cancel in-flight.
	# - If WAIT_FOR_CHUNKS=0: we will only "wait for in-flight" if ALL chunks were already submitted.
	# If any chunk is still pending/not-submitted when stop is raised, escalate to full-stop.
	graceful_stop = (os.getenv('GRACEFUL_STOP') == '1')
	wait_for_chunks = (os.getenv('WAIT_FOR_CHUNKS') == '1')

	# Ensure we defer saving and heavy merging when processing chunks
	_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
	os.environ["GLOSSARY_DEFER_SAVE"] = "1"

	# Get thread submission delay
	thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
	if thread_delay > 0:
	print(f"📑 Thread submission delay: {thread_delay}s between parallel calls")

	# CHANGE: Collect raw CSV lines instead of dictionary
	all_csv_lines = [] # Collect all entries as CSV lines
	total_chunks = len(chunks_to_process)
	completed_chunks = 0

	# Ensure per-chunk smart filtering is disabled globally during batch processing
	_prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED")
	_prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER")
	os.environ["_CHUNK_ALREADY_FILTERED"] = "1"
	os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1"

	# Concurrency: follow GUI batch size (BATCH_SIZE).
	# NOTE: EXTRACTION_WORKERS is used for chapter extraction/CPU work; it should not cap API concurrency.
	# If you want to throttle API concurrency, use BATCH_SIZE (and/or SEND_INTERVAL_SECONDS).
	try:
	api_batch_size = int(api_batch_size)
	except Exception:
	api_batch_size = 1
	api_batch_size = max(1, api_batch_size)

	max_workers = min(api_batch_size, len(chunks_to_process))
	max_workers = max(1, max_workers)

	# Useful debug when users think batching isn't applying
	try:
	send_interval = os.getenv("SEND_INTERVAL_SECONDS", "")
	thread_delay_env = os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "")
	print(f"📑 DEBUG: BATCH_SIZE={api_batch_size}, EXTRACTION_WORKERS={extraction_workers}, SEND_INTERVAL_SECONDS={send_interval}, THREAD_SUBMISSION_DELAY_SECONDS={thread_delay_env}")
	except Exception:
	pass

	print(f"📑 Processing {len(chunks_to_process)} chunks with up to {max_workers} concurrent API calls...")

	# Submit incrementally so Stop can prevent queued work from ever starting.
	from concurrent.futures import wait, FIRST_COMPLETED

	pending = list(chunks_to_process)
	next_pos = 1

	# Track work in three stages:
	# - executor_submitted: submitted to our ThreadPoolExecutor (NOT what the user means by "sent")
	# - sent_chunks: requests that actually transitioned to in-flight (i.e., after api stagger/delay)
	# - completed_chunks_local: futures that completed (success or failure)
	executor_submitted = 0
	completed_chunks_local = 0
	sent_chunks = set() # set[int] of chunk_pos that have actually been sent (in-flight)

	def _status_snapshot(*, in_flight_count: int) -> dict:
	total = int(total_chunks or 0)
	pend = int(len(pending))
	# "all_sent" means every chunk call has actually begun sending (post-delay) at least once.
	all_sent = (total > 0 and len(sent_chunks) >= total)
	# Keep legacy fields for compatibility/debugging, but note "submitted" here is executor-submitted.
	all_submitted = (executor_submitted >= total and pend == 0)
	return {
	"pid": os.getpid(),
	"ts": time.time(),
	"total_chunks": total,
	"executor_submitted": int(executor_submitted),
	"submitted_chunks": int(executor_submitted),
	"sent_chunks": int(len(sent_chunks)),
	"all_sent": bool(all_sent),
	"completed_chunks": int(completed_chunks_local),
	"in_flight": int(in_flight_count),
	"pending": pend,
	"all_submitted": bool(all_submitted),
	"graceful_stop": bool(graceful_stop),
	"wait_for_chunks": bool(wait_for_chunks),
	"stop_requested": bool(is_stop_requested()),
	}

	# Monitor watchdog entries to detect when requests actually transition to "in_flight" (sent).
	# This matches the user's definition of "submitted" (after API delay/stagger).
	_sent_monitor_stop = threading.Event()

	def _sent_monitor():
	try:
	import unified_api_client as _uac
	except Exception:
	return
	# Regex for the context we set in _extract_with_custom_prompt: "auto glossary (i/N)"
	rx = re.compile(r"auto\s+glossary\s$\s(\d+)\s/\s(\d+)\s*$", re.IGNORECASE)
	while not _sent_monitor_stop.is_set():
	try:
	st = _uac.get_api_watchdog_state() if hasattr(_uac, 'get_api_watchdog_state') else {}
	entries = st.get('in_flight_entries', []) if isinstance(st, dict) else []
	if not isinstance(entries, list):
	entries = []
	for e in entries:
	if not isinstance(e, dict):
	continue
	if e.get('status') != 'in_flight':
	continue
	ctx = e.get('context') or e.get('label') or ''
	m = rx.search(str(ctx))
	if not m:
	continue
	pos = int(m.group(1))
	tot = int(m.group(2))
	if tot == int(total_chunks or 0) and 1 <= pos <= tot:
	if pos not in sent_chunks:
	sent_chunks.add(pos)
	# Update status file periodically
	_write_glossary_status(_status_snapshot(in_flight_count=int(st.get('in_flight', 0) or 0) if isinstance(st, dict) else 0))
	except Exception:
	pass
	time.sleep(0.1)

	try:
	t_mon = threading.Thread(target=_sent_monitor, name="GlossarySentMonitor", daemon=True)
	t_mon.start()
	except Exception:
	t_mon = None

	# Initialize status file early
	_write_glossary_status(_status_snapshot(in_flight_count=0))

	def _submit_one(executor, pos, chunk_idx, chunk_text, *, last_submission_time: float):
	if is_stop_requested():
	return None

	# Apply thread submission delay
	if thread_delay > 0 and last_submission_time > 0:
	time_since_last = time.time() - last_submission_time
	if time_since_last < thread_delay:
	sleep_time = thread_delay - time_since_last
	print(f"🧵 Thread delay: {sleep_time:.1f}s for chunk {chunk_idx}")
	time.sleep(sleep_time)

	fut = executor.submit(
	_extract_with_custom_prompt,
	custom_prompt, chunk_text, language,
	min_frequency, max_names, max_titles,
	None, output_dir, strip_honorifics,
	fuzzy_threshold, filter_mode, max_sentences,
	log_callback=None,
	chunk_pos=pos,
	total_chunks=total_chunks,
	)
	return fut

	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = {} # future -> chunk_idx
	last_submission_time = 0.0

	# Prime the worker pool
	while pending and len(futures) < max_workers and not is_stop_requested():
	chunk_idx, chunk_text = pending.pop(0)
	fut = _submit_one(executor, next_pos, chunk_idx, chunk_text, last_submission_time=last_submission_time)
	if fut is False or fut is None:
	break
	futures[fut] = chunk_idx
	executor_submitted += 1
	next_pos += 1
	last_submission_time = time.time()
	_write_glossary_status(_status_snapshot(in_flight_count=len(futures)))

	escalated_full_stop = False

	def _escalate_to_full_stop(reason: str) -> None:
	nonlocal escalated_full_stop
	if escalated_full_stop:
	return
	escalated_full_stop = True
	try:
	print(f"🛑 Escalating to FULL STOP (glossary batch): {reason}")
	except Exception:
	pass
	# Disable graceful semantics locally so unified_api_client cancels quickly.
	try:
	os.environ['GRACEFUL_STOP'] = '0'
	os.environ['WAIT_FOR_CHUNKS'] = '0'
	except Exception:
	pass
	# Force unified_api_client cancellation if available.
	try:
	import unified_api_client
	if hasattr(unified_api_client, 'set_stop_flag'):
	unified_api_client.set_stop_flag(True)
	if hasattr(unified_api_client, 'global_stop_flag'):
	unified_api_client.global_stop_flag = True
	if hasattr(unified_api_client, 'UnifiedClient'):
	unified_api_client.UnifiedClient._global_cancelled = True
	except Exception:
	pass

	while futures:
	# On stop:
	# - If not graceful: immediate stop (cancel queued work).
	# - If graceful + WAIT_FOR_CHUNKS=1: stop submitting new but keep waiting for in-flight.
	# - If graceful + WAIT_FOR_CHUNKS=0: ONLY keep waiting if all chunks were already submitted;
	# otherwise escalate to full stop.
	if is_stop_requested():
	# IMPORTANT: "all sent" means every chunk call has transitioned to in-flight (post delay/stagger).
	all_sent_now = (int(total_chunks or 0) > 0 and len(sent_chunks) >= int(total_chunks or 0))
	if graceful_stop and (not wait_for_chunks) and (not all_sent_now):
	_escalate_to_full_stop("stop requested before all chunks were sent to API")

	if (not graceful_stop) or escalated_full_stop:
	try:
	for fut in list(futures.keys()):
	fut.cancel()
	except Exception:
	pass
	# Do not keep waiting if we're full-stopping.
	break

	# Graceful stop: keep waiting only if WAIT_FOR_CHUNKS=1 OR all chunks already sent.
	if graceful_stop and (wait_for_chunks or all_sent_now):
	# no-op: just continue waiting for done futures
	pass
	else:
	# Graceful stop without waiting semantics -> treat as immediate stop.
	try:
	for fut in list(futures.keys()):
	fut.cancel()
	except Exception:
	pass
	break

	done, _ = wait(futures.keys(), return_when=FIRST_COMPLETED)
	for fut in done:
	chunk_idx = futures.pop(fut, None)
	if chunk_idx is None:
	continue

	# Collect result (even if stop was requested; it may have completed before cancellation)
	try:
	chunk_glossary = fut.result()
	print(f"📑 DEBUG: Chunk {chunk_idx} returned type={type(chunk_glossary)}, len={len(chunk_glossary)}")

	# Normalize to CSV lines (without header)
	chunk_lines = []
	if isinstance(chunk_glossary, dict):
	for raw_name, translated_name in chunk_glossary.items():
	entry_type = "character" if _has_honorific(raw_name) else "term"
	chunk_lines.append(f"{entry_type},{raw_name},{translated_name}")
	elif isinstance(chunk_glossary, list):
	for line in chunk_glossary:
	if line and not line.startswith('type,'):
	chunk_lines.append(line)

	# Aggregate for end-of-run
	all_csv_lines.extend(chunk_lines)

	# Incremental writes (best-effort)
	try:
	_incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode)
	print(f"📑 Incremental write: chunk {chunk_idx} (+{len(chunk_lines)} entries)")
	except Exception as e2:
	print(f"⚠️ Incremental write failed: {e2}")

	completed_chunks += 1
	completed_chunks_local += 1
	progress_percent = (completed_chunks / total_chunks) * 100 if total_chunks else 100
	print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)")
	print(f"📑 Chunk {chunk_idx} completed and aggregated")

	except Exception as e:
	print(f"⚠️ API call for chunk {chunk_idx} failed: {e}")
	completed_chunks += 1
	progress_percent = (completed_chunks / total_chunks) * 100 if total_chunks else 100
	print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)")

	# Submit next work only if not stopping
	while pending and len(futures) < max_workers and not is_stop_requested():
	next_chunk_idx, next_chunk_text = pending.pop(0)
	fut2 = _submit_one(executor, next_pos, next_chunk_idx, next_chunk_text, last_submission_time=last_submission_time)
	if fut2 is False or fut2 is None:
	pending.clear()
	break
	futures[fut2] = next_chunk_idx
	executor_submitted += 1
	next_pos += 1
	last_submission_time = time.time()
	_write_glossary_status(_status_snapshot(in_flight_count=len(futures)))

	# Update status after processing completions
	_write_glossary_status(_status_snapshot(in_flight_count=len(futures)))

	# CHANGE: Return CSV lines instead of dictionary

	# Stop sent-monitor thread
	try:
	_sent_monitor_stop.set()
	except Exception:
	pass

	# Restore per-chunk filter disabling envs
	if _prev_filtered is None:
	os.environ.pop("_CHUNK_ALREADY_FILTERED", None)
	else:
	os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered
	if _prev_force_disable is None:
	os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None)
	else:
	os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable

	# Restore previous defer setting
	if _prev_defer is None:
	# Default back to not deferring if it wasn't set
	if "GLOSSARY_DEFER_SAVE" in os.environ:
	del os.environ["GLOSSARY_DEFER_SAVE"]
	else:
	os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer

	# If we are exiting due to a stop request, clear watchdog state/file so GUI doesn't stay "busy".
	if is_stop_requested():
	try:
	_clear_api_watchdog_state(remove_watchdog_file=True)
	except Exception:
	pass

	return all_csv_lines

	def _incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode):
	"""Incrementally update glossary output.

	Creates per-chunk CSV snapshots in an "incremental_glossary" subfolder:
	glossary.incremental1.csv, glossary.incremental2.csv, ...

	Also maintains a combined aggregator file (glossary.incremental.all.csv)
	that save_glossary() can use as a crash-safe backup.
	"""
	if not chunk_lines:
	return

	# Respect stop flag to avoid writing partial files after cancellation
	if is_stop_requested():
	return

	# Incremental output directory
	incremental_dir = os.path.join(output_dir, "incremental_glossary")
	os.makedirs(incremental_dir, exist_ok=True)

	# Per-chunk snapshot path (no merging, just this chunk)
	chunk_filename = f"glossary.incremental{chunk_idx}.csv"
	chunk_path = os.path.join(incremental_dir, chunk_filename)

	# Combined aggregator path (append-only) and visible glossary path (merged)
	agg_path = os.path.join(incremental_dir, "glossary.incremental.all.csv")
	vis_path = os.path.join(output_dir, "glossary.csv")

	# Ensure main output dir exists
	os.makedirs(output_dir, exist_ok=True)

	# Compose CSV lines for this chunk
	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1"

	header = "type,raw_name,translated_name"
	if include_description:
	header += ",gender,description"
	elif include_gender_context:
	header += ",gender"

	new_csv_lines = [header] + chunk_lines

	# Save per-chunk snapshot (no merging)
	_atomic_write_file(chunk_path, "\n".join(new_csv_lines))

	# Append to aggregator (raw append, no merging/deduping to preserve full history)
	# Use lock to prevent concurrent appends - use proper file locking/flushing
	with _file_write_lock:
	try:
	# Force close/reopen to ensure flush
	# Read first to check header
	file_exists = os.path.exists(agg_path)

	with open(agg_path, 'a', encoding='utf-8') as f:
	# If new file, write header
	if not file_exists:
	f.write(header + "\n")

	# Append chunks
	if chunk_lines:
	content_to_write = "\n".join(chunk_lines) + "\n"
	f.write(content_to_write)
	# Force flush to disk
	f.flush()
	os.fsync(f.fileno())
	except Exception as e:
	print(f"⚠️ Failed to append to incremental aggregator: {e}")

	# Update visible glossary.csv (merged and deduped)
	# DISABLED: Per user request, we only do this at the very end to save performance
	# The incremental_glossary folder maintains the safety backup
	# existing_csv = None
	# if os.path.exists(agg_path):
	# try:
	# with open(agg_path, 'r', encoding='utf-8') as f:
	# existing_csv = f.read()
	# except Exception as e:
	# print(f"⚠️ Incremental: cannot read aggregator: {e}")

	# Merge (exact merge, no fuzzy to keep this fast)
	# Note: _merge_csv_entries handles deduplication
	# We pass empty string as 'new' content because existing_csv already contains everything (from append above)
	# Actually, _merge_csv_entries merges two CSV strings. existing_csv is the full raw history.
	# If we pass it as 'base', it will clean it up.
	# merged_csv_lines = _merge_csv_entries([], existing_csv or "", strip_honorifics, language)

	# Optional filter mode
	# merged_csv_lines = _filter_csv_by_mode(merged_csv_lines, filter_mode)

	# Convert to token-efficient format for visible glossary.csv
	# token_lines = _convert_to_token_efficient_format(merged_csv_lines)
	# token_lines = _sanitize_final_glossary_lines(token_lines, use_legacy_format=False)

	# _atomic_write_file(vis_path, "\n".join(token_lines))

	def _process_single_chunk(chunk_idx, chunk_text, custom_prompt, language,
	min_frequency, max_names, max_titles, batch_size,
	output_dir, strip_honorifics, fuzzy_threshold, filter_mode,
	already_filtered=False, max_sentences=200):
	"""Process a single chunk - wrapper for parallel execution"""
	print(f"📑 Worker processing chunk {chunk_idx} ({len(chunk_text):,} chars)...")

	if custom_prompt:
	# Pass flag to indicate if text is already filtered
	os.environ["_CHUNK_ALREADY_FILTERED"] = "1" if already_filtered else "0"
	_prev_defer = os.getenv("GLOSSARY_DEFER_SAVE")
	os.environ["GLOSSARY_DEFER_SAVE"] = "1"
	try:
	result = _extract_with_custom_prompt(
	custom_prompt, chunk_text, language,
	min_frequency, max_names, max_titles,
	None, output_dir,
	strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback=None
	)
	finally:
	os.environ["_CHUNK_ALREADY_FILTERED"] = "0" # Reset
	if _prev_defer is None:
	if "GLOSSARY_DEFER_SAVE" in os.environ:
	del os.environ["GLOSSARY_DEFER_SAVE"]
	else:
	os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer
	return result
	else:
	# Pattern fallback disabled
	print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping chunk glossary extraction (pattern fallback disabled)")
	return {}

	def _apply_final_filter(entries, filter_mode):
	"""Apply final filtering based on mode to ensure only requested types are included"""
	if filter_mode == "only_with_honorifics":
	# Filter to keep only entries that look like they have honorifics
	filtered = {}
	for key, value in entries.items():
	# Check if the key contains known honorific patterns
	if _has_honorific(key):
	filtered[key] = value
	print(f"📑 Final filter: Kept {len(filtered)} entries with honorifics (from {len(entries)} total)")
	return filtered
	elif filter_mode == "only_without_honorifics":
	# Filter to keep only entries without honorifics
	filtered = {}
	for key, value in entries.items():
	if not _has_honorific(key):
	filtered[key] = value
	print(f"📑 Final filter: Kept {len(filtered)} entries without honorifics (from {len(entries)} total)")
	return filtered
	else:
	return entries

	def _looks_like_name(text):
	"""Check if text looks like a character name"""
	if not text:
	return False

	# Check for various name patterns
	# Korean names (2-4 hangul characters)
	if all(0xAC00 <= ord(char) <= 0xD7AF for char in text) and 2 <= len(text) <= 4:
	return True

	# Japanese names (mix of kanji/kana, 2-6 chars)
	has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in text)
	has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in text)
	if (has_kanji or has_kana) and 2 <= len(text) <= 6:
	return True

	# Chinese names (EXPANDED: 2-6 Chinese characters for cultivation novels)
	if all(0x4E00 <= ord(char) <= 0x9FFF for char in text) and 2 <= len(text) <= 6:
	# 1. Check if it matches specific Chinese name patterns (Courtesy Name, Generation Name)
	if hasattr(PM, 'CHINESE_NAME_PATTERNS'):
	# Courtesy names (e.g. "Lu Bozi")
	if 'courtesy_names' in PM.CHINESE_NAME_PATTERNS:
	for pattern in PM.CHINESE_NAME_PATTERNS['courtesy_names']:
	if re.match(pattern, text):
	return True

	# Generation names (middle character matches generation list)
	if len(text) == 3 and 'generation_names' in PM.CHINESE_NAME_PATTERNS:
	if text[1] in PM.CHINESE_NAME_PATTERNS['generation_names']:
	return True

	# Title prefixes (e.g. "Old Li", "Little Wang")
	if 'title_prefixes' in PM.CHINESE_NAME_PATTERNS:
	if text[0] in PM.CHINESE_NAME_PATTERNS['title_prefixes']:
	return True

	# 2. Check if it starts with a known surname (1 or 2 chars)
	if len(text) >= 2:
	# Check single-char surname
	if text[0] in PM.CHINESE_SINGLE_SURNAMES:
	return True
	# Check two-char compound surname
	if len(text) >= 3 and text[:2] in PM.CHINESE_COMPOUND_SURNAMES:
	return True

	# 3. Even without surname match, if it's 2-6 chars it could be a valid term
	return True

	# English names (starts with capital, mostly letters)
	if text[0].isupper() and sum(1 for c in text if c.isalpha()) >= len(text) * 0.8:
	return True

	return False

	def _has_honorific(term):
	"""Check if a term contains an honorific using PatternManager's comprehensive list"""
	if not term:
	return False

	term_lower = term.lower()

	# Check all language honorifics from PatternManager
	for language, honorifics_list in PM.CJK_HONORIFICS.items():
	for honorific in honorifics_list:
	# For romanized/English honorifics with spaces or dashes
	if honorific.startswith(' ') or honorific.startswith('-'):
	if term_lower.endswith(honorific.lower()):
	return True
	# For CJK honorifics (no separator)
	else:
	if honorific in term:
	return True

	return False

	def _strip_all_honorifics(term, language='korean'):
	"""Strip all honorifics from a term using PatternManager's lists"""
	if not term:
	return term

	result = term

	# Get honorifics for the specific language and English romanizations
	honorifics_to_strip = []
	if language in PM.CJK_HONORIFICS:
	honorifics_to_strip.extend(PM.CJK_HONORIFICS[language])
	honorifics_to_strip.extend(PM.CJK_HONORIFICS.get('english', []))

	# Sort by length (longest first) to avoid partial matches
	honorifics_to_strip.sort(key=len, reverse=True)

	# Strip honorifics
	for honorific in honorifics_to_strip:
	if honorific.startswith(' ') or honorific.startswith('-'):
	# For romanized honorifics with separators
	if result.lower().endswith(honorific.lower()):
	result = result[:-len(honorific)]
	else:
	# For CJK honorifics (no separator)
	if result.endswith(honorific):
	result = result[:-len(honorific)]

	return result.strip()

	def _convert_to_csv_format(data):
	"""Convert various glossary formats to CSV string format with enforced 3 columns"""
	csv_lines = ["type,raw_name,translated_name"]

	if isinstance(data, str):
	# Already CSV string
	if data.strip().startswith('type,raw_name'):
	return data
	# Try to parse as JSON
	try:
	data = json.loads(data)
	except:
	return data

	if isinstance(data, list):
	for item in data:
	if isinstance(item, dict):
	if 'type' in item and 'raw_name' in item:
	# Already in correct format
	line = f"{item['type']},{item['raw_name']},{item.get('translated_name', item['raw_name'])}"
	csv_lines.append(line)
	else:
	# Old format - default to 'term' type
	entry_type = 'term'
	raw_name = item.get('original_name', '')
	translated_name = item.get('name', raw_name)
	if raw_name and translated_name:
	csv_lines.append(f"{entry_type},{raw_name},{translated_name}")

	elif isinstance(data, dict):
	if 'entries' in data:
	# Has metadata wrapper, extract entries
	for original, translated in data['entries'].items():
	csv_lines.append(f"term,{original},{translated}")
	else:
	# Plain dictionary - default to 'term' type
	for original, translated in data.items():
	csv_lines.append(f"term,{original},{translated}")

	return '\n'.join(csv_lines)

	def _parse_csv_to_dict(csv_content):
	"""Parse CSV content to dictionary for backward compatibility"""
	result = {}
	lines = csv_content.strip().split('\n')

	for line in lines[1:]: # Skip header
	if not line.strip():
	continue
	parts = [p.strip() for p in line.split(',')]
	if len(parts) >= 3:
	result[parts[1]] = parts[2] # raw_name -> translated_name

	return result

	def _fuzzy_match(term1, term2, threshold=0.90):
	"""Check if two terms match using fuzzy matching"""
	ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio()
	return ratio >= threshold

	def _fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len):
	"""Use rapidfuzz library for MUCH faster fuzzy matching"""
	from rapidfuzz import fuzz

	print(f"📑 Using RapidFuzz (C++ speed)...")
	start_time = time.time()

	matches_count = 0
	threshold_percent = threshold * 100 # rapidfuzz uses 0-100 scale

	# Can use smaller step because rapidfuzz is so fast
	step = 1 # Check every position - rapidfuzz can handle it

	# Process text
	for i in range(0, len(text_lower) - term_len + 1, step):
	# Check stop flag every 10000 positions
	if i > 0 and i % 10000 == 0:
	if is_stop_requested():
	print(f"📑 RapidFuzz stopped at position {i}")
	return matches_count

	window = text_lower[i:i + term_len]

	# rapidfuzz is fast enough we can check every position
	if fuzz.ratio(term_lower, window) >= threshold_percent:
	matches_count += 1

	elapsed = time.time() - start_time
	print(f"📑 RapidFuzz found {matches_count} matches in {elapsed:.2f}s")
	return matches_count

	def _batch_compute_frequencies(terms, all_text, fuzzy_threshold=0.90, min_frequency=2):
	"""Compute frequencies for all terms at once - MUCH faster than individual checking"""
	print(f"📑 Computing frequencies for {len(terms)} terms in batch mode...")
	start_time = time.time()

	# Result dictionary
	term_frequencies = {}

	# First pass: exact matching (very fast)
	print(f"📑 Phase 1: Exact matching...")
	text_lower = all_text.lower()
	for term in terms:
	if is_stop_requested():
	return term_frequencies
	term_lower = term.lower()
	count = text_lower.count(term_lower)
	term_frequencies[term] = count

	exact_time = time.time() - start_time
	high_freq_terms = sum(1 for count in term_frequencies.values() if count >= min_frequency)
	print(f"📑 Exact matching complete: {high_freq_terms}/{len(terms)} terms meet threshold ({exact_time:.1f}s)")

	# If fuzzy matching is disabled, we're done
	if fuzzy_threshold >= 1.0:
	return term_frequencies

	# Second pass: fuzzy matching ONLY for low-frequency terms
	low_freq_terms = [term for term, count in term_frequencies.items() if count < min_frequency]

	if low_freq_terms:
	print(f"📑 Phase 2: Fuzzy matching for {len(low_freq_terms)} low-frequency terms...")

	# Try to use RapidFuzz batch processing
	try:
	from rapidfuzz import process, fuzz

	# For very large texts, sample it for fuzzy matching
	if len(text_lower) > 500000:
	print(f"📑 Text too large ({len(text_lower):,} chars), sampling for fuzzy matching...")
	# Sample every Nth character to reduce size
	sample_rate = max(1, len(text_lower) // 100000)
	sampled_text = text_lower[::sample_rate]
	else:
	sampled_text = text_lower

	# Create chunks of text for fuzzy matching
	chunk_size = 1000 # Process text in chunks
	text_chunks = [sampled_text[i:i+chunk_size] for i in range(0, len(sampled_text), chunk_size//2)] # Overlapping chunks

	print(f"📑 Processing {len(text_chunks)} text chunks...")
	threshold_percent = fuzzy_threshold * 100

	# Process in batches to avoid memory issues
	batch_size = 100 # Process 100 terms at a time
	for batch_start in range(0, len(low_freq_terms), batch_size):
	if is_stop_requested():
	break

	batch_end = min(batch_start + batch_size, len(low_freq_terms))
	batch_terms = low_freq_terms[batch_start:batch_end]

	for term in batch_terms:
	if is_stop_requested():
	break

	# Quick fuzzy search in chunks
	fuzzy_count = 0
	for chunk in text_chunks[:50]: # Limit to first 50 chunks for speed
	if fuzz.partial_ratio(term.lower(), chunk) >= threshold_percent:
	fuzzy_count += 1

	if fuzzy_count > 0:
	# Scale up based on sampling
	if len(text_lower) > 500000:
	fuzzy_count *= (len(text_lower) // len(sampled_text))
	term_frequencies[term] += fuzzy_count

	if (batch_end % 500 == 0) or (batch_end == len(low_freq_terms)):
	elapsed = time.time() - start_time
	print(f"📑 Processed {batch_end}/{len(low_freq_terms)} terms ({elapsed:.1f}s)")

	except ImportError:
	print("📑 RapidFuzz not available, skipping fuzzy matching")

	total_time = time.time() - start_time
	final_high_freq = sum(1 for count in term_frequencies.values() if count >= min_frequency)
	print(f"📑 Batch frequency computation complete: {final_high_freq}/{len(terms)} terms accepted ({total_time:.1f}s)")

	return term_frequencies

	def _find_fuzzy_matches(term, text, threshold=0.90):
	"""Find fuzzy matches of a term in text using efficient method with parallel processing"""
	start_time = time.time()

	term_lower = term.lower()
	text_lower = text.lower()
	term_len = len(term)

	# Only log for debugging if explicitly enabled
	debug_search = os.getenv("GLOSSARY_DEBUG_SEARCH", "0") == "1"
	if debug_search and len(text) > 100000:
	print(f"📑 Searching for '{term}' in {len(text):,} chars (threshold: {threshold})")

	# Strategy 1: Use exact matching first for efficiency
	exact_start = time.time()
	matches_count = text_lower.count(term_lower)
	exact_time = time.time() - exact_start

	if matches_count > 0:
	if debug_search and len(text) > 100000:
	print(f"📑 Found {matches_count} exact matches in {exact_time:.3f}s")
	return matches_count

	# Strategy 2: Try rapidfuzz if available (much faster)
	if matches_count == 0 and threshold < 1.0:
	try:
	from rapidfuzz import fuzz
	return _fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len)
	except ImportError:
	pass # Fall back to parallel/sequential

	# Strategy 3: Fall back to parallel/sequential if rapidfuzz not available
	# Check if parallel processing is enabled
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))

	if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts
	return _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers)
	else:
	return _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len)
	# Check if parallel processing is enabled
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))

	if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts
	return _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers)
	else:
	return _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len)

	return matches_count

	def _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, num_workers):
	"""Parallel fuzzy search using ThreadPoolExecutor"""
	print(f"📑 Starting parallel fuzzy search with {num_workers} workers...")

	text_len = len(text_lower)
	matches_count = 0

	# Split text into overlapping chunks for parallel processing
	chunk_size = max(text_len // num_workers, term_len * 100)
	chunks = []

	for i in range(0, text_len, chunk_size):
	# Add overlap to avoid missing matches at boundaries
	end = min(i + chunk_size + term_len - 1, text_len)
	chunks.append((i, text_lower[i:end]))

	print(f"📑 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each")

	# Process chunks in parallel
	with ThreadPoolExecutor(max_workers=num_workers) as executor:
	futures = []

	for chunk_idx, (start_pos, chunk_text) in enumerate(chunks):
	if is_stop_requested():
	return matches_count

	future = executor.submit(
	_fuzzy_search_chunk,
	term_lower, chunk_text, threshold, term_len, chunk_idx, len(chunks)
	)
	futures.append(future)

	# Collect results
	for future in as_completed(futures):
	if is_stop_requested():
	executor.shutdown(wait=False)
	return matches_count

	try:
	chunk_matches = future.result()
	matches_count += chunk_matches
	except Exception as e:
	print(f"📑 ⚠️ Chunk processing error: {e}")

	print(f"📑 Parallel fuzzy search found {matches_count} matches")
	return matches_count

	def _fuzzy_search_chunk(term_lower, chunk_text, threshold, term_len, chunk_idx, total_chunks):
	"""Process a single chunk for fuzzy matches"""
	chunk_matches = 0

	# Use a more efficient step size - no need to check every position
	step = max(1, term_len // 3) # Check every third of term length

	for i in range(0, len(chunk_text) - term_len + 1, step):
	# Check stop flag periodically
	if i > 0 and i % 1000 == 0:
	if is_stop_requested():
	return chunk_matches

	window = chunk_text[i:i + term_len]

	# Use SequenceMatcher for fuzzy matching
	if SequenceMatcher(None, term_lower, window).ratio() >= threshold:
	chunk_matches += 1

	# Log progress for this chunk
	if total_chunks > 1:
	print(f"📑 Chunk {chunk_idx + 1}/{total_chunks} completed: {chunk_matches} matches")

	return chunk_matches

	def _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len):
	"""Sequential fuzzy search (fallback for small texts or single worker)"""
	print(f"📑 Starting sequential fuzzy search...")
	fuzzy_start = time.time()

	matches_count = 0

	# More efficient step size
	step = max(1, term_len // 3)
	total_windows = (len(text_lower) - term_len + 1) // step

	print(f"📑 Checking ~{total_windows:,} windows with step size {step}")

	windows_checked = 0
	for i in range(0, len(text_lower) - term_len + 1, step):
	# Check stop flag frequently
	if i > 0 and i % (step * 100) == 0:
	if is_stop_requested():
	return matches_count

	# Progress log for very long operations
	if windows_checked % 1000 == 0 and windows_checked > 0:
	elapsed = time.time() - fuzzy_start
	rate = windows_checked / elapsed if elapsed > 0 else 0
	eta = (total_windows - windows_checked) / rate if rate > 0 else 0
	print(f"📑 Progress: {windows_checked}/{total_windows} windows, {rate:.0f} w/s, ETA: {eta:.1f}s")

	window = text_lower[i:i + term_len]
	if SequenceMatcher(None, term_lower, window).ratio() >= threshold:
	matches_count += 1

	windows_checked += 1

	fuzzy_time = time.time() - fuzzy_start
	print(f"📑 Sequential fuzzy search completed in {fuzzy_time:.2f}s, found {matches_count} matches")

	return matches_count

	def _fuzzy_match(term1, term2, threshold=0.90):
	"""Check if two terms match using fuzzy matching (unchanged)"""
	ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio()
	return ratio >= threshold

	def _strip_honorific(term, language_hint='unknown'):
	"""Strip honorific from a term if present"""
	if not term:
	return term

	# Get honorifics for the detected language
	honorifics_to_check = []
	if language_hint in PM.CJK_HONORIFICS:
	honorifics_to_check.extend(PM.CJK_HONORIFICS[language_hint])
	honorifics_to_check.extend(PM.CJK_HONORIFICS.get('english', []))

	# Check and remove honorifics
	for honorific in honorifics_to_check:
	if honorific.startswith('-') or honorific.startswith(' '):
	# English-style suffix
	if term.endswith(honorific):
	return term[:-len(honorific)].strip()
	else:
	# CJK-style suffix (no separator)
	if term.endswith(honorific):
	return term[:-len(honorific)]

	return term

	def _filter_text_for_glossary(text, min_frequency=2, max_sentences=None):
	"""Filter text to extract only meaningful content for glossary extraction

	Args:
	text: Input text to filter
	min_frequency: Minimum frequency threshold for terms
	max_sentences: Maximum number of sentences to return (reads from env if None)
	"""
	import re
	from collections import Counter
	from concurrent.futures import ThreadPoolExecutor, as_completed
	import time

	filter_start_time = time.time()
	print(f"📑 Starting smart text filtering...")
	print(f"📑 Input text size: {len(text):,} characters")

	# Dynamic character coverage flag (must be defined before any early checks)
	include_all_characters_env = os.getenv("GLOSSARY_INCLUDE_ALL_CHARACTERS", "0")
	include_all_characters = include_all_characters_env == "1"

	force_skip_smart_selection = False
	honorific_first_indices = {}
	# Clean HTML if present
	print(f"📑 Step 1/7: Cleaning HTML tags...")
	from bs4 import BeautifulSoup
	soup = BeautifulSoup(text, 'html.parser')
	clean_text = soup.get_text()
	print(f"📑 Clean text size: {len(clean_text):,} characters")

	# Detect primary language for better filtering
	print(f"📑 Step 2/7: Detecting primary language...")
	def detect_primary_language(text_sample):
	sample = text_sample[:1000]
	korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
	japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
	chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)

	# Check gender pronouns as secondary indicator if character counts are ambiguous
	if korean_chars == 0 and japanese_kana == 0 and chinese_chars > 0:
	# Distinguish Chinese vs Kanji-heavy Japanese using pronouns
	if hasattr(PM, 'GENDER_PRONOUNS'):
	# Check Chinese pronouns
	chinese_pronouns = PM.GENDER_PRONOUNS.get('chinese', {}).get('male', []) + \
	PM.GENDER_PRONOUNS.get('chinese', {}).get('female', [])
	for p in chinese_pronouns:
	if p in sample:
	return 'chinese'

	# Check Japanese pronouns
	japanese_pronouns = PM.GENDER_PRONOUNS.get('japanese', {}).get('male', []) + \
	PM.GENDER_PRONOUNS.get('japanese', {}).get('female', [])
	for p in japanese_pronouns:
	if p in sample:
	return 'japanese'

	if korean_chars > 50:
	return 'korean'
	elif japanese_kana > 20:
	return 'japanese'
	elif chinese_chars > 50 and japanese_kana < 10:
	return 'chinese'
	else:
	return 'english'

	primary_lang = detect_primary_language(clean_text)
	print(f"📑 Detected primary language: {primary_lang}")
	# Safety guard: ensure flag exists even if subprocess reload missed earlier assignment
	try:
	include_gender_context_flag
	except NameError:
	include_gender_context_flag = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"

	# Gender pronouns for optional gender-context filtering in early captures
	gender_pronouns = []
	if include_gender_context_flag and hasattr(PM, "GENDER_PRONOUNS"):
	lang_key = "english"
	if primary_lang == "korean":
	lang_key = "korean"
	elif primary_lang == "chinese":
	lang_key = "chinese"
	elif primary_lang == "japanese":
	lang_key = "japanese"
	gp = PM.GENDER_PRONOUNS.get(lang_key, {})
	gender_pronouns = gp.get("male", []) + gp.get("female", [])

	# Split into sentences for better context
	print(f"📁 Step 3/7: Splitting text into sentences...")
	# Use language-specific sentence splitting for better accuracy
	if primary_lang == 'chinese':
	# Split on major punctuation, but keep 、 and ， within sentences
	# This preserves more context for Chinese cultivation/wuxia terms
	sentences = re.split(r'[。！？；：]+', clean_text)
	else:
	sentences = re.split(r'[.!?。！？]+', clean_text)
	print(f"📁 Found {len(sentences):,} sentences")

	# Extract potential terms (words/phrases that appear multiple times)
	print(f"📑 Step 4/7: Setting up extraction patterns and exclusion rules...")
	word_freq = Counter()

	# Pattern for detecting potential names/terms based on capitalization or special characters
	# Korean names: 2-4 hangul characters WITHOUT honorifics
	korean_pattern = r'[가-힣]{2,4}'
	# Japanese names: kanji/hiragana/katakana combinations
	japanese_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,6}'
	# Chinese names: EXPANDED to 2-8 characters for cultivation/wuxia novels
	# This captures longer compound names, titles, and cultivation terms
	chinese_pattern = r'[\u4e00-\u9fff]{2,8}'
	# English proper nouns: Capitalized words
	english_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b'

	# Combine patterns
	combined_pattern = f'({korean_pattern}\|{japanese_pattern}\|{chinese_pattern}\|{english_pattern})'
	print(f"📑 Using combined regex pattern for {primary_lang} text")

	# Get honorifics and title patterns for the detected language
	honorifics_to_exclude = set()
	if primary_lang in PM.CJK_HONORIFICS:
	honorifics_to_exclude.update(PM.CJK_HONORIFICS[primary_lang])
	# Also add English romanizations
	honorifics_to_exclude.update(PM.CJK_HONORIFICS.get('english', []))

	# Compile title patterns for the language
	title_patterns = []
	if primary_lang in PM.TITLE_PATTERNS:
	for pattern in PM.TITLE_PATTERNS[primary_lang]:
	title_patterns.append(re.compile(pattern))

	# Function to check if a term should be excluded
	def should_exclude_term(term):
	term_lower = term.lower()

	# Check if it's a common word
	if term in PM.COMMON_WORDS or term_lower in PM.COMMON_WORDS:
	return True

	# Check if it contains honorifics
	for honorific in honorifics_to_exclude:
	if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])):
	return True

	# Check if it matches title patterns
	for pattern in title_patterns:
	if pattern.search(term):
	return True

	# Check if it's a number (including Chinese numbers)
	if term in PM.CHINESE_NUMS:
	return True

	# Check if it's just digits
	if term.isdigit():
	return True

	# For Chinese text, INCLUDE domain-specific terms (don't exclude them)
	if primary_lang == 'chinese' and len(term) >= 2:
	# Check if it's a cultivation term - these should NOT be excluded
	for category in PM.CHINESE_CULTIVATION_TERMS.values():
	if term in category:
	return False # Keep cultivation terms!

	# Check if it's a wuxia term - these should NOT be excluded
	for category in PM.CHINESE_WUXIA_TERMS.values():
	if term in category:
	return False # Keep wuxia terms!

	# Check relationship terms (important character relationships)
	for category in PM.CHINESE_RELATIONSHIP_TERMS.values():
	if term in category:
	return False # Keep relationship terms!

	# Check mythological terms (creatures, artifacts, legendary beings)
	for category in PM.CHINESE_MYTHOLOGICAL_TERMS.values():
	if term in category:
	return False # Keep mythological terms!

	# Check elemental/natural force terms
	for category in PM.CHINESE_ELEMENTAL_TERMS.values():
	if term in category:
	return False # Keep elemental terms!

	# Check physique/spiritual root terms
	for category in PM.CHINESE_PHYSIQUE_TERMS.values():
	if term in category:
	return False # Keep physique terms!

	# Check treasure grades
	for category in PM.CHINESE_TREASURE_GRADES.values():
	if term in category:
	return False # Keep treasure grade terms!

	# Check power system terms (levels, stars, etc.)
	for category in PM.CHINESE_POWER_SYSTEMS.values():
	if term in category:
	return False # Keep power system terms!

	# Check location types
	for category in PM.CHINESE_LOCATION_TYPES.values():
	if term in category:
	return False # Keep location terms!

	# Check battle terms
	for category in PM.CHINESE_BATTLE_TERMS.values():
	if term in category:
	return False # Keep battle terms!

	# Check novel terms (common raw Chinese terms)
	if hasattr(PM, 'CHINESE_NOVEL_TERMS'):
	for category in PM.CHINESE_NOVEL_TERMS.values():
	if term in category:
	return False

	return False

	# Extract potential terms from each sentence
	print(f"📑 Step 5/7: Extracting and filtering terms from sentences...")

	# Check if we should use parallel processing
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))
	# Auto-detect optimal workers if not set
	if extraction_workers == 1 and len(sentences) > 1000:
	# Use more cores for better parallelization
	cpu_count = os.cpu_count() or 4
	extraction_workers = min(cpu_count, 12) # Use up to 12 cores
	print(f"📑 Auto-detected {cpu_count} CPU cores, using {extraction_workers} workers")

	use_parallel = extraction_workers > 1 and len(sentences) > 100

	if use_parallel:
	print(f"📑 Using parallel processing with {extraction_workers} workers")
	print(f"📑 Estimated speedup: {extraction_workers}x faster")

	important_sentences = []
	seen_contexts = set()
	processed_count = 0
	total_sentences = len(sentences)
	last_progress_time = time.time()

	# Prepare gender context check
	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	gender_nuance_enabled = include_gender_context and os.getenv("GLOSSARY_ENABLE_GENDER_NUANCE", "1") == "1"
	gender_pronouns = []
	if gender_nuance_enabled and hasattr(PM, 'GENDER_PRONOUNS'):
	# Get pronouns for the detected language
	lang_key = 'english'
	if primary_lang == 'korean': lang_key = 'korean'
	elif primary_lang == 'chinese': lang_key = 'chinese'
	elif primary_lang == 'japanese': lang_key = 'japanese'

	gender_pronouns.extend(PM.GENDER_PRONOUNS.get(lang_key, {}).get('male', []))
	gender_pronouns.extend(PM.GENDER_PRONOUNS.get(lang_key, {}).get('female', []))
	if gender_pronouns:
	print(f"📑 Gender context enabled: scanning for pronouns in {lang_key}")

	def process_sentence_batch(batch_sentences, batch_idx):
	"""Process a batch of sentences"""
	local_word_freq = Counter()
	local_important = []
	local_seen = set()

	for sentence in batch_sentences:
	sentence = sentence.strip()
	if len(sentence) < 10 or len(sentence) > 500:
	continue

	# Check for gender pronouns if enabled - include sentence if pronoun found
	has_pronoun = False
	if gender_nuance_enabled and gender_pronouns:
	for pronoun in gender_pronouns:
	if pronoun in sentence:
	has_pronoun = True
	break

	# Find all potential terms in this sentence
	matches = re.findall(combined_pattern, sentence)

	valid_term_found = False
	if matches:
	# Filter out excluded terms
	for match in matches:
	if not should_exclude_term(match):
	local_word_freq[match] += 1
	valid_term_found = True

	# Keep sentence if it has valid terms OR contains a gender pronoun (for context)
	# If include_gender_context is True, we include sentences with pronouns even if they don't have new terms,
	# but ONLY if the pronouns match known characters. However, we don't know the characters yet.
	# So, we include pronoun sentences to provide context for the LLM to infer gender.
	if valid_term_found or (gender_nuance_enabled and has_pronoun):
	sentence_key = sentence[:50] # Use prefix as key to avoid duplicates
	if sentence_key not in local_seen:
	local_important.append(sentence)
	local_seen.add(sentence_key)

	return local_word_freq, local_important, local_seen, batch_idx

	if use_parallel:
	# Force SMALL batches for real parallelization
	# We want MANY small batches, not few large ones!

	# Calculate based on total sentences
	total_sentences = len(sentences)

	# CRITICAL: Batch size must balance two factors:
	# 1. Small batches = more parallelism but higher overhead
	# 2. Large batches = less overhead but limits parallelism
	#
	# For Windows ProcessPoolExecutor, overhead is HIGH, so we prefer LARGE batches
	# Target: Each worker should get 3-10 batches (not 100+ tiny batches)

	# Calculate batch size based on workers to minimize overhead
	target_batches_per_worker = 5 # Sweet spot: enough work distribution, minimal overhead
	ideal_batch_size = max(500, total_sentences // (extraction_workers * target_batches_per_worker))

	# Apply sensible limits
	if total_sentences < 1000:
	optimal_batch_size = 100 # Small dataset: normal batching
	elif total_sentences < 10000:
	optimal_batch_size = min(500, ideal_batch_size)
	elif total_sentences < 50000:
	optimal_batch_size = min(2000, ideal_batch_size)
	elif total_sentences < 200000:
	optimal_batch_size = min(5000, ideal_batch_size)
	else:
	# For 754K sentences with 12 workers:
	# target_batches = 12 * 5 = 60 batches
	# batch_size = 754K / 60 = ~12,500 sentences/batch
	# This is MUCH better than 1887 batches of 400!
	optimal_batch_size = min(20000, ideal_batch_size)

	# Ensure we have enough batches for all workers
	min_batches = extraction_workers * 3 # At least 3 batches per worker
	max_batch_size = max(50, total_sentences // min_batches)
	optimal_batch_size = min(optimal_batch_size, max_batch_size)

	print(f"📑 Total sentences: {total_sentences:,}")
	print(f"📑 Target batch size: {optimal_batch_size} sentences")

	# Calculate expected number of batches
	expected_batches = (total_sentences + optimal_batch_size - 1) // optimal_batch_size
	print(f"📑 Expected batches: {expected_batches} (for {extraction_workers} workers)")
	print(f"📑 Batches per worker: ~{expected_batches // extraction_workers} batches")

	batches = [sentences[i:i + optimal_batch_size] for i in range(0, len(sentences), optimal_batch_size)]
	print(f"📑 Processing {len(batches)} batches of ~{optimal_batch_size} sentences each")
	print(f"📑 Expected speedup: {min(extraction_workers, len(batches))}x (using {extraction_workers} workers)")

	# Decide between ThreadPoolExecutor and ProcessPoolExecutor
	import multiprocessing
	in_subprocess = multiprocessing.current_process().name != 'MainProcess'

	# Use ProcessPoolExecutor for better parallelism on larger datasets
	# On Windows, we CAN use ProcessPoolExecutor in subprocess with spawn context
	use_process_pool = len(sentences) > 5000 # Remove subprocess check!

	if use_process_pool:
	# Check if we're in a daemonic process (can't spawn children)
	is_daemon = multiprocessing.current_process().daemon if hasattr(multiprocessing.current_process(), 'daemon') else False

	if in_subprocess and is_daemon:
	# Daemonic processes can't spawn children - fall back to ThreadPoolExecutor
	print(f"⚠️ Running in daemonic subprocess - cannot use ProcessPoolExecutor")
	print(f"📁 Falling back to ThreadPoolExecutor (limited parallelism due to GIL)")
	use_process_pool = False
	executor_class = ThreadPoolExecutor
	executor_kwargs = {'max_workers': extraction_workers}
	use_mp_pool = False
	else:
	# We can use ProcessPoolExecutor
	if in_subprocess:
	print(f"📁 Using ProcessPoolExecutor in non-daemonic subprocess")
	print(f"📁 This enables TRUE parallelism even from within a subprocess!")
	else:
	print(f"📁 Using ProcessPoolExecutor for maximum performance (true parallelism)")

	mp_context = multiprocessing.get_context('spawn')
	executor_class = mp_context.Pool

	# Capture CURRENT environment variable values from parent process
	current_env_vars = {
	'GLOSSARY_MAX_SENTENCES': os.getenv('GLOSSARY_MAX_SENTENCES', '200'),
	'GLOSSARY_MIN_FREQUENCY': os.getenv('GLOSSARY_MIN_FREQUENCY', '2'),
	'GLOSSARY_MAX_NAMES': os.getenv('GLOSSARY_MAX_NAMES', '50'),
	'GLOSSARY_MAX_TITLES': os.getenv('GLOSSARY_MAX_TITLES', '30'),
	'GLOSSARY_BATCH_SIZE': os.getenv('GLOSSARY_BATCH_SIZE', '50'),
	'GLOSSARY_STRIP_HONORIFICS': os.getenv('GLOSSARY_STRIP_HONORIFICS', '1'),
	'GLOSSARY_FUZZY_THRESHOLD': os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90'),
	}
	print(f"📁 Passing env vars to child processes: GLOSSARY_MAX_SENTENCES={current_env_vars['GLOSSARY_MAX_SENTENCES']}")

	# For multiprocessing.Pool, we use different kwargs
	# Use module-level init function (can't use local function due to pickling)
	executor_kwargs = {
	'processes': extraction_workers,
	'initializer': _init_worker_with_env,
	'initargs': (current_env_vars,)
	}
	use_mp_pool = True # Flag to use different API
	else:
	print(f"📁 Using ThreadPoolExecutor for sentence processing (dataset < 5000 sentences)")
	executor_class = ThreadPoolExecutor
	executor_kwargs = {'max_workers': extraction_workers}
	use_mp_pool = False

	# Handle multiprocessing.Pool vs concurrent.futures differently
	if use_process_pool and use_mp_pool:
	# Use multiprocessing.Pool API (map_async)
	with executor_class(**executor_kwargs) as pool:
	# Prepare data for process pool
	exclude_check_data = (
	list(honorifics_to_exclude),
	[p.pattern for p in title_patterns],
	PM.COMMON_WORDS,
	PM.CHINESE_NUMS
	)

	# Prepare all arguments
	all_args = [(batch, idx, combined_pattern, exclude_check_data)
	for idx, batch in enumerate(batches)]

	print(f"📁 Submitting {len(all_args)} batches to process pool...")

	# Use map_async with chunksize for better distribution
	# chunksize=1 means each worker gets one batch at a time
	result_async = pool.map_async(_process_sentence_batch_for_extraction, all_args, chunksize=1)

	# Poll for completion with progress estimates
	completed_batches = 0
	batch_start_time = time.time()
	next_report_ts = batch_start_time + 5.0

	print(f"📁 Processing batches with {extraction_workers} parallel workers...")

	while not result_async.ready():
	time.sleep(2) # Check every 2 seconds
	now = time.time()
	elapsed = now - batch_start_time

	# Emit logs on a fixed 5s cadence (5, 10, 15...) even if our poll loop wakes late.
	while now >= next_report_ts:
	elapsed_for_log = int(next_report_ts - batch_start_time)

	# Estimate progress based on time and worker count
	batches_per_second = extraction_workers / 0.3 # rough heuristic
	estimated_completed = min(int(elapsed * batches_per_second), len(all_args))
	estimated_progress = min(95, (estimated_completed / len(all_args)) * 100)
	estimated_sentences = min(estimated_completed * optimal_batch_size, total_sentences)

	if estimated_progress < 95:
	print(f"📁 Processing... ~{estimated_progress:.0f}% estimated (~{estimated_sentences:,} sentences) \| {elapsed_for_log}s elapsed")
	else:
	print(f"📁 Processing... finalizing last batches \| {elapsed_for_log}s elapsed")

	next_report_ts += 5.0

	# Get all results
	total_elapsed = time.time() - batch_start_time
	print(f"📁 All batches completed in {total_elapsed:.1f}s! Collecting results...")
	all_results = result_async.get()

	# Process all results
	for local_word_freq, local_important, local_seen, batch_idx in all_results:
	# Merge results
	word_freq.update(local_word_freq)
	for sentence in local_important:
	sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence)))
	if sentence_key not in seen_contexts:
	important_sentences.append(sentence)
	seen_contexts.add(sentence_key)

	processed_count += len(batches[batch_idx])
	completed_batches += 1

	# Show progress
	progress_interval = 1 if len(batches) <= 20 else (5 if len(batches) <= 100 else 10)
	if completed_batches % progress_interval == 0 or completed_batches == len(batches):
	progress = (processed_count / total_sentences) * 100
	elapsed = time.time() - batch_start_time
	rate = (processed_count / elapsed) if elapsed > 0 else 0
	print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) \| Batch {completed_batches}/{len(batches)} \| {rate:.0f} sent/sec")
	else:
	# Use concurrent.futures API (ThreadPoolExecutor or ProcessPoolExecutor)
	with executor_class(**executor_kwargs) as executor:
	futures = []

	# Prepare data for ProcessPoolExecutor if needed
	if use_process_pool:
	# Serialize exclusion check data for process pool
	exclude_check_data = (
	list(honorifics_to_exclude),
	[p.pattern for p in title_patterns],
	PM.COMMON_WORDS,
	PM.CHINESE_NUMS
	)

	for idx, batch in enumerate(batches):
	if use_process_pool:
	# Use module-level function for ProcessPoolExecutor
	future = executor.submit(_process_sentence_batch_for_extraction,
	(batch, idx, combined_pattern, exclude_check_data))
	else:
	# Use local function for ThreadPoolExecutor
	future = executor.submit(process_sentence_batch, batch, idx)

	futures.append(future)
	# Yield to GUI when submitting futures
	if idx % 10 == 0:
	time.sleep(0.001)

	# Collect results with progress
	completed_batches = 0
	batch_start_time = time.time()
	for future in as_completed(futures):
	# Get result without timeout - as_completed already handles waiting
	local_word_freq, local_important, local_seen, batch_idx = future.result()

	# Merge results
	word_freq.update(local_word_freq)
	for sentence in local_important:
	sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence)))
	if sentence_key not in seen_contexts:
	important_sentences.append(sentence)
	seen_contexts.add(sentence_key)

	processed_count += len(batches[batch_idx])
	completed_batches += 1

	# Show progress more frequently for better user feedback
	progress_interval = 1 if len(batches) <= 20 else (5 if len(batches) <= 100 else 10)

	if completed_batches % progress_interval == 0 or completed_batches == len(batches):
	progress = (processed_count / total_sentences) * 100
	elapsed = time.time() - batch_start_time
	rate = (processed_count / elapsed) if elapsed > 0 else 0
	print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) \| Batch {completed_batches}/{len(batches)} \| {rate:.0f} sent/sec")

	# Yield to GUI after each batch completes
	time.sleep(0.001)
	else:
	# Sequential processing with progress
	for idx, sentence in enumerate(sentences):
	sentence = sentence.strip()
	if len(sentence) < 10 or len(sentence) > 500:
	continue

	# Find all potential terms in this sentence
	matches = re.findall(combined_pattern, sentence)

	if matches:
	# Filter out excluded terms
	filtered_matches = []
	for match in matches:
	if not should_exclude_term(match):
	word_freq[match] += 1
	filtered_matches.append(match)

	# Keep sentences with valid potential terms
	if filtered_matches:
	sentence_key = ' '.join(sorted(filtered_matches))
	if sentence_key not in seen_contexts:
	important_sentences.append(sentence)
	seen_contexts.add(sentence_key)

	# Show progress every 1000 sentences or 2 seconds
	if idx % 1000 == 0 or (time.time() - last_progress_time > 2):
	progress = ((idx + 1) / total_sentences) * 100
	print(f"📑 Processing sentences: {idx + 1:,}/{total_sentences:,} ({progress:.1f}%)")
	last_progress_time = time.time()
	# Yield to GUI thread every 1000 sentences
	time.sleep(0.001) # Tiny sleep to let GUI update
	# Yield to GUI thread every 1000 sentences
	time.sleep(0.001) # Tiny sleep to let GUI update

	print(f"📑 Found {len(important_sentences):,} sentences with potential glossary terms")

	# Step 6/7: Deduplicate and normalize terms
	# Skip this heavy deduplication if "Dynamic Limit Expansion" (include_all_characters) is disabled
	# When disabled, we only care about exact matches of high-frequency terms, which combined_freq already handles
	if not include_all_characters:
	print(f"📑 Step 6/7: Skipping advanced term deduplication (Dynamic Limit Expansion disabled)...")
	print(f"📑 Using simple normalized frequency counts for {len(word_freq):,} terms")

	combined_freq = Counter()
	term_count = 0

	# Simple deduplication by normalized form only
	for term, count in word_freq.items():
	normalized = term.lower().strip()
	if normalized in combined_freq:
	if count > combined_freq[normalized]:
	del combined_freq[normalized]
	combined_freq[term] = count
	else:
	combined_freq[term] = count
	term_count += 1
	if term_count % 5000 == 0:
	time.sleep(0.001)
	else:
	print(f"📑 Step 6/7: Normalizing and deduplicating {len(word_freq):,} unique terms...")

	combined_freq = Counter()
	term_count = 0

	# Original logic with potential for future advanced features if enabled
	for term, count in word_freq.items():
	normalized = term.lower().strip()
	if normalized in combined_freq:
	if count > combined_freq[normalized]:
	del combined_freq[normalized]
	combined_freq[term] = count
	else:
	combined_freq[term] = count
	term_count += 1
	if term_count % 1000 == 0:
	time.sleep(0.001)

	print(f"📑 Deduplicated to {len(combined_freq):,} unique terms")

	# Filter to keep only terms that appear at least min_frequency times
	frequent_terms = {term: count for term, count in combined_freq.items() if count >= min_frequency}

	# Build filtered text focusing on sentences containing frequent terms
	print(f"📑 Step 7/7: Building filtered text from relevant sentences...")

	# OPTIMIZATION: Skip sentences that already passed filtering in step 5
	# These sentences already contain glossary terms, no need to check again!
	# We just need to limit the sample size

	filtered_sentences = important_sentences # Already filtered!
	print(f"📑 Using {len(filtered_sentences):,} pre-filtered sentences (already contain glossary terms)")

	# EARLY DYNAMIC EXPANSION: collect one sentence index per unique honorific-attached name (first appearance), before scoring/nuance
	def _sentence_has_gender_pronoun(sent: str) -> bool:
	if not include_gender_context_flag or not gender_pronouns:
	return True
	return any(p in sent for p in gender_pronouns)

	if include_all_characters:
	honorific_pattern_str = None
	if primary_lang in PM.CJK_HONORIFICS:
	h_list = PM.CJK_HONORIFICS[primary_lang] + PM.CJK_HONORIFICS.get('english', [])
	h_list.sort(key=len, reverse=True)
	if h_list:
	honorific_pattern_str = '\|'.join(map(re.escape, h_list))
	if honorific_pattern_str:
	try:
	honorifics = PM.CJK_HONORIFICS.get(primary_lang, []) + PM.CJK_HONORIFICS.get('english', [])
	honorifics = [h for h in honorifics if h] # drop empties
	# Keep only clear suffix/title honorifics; drop verb endings/keigo/politeness particles
	if primary_lang == 'korean':
	suffix_allow = {'님','씨','군','양','공','옹','군','양','낭','랑','생','자','부','모','시','제','족하',
	'마마','대감','영감','나리','도령','낭자','아씨','규수','각하','전하','폐하','저하','합하',
	'대비','대왕','왕자','공주','도련님','아가씨'}
	honorifics = [h for h in honorifics if h in suffix_allow]
	elif primary_lang == 'japanese':
	suffix_allow = {'さん','ちゃん','君','くん','様','さま','殿','先輩','先生','氏','殿下','閣下','卿'}
	honorifics = [h for h in honorifics if h in suffix_allow]
	elif primary_lang == 'chinese':
	# short person titles only
	honorifics = [h for h in honorifics if len(h) <= 3 and h in {'先生','小姐','夫人','公子','姑娘','大人','阁下','将军','公主','少爷','老爷','相公','郎君','小姐','少侠','侠士'}]
	else:
	# romanized suffixes only
	honorifics = [h for h in honorifics if h.startswith('-') and len(h) <= 8]
	if honorifics:
	hon_regex = "\|".join(map(re.escape, honorifics))
	cjk_name_pat = r"[\\u4e00-\\u9fff\\u3040-\\u30ff\\uac00-\\ud7af·]{2,4}"
	latin_name_pat = r"[A-Z][a-z]{1,15}(?:\\s+[A-Z][a-z]{1,15}){0,1}"
	punct_opt = r"[，、,.:;!?…\\)\\] \\}】』」]?"
	combined_pat = re.compile(
	rf"(?P<name>{cjk_name_pat}\|{latin_name_pat})\\s*(?P<hon>{hon_regex}){punct_opt}"
	)
	honor_pat = re.compile(hon_regex)
	ordered_names = []
	for idx, sent in enumerate(filtered_sentences):
	for m in combined_pat.finditer(sent):
	name = m.group("name").strip()
	if not name or any(ch.isdigit() for ch in name):
	continue

	# Apply strict filtering to regex matches too
	# FILTERING: Skip tokens with common noisy start characters
	if any(name.startswith(c) for c in ['[', '(', '{', '<', '-', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅋ', 'ㅎ']):
	continue

	# FILTERING: Skip tokens that are just common words/particles
	if name in PM.COMMON_WORDS:
	continue

	# FILTERING: Aggressive Korean Verb/Adjective Ending Check
	if len(name) > 2 and any(name.endswith(e) for e in ['겠네', '리라', '니까', '는데', '러나', '다가', '면서', '지만', '도록', '으로', '에서', '에게', '한테', '라고', '이란']):
	continue

	# Skip if name looks like a title term (PatternManager title patterns)
	skip_title = False
	for pat in PM.TITLE_PATTERNS.get(primary_lang, []):
	if re.search(pat, name):
	skip_title = True
	break
	if skip_title:
	continue
	if name not in honorific_first_indices:
	honorific_first_indices[name] = idx
	# Append every time to track frequency
	ordered_names.append(name)
	# Fallback: token immediately before any honorific
	# NOTE: Bidirectional check ('after') was removed due to excessive false positives.
	# Strict filtering applied to 'before' token to reduce noise.
	for m in honor_pat.finditer(sent):
	# 1. Check BEFORE the honorific
	if primary_lang == 'chinese':
	# Chinese logic: Get previous 2-4 characters without relying on space
	start_idx = m.start()
	# Try taking 2, 3, 4 characters backwards
	# Chinese names are typically 2-3 characters (Surname + Given Name)
	# We check if they form a valid name
	prefix_str = sent[max(0, start_idx-4):start_idx]

	# Iterate through possible name lengths (2 to 4) ending at honorific
	# We prioritize shorter names (2-3) if they look valid? No, prioritize longest valid?
	# Let's try to extract valid chunks.
	token = ""
	# Scan backwards for valid Chinese chars
	current_token = ""
	for i in range(1, 5): # Look back up to 4 chars
	if start_idx - i < 0: break
	char = sent[start_idx - i]
	# Check if char is valid Chinese character
	if '\u4e00' <= char <= '\u9fff':
	current_token = char + current_token
	else:
	break # Stop at non-Chinese char (punctuation, space, etc)

	if len(current_token) >= 2:
	token = current_token
	elif primary_lang == 'japanese':
	# Japanese logic: Get previous 2-6 characters
	start_idx = m.start()
	# Scan backwards for valid Japanese chars (Kanji, Hiragana, Katakana)
	token = ""
	current_token = ""
	for i in range(1, 7): # Look back up to 6 chars
	if start_idx - i < 0: break
	char = sent[start_idx - i]
	# Check if char is valid Japanese character
	# Kanji: 4E00-9FFF, Hiragana: 3040-309F, Katakana: 30A0-30FF
	# Also include long vowel mark (ー): 30FC
	is_valid_jp = ('\u4e00' <= char <= '\u9fff') or \
	('\u3040' <= char <= '\u309f') or \
	('\u30a0' <= char <= '\u30ff') or \
	(char == '\u30fc')

	if is_valid_jp:
	current_token = char + current_token
	else:
	break # Stop at non-Japanese char

	if len(current_token) >= 2:
	token = current_token
	else:
	# Original logic for space-separated languages (Korean, English)
	prefix = sent[:m.start()].strip()
	if prefix:
	token = prefix.split()[-1]
	token = token.strip(".,;:!?\"'()[]{}<>~`@#$%^&*-=_+\|\\/")
	else:
	token = ""

	if token:
	# Apply all validation logic (common words, fullmatch regex, etc.)
	if not any(ch.isdigit() for ch in token):
	# ... (Rest of existing validation logic) ...
	# FILTERING: Skip tokens with common noisy start characters
	if not any(token.startswith(c) for c in ['[', '(', '{', '<', '-', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅋ', 'ㅎ']):
	# FILTERING: Skip tokens that look like file extensions or paths
	if not ('.' in token or '/' in token or '\\' in token):
	# FILTERING: Skip tokens that are just common words/particles
	if token not in PM.COMMON_WORDS:
	# FILTERING: Aggressive Korean Verb/Adjective Ending Check
	if not (len(token) > 2 and any(token.endswith(e) for e in ['겠네', '리라', '니까', '는데', '러나', '다가', '면서', '지만', '도록', '으로', '에서', '에게', '한테', '라고', '이란'])):

	# STRICTER ATTACHMENT CHECK FOR KOREAN SUFFIXES
	# (For Chinese, we already extracted attached characters, so this check is implicitly passed or N/A)
	is_attached = True
	if primary_lang != 'chinese':
	is_attached = not sent[:m.start()].endswith(' ')

	# Valid token structure check
	valid_shape = False
	# STRICTER: Use regex to ensure the ENTIRE token matches the valid pattern
	if re.fullmatch(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af·]{2,4}', token):
	valid_shape = True
	elif re.fullmatch(r'^[A-Z][a-z]{1,15}(\s+[A-Z][a-z]{1,15})?$', token):
	valid_shape = True

	if valid_shape:
	# Skip if token looks like a title term
	skip_title = False
	for pat in PM.TITLE_PATTERNS.get(primary_lang, []):
	if re.search(pat, token):
	skip_title = True
	break
	if not skip_title:
	if token not in honorific_first_indices:
	honorific_first_indices[token] = idx
	# Append every time to track frequency
	ordered_names.append(token)

	# DEDUPLICATE THE REPRESENTATIVE UNIQUE CHARACTERS HERE
	if ordered_names:
	print(f"📑 Deduplicating {len(ordered_names)} potential character names (honorific-first)...")
	try:
	import duplicate_detection_config as DDC

	# Get configured algorithm and threshold
	dd_config = DDC.get_duplicate_detection_config()
	algo_desc = dd_config.get('description', 'Unknown')

	# Use environment variable directly as fallback
	fallback_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90"))
	effective_threshold = dd_config.get('threshold', fallback_threshold)

	selected_algo = os.getenv('GLOSSARY_DUPLICATE_ALGORITHM', 'auto').upper()
	print(f"📑 Duplicate Detection Algorithm: {selected_algo} ({algo_desc})")
	print(f"📑 Deduplicating names with threshold: {effective_threshold:.2f}")

	deduped_names = []
	kept_indices = {} # Rebuild this map
	skipped_dupes = 0

	# Optimized deduplication using bucketing by first character
	# This avoids O(N²) all-to-all comparison while maintaining fuzzy matching quality

	deduped_names = []
	kept_indices = {}
	skipped_dupes = 0

	# Filter by honorific attachment frequency
	# Only keep names that appear with an honorific at least N times
	# This filters out one-off noise while keeping legitimate names
	name_freq_with_honorific = Counter(ordered_names)

	# Use configured minimum frequency (GLOSSARY_MIN_FREQUENCY)
	# This allows the user to control the strictness via the GUI/Config
	min_hon_freq = min_frequency

	print(f"📑 Filtering by honorific attachment frequency (min {min_hon_freq} occurrences)...")

	# Get unique candidates that meet frequency threshold
	# Use seen set to deduplicate ordered_names while preserving order
	filtered_unique = []
	seen_candidates = set()

	for name in ordered_names:
	if name not in seen_candidates and name_freq_with_honorific[name] >= min_hon_freq:
	filtered_unique.append(name)
	seen_candidates.add(name)

	print(f"📑 Reduced candidates from {len(ordered_names)} (total) to {len(filtered_unique)} (unique freq-filtered)")

	ordered_names = filtered_unique

	# Fast lookup structures
	seen_normalized = set()
	# Bucket by first character (normalized) to reduce search space
	# Key: first_char, Value: list of existing names starting with that char
	lookup_buckets = {}

	print(f"📑 Processing {len(ordered_names)} names with bucketed optimization...")

	for i, name in enumerate(ordered_names):
	# Progress logging for large sets
	if i > 0 and i % 1000 == 0:
	print(f"📑 Dedupe progress: {i}/{len(ordered_names)}...")

	norm = name.lower().strip()
	if not norm: continue

	# 1. Exact normalized check (O(1) - Instant)
	if norm in seen_normalized:
	skipped_dupes += 1
	continue

	# 2. Fuzzy Check (Bucketed)
	is_dup = False
	first_char = norm[0]

	# Only compare against names starting with the same character
	# This reduces comparisons by ~20-50x (alphabet size)
	candidates = lookup_buckets.get(first_char, [])

	# If bucket is massive (>1000), limit to most recent 1000 to prevent slowdown
	# (Heuristic: duplicates usually appear near each other or we catch them early)
	if len(candidates) > 1000:
	search_candidates = candidates[-1000:]
	else:
	search_candidates = candidates

	for existing in search_candidates:
	score = DDC.calculate_similarity_with_config(name, existing, dd_config)
	if score >= effective_threshold:
	is_dup = True
	skipped_dupes += 1
	break

	if not is_dup:
	deduped_names.append(name)
	seen_normalized.add(norm)

	# Add to bucket
	if first_char not in lookup_buckets:
	lookup_buckets[first_char] = []
	lookup_buckets[first_char].append(name)

	# Keep the original index
	if name in honorific_first_indices:
	kept_indices[name] = honorific_first_indices[name]

	print(f"📑 Advanced deduplication removed {skipped_dupes} duplicate names")

	# Update the lists
	ordered_names = deduped_names
	honorific_first_indices = kept_indices

	except ImportError:
	print("⚠️ duplicate_detection_config module not found, skipping name deduplication")
	except Exception as e:
	print(f"⚠️ Name deduplication failed: {e}")
	else:
	print("📑 Dynamic expansion (honorific-first): no honorifics found in PatternManager for this language")
	base_count = len(honorific_first_indices)
	if include_gender_context_flag and base_count > 0:
	try:
	gender_subset = sum(
	1 for idx in honorific_first_indices.values()
	if 0 <= idx < len(filtered_sentences) and _sentence_has_gender_pronoun(filtered_sentences[idx])
	)
	print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring (gender-context subset: {gender_subset})")
	except Exception:
	print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring")
	else:
	print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring")

	# Debug: Write filtered terms to file (User request)
	if base_count > 0 and 'ordered_names' in locals():
	try:
	# Use output_dir if available, otherwise cwd
	debug_base = output_dir if 'output_dir' in locals() else os.getcwd()
	debug_dir = os.path.join(debug_base, 'debug')
	os.makedirs(debug_dir, exist_ok=True)
	debug_file_path = os.path.join(debug_dir, 'honorific_debug.txt')

	with open(debug_file_path, 'w', encoding='utf-8') as f:
	for name in ordered_names:
	f.write(f"{name}\n")
	print(f"📑 Wrote {len(ordered_names)} terms to {debug_file_path}")
	except Exception as e:
	print(f"📑 Failed to write debug file: {e}")
	except Exception:
	print("📑 Dynamic expansion (honorific-first): error parsing honorific names; continuing without early captures")
	else:
	print("📑 Dynamic expansion (honorific-first): no honorific pattern available for this language")

	# For extremely large datasets, we can optionally do additional filtering
	# Skip this reduction when include_all_characters is enabled to avoid losing rare characters
	if (not include_all_characters) and len(filtered_sentences) > 10000 and len(frequent_terms) > 1000:
	print(f"📑 Large dataset detected - applying frequency-based filtering...")
	print(f"📑 Filtering {len(filtered_sentences):,} sentences for top frequent terms...")

	# Sort terms by frequency to prioritize high-frequency ones
	sorted_terms = sorted(frequent_terms.items(), key=lambda x: x[1], reverse=True)
	top_terms = dict(sorted_terms[:1000]) # Focus on top 1000 most frequent terms

	print(f"📑 Using top {len(top_terms):,} most frequent terms for final filtering")

	# Use parallel processing only if really needed
	if use_parallel and len(filtered_sentences) > 5000:
	import multiprocessing
	in_subprocess = multiprocessing.current_process().name != 'MainProcess'

	# Create a simple set of terms for fast lookup (no variations needed)
	term_set = set(top_terms.keys())

	print(f"📑 Using parallel filtering with {extraction_workers} workers...")

	# Optimize batch size for ProcessPoolExecutor (reduce overhead)
	# Use larger batches since this is a simpler operation than term extraction
	check_batch_size = max(1000, len(filtered_sentences) // (extraction_workers * 5))
	check_batches = [filtered_sentences[i:i + check_batch_size]
	for i in range(0, len(filtered_sentences), check_batch_size)]

	print(f"📑 Processing {len(check_batches)} batches of ~{check_batch_size} sentences")

	# Use ProcessPoolExecutor for true parallelism (if not already in subprocess)
	use_process_pool_filtering = (not in_subprocess and len(check_batches) > 3)

	if use_process_pool_filtering:
	print(f"📑 Using ProcessPoolExecutor for true parallel filtering")
	new_filtered = []
	with ProcessPoolExecutor(max_workers=extraction_workers) as executor:
	# Use the module-level function _check_sentence_batch_for_terms
	futures = [executor.submit(_check_sentence_batch_for_terms, (batch, term_set))
	for batch in check_batches]

	for future in as_completed(futures):
	new_filtered.extend(future.result())
	else:
	print(f"📑 Using ThreadPoolExecutor for filtering (small dataset or in subprocess)")
	# Simple function to check if sentence contains any top term
	def check_batch_simple(batch):
	result = []
	for sentence in batch:
	# Simple substring check - much faster than regex
	for term in term_set:
	if term in sentence:
	result.append(sentence)
	break
	return result

	new_filtered = []
	with ThreadPoolExecutor(max_workers=extraction_workers) as executor:
	futures = [executor.submit(check_batch_simple, batch) for batch in check_batches]

	for future in as_completed(futures):
	new_filtered.extend(future.result())

	filtered_sentences = new_filtered
	print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms")
	else:
	# For smaller datasets, simple sequential filtering
	print(f"📑 Using sequential filtering...")
	new_filtered = []
	for i, sentence in enumerate(filtered_sentences):
	for term in top_terms:
	if term in sentence:
	new_filtered.append(sentence)
	break
	if i % 1000 == 0:
	print(f"📑 Progress: {i:,}/{len(filtered_sentences):,} sentences")
	time.sleep(0.001)

	filtered_sentences = new_filtered
	print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms")

	print(f"📑 Selected {len(filtered_sentences):,} sentences containing frequent terms")

	# Track character-like term count for final summary
	character_term_count = 0

	# Limit the number of sentences to reduce token usage
	if max_sentences is None:
	max_sentences_fallback = os.getenv("GLOSSARY_MAX_SENTENCES", "200")
	print(f"🔍 [DEBUG] max_sentences was None, reading from environment: '{max_sentences_fallback}'")
	max_sentences = int(max_sentences_fallback)
	else:
	print(f"🔍 [DEBUG] max_sentences parameter was provided: {max_sentences}")

	print(f"🔍 [DEBUG] Final GLOSSARY_MAX_SENTENCES value being used: {max_sentences}")

	# Force smart selection path when dynamic expansion is enabled, even if filtered_sentences <= max_sentences
	run_smart_selection = (not force_skip_smart_selection) and (include_all_characters or (max_sentences > 0 and len(filtered_sentences) > max_sentences))
	if run_smart_selection and max_sentences > 0:
	dynamic_bonus = len(honorific_first_indices) if include_all_characters else 0
	effective_preview = max_sentences + dynamic_bonus
	if dynamic_bonus > 0:
	print(f"📁 Limiting to {max_sentences} + {dynamic_bonus} (dynamic expansion) = {effective_preview} representative sentences (from {len(filtered_sentences):,})")
	else:
	print(f"📁 Limiting to {max_sentences} representative sentences (from {len(filtered_sentences):,})")

	# SMART SELECTION: Prioritize sentences with unique terms and gender context
	# instead of blind slicing.

	# 1. Identify which terms appear in which sentences
	# We need to re-scan briefly or pass this info along. Re-scanning is safer/easier here.
	if gender_nuance_enabled:
	print("📑 analyzing sentences for term coverage and gender nuance...")
	else:
	print("📑 analyzing sentences for term coverage (gender nuance disabled)...")
	term_to_sentences = {} # term -> list of (score, sentence_index)
	sentence_scores = {} # index -> score

	# Pre-compile regexes
	honorific_pattern_str = None
	if primary_lang in PM.CJK_HONORIFICS:
	h_list = PM.CJK_HONORIFICS[primary_lang] + PM.CJK_HONORIFICS.get('english', [])
	h_list.sort(key=len, reverse=True)
	if h_list:
	honorific_pattern_str = '\|'.join(map(re.escape, h_list))

	# Get pronouns for scoring
	gender_pronouns = []
	if gender_nuance_enabled and hasattr(PM, 'GENDER_PRONOUNS'):
	lang_key = 'english'
	if primary_lang == 'korean': lang_key = 'korean'
	elif primary_lang == 'chinese': lang_key = 'chinese'
	elif primary_lang == 'japanese': lang_key = 'japanese'
	gender_pronouns = PM.GENDER_PRONOUNS.get(lang_key, {}).get('male', []) + \
	PM.GENDER_PRONOUNS.get(lang_key, {}).get('female', [])
	# If gender context is OFF or nuance scoring is disabled, skip expensive scoring and just build simple coverage map
	if not gender_nuance_enabled:
	print("📑 Gender context or nuance toggle disabled: using simple term coverage (no pronoun weighting).")
	for idx, sent in enumerate(filtered_sentences):
	sentence_scores[idx] = 1.0
	for term in frequent_terms:
	if term in sent:
	term_to_sentences.setdefault(term, []).append(idx)
	# Parallelize scoring if dataset is large enough and gender context is ON
	elif use_parallel and len(filtered_sentences) > 2000:
	print(f"📑 Parallelizing sentence scoring with {extraction_workers} workers...")

	# Prepare batches
	# Aim for ~500 sentences per batch to get updates every ~2-3 seconds (assuming ~150-200 sent/sec)
	batch_size = 500

	# However, ensure we don't have too few batches for the workers (utilize parallelism)
	if len(filtered_sentences) // batch_size < extraction_workers * 4:
	batch_size = max(100, len(filtered_sentences) // (extraction_workers * 4))

	batches = []
	for i in range(0, len(filtered_sentences), batch_size):
	end_idx = min(i + batch_size, len(filtered_sentences))
	# Pass (start_index, list_of_sentences)
	batches.append((i, filtered_sentences[i:end_idx]))

	term_list = list(frequent_terms.keys())

	# Use ProcessPoolExecutor for heavy CPU work
	if use_process_pool:
	executor_cls = ProcessPoolExecutor
	else:
	executor_cls = ThreadPoolExecutor

	with executor_cls(max_workers=extraction_workers) as executor:
	# Submit all batches
	futures = [executor.submit(
	_score_sentence_batch,
	(batch_data, term_list, honorific_pattern_str, gender_pronouns, include_gender_context)
	) for batch_data in batches]

	# Collect results with progress logging
	completed_batches = 0
	processed_count = 0
	scoring_start_time = time.time()
	last_log_time = scoring_start_time
	total_batches = len(batches)
	total_to_score = len(filtered_sentences)

	# Emit wait logs even before the first batch completes
	try:
	from concurrent.futures import wait as _wait, FIRST_COMPLETED as _FIRST_COMPLETED
	except Exception:
	_wait = None
	_FIRST_COMPLETED = None

	pending = set(futures)
	while pending:
	done = set()
	if _wait is not None and _FIRST_COMPLETED is not None:
	done, pending = _wait(pending, timeout=5.0, return_when=_FIRST_COMPLETED)
	done = set(done or [])
	else:
	# Fallback: block until first completion (no wait logs)
	for future in as_completed(list(pending)):
	done.add(future)
	pending.discard(future)
	break

	if not done:
	# No batch completed within timeout
	elapsed = time.time() - scoring_start_time
	print(f"📑 Scoring... {elapsed:.0f}s elapsed")
	continue

	for future in done:
	try:
	batch_scores, batch_term_map = future.result()
	sentence_scores.update(batch_scores)
	# Merge term mappings
	for term, indices in batch_term_map.items():
	if term not in term_to_sentences:
	term_to_sentences[term] = []
	term_to_sentences[term].extend(indices)

	# Update progress stats
	completed_batches += 1
	processed_count += len(batch_scores)

	current_time = time.time()
	elapsed = current_time - scoring_start_time

	# Log periodically (every ~5 seconds or if it's the last batch)
	if (current_time - last_log_time >= 5.0) or (completed_batches == total_batches):
	display_count = min(processed_count, total_to_score)
	progress_pct = min(99.9, (display_count / total_to_score) * 100)
	rate = display_count / elapsed if elapsed > 0 else 0

	if completed_batches < total_batches:
	print(f"📑 Scoring... {display_count:,}/{total_to_score:,} sentences ({progress_pct:.1f}%) \| Batch {completed_batches}/{total_batches} \| {rate:.0f} sent/sec \| {elapsed:.0f}s elapsed")
	else:
	print(f"📑 Scoring... {total_to_score:,}/{total_to_score:,} sentences (100.0%) \| Batch {total_batches}/{total_batches} \| {rate:.0f} sent/sec \| {elapsed:.0f}s elapsed")
	print(f"📑 Scoring... finalizing last batches \| {elapsed:.0f}s elapsed")

	last_log_time = current_time

	except Exception as e:
	print(f"⚠️ Scoring batch failed: {e}")

	total_elapsed = time.time() - scoring_start_time
	print(f"📁 All scoring batches completed in {total_elapsed:.1f}s!")
	else:
	# Sequential fallback
	honorific_pattern = re.compile(honorific_pattern_str) if honorific_pattern_str else None
	for idx, sent in enumerate(filtered_sentences):
	score = 1.0
	if gender_nuance_enabled and gender_pronouns:
	for p in gender_pronouns:
	if p in sent:
	score += 5.0
	break
	if honorific_pattern and honorific_pattern.search(sent):
	score += 2.0
	sentence_scores[idx] = score

	for term in frequent_terms:
	if term in sent:
	if term not in term_to_sentences:
	term_to_sentences[term] = []
	term_to_sentences[term].append(idx)

	# 2. Select sentences via Round-Robin to ensure coverage of ALL unique terms
	# with PRIORITY for character-like terms (those with honorifics)
	selected_indices = set()

	# Sort each term's sentences by score descending (higher score first)
	for term in term_to_sentences:
	term_to_sentences[term].sort(key=lambda idx: sentence_scores[idx], reverse=True)
	# If dynamic expansion is on, prefer character terms derived from honorific-attached names
	honorific_char_terms = []
	if include_all_characters and honorific_pattern_str:
	try:
	honor_pat = re.compile(honorific_pattern_str)
	char_term_map = {}
	name_regex = re.compile(r'([\w\-\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]+)$')
	for idx, sent in enumerate(filtered_sentences):
	for m in honor_pat.finditer(sent):
	prefix = sent[:m.start()].strip()
	nm = name_regex.search(prefix)
	if nm:
	name = nm.group(1)
	char_term_map.setdefault(name, []).append(idx)
	if char_term_map:
	term_to_sentences = {k: sorted(v, key=lambda i: sentence_scores.get(i, 0), reverse=True)
	for k, v in char_term_map.items()}
	honorific_char_terms = list(term_to_sentences.keys())
	except Exception:
	pass

	# Split terms into character-like (with honorifics) and others
	def _is_character_like(term: str) -> bool:
	try:
	if _has_honorific(term):
	return True
	# CJK short names
	if primary_lang in ['korean', 'japanese', 'chinese']:
	# Count CJK chars
	cjk_len = sum(1 for ch in term if 0x4E00 <= ord(ch) <= 0x9FFF or 0x3040 <= ord(ch) <= 0x30FF or 0xAC00 <= ord(ch) <= 0xD7AF)
	if 2 <= cjk_len <= 4:
	return True
	# English-style names: title case with 1-3 words
	parts = term.split()
	if 1 <= len(parts) <= 3 and all(p[:1].isupper() for p in parts if p):
	return True
	except Exception:
	pass
	return False

	character_terms = []
	non_character_terms = []
	source_terms = honorific_char_terms if (include_all_characters and honorific_char_terms) else sorted(term_to_sentences.keys())
	for term in source_terms:
	if _is_character_like(term):
	character_terms.append(term)
	else:
	non_character_terms.append(term)
	character_term_count = len(character_terms)

	# If dynamic limit expansion is enabled, prepare to cover every character-like term once
	if include_all_characters and character_terms:
	# Build characters strictly from honorific-bearing terms first; fallback to detection if none
	honorific_chars = []
	if honorific_pattern_str:
	try:
	honor_pat = re.compile(honorific_pattern_str)
	honorific_chars = [t for t in character_terms if honor_pat.search(t)]
	except Exception:
	honorific_chars = []
	if honorific_chars:
	character_terms = honorific_chars
	# Rank character terms by frequency so most frequent get picked first when sentences are missing
	character_terms = sorted(character_terms, key=lambda t: frequent_terms.get(t, 0), reverse=True)

	def round_robin_terms(term_list, selected_indices, target_limit, min_per_term=None):
	"""Round-robin over provided term list, updating selected_indices in-place."""
	term_iterators = [iter(term_to_sentences[t]) for t in term_list]

	# If min_per_term is set, ensure we get at least that many for each term first
	if min_per_term:
	for term in term_list:
	sentences = term_to_sentences[term]
	for i in range(min(min_per_term, len(sentences))):
	selected_indices.add(sentences[i])

	while len(selected_indices) < target_limit and term_iterators:
	active_iterators = []
	for it in term_iterators:
	if len(selected_indices) >= target_limit:
	break
	try:
	while True:
	idx = next(it)
	if idx not in selected_indices:
	selected_indices.add(idx)
	active_iterators.append(it)
	break
	except StopIteration:
	pass
	term_iterators = active_iterators

	# Base limit from user/config
	base_limit = max_sentences
	requested_bonus = 0
	# If we collected honorific-first sentences, seed the selection with them
	if include_all_characters and honorific_first_indices:
	for idx in honorific_first_indices.values():
	if 0 <= idx < len(filtered_sentences):
	selected_indices.add(idx)
	requested_bonus = len(honorific_first_indices)
	# Dynamic expansion should ADD to the base limit, not replace it
	honorific_bonus = len(selected_indices) if include_all_characters else 0
	effective_limit = base_limit + honorific_bonus
	requested_total = base_limit + requested_bonus
	print(f"📁 Requested sentence budget: base {base_limit} + bonus {requested_bonus} = {requested_total}")
	# Standard Fixed Limit Logic
	# First, prioritize character-like terms (honorific-based)
	if character_terms:
	round_robin_terms(character_terms, selected_indices, effective_limit)

	# Then, if we still have room, cover remaining non-character terms
	if len(selected_indices) < effective_limit and non_character_terms:
	round_robin_terms(non_character_terms, selected_indices, effective_limit)


	# If we still have room (rare), fill with highest scored remaining sentences
	target_limit = effective_limit
	if target_limit and len(selected_indices) < target_limit:
	remaining = sorted(
	[i for i in range(len(filtered_sentences)) if i not in selected_indices],
	key=lambda i: sentence_scores[i],
	reverse=True
	)
	selected_indices.update(remaining[:target_limit - len(selected_indices)])

	# Log the actual unique sentence count vs requested (base + bonus)
	unique_count = len(selected_indices)
	dropped = max(0, requested_total - unique_count)
	if include_all_characters:
	print(f"📁 Deduped sentence budget: requested {base_limit}+{requested_bonus} -> {unique_count} unique (dropped {dropped})")
	else:
	print(f"📁 Deduped sentence budget: requested {base_limit} -> {unique_count} unique (dropped {dropped})")

	# Sort indices to maintain narrative flow
	final_indices = sorted(list(selected_indices))
	filtered_sentences = [filtered_sentences[i] for i in final_indices]
	dropped_windows = 0
	dropped_sentence_indices = set()

	if include_all_characters:
	# Determine base vs bonus allocation before dedup
	pre_dedup_sentences = filtered_sentences # already ordered by final_indices
	pre_total = len(pre_dedup_sentences)
	pre_base = min(base_limit, pre_total)
	pre_bonus = max(0, pre_total - pre_base)

	base_idx_set = set(final_indices[:pre_base])
	bonus_idx_set = set(final_indices[pre_base:])
	# Map sentences to terms (characters and others) for coverage-aware dedup
	sentence_terms = {}
	if 'term_to_sentences' in locals():
	for term, idx_list in term_to_sentences.items():
	for idx in idx_list:
	if idx in final_indices:
	sentence_terms.setdefault(idx, set()).add(term)
	character_term_set = set(character_terms) if 'character_terms' in locals() else set()
	covered_char_terms = set()
	covered_terms_global = set()

	# Sentence-level dedup post-selection using duplicate_detection_config + slider threshold
	dup_config = ddc.get_duplicate_detection_config()
	# Fallback to env slider if save_glossary scope variable isn't in this function
	fuzzy_threshold_env = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90"))
	dup_threshold = dup_config.get('threshold', fuzzy_threshold_env)
	algo_list = dup_config.get('algorithms', [])
	algo_mode = os.getenv("GLOSSARY_DUPLICATE_ALGORITHM", "auto")
	print(f"📋 Sentence dedup config: mode={algo_mode}, algos={algo_list}, slider={fuzzy_threshold_env:.2f}, threshold_used={dup_threshold:.2f}, available={ddc.get_algorithm_display_info()}")

	dedup_seen_exact = set()
	kept_sentences = []
	kept_indices = []
	base_kept = bonus_kept = 0
	base_dropped = bonus_dropped = 0

	for idx, sent in zip(final_indices, pre_dedup_sentences):
	key = sent.strip()
	if not key:
	if idx in base_idx_set:
	base_dropped += 1
	else:
	bonus_dropped += 1
	continue

	# Exact duplicate quick check
	if key in dedup_seen_exact:
	if idx in base_idx_set:
	base_dropped += 1
	else:
	bonus_dropped += 1
	continue

	terms_here = sentence_terms.get(idx, set()) if sentence_terms else set()

	# Term-based dedup: drop if this sentence contributes no new terms (all terms already covered)
	is_dup = False
	if terms_here and terms_here.issubset(covered_terms_global):
	is_dup = True
	else:
	if kept_sentences:
	klen = len(key)
	min_len = int(klen * 0.7)
	max_len = int(klen * 1.3)
	for other in kept_sentences:
	if not (min_len <= len(other) <= max_len):
	continue
	if len(set(key) & set(other)) < klen * 0.5:
	continue
	sim = ddc.calculate_similarity_with_config(key, other, dup_config)
	if sim >= dup_threshold:
	is_dup = True
	break

	if is_dup:
	# Guard: keep if this sentence is the only coverage for an uncovered character term
	keep_for_character = False
	if sentence_terms:
	for t in sentence_terms.get(idx, set()):
	if t in character_term_set and t not in covered_char_terms:
	keep_for_character = True
	break
	if not keep_for_character:
	if idx in base_idx_set:
	base_dropped += 1
	else:
	bonus_dropped += 1
	continue

	# Keep
	dedup_seen_exact.add(key)
	kept_sentences.append(key)
	kept_indices.append(idx)
	# Mark covered character terms
	if sentence_terms:
	for t in terms_here:
	if t in character_term_set:
	covered_char_terms.add(t)
	covered_terms_global.add(t)
	if idx in base_idx_set:
	base_kept += 1
	else:
	bonus_kept += 1

	# Rebuild filtered_sentences preserving original ordering
	kept_index_set = set(kept_indices)
	filtered_sentences = [sent for idx, sent in zip(final_indices, pre_dedup_sentences) if idx in kept_index_set]
	dropped_sentence_indices = set(final_indices) - kept_index_set
	total_kept = base_kept + bonus_kept
	total_dropped = base_dropped + bonus_dropped
	dropped_windows = total_dropped

	print(
	f"📁 Deduped sentence budget: base {pre_base}->{base_kept} (dropped {base_dropped}), "
	f"bonus {pre_bonus}->{bonus_kept} (dropped {bonus_dropped}), total {total_kept}"
	)
	# Re-log with dedup-applied cap shrink
	print(
	f"📁 Smart selection complete: Kept {len(filtered_sentences)} sentences covering "
	f"{len(term_to_sentences)} unique terms (cap shrink by {total_dropped})"
	)
	else:
	print(f"📁 Smart selection complete: Kept {len(filtered_sentences)} sentences covering {len(term_to_sentences)} unique terms")
	dropped_windows = 0

	elif max_sentences == 0:
	print(f"📁 Including ALL {len(filtered_sentences):,} sentences (max_sentences=0)")

	# Check if gender context expansion is enabled
	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"

	if include_gender_context:
	context_window = int(os.getenv("GLOSSARY_CONTEXT_WINDOW", "2"))
	print(f"📑 Gender context enabled: Expanding snippets with {context_window}-sentence windows...")
	if 'dropped_windows' in locals() and dropped_windows:
	print(f"📑 Context windows skipped due to dedup: {dropped_windows}")

	# Split full text into sentences for context extraction
	all_sentences_list = re.split(r'[.!?。！？]+', clean_text)
	all_sentences_list = [s.strip() for s in all_sentences_list if s.strip()]

	# Create index map for fast lookup - OPTIMIZED to O(n) instead of O(n²)
	# Build a lookup dict: sentence -> index for fast matching
	sentence_to_index = {}
	all_sentences_normalized = {s.strip(): idx for idx, s in enumerate(all_sentences_list)}

	print(f"📑 Mapping {len(filtered_sentences):,} filtered sentences to context positions...")
	kept_windows = 0
	for filtered_sent in filtered_sentences:
	filtered_normalized = filtered_sent.strip()

	# Try exact match first (fastest)
	if filtered_normalized in all_sentences_normalized:
	sentence_to_index[filtered_sent] = all_sentences_normalized[filtered_normalized]
	else:
	# Try substring match (slower fallback)
	found = False
	for sentence, idx in all_sentences_normalized.items():
	if filtered_normalized in sentence or sentence in filtered_normalized:
	sentence_to_index[filtered_sent] = idx
	found = True
	break

	if not found:
	# Last resort: try finding in original list
	for idx, sentence in enumerate(all_sentences_list):
	if filtered_normalized in sentence or sentence in filtered_normalized:
	sentence_to_index[filtered_sent] = idx
	break

	# Build context windows with explicit boundaries to avoid cross-window leakage
	context_groups: list[str] = []
	window_seeds: list[int] = []
	included_indices = set()

	for filtered_sent in filtered_sentences:
	# If we can't locate the sentence in the master list, wrap it individually
	if filtered_sent not in sentence_to_index:
	if 'dropped_sentence_indices' in locals() and filtered_sent in dropped_sentence_indices:
	continue # skip entire window if its seed sentence was deduped
	window_num = len(context_groups) + 1
	context_groups.append(
	f"{filtered_sent}\n=== CONTEXT {window_num} END ==="
	)
	window_seeds.append(-1)
	continue

	idx = sentence_to_index[filtered_sent]

	# Skip if already included in a previous window
	if idx in included_indices:
	continue
	# Skip window if its seed sentence was deduped
	if 'dropped_sentence_indices' in locals() and filtered_sent in dropped_sentence_indices:
	continue

	# Get context window: [idx-context_window ... idx ... idx+context_window]
	start_idx = max(0, idx - context_window)
	end_idx = min(len(all_sentences_list), idx + context_window + 1)

	# Mark all sentences in this window as included
	for i in range(start_idx, end_idx):
	included_indices.add(i)

	# Extract the window and wrap with start/end markers for splitter safety
	window_sentences = all_sentences_list[start_idx:end_idx]
	context_group_body = ' '.join(window_sentences)
	window_num = len(context_groups) + 1
	context_groups.append(
	f"{context_group_body}\n=== CONTEXT {window_num} END ==="
	)
	window_seeds.append(idx)
	kept_windows += 1

	skipped_windows = (len(filtered_sentences) - kept_windows) if 'kept_windows' in locals() else 0
	print(f"📑 Created {len(context_groups):,} context windows (up to {context_window*2+1} sentences each)")
	if skipped_windows:
	print(f"📑 Context windows removed after dedup: {skipped_windows}")

	# Window-level dedup: drop windows whose term set is already covered, while keeping one per character
	window_terms = []
	if 'sentence_terms' in locals():
	for seed_idx in window_seeds:
	if seed_idx == -1:
	window_terms.append(set())
	else:
	window_terms.append(sentence_terms.get(seed_idx, set()))
	else:
	window_terms = [set() for _ in window_seeds]

	covered_terms_global = set()
	covered_char_terms = set()
	kept_context_groups = []
	kept_window_seeds = []
	for cg, seed_idx, terms in zip(context_groups, window_seeds, window_terms):
	if not terms:
	# keep empty-term windows to preserve structure
	kept_context_groups.append(cg)
	kept_window_seeds.append(seed_idx)
	continue
	drop = False
	# STRICT: one window per character. If any character term here is already covered, drop this window.
	char_terms = set(t for t in terms if 'character_term_set' in locals() and t in character_term_set)
	if char_terms and char_terms & covered_char_terms:
	drop = True
	elif not char_terms and terms.issubset(covered_terms_global):
	drop = True
	# If no character terms yet covered, allow first appearance
	if drop:
	keep_for_char = any((t in character_term_set and t not in covered_char_terms) for t in terms) if 'character_term_set' in locals() else False
	if keep_for_char and not (char_terms & covered_char_terms):
	drop = False
	if drop:
	continue
	# keep and mark coverage
	kept_context_groups.append(cg)
	kept_window_seeds.append(seed_idx)
	for t in terms:
	covered_terms_global.add(t)
	if 'character_term_set' in locals() and t in character_term_set:
	covered_char_terms.add(t)

	dropped_windows_after_terms = len(context_groups) - len(kept_context_groups)
	if dropped_windows_after_terms:
	print(f"📑 Context windows removed after term-aware dedup: {dropped_windows_after_terms}")

	# Compute true total sentences emitted in kept windows
	total_window_sentences = 0
	for ctx in kept_context_groups:
	# split on end marker to avoid counting it
	body = ctx.split('=== CONTEXT ')[0]
	# crude split by sentence separators
	total_window_sentences += len([s for s in re.split(r'[.!?。！？]+', body) if s.strip()])
	print(f"📑 Final kept windows: {len(kept_context_groups)}, final kept sentences (within windows): {total_window_sentences}")
	filtered_text = '\n\n'.join(kept_context_groups) # Separate windows with double newline
	print(f"📑 Context-expanded text: {len(filtered_text):,} characters")
	else:
	# Even without gender context, add footer markers to preserve boundaries for chapter splitting
	context_groups = []
	for idx, sent in enumerate(filtered_sentences, 1):
	context_groups.append(f"{sent}\n=== CONTEXT {idx} END ===")
	filtered_text = '\n\n'.join(context_groups)

	# Determine character count for summary (use dynamic-expansion tally when available)
	if include_all_characters and honorific_first_indices:
	character_term_count = len(honorific_first_indices)
	elif 'character_terms' in locals() and character_terms:
	character_term_count = len(set(character_terms))
	# Calculate and display filtering statistics
	filter_end_time = time.time()
	filter_duration = filter_end_time - filter_start_time

	original_length = len(clean_text)
	filtered_length = len(filtered_text)
	size_change_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0

	filtered_text = _normalize_filtered_text(filtered_text)
	filtered_length = len(filtered_text)
	size_change_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0
	print("📑 Applied post-filter text normalization to remove orphaned quotes and extra blank lines")
	print(f"\n📑 === FILTERING COMPLETE ===")
	print(f"📑 Duration: {filter_duration:.1f} seconds")
	if size_change_percent >= 0:
	print(f"📑 Text reduction: {original_length:,} → {filtered_length:,} chars ({size_change_percent:.1f}% reduction)")
	else:
	print(f"📑 Text expansion: {original_length:,} → {filtered_length:,} chars ({abs(size_change_percent):.1f}% expansion)")
	print(f"📑 Terms found: {len(frequent_terms):,} unique terms (min frequency: {min_frequency})")
	print(f"📑 Characters found (character-like terms): {character_term_count:,}")
	print(f"📑 Final output: {len(filtered_sentences)} sentences, {filtered_length:,} characters")
	print(f"📑 Performance: {(original_length / filter_duration / 1000):.1f}K chars/second")
	print(f"📑 ========================\n")

	return filtered_text, frequent_terms


	def _normalize_filtered_text(text: str) -> str:
	"""Normalize filtered text by collapsing stray blank lines and orphaned quote lines."""
	if not text:
	return text

	quote_open = {"“", "「", "『", "\""}
	quote_close = {"”", "」", "』", "\""}

	lines = text.replace("\r\n", "\n").split("\n")
	normalized_lines = []
	i = 0

	while i < len(lines):
	line = lines[i]
	stripped = line.strip()

	if stripped in quote_close:
	# Remove trailing blank lines before attaching closing quote
	while normalized_lines and not normalized_lines[-1].strip():
	normalized_lines.pop()
	if normalized_lines:
	normalized_lines[-1] = normalized_lines[-1].rstrip() + stripped
	else:
	normalized_lines.append(stripped)
	elif stripped in quote_open:
	j = i + 1
	while j < len(lines) and not lines[j].strip():
	j += 1
	if j < len(lines):
	match = re.match(r"^(\s)(.)$", lines[j])
	if match:
	leading, remainder = match.groups()
	lines[j] = f"{leading}{stripped}{remainder}"
	else:
	lines[j] = f"{stripped}{lines[j]}"
	else:
	normalized_lines.append(stripped)
	else:
	normalized_lines.append(line)
	i += 1

	normalized_text = "\n".join(normalized_lines)
	normalized_text = re.sub(r"\n{3,}", "\n\n", normalized_text)
	normalized_text = re.sub(r"\n{2,}([”」』])", r"\n\1", normalized_text)
	normalized_text = re.sub(r"([“「『])\n{2,}", r"\1\n", normalized_text)
	normalized_text = re.sub(r"\n{2,}", "\n", normalized_text)

	return normalized_text

	def _extract_with_custom_prompt(custom_prompt, all_text, language,
	min_frequency, max_names, max_titles,
	existing_glossary, output_dir,
	strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all', max_sentences=200, log_callback=None,
	chunk_pos=None, total_chunks=None):
	"""Extract glossary using custom AI prompt with proper filtering"""
	# Redirect stdout to GUI log if callback provided (but not in subprocess - worker handles it)
	import sys
	in_subprocess = hasattr(sys.stdout, 'queue')
	if log_callback and not in_subprocess:
	set_output_redirect(log_callback)

	print("📑 Using custom automatic glossary prompt")
	extraction_start = time.time()

	# Check stop flag
	if is_stop_requested():
	print("📑 ❌ Glossary extraction stopped by user")
	return {}

	# Note: Filter mode can be controlled via the configurable prompt environment variable
	# No hardcoded filter instructions are added here

	try:
	MODEL = os.getenv("MODEL", "gemini-2.0-flash")
	API_KEY = (os.getenv("API_KEY") or
	os.getenv("OPENAI_API_KEY") or
	os.getenv("OPENAI_OR_Gemini_API_KEY") or
	os.getenv("GEMINI_API_KEY"))

	if is_traditional_translation_api(MODEL):
	# Pattern fallback disabled; traditional translation APIs can't run AI extraction.
	print("📑 Traditional translation API selected - skipping automatic glossary extraction (pattern fallback disabled)")
	return {}

	elif not API_KEY and not _model_uses_own_auth(MODEL):
	# Pattern fallback disabled; without an API key we can't run AI extraction.
	print("📑 No API key found - skipping automatic glossary extraction (pattern fallback disabled)")
	return {}
	else:
	print(f"📑 Using AI-assisted extraction with custom prompt")

	# Ensure multi-key config is available in this process if enabled
	_ensure_multi_key_config_loaded()
	from unified_api_client import UnifiedClient, UnifiedClientError
	client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir)

	# Log glossary anti-duplicate parameters usage
	if os.getenv("GLOSSARY_ENABLE_ANTI_DUPLICATE", "0") == "1":
	ad_top_p = os.getenv("GLOSSARY_TOP_P", "1.0")
	ad_top_k = os.getenv("GLOSSARY_TOP_K", "0")
	ad_freq = os.getenv("GLOSSARY_FREQUENCY_PENALTY", "0.0")
	ad_pres = os.getenv("GLOSSARY_PRESENCE_PENALTY", "0.0")
	ad_rep = os.getenv("GLOSSARY_REPETITION_PENALTY", "1.0")
	print(f"🎯 Anti-duplicate enabled for glossary (top_p={ad_top_p}, top_k={ad_top_k}, freq_penalty={ad_freq}, presence_penalty={ad_pres}, repetition_penalty={ad_rep})")

	# Progress-bar labeling: when running chunked auto-glossary, give each in-flight call a unique name.
	# This drives the GUI watchdog tooltip "Active calls" list.
	progress_context = 'glossary'
	try:
	if chunk_pos is not None and total_chunks is not None:
	progress_context = f"auto glossary ({int(chunk_pos)}/{int(total_chunks)})"
	except Exception:
	progress_context = 'glossary'

	client.context = progress_context
	if hasattr(client, 'reset_cleanup_state'):
	client.reset_cleanup_state()

	# Apply thread submission delay using the client's method
	thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5"))
	if thread_delay > 0:
	client._apply_thread_submission_delay()

	# Check if cancelled during delay
	if hasattr(client, '_cancelled') and client._cancelled:
	print("📑 ❌ Glossary extraction stopped during delay")
	return {}

	# Check if text is already filtered (from chunking or cache)
	already_filtered = (os.getenv("_CHUNK_ALREADY_FILTERED", "0") == "1" or
	os.getenv("_TEXT_ALREADY_FILTERED", "0") == "1")

	if already_filtered:
	# print("📑 Text already filtered, skipping re-filtering")
	text_sample = all_text # Use as-is since it's already filtered
	detected_terms = {}
	else:
	# Apply smart filtering to reduce noise and focus on meaningful content
	force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER", "0") == "1"
	use_smart_filter = (os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1") and not force_disable

	if not use_smart_filter:
	# Smart filter disabled - send FULL text without any filtering or truncation
	print("📁 Smart filtering DISABLED by user - sending full text to API (this will be expensive!)")
	text_sample = all_text
	detected_terms = {}
	else:
	# Smart filter enabled - apply intelligent filtering
	print("📁 Applying smart text filtering to reduce noise...")
	# Use max_sentences parameter (passed from parent, already read from environment)
	print(f"🔍 [DEBUG] In _extract_with_custom_prompt: max_sentences={max_sentences}")
	text_sample, detected_terms = _filter_text_for_glossary(all_text, min_frequency, max_sentences)

	# If there is no content left, skip API call
	if not text_sample or not str(text_sample).strip():
	print("📑 No text available after filtering - skipping automatic glossary generation")
	return {}

	# Replace placeholders in prompt
	# Get target language from environment (used in the prompt for translation output)
	target_language = os.getenv('GLOSSARY_TARGET_LANGUAGE', 'English')
	# Count context marker windows for {marker} placeholder
	marker_matches = re.findall(r"===\sCONTEXT\s+\d+\s+END\s===", all_text or "")
	marker_count = len(marker_matches)
	system_prompt = custom_prompt.replace('{language}', target_language)
	system_prompt = system_prompt.replace('{min_frequency}', str(min_frequency))
	system_prompt = system_prompt.replace('{max_names}', str(max_names))
	system_prompt = system_prompt.replace('{max_titles}', str(max_titles))
	system_prompt = system_prompt.replace('{marker}', str(marker_count))

	# Send system prompt and text as separate messages
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": f"{text_sample}"}
	]

	# Check stop before API call
	if is_stop_requested():
	print("📑 ❌ Glossary extraction stopped before API call")
	return {}

	try:
	# Use glossary-specific temperature with fallback to global
	temperature = float(os.getenv("GLOSSARY_TEMPERATURE", os.getenv("TEMPERATURE", "0.3")))
	# Use glossary-specific max output tokens with fallback to global
	max_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "4096")))

	# Use send_with_interrupt for interruptible API call
	# Respect RETRY_TIMEOUT toggle - if disabled, use None for infinite timeout
	retry_env = os.getenv("RETRY_TIMEOUT")
	retry_timeout_enabled = retry_env is None or retry_env.strip().lower() not in ("0", "false", "off", "")
	chunk_timeout = None
	if retry_timeout_enabled:
	env_ct = os.getenv("CHUNK_TIMEOUT", "1800")
	try:
	ct_val = float(env_ct)
	chunk_timeout = None if ct_val <= 0 else ct_val
	except Exception:
	chunk_timeout = None
	print(f"📑 Sending AI extraction request (timeout: {chunk_timeout if chunk_timeout is not None else 'disabled'}s, interruptible)...")
	else:
	print(f"📑 Sending AI extraction request (timeout: disabled, interruptible)...")

	# Before API call
	api_start = time.time()
	print(f"📑 Preparing API request (text size: {len(text_sample):,} chars)...")
	print(f"📑 ⏳ Processing {len(text_sample):,} characters... Please wait, this may take 5-10 minutes")

	# Timeout retry logic (matches translation behavior)
	try:
	max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2"))
	except Exception:
	max_timeout_retries = 2
	timeout_retry_count = 0
	while True:
	try:
	response, finish_reason, raw_obj = send_with_interrupt(
	messages=messages,
	client=client,
	temperature=temperature,
	max_tokens=max_tokens,
	stop_check_fn=is_stop_requested,
	chunk_timeout=chunk_timeout,
	context=progress_context
	)
	break
	except UnifiedClientError as e:
	error_msg = str(e)
	lower_msg = error_msg.lower()

	# Only treat an explicit user stop as an interrupt; timeouts/cancellations should retry
	user_stopped = ("stopped by user" in lower_msg) or (
	is_stop_requested() and not any(k in lower_msg for k in ("timeout", "timed out", "cancelled"))
	)
	if user_stopped:
	print(f"📑 ❌ AI extraction interrupted by user")
	return {}

	# Treat cancelled / client init errors as timeout retries
	is_timeout = ("timed out" in lower_msg) or ("timeout" in lower_msg) or ("cancelled" in lower_msg) or ("client not initialized" in lower_msg)
	if is_timeout and timeout_retry_count < max_timeout_retries:
	timeout_retry_count += 1
	if chunk_timeout:
	print(f"⚠️ AI extraction timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...")
	else:
	print(f"⚠️ AI extraction timed out, retrying ({timeout_retry_count}/{max_timeout_retries})...")

	# Clear cancellation flags that timeouts may have set
	try:
	client.reset_cleanup_state()
	except Exception:
	pass
	try:
	# Also clear class-level global cancellation for all clients
	client.__class__.set_global_cancellation(False)
	except Exception:
	pass

	# Reinitialize client if needed
	client_type = getattr(client, 'client_type', 'unknown')
	needs_reinit = False
	if client_type == 'gemini':
	needs_reinit = hasattr(client, 'gemini_client') and client.gemini_client is None
	elif client_type == 'openai':
	needs_reinit = hasattr(client, 'openai_client') and client.openai_client is None
	if needs_reinit:
	try:
	print(f" 🔄 Reinitializing {client_type} client...")
	client._setup_client()
	except Exception as reinit_err:
	print(f" ⚠️ Failed to reinitialize client: {reinit_err}")
	# Stagger retries to avoid simultaneous API calls
	try:
	import random
	base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
	retry_delay = random.uniform(base_delay / 2, base_delay)
	print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
	time.sleep(retry_delay)
	except Exception:
	time.sleep(1.0)
	continue
	else:
	raise
	api_time = time.time() - api_start
	print(f"📑 API call completed in {api_time:.1f}s")

	# Get the actual text from the response
	if hasattr(response, 'content'):
	response_text = response.content
	else:
	response_text = str(response)

	# Before processing response
	process_start = time.time()
	# print(f"📑 Processing AI response...")
	# Process response and build CSV
	csv_lines = _process_ai_response(response_text, all_text, min_frequency,
	strip_honorifics, fuzzy_threshold,
	language, filter_mode)

	print(f"📑 AI extracted {len(csv_lines) - 1} valid terms (header excluded)")

	process_time = time.time() - process_start
	# print(f"📑 Response processing took {process_time:.1f}s")

	# If we're running per-chunk, defer all heavy work and saving
	if os.getenv("GLOSSARY_DEFER_SAVE", "0") == "1":
	return csv_lines

	# Check stop before merging
	if is_stop_requested():
	print("📑 ❌ Glossary generation stopped before merging")
	return {}

	# Merge with existing glossary if present
	if existing_glossary:
	csv_lines = _merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language)
	# Always inject the book title BEFORE any deduplication or filtering so it
	# survives the first run (previously only happened after a second run/merge)
	if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1":
	csv_lines = _ensure_book_title_csv_lines(csv_lines)
	print("📚 Book title injected before dedup (single-shot glossary path)")

	# Fuzzy matching deduplication
	skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1"
	if not skip_frequency_check: # Only dedupe if we're checking frequencies
	# Time the deduplication
	dedup_start = time.time()
	original_count = len(csv_lines) - 1 # Exclude header

	csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)

	dedup_time = time.time() - dedup_start
	final_count = len(csv_lines) - 1 # Exclude header
	removed_count = original_count - final_count

	print(f"📑 Deduplication completed in {dedup_time:.1f}s")
	print(f"📑 - Original entries: {original_count}")
	print(f"📑 - Duplicates removed: {removed_count}")
	print(f"📑 - Final entries: {final_count}")

	# Store for summary statistics
	_dedup_time = 0 + dedup_time
	else:
	print(f"📑 Skipping deduplication (frequency check disabled)")

	# Apply filter mode to final results
	csv_lines = _filter_csv_by_mode(csv_lines, filter_mode)

	# Check if we should use token-efficient format
	use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1'

	if not use_legacy_format:
	# Convert to token-efficient format
	csv_lines = _convert_to_token_efficient_format(csv_lines)

	# Final sanitize to prevent stray headers
	csv_lines = _sanitize_final_glossary_lines(csv_lines, use_legacy_format)

	# Create final CSV content
	csv_content = '\n'.join(csv_lines)

	# Save glossary as CSV with proper extension
	glossary_path = os.path.join(output_dir, "glossary.csv")
	_atomic_write_file(glossary_path, csv_content)

	print(f"\n📑 ✅ AI-ASSISTED GLOSSARY SAVED!")
	print(f"📑 File: {glossary_path}")
	c_count, t_count, total = _count_glossary_entries(csv_lines, use_legacy_format)
	print(f"📑 Character entries: {c_count}")
	# print(f"📑 Term entries: {t_count}")
	print(f"📑 Total entries: {total}")
	total_time = time.time() - extraction_start
	print(f"📑 Total extraction time: {total_time:.1f}s")
	return _parse_csv_to_dict(csv_content)

	except UnifiedClientError as e:
	if "stopped by user" in str(e).lower():
	print(f"📑 ❌ AI extraction interrupted by user")
	return {}
	else:
	print(f"⚠️ AI extraction failed: {e}")
	print("📑 ❌ Glossary generation failed - returning empty glossary")
	return {}
	except Exception as e:
	print(f"⚠️ AI extraction failed: {e}")
	import traceback
	traceback.print_exc()
	print("📑 ❌ Glossary generation failed - returning empty glossary")
	return {}

	except Exception as e:
	print(f"⚠️ Custom prompt processing failed: {e}")
	import traceback
	traceback.print_exc()
	print("📑 ❌ Glossary generation failed - returning empty glossary")
	return {}

	def _filter_csv_by_mode(csv_lines, filter_mode):
	"""Filter CSV lines based on the filter mode"""
	if filter_mode == "all":
	return csv_lines

	filtered = [csv_lines[0]] # Keep header

	for line in csv_lines[1:]:
	if not line.strip():
	continue

	parts = [p.strip() for p in line.split(',')]
	if len(parts) < 3:
	continue

	entry_type = parts[0].lower()
	raw_name = parts[1]

	if filter_mode == "only_with_honorifics":
	# Only keep character entries with honorifics
	if entry_type == "character" and _has_honorific(raw_name):
	filtered.append(line)
	elif filter_mode == "only_without_honorifics":
	# Keep terms and characters without honorifics
	if entry_type == "term" or (entry_type == "character" and not _has_honorific(raw_name)):
	filtered.append(line)

	print(f"📑 Filter '{filter_mode}': {len(filtered)-1} entries kept from {len(csv_lines)-1}")
	return filtered

	def _process_ai_response(response_text, all_text, min_frequency,
	strip_honorifics, fuzzy_threshold, language, filter_mode):
	"""Process AI response and return CSV lines"""

	# Check if gender context and description are enabled (used throughout the function)
	include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1"
	include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1"

	# option to completely skip frequency validation for speed
	skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1"

	# if skip_all_validation:
	# print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)")

	# Clean response text
	response_text = response_text.strip()

	# Remove string representation artifacts if they wrap the entire response
	if response_text.startswith('("') and response_text.endswith('")'):
	response_text = response_text[2:-2]
	elif response_text.startswith('"') and response_text.endswith('"'):
	response_text = response_text[1:-1]
	elif response_text.startswith('(') and response_text.endswith(')'):
	response_text = response_text[1:-1]

	# Unescape the string
	response_text = response_text.replace('\\n', '\n')
	response_text = response_text.replace('\\r', '')
	response_text = response_text.replace('\\t', '\t')
	response_text = response_text.replace('\\"', '"')
	response_text = response_text.replace("\\'", "'")
	response_text = response_text.replace('\\\\', '\\')

	# Clean up markdown code blocks if present
	if '```' in response_text:
	parts = response_text.split('```')
	for part in parts:
	if 'csv' in part[:10].lower():
	response_text = part[part.find('\n')+1:]
	break
	elif part.strip() and ('type,raw_name' in part or 'character,' in part or 'term,' in part):
	response_text = part
	break

	# Normalize line endings
	response_text = response_text.replace('\r\n', '\n').replace('\r', '\n')
	lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()]

	import csv

	# --- Dynamic header capture: accept every column the AI returns ---
	dynamic_header = None
	dynamic_rows = []
	for ln in lines:
	low = ln.lower()
	if 'type' in low and 'raw_name' in low:
	try:
	dynamic_header = [c.strip() for c in next(csv.reader([ln])) if c.strip()]
	except Exception:
	dynamic_header = [c.strip() for c in ln.split(',') if c.strip()]
	continue
	if dynamic_header:
	try:
	dynamic_rows.append(next(csv.reader([ln])))
	except Exception:
	dynamic_rows.append([c.strip() for c in ln.split(',')])

	if dynamic_header:
	required = {h.lower(): i for i, h in enumerate(dynamic_header)}
	if all(k in required for k in ('type', 'raw_name', 'translated_name')):
	csv_lines = [','.join(dynamic_header)]
	for row in dynamic_rows:
	if len(row) < len(dynamic_header):
	row += [''] * (len(dynamic_header) - len(row))
	elif len(row) > len(dynamic_header):
	desc_idx = required.get('description')
	if desc_idx is not None and desc_idx < len(dynamic_header):
	row = row[:desc_idx] + [','.join(row[desc_idx:])]
	else:
	row = row[:len(dynamic_header)]
	# Clean stop tokens
	row = ['' if cell in ("'stop'", "stop") else cell for cell in row]
	entry_type = row[required['type']].strip() if len(row) > required['type'] else ''
	raw_name = row[required['raw_name']].strip() if len(row) > required['raw_name'] else ''
	translated_name = row[required['translated_name']].strip() if len(row) > required['translated_name'] else ''
	if not raw_name or not translated_name:
	continue
	csv_lines.append(','.join(row[:len(dynamic_header)]))
	if csv_lines:
	print(f"📑 Dynamic header detected from AI: {dynamic_header}")
	return csv_lines

	csv_lines = []
	header_found = False

	# Post-response min_frequency filtering is disabled (accept all AI rows);
	# skip_frequency_check forced true to bypass frequency gating.
	skip_frequency_check = True

	# Add option to completely skip ALL validation for maximum speed
	skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1"

	if skip_all_validation:
	# print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)")

	# Use appropriate header based on gender and description settings
	if include_description:
	csv_lines.append("type,raw_name,translated_name,gender,description")
	elif include_gender_context:
	csv_lines.append("type,raw_name,translated_name,gender")
	# print("📑 Fast mode: Using 4-column format with gender")
	else:
	csv_lines.append("type,raw_name,translated_name")

	# Process the AI response
	for line in lines:
	# Skip header lines
	if 'type' in line.lower() and 'raw_name' in line.lower():
	continue

	# Parse CSV line
	parts = [p.strip() for p in line.split(',')]

	# Replace invalid 'stop' values with empty string
	parts = ['' if p == "'stop'" or p == "stop" else p for p in parts]

	if include_description and len(parts) >= 5:
	# Has all 5 columns (with gender and description)
	entry_type = parts[0]
	raw_name = parts[1]
	translated_name = parts[2]
	gender = parts[3] if len(parts) > 3 else ''
	description = parts[4] if len(parts) > 4 else ''

	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if (raw_name and translated_name and
	not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and
	not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))):
	csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender},{description}")
	elif include_gender_context and len(parts) >= 4:
	# Has all 4 columns (with gender)
	entry_type = parts[0]
	raw_name = parts[1]
	translated_name = parts[2]
	gender = parts[3] if len(parts) > 3 else ''

	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if (raw_name and translated_name and
	not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and
	not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))):
	csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender}")
	elif len(parts) >= 3:
	# Has at least 3 columns
	entry_type = parts[0]
	raw_name = parts[1]
	translated_name = parts[2]
	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if (raw_name and translated_name and
	not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and
	not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))):
	if include_description:
	# Add empty gender and description columns when 5 columns expected
	gender = parts[3] if len(parts) > 3 else ''
	description = parts[4] if len(parts) > 4 else ''
	csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender},{description}")
	elif include_gender_context:
	# Add empty gender column for 3-column entries when 4 columns expected
	gender = parts[3] if len(parts) > 3 else ''
	csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender}")
	else:
	csv_lines.append(f"{entry_type},{raw_name},{translated_name}")
	elif len(parts) == 2:
	# Missing type, default to 'term'
	raw_name = parts[0]
	translated_name = parts[1]
	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if (raw_name and translated_name and
	not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and
	not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))):
	if include_description:
	csv_lines.append(f"term,{raw_name},{translated_name},,")
	elif include_gender_context:
	csv_lines.append(f"term,{raw_name},{translated_name},")
	else:
	csv_lines.append(f"term,{raw_name},{translated_name}")

	# print(f"📑 Fast mode: Accepted {len(csv_lines) - 1} entries without validation")
	return csv_lines

	# For "only_with_honorifics" mode, ALWAYS skip frequency check
	if filter_mode == "only_with_honorifics":
	skip_frequency_check = True
	print("📑 Filter mode 'only_with_honorifics': Bypassing frequency checks")

	print(f'📑 Processing {len(lines)} lines from AI response...')
	# print(f'📑 Text corpus size: {len(all_text):,} chars')
	# print(f'📑 Frequency checking: DISABLED (post-response min_frequency bypassed)')
	# print(f'📑 Fuzzy threshold: {fuzzy_threshold}')

	# Collect all terms first for batch processing
	all_terms_to_check = []
	term_info_map = {} # Map term to its full info

	if not skip_frequency_check:
	# First pass: collect all terms that need frequency checking
	for line in lines:
	if 'type' in line.lower() and 'raw_name' in line.lower():
	continue # Skip header

	parts = [p.strip() for p in line.split(',')]

	# Replace invalid 'stop' values with empty string
	parts = ['' if p == "'stop'" or p == "stop" else p for p in parts]

	# Strip orphaned quotes and filter empty columns
	parts = [p.strip('"').strip("'").strip() for p in parts]
	parts = [p for p in parts if p] # Remove empty strings

	if len(parts) >= 3:
	entry_type = parts[0].lower()
	raw_name = parts[1]
	translated_name = parts[2]
	gender = parts[3] if len(parts) > 3 else ''
	description = parts[4] if len(parts) > 4 else ''
	elif len(parts) == 2:
	entry_type = 'term'
	raw_name = parts[0]
	translated_name = parts[1]
	gender = ''
	description = ''
	else:
	continue

	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if not raw_name or not translated_name:
	continue
	if (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"')) or
	raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"'))):
	continue

	if raw_name and translated_name:
	# Store for batch processing
	original_raw = raw_name
	if strip_honorifics:
	raw_name = _strip_honorific(raw_name, language)

	all_terms_to_check.append(raw_name)
	term_info_map[raw_name] = {
	'entry_type': entry_type,
	'original_raw': original_raw,
	'translated_name': translated_name,
	'gender': gender,
	'description': description,
	'line': line
	}

	# Batch compute all frequencies at once
	if all_terms_to_check:
	print(f"📑 Computing frequencies for {len(all_terms_to_check)} terms...")
	term_frequencies = _batch_compute_frequencies(
	all_terms_to_check, all_text, fuzzy_threshold, min_frequency
	)
	else:
	term_frequencies = {}

	# Now process the results using pre-computed frequencies
	entries_processed = 0
	entries_accepted = 0
	# Process based on mode
	if filter_mode == "only_with_honorifics" or skip_frequency_check:
	# For these modes, accept all entries
	if include_description:
	csv_lines.append("type,raw_name,translated_name,gender,description") # Header with description
	elif include_gender_context:
	csv_lines.append("type,raw_name,translated_name,gender") # Header with gender
	else:
	csv_lines.append("type,raw_name,translated_name") # Header

	for line in lines:
	if 'type' in line.lower() and 'raw_name' in line.lower():
	continue # Skip header

	parts = [p.strip() for p in line.split(',')]

	# Replace invalid 'stop' values with empty string
	parts = ['' if p == "'stop'" or p == "stop" else p for p in parts]

	# Strip orphaned quotes and filter empty columns
	parts = [p.strip('"').strip("'").strip() for p in parts]
	parts = [p for p in parts if p] # Remove empty strings

	if len(parts) >= 3:
	entry_type = parts[0].lower()
	raw_name = parts[1]
	translated_name = parts[2]
	gender = parts[3] if len(parts) > 3 else ''
	description = parts[4] if len(parts) > 4 else ''
	elif len(parts) == 2:
	entry_type = 'term'
	raw_name = parts[0]
	translated_name = parts[1]
	gender = ''
	description = ''
	else:
	continue

	# Validate - reject malformed entries that look like tuples/lists or quoted strings
	if not raw_name or not translated_name:
	continue
	if (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"')) or
	raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"'))):
	continue

	if raw_name and translated_name:
	if include_description:
	csv_line = f"{entry_type},{raw_name},{translated_name},{gender},{description}"
	elif include_gender_context:
	csv_line = f"{entry_type},{raw_name},{translated_name},{gender}"
	else:
	csv_line = f"{entry_type},{raw_name},{translated_name}"
	csv_lines.append(csv_line)
	entries_accepted += 1

	print(f"📑 Accepted {entries_accepted} entries (frequency check disabled)")

	else:
	# Use pre-computed frequencies
	if include_description:
	csv_lines.append("type,raw_name,translated_name,gender,description") # Header with description
	elif include_gender_context:
	csv_lines.append("type,raw_name,translated_name,gender") # Header with gender
	else:
	csv_lines.append("type,raw_name,translated_name") # Header

	for term, info in term_info_map.items():
	count = term_frequencies.get(term, 0)

	# Also check original form if it was stripped
	if info['original_raw'] != term:
	count += term_frequencies.get(info['original_raw'], 0)

	if count >= min_frequency:
	if include_description:
	csv_line = f"{info['entry_type']},{term},{info['translated_name']},{info['gender']},{info['description']}"
	elif include_gender_context:
	csv_line = f"{info['entry_type']},{term},{info['translated_name']},{info['gender']}"
	else:
	csv_line = f"{info['entry_type']},{term},{info['translated_name']}"
	csv_lines.append(csv_line)
	entries_accepted += 1

	# Log first few examples
	if entries_accepted <= 5:
	print(f"📑 ✓ Example: {term} -> {info['translated_name']} (freq: {count})")

	print(f"📑 Frequency filtering complete: {entries_accepted}/{len(term_info_map)} terms accepted")

	# Ensure we have at least the header
	if len(csv_lines) == 0:
	if include_description:
	csv_lines.append("type,raw_name,translated_name,gender,description")
	elif include_gender_context:
	csv_lines.append("type,raw_name,translated_name,gender")
	else:
	csv_lines.append("type,raw_name,translated_name")

	# Print final summary
	print(f"📑 Processing complete: {entries_accepted} terms accepted")

	return csv_lines

	def _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold):
	"""Apply advanced fuzzy matching to remove duplicate entries from the glossary with stop flag checks

	Uses a 2-pass approach:
	Pass 1: Remove entries with similar raw names (existing logic)
	Pass 2: Remove entries with identical translated names (new logic)
	"""
	from difflib import SequenceMatcher

	# Try to import advanced libraries
	try:
	from rapidfuzz import fuzz as rfuzz
	use_rapidfuzz = True
	except ImportError:
	use_rapidfuzz = False

	try:
	import jellyfish
	use_jellyfish = True
	except ImportError:
	use_jellyfish = False

	algo_info = []
	if use_rapidfuzz:
	algo_info.append("RapidFuzz")
	if use_jellyfish:
	algo_info.append("Jaro-Winkler")
	if not algo_info:
	algo_info.append("difflib")

	# Check if translated name deduplication is enabled
	# GLOSSARY_DEDUPE_TRANSLATIONS: "1" = enable Pass 2 (remove entries with identical translations)
	# : "0" = disable Pass 2 (only remove entries with similar raw names)
	dedupe_translations = os.getenv("GLOSSARY_DEDUPE_TRANSLATIONS", "1") == "1"

	print(f"📋 Applying 2-pass fuzzy deduplication (threshold: {fuzzy_threshold})...")
	print(f"📋 Pass 1: Raw name deduplication (fuzzy matching)")
	if dedupe_translations:
	print(f"📋 Pass 2: Translated name deduplication (exact matching)")
	else:
	print(f"📋 Pass 2: DISABLED (GLOSSARY_DEDUPE_TRANSLATIONS=0)")
	print(f"📋 Using algorithms: {', '.join(algo_info)}")

	# Check stop flag at start
	if is_stop_requested():
	print(f"📑 ❌ Deduplication stopped by user")
	return csv_lines

	header_line = csv_lines[0] # Keep header
	entry_lines = csv_lines[1:] # Data lines
	original_count = len(entry_lines)

	print(f"📑 Starting deduplication with {original_count} entries...")

	# PASS 1: Raw name deduplication (existing fuzzy matching logic)
	print(f"📑 🔄 PASS 1: Raw name deduplication...")
	pass1_results = _deduplicate_pass1_raw_names(
	entry_lines, fuzzy_threshold, use_rapidfuzz, use_jellyfish
	)

	pass1_count = len(pass1_results)
	pass1_removed = original_count - pass1_count
	print(f"📑 ✅ PASS 1 complete: {pass1_removed} duplicates removed ({pass1_count} remaining)")

	# PASS 2: Translated name deduplication (if enabled)
	if dedupe_translations:
	print(f"📑 🔄 PASS 2: Translated name deduplication...")
	final_results, replaced_count = _deduplicate_pass2_translated_names(pass1_results)
	pass2_removed = pass1_count - len(final_results)

	replaced_msg = f" ({replaced_count} replaced with more complete entries)" if replaced_count > 0 else ""
	print(f"📑 ✅ PASS 2 complete: {pass2_removed} duplicates removed{replaced_msg} ({len(final_results)} remaining)")
	total_removed = pass1_removed + pass2_removed
	else:
	final_results = pass1_results
	total_removed = pass1_removed
	print(f"📑 ⏭️ PASS 2 skipped (translation deduplication disabled)")

	# Rebuild CSV with header
	deduplicated = [header_line] + final_results

	print(f"📑 ✅ Total deduplication complete: {total_removed} duplicates removed")
	print(f"📑 Final glossary size: {len(final_results)} unique entries")

	return deduplicated


	def _deduplicate_pass1_raw_names(entry_lines, fuzzy_threshold, use_rapidfuzz, use_jellyfish):
	"""Pass 1: Remove entries with similar raw names using fuzzy matching"""
	from difflib import SequenceMatcher

	if use_rapidfuzz:
	from rapidfuzz import fuzz as rfuzz

	if use_jellyfish:
	import jellyfish

	deduplicated = []
	seen_entries = {} # raw_name -> (entry_type, translated_name)
	seen_names_lower = set() # Quick exact match check
	removed_count = 0
	total_entries = len(entry_lines)

	for idx, line in enumerate(entry_lines):
	# Check stop flag every 100 entries
	if idx > 0 and idx % 100 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Pass 1 stopped at entry {idx}/{total_entries}")
	break

	# Show progress for large glossaries
	if total_entries > 500 and idx % 200 == 0:
	progress = (idx / total_entries) * 100
	print(f"📑 Pass 1 progress: {progress:.1f}% ({idx}/{total_entries})")

	if not line.strip():
	continue

	parts = [p.strip() for p in line.split(',')]
	if len(parts) < 3:
	continue

	entry_type = parts[0]
	raw_name = parts[1]
	translated_name = parts[2]
	raw_name_lower = raw_name.lower()

	# Fast exact duplicate check first
	if raw_name_lower in seen_names_lower:
	removed_count += 1
	if removed_count <= 10: # Only log first few
	print(f"📋 Pass 1: Removing exact duplicate: '{raw_name}'")
	continue

	# For fuzzy matching, only check if threshold is less than 1.0
	is_duplicate = False
	if fuzzy_threshold < 1.0:
	# Use a more efficient approach: only check similar length strings
	name_len = len(raw_name)
	min_len = int(name_len * 0.7)
	max_len = int(name_len * 1.3)

	# Only compare with entries of similar length
	candidates = []
	for seen_name, (seen_type, seen_trans) in seen_entries.items():
	if min_len <= len(seen_name) <= max_len:
	candidates.append(seen_name)

	# Check fuzzy similarity with candidates using multiple algorithms
	for seen_name in candidates:
	# Quick character overlap check before expensive comparison
	char_overlap = len(set(raw_name_lower) & set(seen_name.lower()))
	if char_overlap < len(raw_name_lower) * 0.5:
	continue # Too different, skip

	# Try multiple algorithms and take the best score
	scores = []

	if use_rapidfuzz:
	# RapidFuzz basic ratio
	scores.append(rfuzz.ratio(raw_name_lower, seen_name.lower()) / 100.0)
	# Token sort (handles word order)
	try:
	scores.append(rfuzz.token_sort_ratio(raw_name_lower, seen_name.lower()) / 100.0)
	except:
	pass
	# Partial ratio (substring)
	try:
	scores.append(rfuzz.partial_ratio(raw_name_lower, seen_name.lower()) / 100.0)
	except:
	pass
	else:
	# Fallback to difflib
	scores.append(SequenceMatcher(None, raw_name_lower, seen_name.lower()).ratio())

	# Try Jaro-Winkler (better for names)
	if use_jellyfish:
	try:
	jaro = jellyfish.jaro_winkler_similarity(raw_name, seen_name)
	scores.append(jaro)
	except:
	pass

	# Take best score
	best_similarity = max(scores) if scores else 0.0

	if best_similarity >= fuzzy_threshold:
	if removed_count < 10: # Only log first few
	print(f"📋 Pass 1: Removing fuzzy duplicate: '{raw_name}' ~= '{seen_name}' (score: {best_similarity:.2%})")
	removed_count += 1
	is_duplicate = True
	break

	if not is_duplicate:
	seen_entries[raw_name] = (entry_type, translated_name)
	seen_names_lower.add(raw_name_lower)
	deduplicated.append(line)

	return deduplicated


	def _deduplicate_pass2_translated_names(entry_lines):
	"""Pass 2: Remove entries with identical translated names"""
	deduplicated = []
	seen_translations = {} # translated_name.lower() -> (raw_name, line)
	removed_count = 0
	replaced_count = 0

	for line in entry_lines:
	if not line.strip():
	continue

	parts = [p.strip() for p in line.split(',')]
	if len(parts) < 3:
	continue

	entry_type = parts[0]
	raw_name = parts[1]
	translated_name = parts[2]
	translated_lower = translated_name.lower().strip()

	# Skip empty translations
	if not translated_lower:
	deduplicated.append(line)
	continue

	# Check if we've seen this translation before
	if translated_lower in seen_translations:
	existing_raw, existing_line = seen_translations[translated_lower]
	# Get the existing translated name from the line
	existing_parts = existing_line.split(',')
	existing_translated = existing_parts[2] if len(existing_parts) >= 3 else translated_name

	# Count fields in both entries (more fields = higher priority)
	current_field_count = len([f.strip() for f in parts if f.strip()])
	existing_field_count = len([f.strip() for f in existing_parts if f.strip()])

	# If current entry has more fields, replace the existing one
	if current_field_count > existing_field_count:
	# Remove existing entry from deduplicated list
	deduplicated = [l for l in deduplicated if l != existing_line]
	# Replace with current entry
	seen_translations[translated_lower] = (raw_name, line)
	deduplicated.append(line)
	removed_count += 1
	replaced_count += 1
	if removed_count <= 10: # Only log first few
	print(f"📋 Pass 2: Replacing '{existing_raw}' -> '{existing_translated}' ({existing_field_count} fields) with '{raw_name}' -> '{translated_name}' ({current_field_count} fields) - more detailed entry")
	else:
	# Keep existing entry (has same or more fields)
	removed_count += 1
	if removed_count <= 10: # Only log first few
	extra_info = f" ({current_field_count} vs {existing_field_count} fields)" if current_field_count != existing_field_count else ""
	print(f"📋 Pass 2: Removing '{raw_name}' -> '{translated_name}' (duplicate translation of '{existing_raw}' -> '{existing_translated}'){extra_info}")
	else:
	# New translation, keep it
	seen_translations[translated_lower] = (raw_name, line)
	deduplicated.append(line)

	return deduplicated, replaced_count

	def _merge_csv_entries(new_csv_lines, existing_glossary, strip_honorifics, language):
	"""Merge CSV entries with existing glossary with stop flag checks"""

	# Check stop flag at start
	if is_stop_requested():
	print(f"📑 ❌ Glossary merge stopped by user")
	return new_csv_lines

	# Parse existing glossary
	existing_lines = []
	existing_names = set()

	if isinstance(existing_glossary, str):
	# Already CSV format
	lines = existing_glossary.strip().split('\n')
	total_lines = len(lines)

	for idx, line in enumerate(lines):
	# Check stop flag every 50 lines
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Merge stopped while processing existing glossary at line {idx}/{total_lines}")
	return new_csv_lines

	if total_lines > 200:
	progress = (idx / total_lines) * 100
	print(f"📑 Processing existing glossary: {progress:.1f}%")

	if 'type,raw_name' in line.lower():
	continue # Skip header

	line_stripped = line.strip()
	# Skip token-efficient lines and section/bullet markers
	if not line_stripped or line_stripped.startswith('===') or line_stripped.startswith('*') or line_stripped.lower().startswith('glossary:'):
	continue

	parts = [p.strip() for p in line.split(',')]
	# Require at least 3 fields (type, raw_name, translated_name)
	if len(parts) < 3:
	continue

	entry_type = parts[0].strip().lower()
	# Only accept reasonable type tokens (letters/underscores only)
	import re as _re
	if not _re.match(r'^[a-z_]+$', entry_type):
	continue

	raw_name = parts[1]
	if strip_honorifics:
	raw_name = _strip_honorific(raw_name, language)
	parts[1] = raw_name
	if raw_name not in existing_names:
	existing_lines.append(','.join(parts))
	existing_names.add(raw_name)

	# Check stop flag before processing new names
	if is_stop_requested():
	print(f"📑 ❌ Merge stopped before processing new entries")
	return new_csv_lines

	# Get new names
	new_names = set()
	final_lines = []

	for idx, line in enumerate(new_csv_lines):
	# Check stop flag every 50 lines
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Merge stopped while processing new entries at line {idx}")
	return final_lines if final_lines else new_csv_lines

	if 'type,raw_name' in line.lower():
	final_lines.append(line) # Keep header
	continue
	parts = [p.strip() for p in line.split(',')]
	if len(parts) >= 2:
	new_names.add(parts[1])
	final_lines.append(line)

	# Check stop flag before adding existing entries
	if is_stop_requested():
	print(f"📑 ❌ Merge stopped before combining entries")
	return final_lines

	# Add non-duplicate existing entries
	added_count = 0
	for idx, line in enumerate(existing_lines):
	# Check stop flag every 50 additions
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Merge stopped while adding existing entries ({added_count} added)")
	return final_lines

	parts = [p.strip() for p in line.split(',')]
	if len(parts) >= 2 and parts[1] not in new_names:
	final_lines.append(line)
	added_count += 1

	print(f"📑 Merged {added_count} entries from existing glossary")
	return final_lines

	def _extract_with_patterns(all_text, language, min_frequency,
	max_names, max_titles, batch_size,
	existing_glossary, output_dir,
	strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'):
	"""Extract glossary using pattern matching with true CSV format output and stop flag checks"""
	print("📑 Using pattern-based extraction")

	# Check stop flag at start
	if is_stop_requested():
	print("📑 ❌ Pattern-based extraction stopped by user")
	return {}

	def is_valid_name(name, language_hint='unknown'):
	"""Strict validation for proper names only"""
	if not name or len(name.strip()) < 1:
	return False

	name = name.strip()

	if name.lower() in PM.COMMON_WORDS or name in PM.COMMON_WORDS:
	return False

	if language_hint == 'korean':
	if not (2 <= len(name) <= 4):
	return False
	if not all(0xAC00 <= ord(char) <= 0xD7AF for char in name):
	return False
	if len(set(name)) == 1:
	return False

	elif language_hint == 'japanese':
	if not (2 <= len(name) <= 6):
	return False
	has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in name)
	has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in name)
	if not (has_kanji or has_kana):
	return False

	elif language_hint == 'chinese':
	if not (2 <= len(name) <= 4):
	return False
	if not all(0x4E00 <= ord(char) <= 0x9FFF for char in name):
	return False

	elif language_hint == 'english':
	if not name[0].isupper():
	return False
	if sum(1 for c in name if c.isalpha()) < len(name) * 0.8:
	return False
	if not (2 <= len(name) <= 20):
	return False

	return True

	def detect_language_hint(text_sample):
	"""Quick language detection for validation purposes"""
	sample = text_sample[:1000]

	korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF)
	japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF))
	chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF)
	latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A)

	if korean_chars > 50:
	return 'korean'
	elif japanese_kana > 20:
	return 'japanese'
	elif chinese_chars > 50 and japanese_kana < 10:
	return 'chinese'
	elif latin_chars > 100:
	return 'english'
	else:
	return 'unknown'

	language_hint = detect_language_hint(all_text)
	print(f"📑 Detected primary language: {language_hint}")

	# Check stop flag after language detection
	if is_stop_requested():
	print("📑 ❌ Extraction stopped after language detection")
	return {}

	honorifics_to_use = []
	if language_hint in PM.CJK_HONORIFICS:
	honorifics_to_use.extend(PM.CJK_HONORIFICS[language_hint])
	honorifics_to_use.extend(PM.CJK_HONORIFICS.get('english', []))

	print(f"📑 Using {len(honorifics_to_use)} honorifics for {language_hint}")

	names_with_honorifics = {}
	standalone_names = {}

	# Check if parallel processing is enabled
	extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1"))

	# PARALLEL HONORIFIC PROCESSING
	if extraction_workers > 1 and len(honorifics_to_use) > 3:
	print(f"📑 Scanning for names with honorifics (parallel with {extraction_workers} workers)...")

	# Create a wrapper function that can be called in parallel
	def process_honorific(args):
	"""Process a single honorific in a worker thread"""
	honorific, idx, total = args

	# Check stop flag
	if is_stop_requested():
	return None, None

	print(f"📑 Worker processing honorific {idx}/{total}: '{honorific}'")

	# Local dictionaries for this worker
	local_names_with = {}
	local_standalone = {}

	# Call the extraction method
	_extract_names_for_honorific(
	honorific, all_text, language_hint,
	min_frequency, local_names_with,
	local_standalone, is_valid_name, fuzzy_threshold
	)

	return local_names_with, local_standalone

	# Prepare arguments for parallel processing
	honorific_args = [
	(honorific, idx + 1, len(honorifics_to_use))
	for idx, honorific in enumerate(honorifics_to_use)
	]

	# Process honorifics in parallel
	with ThreadPoolExecutor(max_workers=min(extraction_workers, len(honorifics_to_use))) as executor:
	futures = []

	for args in honorific_args:
	if is_stop_requested():
	executor.shutdown(wait=False)
	return {}

	future = executor.submit(process_honorific, args)
	futures.append(future)

	# Collect results as they complete
	completed = 0
	for future in as_completed(futures):
	if is_stop_requested():
	executor.shutdown(wait=False)
	return {}

	try:
	result = future.result()
	if result and result[0] is not None:
	local_names_with, local_standalone = result

	# Merge results (thread-safe since we're in main thread)
	for name, count in local_names_with.items():
	if name not in names_with_honorifics:
	names_with_honorifics[name] = count
	else:
	names_with_honorifics[name] = max(names_with_honorifics[name], count)

	for name, count in local_standalone.items():
	if name not in standalone_names:
	standalone_names[name] = count
	else:
	standalone_names[name] = max(standalone_names[name], count)

	completed += 1
	if completed % 5 == 0 or completed == len(honorifics_to_use):
	print(f"📑 Honorific processing: {completed}/{len(honorifics_to_use)} completed")

	except Exception as e:
	print(f"⚠️ Failed to process honorific: {e}")
	completed += 1

	print(f"📑 Parallel honorific processing completed: found {len(names_with_honorifics)} names")

	else:
	# SEQUENTIAL PROCESSING (fallback)
	print("📑 Scanning for names with honorifics...")

	# Extract names with honorifics
	total_honorifics = len(honorifics_to_use)
	for idx, honorific in enumerate(honorifics_to_use):
	# Check stop flag before each honorific
	if is_stop_requested():
	print(f"📑 ❌ Extraction stopped at honorific {idx}/{total_honorifics}")
	return {}

	print(f"📑 Processing honorific {idx + 1}/{total_honorifics}: '{honorific}'")

	_extract_names_for_honorific(honorific, all_text, language_hint,
	min_frequency, names_with_honorifics,
	standalone_names, is_valid_name, fuzzy_threshold)

	# Check stop flag before processing terms
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before processing terms")
	return {}

	# Apply filter mode
	filtered_names = {}
	if filter_mode == 'only_with_honorifics':
	# Only keep names that have honorifics (no standalone names)
	filtered_names = names_with_honorifics.copy()
	print(f"📑 Filter: Keeping only names with honorifics ({len(filtered_names)} names)")
	elif filter_mode == 'only_without_honorifics':
	# Keep standalone names that were NOT found with honorifics
	for name, count in standalone_names.items():
	# Check if this name also appears with honorifics
	appears_with_honorific = False
	for honorific_name in names_with_honorifics.keys():
	if _strip_honorific(honorific_name, language_hint) == name:
	appears_with_honorific = True
	break

	# Only add if it doesn't appear with honorifics
	if not appears_with_honorific:
	filtered_names[name] = count

	print(f"📑 Filter: Keeping only names without honorifics ({len(filtered_names)} names)")
	else: # 'all' mode
	# Keep all names (both with and without honorifics)
	filtered_names = names_with_honorifics.copy()
	# Also add standalone names
	for name, count in standalone_names.items():
	if name not in filtered_names and not any(
	_strip_honorific(n, language_hint) == name for n in filtered_names.keys()
	):
	filtered_names[name] = count
	print(f"📑 Filter: Keeping all names ({len(filtered_names)} names)")

	# Process extracted terms
	final_terms = {}

	term_count = 0
	total_terms = len(filtered_names)
	for term, count in filtered_names.items():
	term_count += 1

	# Check stop flag every 20 terms
	if term_count % 20 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Term processing stopped at {term_count}/{total_terms}")
	return {}

	if strip_honorifics:
	clean_term = _strip_honorific(term, language_hint)
	if clean_term in final_terms:
	final_terms[clean_term] = final_terms[clean_term] + count
	else:
	final_terms[clean_term] = count
	else:
	final_terms[term] = count

	# Check stop flag before finding titles
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before finding titles")
	return {}

	# Find titles (but respect filter mode)
	print("📑 Scanning for titles...")
	found_titles = {}

	# Extract titles for all modes EXCEPT "only_with_honorifics"
	# (titles are included in "only_without_honorifics" since titles typically don't have honorifics)
	if filter_mode != 'only_with_honorifics':
	title_patterns_to_use = []
	if language_hint in PM.TITLE_PATTERNS:
	title_patterns_to_use.extend(PM.TITLE_PATTERNS[language_hint])
	title_patterns_to_use.extend(PM.TITLE_PATTERNS.get('english', []))

	total_patterns = len(title_patterns_to_use)
	for pattern_idx, pattern in enumerate(title_patterns_to_use):
	# Check stop flag before each pattern
	if is_stop_requested():
	print(f"📑 ❌ Title extraction stopped at pattern {pattern_idx}/{total_patterns}")
	return {}

	print(f"📑 Processing title pattern {pattern_idx + 1}/{total_patterns}")

	matches = list(re.finditer(pattern, all_text, re.IGNORECASE if 'english' in pattern else 0))

	for match_idx, match in enumerate(matches):
	# Check stop flag every 50 matches
	if match_idx > 0 and match_idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Title extraction stopped at match {match_idx}")
	return {}

	title = match.group(0)

	# Skip if this title is already in names
	if title in filtered_names or title in names_with_honorifics:
	continue

	count = _find_fuzzy_matches(title, all_text, fuzzy_threshold)

	# Check if stopped during fuzzy matching
	if is_stop_requested():
	print(f"📑 ❌ Title extraction stopped during fuzzy matching")
	return {}

	if count >= min_frequency:
	if re.match(r'[A-Za-z]', title):
	title = title.title()

	if strip_honorifics:
	title = _strip_honorific(title, language_hint)

	if title not in found_titles:
	found_titles[title] = count

	if filter_mode == 'only_without_honorifics':
	print(f"📑 Found {len(found_titles)} titles (included in 'without honorifics' mode)")
	else:
	print(f"📑 Found {len(found_titles)} unique titles")
	else:
	print(f"📑 Skipping title extraction (filter mode: only_with_honorifics)")

	# Check stop flag before sorting and translation
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before sorting terms")
	return {}

	# Combine and sort
	sorted_names = sorted(final_terms.items(), key=lambda x: x[1], reverse=True)
	sorted_titles = sorted(found_titles.items(), key=lambda x: x[1], reverse=True)

	all_terms = []
	for name, count in sorted_names:
	all_terms.append(name)
	for title, count in sorted_titles:
	all_terms.append(title)

	print(f"📑 Total terms to translate: {len(all_terms)}")

	# Check stop flag before translation
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before translation")
	return {}

	# Translate terms
	if os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1":
	print("📑 Translation disabled - keeping original terms")
	translations = {term: term for term in all_terms}
	else:
	print(f"📑 Translating {len(all_terms)} terms...")
	translations = _translate_terms_batch(all_terms, language_hint, batch_size, output_dir)

	# Check if translation was stopped
	if is_stop_requested():
	print("📑 ❌ Extraction stopped after translation")
	return translations # Return partial results

	# Build CSV lines
	csv_lines = ["type,raw_name,translated_name"]

	for name, _ in sorted_names:
	if name in translations:
	csv_lines.append(f"character,{name},{translations[name]}")

	for title, _ in sorted_titles:
	if title in translations:
	csv_lines.append(f"term,{title},{translations[title]}")

	# Check stop flag before merging
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before merging with existing glossary")
	# Still save what we have
	csv_content = '\n'.join(csv_lines)
	glossary_path = os.path.join(output_dir, "glossary.json")
	_atomic_write_file(glossary_path, csv_content)
	return _parse_csv_to_dict(csv_content)

	# Merge with existing glossary
	if existing_glossary:
	csv_lines = _merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language_hint)

	# Check stop flag before deduplication
	if is_stop_requested():
	print("📑 ❌ Extraction stopped before deduplication")
	csv_content = '\n'.join(csv_lines)
	glossary_path = os.path.join(output_dir, "glossary.json")
	_atomic_write_file(glossary_path, csv_content)
	return _parse_csv_to_dict(csv_content)

	# Fuzzy matching deduplication
	csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold)

	# Create CSV content
	csv_content = '\n'.join(csv_lines)
	# Save glossary as CSV
	glossary_path = os.path.join(output_dir, "glossary.csv")
	_atomic_write_file(glossary_path, csv_content)

	print(f"\n📑 ✅ TARGETED GLOSSARY SAVED!")
	print(f"📑 File: {glossary_path}")
	print(f"📑 Total entries: {len(csv_lines) - 1}") # Exclude header

	return _parse_csv_to_dict(csv_content)

	def _translate_terms_batch(term_list, profile_name, batch_size=50, output_dir=None, log_callback=None):
	"""Use fully configurable prompts for translation with interrupt support"""
	# Redirect stdout to GUI log if callback provided
	if log_callback:
	set_output_redirect(log_callback)

	if not term_list or os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1":
	print(f"📑 Glossary translation disabled or no terms to translate")
	return {term: term for term in term_list}

	# Check stop flag
	if is_stop_requested():
	print("📑 ❌ Glossary translation stopped by user")
	return {term: term for term in term_list}

	try:
	MODEL = os.getenv("MODEL", "gemini-1.5-flash")
	API_KEY = (os.getenv("API_KEY") or
	os.getenv("OPENAI_API_KEY") or
	os.getenv("OPENAI_OR_Gemini_API_KEY") or
	os.getenv("GEMINI_API_KEY"))

	if is_traditional_translation_api(MODEL):
	return

	if not API_KEY and not _model_uses_own_auth(MODEL):
	print(f"📑 No API key found, skipping translation")
	return {term: term for term in term_list}

	print(f"📑 Translating {len(term_list)} {profile_name} terms to English using batch size {batch_size}...")

	# Ensure multi-key config is available in this process if enabled
	_ensure_multi_key_config_loaded()
	from unified_api_client import UnifiedClient, UnifiedClientError
	client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir)
	if hasattr(client, 'reset_cleanup_state'):
	client.reset_cleanup_state()

	# Get custom translation prompt from environment
	translation_prompt_template = os.getenv("GLOSSARY_TRANSLATION_PROMPT", "")

	if not translation_prompt_template:
	translation_prompt_template = """You are translating {language} character names and important terms to English.
	For character names, provide English transliterations or keep as romanized.
	Keep honorifics/suffixes only if they are integral to the name.
	Respond with the same numbered format.

	Terms to translate:
	{terms_list}

	Provide translations in the same numbered format."""

	all_translations = {}
	all_responses = [] # Collect raw responses
	# Respect Auto-retry Slow Chunks toggle (RETRY_TIMEOUT env): when off, disable chunk timeouts entirely
	retry_env = os.getenv("RETRY_TIMEOUT")
	retry_timeout_enabled = retry_env is None or retry_env.strip().lower() not in ("0", "false", "off", "")
	if retry_timeout_enabled:
	env_ct = os.getenv("CHUNK_TIMEOUT", "1800")
	try:
	ct_val = float(env_ct)
	chunk_timeout = None if ct_val <= 0 else ct_val
	except Exception:
	chunk_timeout = None
	else:
	chunk_timeout = None

	for i in range(0, len(term_list), batch_size):
	# Check stop flag before each batch
	if is_stop_requested():
	print(f"📑 ❌ Translation stopped at batch {(i // batch_size) + 1}")
	# Return partial translations
	for term in term_list:
	if term not in all_translations:
	all_translations[term] = term
	return all_translations

	batch = term_list[i:i + batch_size]
	batch_num = (i // batch_size) + 1
	total_batches = (len(term_list) + batch_size - 1) // batch_size

	print(f"📑 Processing batch {batch_num}/{total_batches} ({len(batch)} terms)...")

	# Format terms list
	terms_text = ""
	for idx, term in enumerate(batch, 1):
	terms_text += f"{idx}. {term}\n"

	# Replace placeholders in prompt
	prompt = translation_prompt_template.replace('{language}', profile_name)
	prompt = prompt.replace('{terms_list}', terms_text.strip())
	prompt = prompt.replace('{batch_size}', str(len(batch)))

	messages = [
	{"role": "user", "content": prompt}
	]

	try:
	# Use glossary-specific temperature with fallback to global
	temperature = float(os.getenv("GLOSSARY_TEMPERATURE", os.getenv("TEMPERATURE", "0.3")))
	# Use glossary-specific max output tokens with fallback to global
	max_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "4096")))

	# Use send_with_interrupt for interruptible API call
	print(f"📑 Sending translation request for batch {batch_num} (interruptible)...")

	# Timeout retry logic (matches translation behavior)
	try:
	max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2"))
	except Exception:
	max_timeout_retries = 2
	timeout_retry_count = 0
	while True:
	try:
	response, finish_reason, raw_obj = send_with_interrupt(
	messages=messages,
	client=client,
	temperature=temperature,
	max_tokens=max_tokens,
	stop_check_fn=is_stop_requested,
	chunk_timeout=chunk_timeout
	)
	break
	except UnifiedClientError as e:
	error_msg = str(e)
	lower_msg = error_msg.lower()
	if "stopped by user" in lower_msg or is_stop_requested():
	raise
	is_timeout = ("timed out" in lower_msg) or ("timeout" in lower_msg) or ("cancelled" in lower_msg) or ("client not initialized" in lower_msg)
	if is_timeout and timeout_retry_count < max_timeout_retries:
	timeout_retry_count += 1
	if chunk_timeout:
	print(f"⚠️ Glossary translation batch {batch_num} timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...")
	else:
	print(f"⚠️ Glossary translation batch {batch_num} timed out, retrying ({timeout_retry_count}/{max_timeout_retries})...")
	# Reinitialize client if needed
	client_type = getattr(client, 'client_type', 'unknown')
	needs_reinit = False
	if client_type == 'gemini':
	needs_reinit = hasattr(client, 'gemini_client') and client.gemini_client is None
	elif client_type == 'openai':
	needs_reinit = hasattr(client, 'openai_client') and client.openai_client is None
	if needs_reinit:
	try:
	print(f" 🔄 Reinitializing {client_type} client...")
	client._setup_client()
	except Exception as reinit_err:
	print(f" ⚠️ Failed to reinitialize client: {reinit_err}")
	# Stagger retries
	try:
	import random
	base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2"))
	retry_delay = random.uniform(base_delay / 2, base_delay)
	print(f" ⏳ Waiting {retry_delay:.1f}s before retry...")
	time.sleep(retry_delay)
	except Exception:
	time.sleep(1.0)
	continue
	else:
	raise

	# Handle response properly
	if hasattr(response, 'content'):
	response_text = response.content
	else:
	response_text = str(response)

	# Store raw response with batch info
	all_responses.append((batch, response_text))

	print(f"📑 Batch {batch_num} completed - response received")

	# Small delay between batches to avoid rate limiting (configurable)
	if i + batch_size < len(term_list):
	# Check stop before sleep
	if is_stop_requested():
	print(f"📑 ❌ Translation stopped after batch {batch_num}")
	# Fill in missing translations
	for term in term_list:
	if term not in all_translations:
	all_translations[term] = term
	return all_translations
	# Use configurable batch delay or default to 0.1s (much faster than 0.5s)
	batch_delay = float(os.getenv("GLOSSARY_BATCH_DELAY", "0.001"))
	if batch_delay > 0:
	time.sleep(batch_delay)

	except UnifiedClientError as e:
	if "stopped by user" in str(e).lower():
	print(f"📑 ❌ Translation interrupted by user at batch {batch_num}")
	# Fill in remaining terms with originals
	for term in term_list:
	if term not in all_translations:
	all_translations[term] = term
	return all_translations
	else:
	print(f"⚠️ Translation failed for batch {batch_num}: {e}")
	for term in batch:
	all_translations[term] = term
	except Exception as e:
	print(f"⚠️ Translation failed for batch {batch_num}: {e}")
	for term in batch:
	all_translations[term] = term

	# Parse all responses at the end
	print(f"📑 Parsing {len(all_responses)} batch responses...")
	for batch, response_text in all_responses:
	batch_translations = _parse_translation_response(response_text, batch)
	all_translations.update(batch_translations)

	# Ensure all terms have translations
	for term in term_list:
	if term not in all_translations:
	all_translations[term] = term

	translated_count = sum(1 for term, translation in all_translations.items()
	if translation != term and translation.strip())

	print(f"📑 Successfully translated {translated_count}/{len(term_list)} terms")
	return all_translations

	except Exception as e:
	print(f"⚠️ Glossary translation failed: {e}")
	return {term: term for term in term_list}


	def _extract_names_for_honorific(honorific, all_text, language_hint,
	min_frequency, names_with_honorifics,
	standalone_names, is_valid_name, fuzzy_threshold=0.90):
	"""Extract names for a specific honorific with fuzzy matching and stop flag checks"""

	# Check stop flag at start
	if is_stop_requested():
	print(f"📑 ❌ Name extraction for '{honorific}' stopped by user")
	return

	if language_hint == 'korean' and not honorific.startswith('-'):
	pattern = r'([\uac00-\ud7af]{2,4})(?=' + re.escape(honorific) + r'(?:\s\|[,.\!?]\|$))'

	matches = list(re.finditer(pattern, all_text))
	total_matches = len(matches)

	for idx, match in enumerate(matches):
	# Check stop flag every 50 matches
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Korean name extraction stopped at {idx}/{total_matches}")
	return

	# Show progress for large sets
	if total_matches > 500:
	progress = (idx / total_matches) * 100
	print(f"📑 Processing Korean names: {progress:.1f}% ({idx}/{total_matches})")

	potential_name = match.group(1)

	if is_valid_name(potential_name, 'korean'):
	full_form = potential_name + honorific

	# Use fuzzy matching for counting with stop check
	count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold)

	# Check if stopped during fuzzy matching
	if is_stop_requested():
	print(f"📑 ❌ Name extraction stopped during fuzzy matching")
	return

	if count >= min_frequency:
	context_patterns = [
	full_form + r'[은는이가]',
	full_form + r'[을를]',
	full_form + r'[에게한테]',
	r'["]' + full_form,
	full_form + r'[,]',
	]

	context_count = 0
	for ctx_pattern in context_patterns:
	context_count += len(re.findall(ctx_pattern, all_text))

	if context_count > 0:
	names_with_honorifics[full_form] = count
	standalone_names[potential_name] = count

	elif language_hint == 'japanese' and not honorific.startswith('-'):
	pattern = r'([\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,5})(?=' + re.escape(honorific) + r'(?:\s\|[、。！？]\|$))'

	matches = list(re.finditer(pattern, all_text))
	total_matches = len(matches)

	for idx, match in enumerate(matches):
	# Check stop flag every 50 matches
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Japanese name extraction stopped at {idx}/{total_matches}")
	return

	if total_matches > 500:
	progress = (idx / total_matches) * 100
	print(f"📑 Processing Japanese names: {progress:.1f}% ({idx}/{total_matches})")

	potential_name = match.group(1)

	if is_valid_name(potential_name, 'japanese'):
	full_form = potential_name + honorific
	count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold)

	if is_stop_requested():
	print(f"📑 ❌ Name extraction stopped during fuzzy matching")
	return

	if count >= min_frequency:
	names_with_honorifics[full_form] = count
	standalone_names[potential_name] = count

	elif language_hint == 'chinese' and not honorific.startswith('-'):
	pattern = r'([\u4e00-\u9fff]{2,4})(?=' + re.escape(honorific) + r'(?:\s\|[，。！？]\|$))'

	matches = list(re.finditer(pattern, all_text))
	total_matches = len(matches)

	for idx, match in enumerate(matches):
	# Check stop flag every 50 matches
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ Chinese name extraction stopped at {idx}/{total_matches}")
	return

	if total_matches > 500:
	progress = (idx / total_matches) * 100
	print(f"📑 Processing Chinese names: {progress:.1f}% ({idx}/{total_matches})")

	potential_name = match.group(1)

	if is_valid_name(potential_name, 'chinese'):
	full_form = potential_name + honorific
	count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold)

	if is_stop_requested():
	print(f"📑 ❌ Name extraction stopped during fuzzy matching")
	return

	if count >= min_frequency:
	names_with_honorifics[full_form] = count
	standalone_names[potential_name] = count

	elif honorific.startswith('-') or honorific.startswith(' '):
	is_space_separated = honorific.startswith(' ')

	if is_space_separated:
	pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'(?=\s\|[,.\!?]\|$)'
	else:
	pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'\b'

	matches = list(re.finditer(pattern_english, all_text))
	total_matches = len(matches)

	for idx, match in enumerate(matches):
	# Check stop flag every 50 matches
	if idx > 0 and idx % 50 == 0:
	if is_stop_requested():
	print(f"📑 ❌ English name extraction stopped at {idx}/{total_matches}")
	return

	if total_matches > 500:
	progress = (idx / total_matches) * 100
	print(f"📑 Processing English names: {progress:.1f}% ({idx}/{total_matches})")

	potential_name = match.group(1)

	if is_valid_name(potential_name, 'english'):
	full_form = potential_name + honorific
	count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold)

	if is_stop_requested():
	print(f"📑 ❌ Name extraction stopped during fuzzy matching")
	return

	if count >= min_frequency:
	names_with_honorifics[full_form] = count
	standalone_names[potential_name] = count

	def _parse_translation_response(response, original_terms):
	"""Extract translations from AI response by matching numbered lines to original terms"""
	translations = {}

	# Handle UnifiedResponse object
	if hasattr(response, 'content'):
	response_text = response.content
	else:
	response_text = str(response)

	# Split into lines
	lines = response_text.strip().split('\n')

	for line in lines:
	line = line.strip()
	if not line:
	continue

	# Match numbered format: "1. Translation" or "1) Translation" etc
	number_match = re.match(r'^(\d+)[\.):\-\s]+(.+)', line)
	if number_match:
	idx = int(number_match.group(1)) - 1 # Convert to 0-based
	translation = number_match.group(2).strip()

	# Remove trailing explanations in parentheses
	translation = re.sub(r'\s$[^)]+$\s$', '', translation)

	if 0 <= idx < len(original_terms):
	translations[original_terms[idx]] = translation

	print(f"📑 Extracted {len(translations)}/{len(original_terms)} translations")
	return translations


	def _init_worker_with_env(env_vars_dict):
	"""Initialize worker process with environment variables from parent.

	MUST be at module level for pickling by multiprocessing.Pool.
	"""
	import os
	for k, v in env_vars_dict.items():
	os.environ[k] = str(v)

	def _check_sentence_batch_for_terms(args):
	"""Check a batch of sentences for term matches - used by ProcessPoolExecutor"""
	batch_sentences, terms = args
	filtered = []

	# Use pre-compiled term list for fast checking
	for sentence in batch_sentences:
	# Quick check using any() - stops at first match
	if any(term in sentence for term in terms):
	filtered.append(sentence)

	return filtered

	def _score_sentence_batch(args):
	"""Worker function to score a batch of sentences - Optimized for speed"""
	(start_idx, sentences), term_list, honorific_pattern_str, gender_pronouns, include_gender_context = args
	import re

	local_scores = {}
	local_term_map = {}

	# Pre-compile regex if needed
	honorific_pattern = re.compile(honorific_pattern_str) if honorific_pattern_str else None

	# OPTIMIZATION 1: Segregate terms for hybrid strategy
	# - Single-token terms: Use O(1) set intersection (FAST)
	# - Multi-token terms: Use iteration (SLOWER, but few terms)
	# This preserves quality for terms with spaces while keeping speed for CJK/single names

	# Simple tokenizer for classification (matches CJK chars or alphanumeric sequences)
	tokenizer_pattern = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]+\|[a-zA-Z0-9]+')

	single_token_terms = set()
	multi_token_terms = []

	for t in term_list:
	if len(t) < 2: continue
	# Check if term splits into multiple tokens
	tokens = tokenizer_pattern.findall(t)
	if len(tokens) > 1:
	multi_token_terms.append(t)
	else:
	single_token_terms.add(t)

	# Pre-compile multi-token terms regex if there are any (faster than loop)
	multi_term_regex = None
	if multi_token_terms:
	# Sort by length desc to match longest first
	multi_token_terms.sort(key=len, reverse=True)
	# Escape terms
	pattern = '\|'.join(map(re.escape, multi_token_terms))
	try:
	multi_term_regex = re.compile(pattern)
	except:
	# Fallback if pattern is too huge (unlikely for just multi-word subset)
	pass

	for idx, sentence in enumerate(sentences):
	global_idx = start_idx + idx
	score = 1.0

	# Gender pronoun check (fast)
	if include_gender_context and gender_pronouns:
	for p in gender_pronouns:
	if p in sentence:
	score += 5.0
	break

	# Honorific check (fast regex)
	if honorific_pattern and honorific_pattern.search(sentence):
	score += 2.0

	local_scores[global_idx] = score

	# 1. Fast Path: Single-token terms (Set Intersection)
	tokens = set(tokenizer_pattern.findall(sentence))
	found_terms = tokens.intersection(single_token_terms)

	for term in found_terms:
	if term not in local_term_map:
	local_term_map[term] = []
	local_term_map[term].append(global_idx)

	# 2. Slow Path: Multi-token terms (Regex or Iteration)
	# Only needed if we actually have multi-word terms
	if multi_token_terms:
	if multi_term_regex:
	# Fast regex batch match
	for match in multi_term_regex.findall(sentence):
	if match not in local_term_map:
	local_term_map[match] = []
	# Avoid duplicates if regex matches same term multiple times
	if global_idx not in local_term_map[match]:
	local_term_map[match].append(global_idx)
	else:
	# Fallback iteration
	for term in multi_token_terms:
	if term in sentence:
	if term not in local_term_map:
	local_term_map[term] = []
	local_term_map[term].append(global_idx)

	return local_scores, local_term_map

	def _process_sentence_batch_for_extraction(args):
	"""Process sentences to extract terms - used by ProcessPoolExecutor"""
	batch_sentences, batch_idx, combined_pattern, exclude_check_data = args
	from collections import Counter
	import re

	local_word_freq = Counter()
	local_important = []
	local_seen = set()

	# Rebuild the exclusion check function from data
	honorifics_to_exclude, title_patterns_str, common_words, chinese_nums = exclude_check_data
	title_patterns = [re.compile(p) for p in title_patterns_str]

	def should_exclude_term(term):
	term_lower = term.lower()

	# Check if it's a common word
	if term in common_words or term_lower in common_words:
	return True

	# Check if it contains honorifics
	for honorific in honorifics_to_exclude:
	if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])):
	return True

	# Check if it matches title patterns
	for pattern in title_patterns:
	if pattern.search(term):
	return True

	# Check if it's a number
	if term in chinese_nums or term.isdigit():
	return True

	return False

	for sentence in batch_sentences:
	sentence = sentence.strip()
	if len(sentence) < 10 or len(sentence) > 500:
	continue

	# Find all potential terms in this sentence
	matches = re.findall(combined_pattern, sentence)

	if matches:
	# Filter out excluded terms
	filtered_matches = []
	for match in matches:
	if not should_exclude_term(match):
	local_word_freq[match] += 1
	filtered_matches.append(match)

	# Keep sentences with valid potential terms
	if filtered_matches:
	sentence_key = ' '.join(sorted(filtered_matches))
	if sentence_key not in local_seen:
	local_important.append(sentence)
	local_seen.add(sentence_key)

	return local_word_freq, local_important, local_seen, batch_idx