Spaces:
Running
Running
| # -*- coding: utf-8 -*- | |
| # This is for automatic glossary generation only, unrelated to the more thorough glossary generation you get from clicking the "Extract Glossary" button | |
| import os | |
| import re | |
| import os | |
| import sys | |
| import threading | |
| import tempfile | |
| import queue | |
| import time | |
| import json | |
| from bs4 import BeautifulSoup | |
| import PatternManager as PM | |
| import duplicate_detection_config as ddc | |
| from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed | |
| # Default unified auto-glossary prompt (used when AUTO_GLOSSARY_PROMPT is unset/empty). | |
| # NOTE: This matches the GUI's default_unified_prompt in GlossaryManager_GUI.py. | |
| DEFAULT_AUTO_GLOSARY_PROMPT3 = """You are a novel glossary extraction assistant. | |
| You must strictly return ONLY CSV format with 2-4 columns in this exact order: type,raw_name,translated_name,gender,description. | |
| For character entries, determine gender from context, leave empty if context is insufficient. | |
| For non-character entries, leave gender empty. | |
| The description column is optional and can contain brief context (role, location, significance). | |
| Critical Requirement: The translated name and description column must be in {language}. | |
| For example: | |
| character,ᫀ이히리ᄐ 나애,Dihirit Ade,female,The enigmatic guild leader of the Shadow Lotus who operates from the concealed backrooms of the capital, manipulating city politics through commerce and wielding dual daggers with lethal precision | |
| character,ᫀ뢔사난,Kim Sang-hyu,male,A master swordsman from the Northern Sect known for his icy demeanor and unparalleled skill with the Frost Blade technique which he uses to defend the border fortress | |
| CRITICAL EXTRACTION RULES: | |
| - Extract All Character names, Terms, Location names, Ability/Skill names, Item names, Organization names, and Titles/Ranks. | |
| - Do NOT extract sentences, dialogue, actions, questions, or statements as glossary entries | |
| - REJECT entries that contain verbs or end with punctuation (?, !, .) | |
| - REJECT entries starting with: "Me", "How", "What", "Why", "I", "He", "She", "They", "That's", "So", "Therefore", "Still", "But", "Protagonist". (The description column is excluded from this restriction) | |
| - Do NOT output any entries that are rejected by the above rules; skip them entirely | |
| - If unsure whether something is a proper noun/name, skip it | |
| - The description column must contain detailed context/explanation | |
| - Create at least one glossary entry for EVERY context marker window (lines ending with "=== CONTEXT N END ==="); treat each marker boundary as a required extraction point. | |
| - You must create {marker} glossary entries (one or more per window; do not invent placeholders). | |
| - You must include absolutely all characters found in the provided text in your glossary generation. Do not skip any character.""" | |
| # Class-level shared lock for API submission timing | |
| _api_submission_lock = threading.Lock() | |
| _last_api_submission_time = 0 | |
| _results_lock = threading.Lock() | |
| _file_write_lock = threading.Lock() | |
| _stop_requested = False | |
| # Register watchdog cleanup once per process (best-effort) | |
| _watchdog_atexit_registered = False | |
| BOOK_TITLE_RAW = None | |
| BOOK_TITLE_TRANSLATED = None | |
| BOOK_TITLE_VALUE = None # Legacy support if needed, or remove? Keeping for safety but won't use. | |
| def _extract_title_from_metadata(meta): | |
| """Best-effort lookup of a book title inside metadata structures.""" | |
| if not isinstance(meta, dict): | |
| return None | |
| title_keys = [ | |
| "title", | |
| "book_title", | |
| "bookTitle", | |
| "title_translated", | |
| "translated_title", | |
| "title_en", | |
| ] | |
| for key in title_keys: | |
| val = meta.get(key) | |
| if val: | |
| return str(val).strip() | |
| for nested_key in ("metadata", "opf", "info", "data"): | |
| nested = meta.get(nested_key) | |
| if isinstance(nested, dict): | |
| nested_title = _extract_title_from_metadata(nested) | |
| if nested_title: | |
| return nested_title | |
| return None | |
| def _extract_raw_title_from_epub(epub_path): | |
| """Extract the raw untranslated title from the input EPUB content.opf.""" | |
| if not epub_path or not os.path.exists(epub_path): | |
| return None | |
| print(f"[Metadata] Checking input EPUB for raw title: {epub_path}") | |
| # Try manual parsing first (more robust) | |
| try: | |
| import zipfile | |
| with zipfile.ZipFile(epub_path, 'r') as zf: | |
| # Find opf | |
| opf_name = next((n for n in zf.namelist() if n.lower().endswith('.opf')), None) | |
| if opf_name: | |
| content = zf.read(opf_name).decode('utf-8', errors='ignore') | |
| # Use BS4 with xml parser | |
| try: | |
| soup = BeautifulSoup(content, 'xml') | |
| except Exception: | |
| soup = BeautifulSoup(content, 'html.parser') | |
| # Try dc:title | |
| title_tag = soup.find('dc:title') | |
| if not title_tag: | |
| # Fallback to any title tag | |
| title_tag = soup.find('title') | |
| if title_tag: | |
| val = title_tag.get_text(strip=True) | |
| if val: | |
| return val | |
| except Exception as e: | |
| print(f"[Warning] Manual EPUB title extraction failed: {e}") | |
| # Fallback: ebooklib | |
| try: | |
| from ebooklib import epub | |
| book = epub.read_epub(epub_path) | |
| titles = book.get_metadata("DC", "title") | |
| if titles: | |
| val = titles[0][0] | |
| if val: | |
| return str(val).strip() | |
| except Exception as e: | |
| print(f"[Warning] Could not read EPUB metadata via ebooklib: {e}") | |
| return None | |
| def _extract_translated_title_from_metadata(output_dir): | |
| """Extract translated title from metadata.json in output directory.""" | |
| base_dir = os.path.abspath(output_dir or ".") | |
| epub_path = os.getenv("EPUB_PATH", "") | |
| epub_base = os.path.splitext(os.path.basename(epub_path or ""))[0] if epub_path else None | |
| candidates = [] | |
| # Only check output directory logic for translated title | |
| if epub_base: | |
| candidates.append(os.path.join(base_dir, epub_base, "metadata.json")) | |
| # Also check direct output dir | |
| candidates.append(os.path.join(base_dir, "metadata.json")) | |
| for meta_path in candidates: | |
| # print(f"[Metadata] Checking for translated book title at: {meta_path}") | |
| if os.path.exists(meta_path): | |
| try: | |
| with open(meta_path, "r", encoding="utf-8") as f: | |
| meta = json.load(f) | |
| meta_title = _extract_title_from_metadata(meta) | |
| if meta_title: | |
| return meta_title.strip() | |
| except Exception as e: | |
| print(f"[Warning] Could not read metadata.json for book title: {e}") | |
| return None | |
| def _derive_book_title(output_dir): | |
| """Legacy wrapper - logic moved to save_glossary main flow.""" | |
| return None | |
| def _ensure_book_title_csv_lines(csv_lines): | |
| """ | |
| Ensure the CSV (header + rows) contains a leading book title entry when enabled. | |
| Uses distinct raw and translated titles. | |
| """ | |
| if not csv_lines: | |
| return csv_lines | |
| include = os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "1").lower() not in ("0", "false", "no") | |
| raw_title = BOOK_TITLE_RAW | |
| trans_title = BOOK_TITLE_TRANSLATED | |
| # If we don't have BOTH, we can't create a perfect entry. | |
| # But user said "no scenarios with untranslated and untranslated". | |
| # So if one is missing, we might skip OR just use what we have? | |
| # User said "we only need untranslated text and translated text". | |
| # Assuming if both aren't available, we might default to what we have but prefer distinct. | |
| # Logic: if we have raw but no translated, use raw for both? No, user hates that. | |
| # But if we literally don't have a translation, we can't invent one. | |
| # The requirement seems to be: Get the CORRECT source for each field. | |
| if not include: | |
| return csv_lines | |
| if not raw_title and not trans_title: | |
| return csv_lines | |
| # Normalize for dedup check | |
| norm_raw = raw_title.lower() if raw_title else "" | |
| norm_trans = trans_title.lower() if trans_title else "" | |
| # Skip if already present | |
| header = csv_lines[0] | |
| for line in csv_lines[1:]: | |
| parts = [p.strip() for p in line.split(",")] | |
| if len(parts) >= 3: | |
| # Check if this line is already the book title | |
| p_raw = parts[1].lower() | |
| p_trans = parts[2].lower() | |
| # Match if we find our raw title or our translated title in the respective columns | |
| if (raw_title and p_raw == norm_raw) or (trans_title and p_trans == norm_trans): | |
| return csv_lines | |
| fields = [f.strip() for f in header.split(",")] | |
| row = [] | |
| for field in fields: | |
| key = field.lower() | |
| if key == "type": | |
| row.append("book") | |
| elif key == "raw_name": | |
| row.append(raw_title if raw_title else (trans_title if trans_title else "")) | |
| elif key == "translated_name": | |
| row.append(trans_title if trans_title else (raw_title if raw_title else "")) | |
| else: | |
| row.append("") | |
| book_line = ",".join(row) | |
| return [header, book_line] + csv_lines[1:] | |
| def _csv_sort_key(line: str): | |
| """Sort book first, then characters, then others by raw name.""" | |
| try: | |
| parts = line.split(",") | |
| entry_type = parts[0].strip().lower() | |
| name = parts[1].lower() if len(parts) > 1 else line.lower() | |
| except Exception: | |
| entry_type = "" | |
| name = line.lower() | |
| order = {"book": -1, "character": 0, "term": 1} | |
| return (order.get(entry_type, 2), name) | |
| # Timing variables | |
| _extraction_time = 0 | |
| _api_time = 0 | |
| _freq_check_time = 0 | |
| _dedup_time = 0 | |
| _io_time = 0 | |
| def _get_stop_file_path(): | |
| """Return the stop-flag file path (shared across processes).""" | |
| return os.environ.get("GLOSSARY_STOP_FILE") or os.path.join(tempfile.gettempdir(), "glossarion_glossary.stop") | |
| def _get_glossary_status_file_path() -> str: | |
| """File path for cross-process status about chunk submission/completion. | |
| This lets the parent process decide whether it's safe to "wait for chunks" even when | |
| WAIT_FOR_CHUNKS is disabled. | |
| """ | |
| try: | |
| explicit = os.environ.get("GLOSSARY_STATUS_FILE") | |
| if explicit: | |
| return explicit | |
| except Exception: | |
| pass | |
| # Default: colocate next to the stop file so both processes can find it deterministically. | |
| try: | |
| stop_fp = _get_stop_file_path() | |
| if stop_fp: | |
| return f"{stop_fp}.status.json" | |
| except Exception: | |
| pass | |
| return os.path.join(tempfile.gettempdir(), "glossarion_glossary.status.json") | |
| def _write_glossary_status(payload: dict) -> None: | |
| """Best-effort atomic write of glossary chunk status.""" | |
| try: | |
| fp = _get_glossary_status_file_path() | |
| os.makedirs(os.path.dirname(fp) or ".", exist_ok=True) | |
| tmp = f"{fp}.tmp" | |
| with open(tmp, "w", encoding="utf-8") as f: | |
| json.dump(payload, f, ensure_ascii=False, indent=2) | |
| os.replace(tmp, fp) | |
| except Exception: | |
| # Status is best-effort only. | |
| pass | |
| def _clear_api_watchdog_state(*, remove_watchdog_file: bool = True) -> None: | |
| """Best-effort reset of unified_api_client watchdog state. | |
| GlossaryManager often runs in a separate process; if it exits mid-stream or is force-stopped, | |
| its watchdog JSON file can keep the GUI progress bar "busy" until manually cleared. | |
| """ | |
| # Reset in-memory counters | |
| try: | |
| import unified_api_client | |
| if hasattr(unified_api_client, '_api_watchdog_reset'): | |
| unified_api_client._api_watchdog_reset() | |
| except Exception: | |
| pass | |
| # Remove the per-process watchdog file (if enabled) | |
| if remove_watchdog_file: | |
| try: | |
| wd_dir = os.environ.get("GLOSSARION_WATCHDOG_DIR") | |
| if wd_dir and os.path.isdir(wd_dir): | |
| fp = os.path.join(wd_dir, f"api_watchdog_{os.getpid()}.json") | |
| tmp = f"{fp}.tmp" | |
| try: | |
| if os.path.exists(tmp): | |
| os.remove(tmp) | |
| except Exception: | |
| pass | |
| try: | |
| if os.path.exists(fp): | |
| os.remove(fp) | |
| except Exception: | |
| pass | |
| except Exception: | |
| pass | |
| def set_stop_flag(value: bool): | |
| """Set the module-level stop flag and propagate to shared channels.""" | |
| global _stop_requested | |
| _stop_requested = bool(value) | |
| # Mirror to environment for other components | |
| os.environ["TRANSLATION_CANCELLED"] = "1" if value else "0" | |
| # If we're stopping, clear watchdog immediately so the GUI bar doesn't stick. | |
| # (If graceful-stop semantics are needed, the caller should avoid setting stop until ready.) | |
| if value: | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| # Touch/remove stop file for cross-process signalling | |
| stop_path = _get_stop_file_path() | |
| try: | |
| if value: | |
| with open(stop_path, "w", encoding="utf-8") as f: | |
| f.write("stop") | |
| else: | |
| if os.path.exists(stop_path): | |
| os.remove(stop_path) | |
| except Exception: | |
| pass | |
| # Notify unified_api_client if present | |
| try: | |
| import unified_api_client | |
| if hasattr(unified_api_client, "UnifiedClient"): | |
| unified_api_client.UnifiedClient._global_cancelled = bool(value) | |
| if hasattr(unified_api_client, "global_stop_flag"): | |
| unified_api_client.global_stop_flag = bool(value) | |
| except Exception: | |
| pass | |
| # Function to check if stop is requested (can be overridden) | |
| def is_stop_requested(): | |
| """Check if stop has been requested from any source. | |
| NOTE: TRANSLATION_CANCELLED is set on BOTH graceful and immediate stop. | |
| During graceful stop we must let in-flight API calls finish, so we only | |
| treat it as a stop signal when GRACEFUL_STOP is not active. When | |
| graceful stop IS active, the orchestrator in TransateKRtoEN handles the | |
| decision of whether to wait or cancel. | |
| """ | |
| if _stop_requested: | |
| return True | |
| # Environment toggle (set by GUI stop button) | |
| # Only treat as immediate stop when GRACEFUL_STOP is not active | |
| if os.environ.get("TRANSLATION_CANCELLED") == "1": | |
| if os.environ.get("GRACEFUL_STOP") != "1": | |
| return True | |
| # File-based stop flag for cross-process cancellation | |
| try: | |
| stop_path = _get_stop_file_path() | |
| if stop_path and os.path.exists(stop_path): | |
| return True | |
| except Exception: | |
| pass | |
| # Unified API client global cancellation | |
| try: | |
| import unified_api_client | |
| if getattr(unified_api_client, "global_stop_flag", False): | |
| return True | |
| if hasattr(unified_api_client, "UnifiedClient") and getattr(unified_api_client.UnifiedClient, "_global_cancelled", False): | |
| return True | |
| except Exception: | |
| pass | |
| return False | |
| def set_output_redirect(log_callback=None): | |
| """Redirect print statements to a callback function for GUI integration""" | |
| if log_callback: | |
| import threading | |
| class CallbackWriter: | |
| def __init__(self, callback): | |
| self.callback = callback | |
| self.main_thread = threading.main_thread() | |
| def write(self, text): | |
| if text.strip(): | |
| # The callback (append_log) is already thread-safe - it handles QTimer internally | |
| # So we can call it directly from any thread | |
| self.callback(text.strip()) | |
| def flush(self): | |
| pass | |
| sys.stdout = CallbackWriter(log_callback) | |
| def is_traditional_translation_api(model: str) -> bool: | |
| """Check if the model is a traditional translation API""" | |
| return model in ['deepl', 'google-translate', 'google-translate-free'] or model.startswith('deepl/') or model.startswith('google-translate/') | |
| def _model_uses_own_auth(model: str) -> bool: | |
| """Check if the model uses its own authentication (no API key needed). | |
| authgpt/ uses OAuth tokens, vertex/ uses Google service account credentials.""" | |
| if not model: | |
| return False | |
| m = model.lower() | |
| return m.startswith('authgpt/') or m.startswith('vertex/') | |
| def _ensure_multi_key_config_loaded(): | |
| """Best-effort load of multi-key config when running in subprocesses. | |
| In subprocesses, in-memory key lists are not inherited. If multi-key mode is | |
| enabled via env but no keys are present, load them from config.json and | |
| initialize UnifiedClient's in-memory pool. | |
| """ | |
| try: | |
| if os.getenv('USE_MULTI_API_KEYS', '0') != '1': | |
| return | |
| except Exception: | |
| return | |
| # If keys are already present in env or in-memory, nothing to do. | |
| try: | |
| mk_env = os.getenv('MULTI_API_KEYS', '') | |
| if mk_env and str(mk_env).strip() not in ('', '[]', 'null', 'None'): | |
| return | |
| except Exception: | |
| pass | |
| try: | |
| import unified_api_client as _uac | |
| with _uac.UnifiedClient._in_memory_multi_keys_lock: | |
| if _uac.UnifiedClient._in_memory_multi_keys: | |
| return | |
| except Exception: | |
| pass | |
| # Try to load from config.json in common locations. | |
| cfg_paths = [] | |
| try: | |
| cfg_env = os.getenv('CONFIG_FILE') | |
| if cfg_env: | |
| cfg_paths.append(cfg_env) | |
| except Exception: | |
| pass | |
| try: | |
| cfg_paths.append(os.path.join(os.path.dirname(os.path.abspath(__file__)), "config.json")) | |
| except Exception: | |
| pass | |
| try: | |
| cfg_paths.append(os.path.join(os.getcwd(), "config.json")) | |
| except Exception: | |
| pass | |
| # Deduplicate while preserving order | |
| seen = set() | |
| candidates = [] | |
| for p in cfg_paths: | |
| if not p: | |
| continue | |
| p_norm = os.path.abspath(p) | |
| if p_norm in seen: | |
| continue | |
| seen.add(p_norm) | |
| candidates.append(p_norm) | |
| cfg = None | |
| cfg_path = None | |
| for p in candidates: | |
| if os.path.exists(p): | |
| try: | |
| with open(p, 'r', encoding='utf-8') as f: | |
| cfg = json.load(f) | |
| cfg_path = p | |
| break | |
| except Exception: | |
| continue | |
| if not isinstance(cfg, dict): | |
| return | |
| keys = cfg.get('multi_api_keys') or [] | |
| if not keys: | |
| return | |
| force_rotation = bool(cfg.get('force_key_rotation', True)) | |
| rotation_frequency = int(cfg.get('rotation_frequency', 1)) | |
| try: | |
| os.environ.setdefault('FORCE_KEY_ROTATION', '1' if force_rotation else '0') | |
| os.environ.setdefault('ROTATION_FREQUENCY', str(rotation_frequency)) | |
| os.environ.setdefault('USE_MULTI_KEYS', '1') # backward-compat | |
| except Exception: | |
| pass | |
| try: | |
| import unified_api_client as _uac | |
| _uac.UnifiedClient.set_in_memory_multi_keys( | |
| keys, | |
| force_rotation=force_rotation, | |
| rotation_frequency=rotation_frequency, | |
| ) | |
| if cfg_path: | |
| print(f"[DEBUG] Loaded multi-key config from {os.path.basename(cfg_path)} ({len(keys)} keys)") | |
| else: | |
| print(f"[DEBUG] Loaded multi-key config ({len(keys)} keys)") | |
| except Exception as e: | |
| print(f"[DEBUG] Failed to initialize multi-key config from file: {e}") | |
| def send_with_interrupt(*args, **kwargs): | |
| """Lazy wrapper to avoid circular import""" | |
| from TransateKRtoEN import send_with_interrupt as _send_with_interrupt | |
| return _send_with_interrupt(*args, **kwargs) | |
| # Class-level shared lock for API submission timing | |
| _api_submission_lock = threading.Lock() | |
| _last_api_submission_time = 0 | |
| _results_lock = threading.Lock() | |
| _file_write_lock = threading.Lock() | |
| # Timing variables | |
| _extraction_time = 0 | |
| _api_time = 0 | |
| _freq_check_time = 0 | |
| _dedup_time = 0 | |
| _io_time = 0 | |
| def _atomic_write_file(filepath, content, encoding='utf-8'): | |
| """Atomically write to a file to prevent corruption from concurrent writes""" | |
| # Create temp file in same directory to ensure same filesystem | |
| dir_path = os.path.dirname(filepath) | |
| with _file_write_lock: | |
| try: | |
| # Write to temporary file first | |
| with tempfile.NamedTemporaryFile(mode='w', encoding=encoding, | |
| dir=dir_path, delete=False) as tmp_file: | |
| tmp_file.write(content) | |
| tmp_path = tmp_file.name | |
| # Atomic rename (on same filesystem) | |
| if os.name == 'nt': # Windows | |
| # Windows doesn't support atomic rename if target exists | |
| if os.path.exists(filepath): | |
| os.remove(filepath) | |
| os.rename(tmp_path, filepath) | |
| else: # Unix/Linux/Mac | |
| os.rename(tmp_path, filepath) | |
| return True | |
| except Exception as e: | |
| print(f"⚠️ Atomic write failed: {e}") | |
| # Cleanup temp file if it exists | |
| if 'tmp_path' in locals() and os.path.exists(tmp_path): | |
| try: | |
| os.remove(tmp_path) | |
| except: | |
| pass | |
| # Fallback to direct write with lock | |
| try: | |
| with open(filepath, 'w', encoding=encoding) as f: | |
| f.write(content) | |
| return True | |
| except Exception as e2: | |
| print(f"⚠️ Fallback write also failed: {e2}") | |
| return False | |
| def save_glossary(output_dir, chapters, instructions, language="korean", log_callback=None): | |
| """Targeted glossary generator with true CSV format output and parallel processing""" | |
| # If the user stops translation while glossary runs in a subprocess, we must ensure the | |
| # per-process watchdog file doesn't stick around and keep the GUI progress bar "busy". | |
| # We only clear on stop (not on normal completion). | |
| global _watchdog_atexit_registered | |
| if not _watchdog_atexit_registered: | |
| try: | |
| import atexit | |
| def _cleanup_watchdog_on_exit(): | |
| try: | |
| if is_stop_requested(): | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| except Exception: | |
| pass | |
| atexit.register(_cleanup_watchdog_on_exit) | |
| _watchdog_atexit_registered = True | |
| except Exception: | |
| pass | |
| # Note: Don't redirect stdout here if log_callback is provided by subprocess worker | |
| # The worker already captures stdout and sends to queue | |
| # Only redirect if we're NOT in a subprocess (i.e., log_callback is a real GUI callback) | |
| import sys | |
| in_subprocess = hasattr(sys.stdout, 'queue') # Worker's LogCapture has a queue attribute | |
| if log_callback and not in_subprocess: | |
| set_output_redirect(log_callback) | |
| # Clear any stale stop flags before starting a new glossary run | |
| try: | |
| set_stop_flag(False) | |
| except Exception: | |
| try: | |
| os.environ["TRANSLATION_CANCELLED"] = "0" | |
| except Exception: | |
| pass | |
| try: | |
| stop_path = _get_stop_file_path() | |
| if stop_path and os.path.exists(stop_path): | |
| os.remove(stop_path) | |
| except Exception: | |
| pass | |
| try: | |
| import unified_api_client | |
| if hasattr(unified_api_client, "UnifiedClient"): | |
| unified_api_client.UnifiedClient._global_cancelled = False | |
| if hasattr(unified_api_client, "global_stop_flag"): | |
| unified_api_client.global_stop_flag = False | |
| except Exception: | |
| pass | |
| print("📱 Targeted Glossary Generator v6.0 (CSV Format + Parallel)") | |
| # CRITICAL: Reload ALL glossary settings from environment variables at the START | |
| # This ensures child processes spawned by ProcessPoolExecutor get the latest values | |
| # Force fresh read of all environment variables (they were set by save_config) | |
| print("🔄 Reloading glossary settings from environment variables...") | |
| # Honor output directory override (same behavior as translation pipeline) | |
| try: | |
| override_dir = os.getenv("OUTPUT_DIRECTORY") | |
| if override_dir: | |
| override_dir = os.path.abspath(override_dir) | |
| leaf = os.path.basename(os.path.abspath(output_dir)) or "output" | |
| # Always place under the override root (handles different drives safely) | |
| output_dir = os.path.join(override_dir, leaf) | |
| except Exception as e: | |
| print(f"⚠️ OUTPUT_DIRECTORY override failed: {e}") | |
| print(f"📁 Glossary output directory: {os.path.abspath(output_dir)}") | |
| # Check stop flag at start | |
| # Ensure output directory exists | |
| try: | |
| os.makedirs(output_dir, exist_ok=True) | |
| except Exception as _e: | |
| print(f"⚠️ Could not ensure output directory exists: {output_dir} ({_e})") | |
| if is_stop_requested(): | |
| print("📁 ❌ Glossary generation stopped by user") | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| return {} | |
| # CLEAR incremental history UNCONDITIONALLY at the start of any run | |
| # This prevents stale chunks from polluting the aggregation, regardless of whether chunking is used | |
| incremental_dir = os.path.join(output_dir, "incremental_glossary") | |
| if os.path.exists(incremental_dir): | |
| print(f"📑 Cleaning incremental glossary folder: {incremental_dir}") | |
| try: | |
| import shutil | |
| # Safely clear the entire incremental folder | |
| for filename in os.listdir(incremental_dir): | |
| file_path = os.path.join(incremental_dir, filename) | |
| try: | |
| if os.path.isfile(file_path) or os.path.islink(file_path): | |
| os.unlink(file_path) | |
| elif os.path.isdir(file_path): | |
| shutil.rmtree(file_path) | |
| except Exception as e: | |
| print(f"⚠️ Failed to delete {file_path}: {e}") | |
| except Exception as e: | |
| print(f"⚠️ Failed to clear incremental history: {e}") | |
| # Ensure directory exists for potential use | |
| os.makedirs(incremental_dir, exist_ok=True) | |
| # Check if glossary already exists; if so, we'll MERGE it later (do not return early) | |
| glossary_path = os.path.join(output_dir, "glossary.csv") | |
| existing_glossary_content = None | |
| if os.path.exists(glossary_path): | |
| print(f"📁 Existing glossary detected (will merge): {glossary_path}") | |
| try: | |
| with open(glossary_path, 'r', encoding='utf-8') as f: | |
| existing_glossary_content = f.read() | |
| except Exception as e: | |
| print(f"⚠️ Could not read existing glossary: {e}") | |
| # Rest of the method continues as before... | |
| print("📁 Extracting names and terms with configurable options") | |
| global BOOK_TITLE_RAW, BOOK_TITLE_TRANSLATED | |
| # 1. Get raw title from input EPUB (input path) | |
| epub_path = os.getenv("EPUB_PATH", "") | |
| BOOK_TITLE_RAW = _extract_raw_title_from_epub(epub_path) | |
| # 2. Get translated title from output metadata (output path) | |
| BOOK_TITLE_TRANSLATED = _extract_translated_title_from_metadata(output_dir) | |
| # Debug info | |
| if BOOK_TITLE_RAW: | |
| print(f"📚 Raw book title: {BOOK_TITLE_RAW}") | |
| if BOOK_TITLE_TRANSLATED: | |
| print(f"📚 Translated book title: {BOOK_TITLE_TRANSLATED}") | |
| # Check stop flag before processing | |
| if is_stop_requested(): | |
| print("📁 ❌ Glossary generation stopped by user") | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| return {} | |
| # Check if automatic glossary generation is enabled | |
| enable_auto_glossary = os.getenv("ENABLE_AUTO_GLOSSARY", "1") == "1" | |
| # Check for manual glossary first (CSV only) | |
| manual_glossary_path = os.getenv("MANUAL_GLOSSARY") | |
| existing_glossary = None | |
| if manual_glossary_path and os.path.exists(manual_glossary_path): | |
| print(f"📁 Manual glossary detected: {os.path.basename(manual_glossary_path)}") | |
| try: | |
| with open(manual_glossary_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Treat as CSV text and stage it for merge; also copy to output for visibility | |
| target_path = os.path.join(output_dir, "glossary.csv") | |
| with open(target_path, 'w', encoding='utf-8') as f: | |
| f.write(content) | |
| print(f"📁 ✅ Manual CSV glossary copied to: {target_path}") | |
| existing_glossary = content | |
| # Skip automatic generation when manual glossary is loaded | |
| if not enable_auto_glossary: | |
| print(f"ℹ️ Automatic glossary generation disabled, using manual glossary only") | |
| return {} | |
| else: | |
| print(f"ℹ️ Skipping automatic glossary generation (manual glossary already loaded)") | |
| return {} | |
| except Exception as e: | |
| print(f"⚠️ Could not copy manual glossary: {e}") | |
| print(f"📁 Proceeding with automatic generation...") | |
| # Check if auto-glossary is disabled without a manual glossary | |
| if not enable_auto_glossary: | |
| print(f"ℹ️ Automatic glossary generation is disabled and no manual glossary provided") | |
| return {} | |
| # Check for existing glossary from manual extraction | |
| # Avoid double-nesting when output_dir already ends with "Glossary" | |
| if os.path.basename(os.path.abspath(output_dir)).lower() == "glossary": | |
| glossary_folder_path = output_dir | |
| else: | |
| glossary_folder_path = os.path.join(output_dir, "Glossary") | |
| # existing_glossary may already be set by MANUAL_GLOSSARY above | |
| if os.path.exists(glossary_folder_path): | |
| for file in os.listdir(glossary_folder_path): | |
| if file.endswith("_glossary.json"): | |
| existing_path = os.path.join(glossary_folder_path, file) | |
| try: | |
| with open(existing_path, 'r', encoding='utf-8') as f: | |
| existing_content = f.read() | |
| existing_glossary = existing_content | |
| print(f"📁 Found existing glossary from manual extraction: {file}") | |
| break | |
| except Exception as e: | |
| print(f"⚠️ Could not load existing glossary: {e}") | |
| # Get configuration from environment variables (FRESH READ) | |
| min_frequency = int(os.getenv("GLOSSARY_MIN_FREQUENCY", "2")) | |
| max_names = int(os.getenv("GLOSSARY_MAX_NAMES", "50")) | |
| max_titles = int(os.getenv("GLOSSARY_MAX_TITLES", "30")) | |
| # Batch sizing: | |
| # - GUI uses BATCH_SIZE for concurrency/batching. | |
| # - Keep GLOSSARY_BATCH_SIZE for backward compatibility, but default to GUI's value. | |
| batch_size = int(os.getenv("GLOSSARY_BATCH_SIZE", os.getenv("BATCH_SIZE", "50"))) | |
| strip_honorifics = os.getenv("GLOSSARY_STRIP_HONORIFICS", "1") == "1" | |
| fuzzy_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90")) | |
| max_text_size = int(os.getenv("GLOSSARY_MAX_TEXT_SIZE", "0")) | |
| # DEBUG: Show what we're reading from environment | |
| max_sentences_env = os.getenv("GLOSSARY_MAX_SENTENCES", "200") | |
| print(f"🔍 [DEBUG] Reading GLOSSARY_MAX_SENTENCES from environment: '{max_sentences_env}'") | |
| max_sentences = int(max_sentences_env) | |
| print(f"🔍 [DEBUG] Converted to integer: {max_sentences}") | |
| include_all_characters_env = os.getenv("GLOSSARY_INCLUDE_ALL_CHARACTERS", "0") | |
| include_all_characters = include_all_characters_env == "1" | |
| include_gender_context_flag = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| print(f"📑 DEBUG: Include all characters (dynamic limit expansion) = '{include_all_characters_env}'") | |
| print(f"📑 Settings: Min frequency: {min_frequency}, Max names: {max_names}, Max titles: {max_titles}") | |
| print(f"📑 Strip honorifics: {'✅ Yes' if strip_honorifics else '❌ No'}") | |
| print(f"📑 Fuzzy matching threshold: {fuzzy_threshold}") | |
| print(f"📑 Max sentences for filtering: {max_sentences}") | |
| # Get custom prompt from environment | |
| custom_prompt = os.getenv("AUTO_GLOSSARY_PROMPT", "").strip() | |
| # Initialize to the default unified prompt when unset/empty. | |
| # Pattern-based extraction remains disabled elsewhere. | |
| if not custom_prompt: | |
| custom_prompt = DEFAULT_AUTO_GLOSARY_PROMPT3.strip() | |
| os.environ["AUTO_GLOSSARY_PROMPT"] = custom_prompt | |
| print("📑 AUTO_GLOSSARY_PROMPT not set - initialized to default unified prompt") | |
| def clean_html(html_text): | |
| """Remove HTML tags to get clean text""" | |
| soup = BeautifulSoup(html_text, 'html.parser') | |
| return soup.get_text() | |
| # Check stop before processing chapters | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary generation stopped by user") | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| return {} | |
| # Get chapter split threshold, toggle, and filter mode | |
| chapter_split_threshold = int(os.getenv("GLOSSARY_CHAPTER_SPLIT_THRESHOLD", "100000")) | |
| chapter_split_enabled = os.getenv("GLOSSARY_ENABLE_CHAPTER_SPLIT", "1") == "1" | |
| filter_mode = os.getenv("GLOSSARY_FILTER_MODE", "all") # all, only_with_honorifics, only_without_honorifics | |
| # Check if parallel extraction is enabled for automatic glossary | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| batch_translation = os.getenv("BATCH_TRANSLATION", "0") == "1" | |
| # Prefer GUI's batch size; fall back to glossary batch size if needed. | |
| api_batch_size = int(os.getenv("BATCH_SIZE", os.getenv("GLOSSARY_BATCH_SIZE", "5"))) | |
| batching_mode = os.getenv("BATCHING_MODE", "direct") | |
| batch_group_size = int(os.getenv("BATCH_GROUP_SIZE", "3")) | |
| # Backward compatibility | |
| if os.getenv("CONSERVATIVE_BATCHING", "0") == "1": | |
| batching_mode = "conservative" | |
| # Log the settings | |
| print(f"📑 Filter mode: {filter_mode}") | |
| if extraction_workers > 1: | |
| print(f"📑 Parallel extraction enabled: {extraction_workers} workers") | |
| if batch_translation: | |
| print(f"📑 Batch API calls enabled: {api_batch_size} chunks per batch") | |
| print(f"📑 Batching mode: {batching_mode}") | |
| if batching_mode == "conservative": | |
| print(f"📑 Conservative group size: {batch_group_size}") | |
| all_text = ' '.join(clean_html(chapter["body"]) for chapter in chapters) | |
| print(f"📑 Processing {len(all_text):,} characters of text") | |
| # Apply smart filtering FIRST to check actual size needed | |
| use_smart_filter = os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1" | |
| effective_text_size = len(all_text) | |
| filtered_text_cache = None | |
| if use_smart_filter and custom_prompt: # Only apply for AI extraction | |
| print(f"📁 Smart filtering enabled - checking effective text size after filtering...") | |
| # Perform filtering ONCE and reuse for chunking | |
| filtered_sample, _ = _filter_text_for_glossary(all_text, min_frequency, max_sentences) | |
| filtered_text_cache = filtered_sample | |
| effective_text_size = len(filtered_sample) | |
| # Calculate token count using tiktoken | |
| try: | |
| import tiktoken | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| token_count = len(enc.encode(filtered_sample)) | |
| print(f"📁 Text reduction: {len(all_text):,} → {effective_text_size:,} chars ({100*(1-effective_text_size/len(all_text)):.1f}% reduction) | {token_count:,} tokens") | |
| except: | |
| print(f"📁 Text reduction: {len(all_text):,} → {effective_text_size:,} chars ({100*(1-effective_text_size/len(all_text)):.1f}% reduction)") | |
| # Safety check: Calculate actual token count for chunking decision | |
| estimated_tokens = None | |
| try: | |
| import tiktoken | |
| enc = tiktoken.get_encoding("cl100k_base") | |
| estimated_tokens = len(enc.encode(filtered_text_cache if filtered_text_cache else all_text)) | |
| except: | |
| # Fallback estimate: 1 token ≈ 3-4 characters for Asian languages | |
| estimated_tokens = effective_text_size // 3 | |
| # Get output token limit (glossary-specific with fallback to global) | |
| max_output_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "65536"))) | |
| # Use compression factor to determine safe input limit (from CJK→English compression ratio) | |
| # Use glossary-specific compression factor with fallback to global | |
| compression_factor = float(os.getenv("GLOSSARY_COMPRESSION_FACTOR", os.getenv("COMPRESSION_FACTOR", "1.0"))) | |
| # Safe input limit is max_output divided by compression factor | |
| # (e.g., if compression is 0.7, output will be 70% of input, so we can use 1/0.7 = 1.43x for safety) | |
| safe_input_limit = int(max_output_tokens / max(compression_factor, 0.1)) if compression_factor > 0 else int(max_output_tokens * 0.8) | |
| if estimated_tokens > safe_input_limit: | |
| # Only show detailed token logs if using token-based chunking (threshold == 0) | |
| if chapter_split_threshold == 0: | |
| print(f"⚠️ Text too large for single API call!") | |
| print(f" Estimated tokens: {estimated_tokens:,}") | |
| print(f" Safe input limit: {safe_input_limit:,} (based on {compression_factor:.2f}x compression factor and {max_output_tokens:,} max output tokens)") | |
| print(f" Will use ChapterSplitter for token-based chunking...") | |
| else: | |
| # Character-based threshold already set, just use it silently | |
| pass | |
| # Check if we need to split into chunks based on EFFECTIVE size after filtering | |
| needs_chunking = chapter_split_enabled and ( | |
| (chapter_split_threshold == 0 and estimated_tokens > safe_input_limit) or | |
| (chapter_split_threshold > 0 and effective_text_size > chapter_split_threshold) | |
| ) | |
| if not chapter_split_enabled: | |
| print("📑 Chapter splitting disabled (GLOSSARY_ENABLE_CHAPTER_SPLIT=0) - processing without pre-splitting") | |
| if needs_chunking: | |
| # Prepare chunk processing | |
| incremental_dir = os.path.join(output_dir, "incremental_glossary") | |
| agg_path = os.path.join(incremental_dir, "glossary.incremental.all.csv") | |
| # CLEAR incremental history if it exists to ensure 'all' file only contains current run data | |
| # This prevents it from growing indefinitely across multiple runs | |
| if os.path.exists(incremental_dir): | |
| try: | |
| import shutil | |
| # Safely clear the entire incremental folder | |
| for filename in os.listdir(incremental_dir): | |
| file_path = os.path.join(incremental_dir, filename) | |
| try: | |
| if os.path.isfile(file_path) or os.path.islink(file_path): | |
| os.unlink(file_path) | |
| elif os.path.isdir(file_path): | |
| shutil.rmtree(file_path) | |
| except Exception as e: | |
| print(f"⚠️ Failed to delete {file_path}: {e}") | |
| print(f"📑 Cleared incremental glossary folder: {incremental_dir}") | |
| except Exception as e: | |
| print(f"⚠️ Failed to clear incremental history: {e}") | |
| # Ensure directory exists (if it was fully removed or didn't exist) | |
| os.makedirs(incremental_dir, exist_ok=True) | |
| if chapter_split_threshold == 0: | |
| # Use ChapterSplitter for token-based intelligent chunking | |
| print(f"📑 Text exceeds safe token limit, using ChapterSplitter for token-based chunking...") | |
| from chapter_splitter import ChapterSplitter | |
| # Get the model name for the tokenizer | |
| model = os.getenv("MODEL", "gemini-2.0-flash") | |
| splitter = ChapterSplitter(model_name=model, target_tokens=safe_input_limit) | |
| # Get the text to split (filtered or raw) | |
| text_to_split = filtered_text_cache if (use_smart_filter and custom_prompt and filtered_text_cache) else all_text | |
| # Use ChapterSplitter to intelligently split based on tokens | |
| split_results = splitter.split_chapter(text_to_split, max_tokens=safe_input_limit) | |
| chunks_to_process = [(i, chunk) for i, (chunk, _, _) in enumerate(split_results, 1)] | |
| print(f"📑 ChapterSplitter created {len(chunks_to_process)} token-balanced chunks") | |
| all_glossary_entries = [] | |
| else: | |
| # Use character-based splitting with fixed threshold | |
| print(f"📑 Effective text exceeds {chapter_split_threshold:,} chars, will process in chunks...") | |
| # If using smart filter, we need to split the FILTERED text, not raw text | |
| if use_smart_filter and custom_prompt: | |
| # Split the filtered text into chunks (reuse cached filtered text) | |
| filtered_text = filtered_text_cache if filtered_text_cache is not None else _filter_text_for_glossary(all_text, min_frequency, max_sentences)[0] | |
| chunks_to_process = [] | |
| # Split filtered text into chunks of appropriate size | |
| chunk_size = chapter_split_threshold | |
| for i in range(0, len(filtered_text), chunk_size): | |
| chunk_text = filtered_text[i:i + chunk_size] | |
| chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) | |
| print(f"📑 Split filtered text into {len(chunks_to_process)} chunks") | |
| all_glossary_entries = [] | |
| else: | |
| # Original logic for unfiltered text | |
| all_glossary_entries = [] | |
| chunk_size = 0 | |
| chunk_chapters = [] | |
| chunks_to_process = [] | |
| for idx, chapter in enumerate(chapters): | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary generation stopped by user") | |
| return all_glossary_entries | |
| chapter_text = clean_html(chapter["body"]) | |
| chunk_size += len(chapter_text) | |
| chunk_chapters.append(chapter) | |
| # Process chunk when it reaches threshold or last chapter | |
| if chunk_size >= chapter_split_threshold or idx == len(chapters) - 1: | |
| chunk_text = ' '.join(clean_html(ch["body"]) for ch in chunk_chapters) | |
| chunks_to_process.append((len(chunks_to_process) + 1, chunk_text)) | |
| # Reset for next chunk | |
| chunk_size = 0 | |
| chunk_chapters = [] | |
| print(f"📑 Split into {len(chunks_to_process)} chunks for processing") | |
| # Batch toggle decides concurrency: ON => parallel API calls; OFF => strict sequential | |
| if batch_translation and custom_prompt and len(chunks_to_process) > 1: | |
| print(f"📑 Processing chunks in batch mode with {api_batch_size} chunks per batch...") | |
| # Set fast mode for batch processing | |
| os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "1" | |
| # Use batch API calls for AI extraction | |
| all_csv_lines = _process_chunks_batch_api( | |
| chunks_to_process, custom_prompt, language, | |
| min_frequency, max_names, max_titles, | |
| output_dir, strip_honorifics, fuzzy_threshold, | |
| filter_mode, api_batch_size, extraction_workers, max_sentences | |
| ) | |
| # Reset validation mode | |
| os.environ["GLOSSARY_SKIP_ALL_VALIDATION"] = "0" | |
| print(f"📑 All chunks completed. Aggregated raw lines: {len(all_csv_lines)}") | |
| # Process all collected entries at once (even if empty) | |
| # Add header so downstream steps can work uniformly | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1" | |
| if include_description: | |
| all_csv_lines.insert(0, "type,raw_name,translated_name,gender,description") | |
| elif include_gender_context: | |
| all_csv_lines.insert(0, "type,raw_name,translated_name,gender") | |
| else: | |
| all_csv_lines.insert(0, "type,raw_name,translated_name") | |
| # Merge with any on-disk glossary first (to avoid overwriting user edits) | |
| on_disk_path = os.path.join(output_dir, "glossary.csv") | |
| if os.path.exists(on_disk_path): | |
| try: | |
| with open(on_disk_path, 'r', encoding='utf-8') as f: | |
| on_disk_content = f.read() | |
| all_csv_lines = _merge_csv_entries(all_csv_lines, on_disk_content, strip_honorifics, language) | |
| print("📑 Merged with existing on-disk glossary") | |
| except Exception as e: | |
| print(f"⚠️ Failed to merge with existing on-disk glossary: {e}") | |
| # Apply filter mode if needed | |
| if filter_mode == "only_with_honorifics": | |
| filtered = [all_csv_lines[0]] # Keep header | |
| for line in all_csv_lines[1:]: | |
| parts = line.split(',', 2) | |
| if len(parts) >= 3 and parts[0] == "character": | |
| filtered.append(line) | |
| all_csv_lines = filtered | |
| print(f"📑 Filter applied: {len(all_csv_lines)-1} character entries with honorifics kept") | |
| # Ensure book title header is present before dedup/sort when requested | |
| if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1": | |
| all_csv_lines = _ensure_book_title_csv_lines(all_csv_lines) | |
| # Apply fuzzy deduplication (deferred until after all chunks) | |
| try: | |
| print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") | |
| all_csv_lines = _deduplicate_glossary_with_fuzzy(all_csv_lines, fuzzy_threshold) | |
| except Exception as e: | |
| print(f"⚠️ Deduplication error: {e} — continuing without dedup") | |
| # Sort by type and name | |
| print(f"📑 Sorting glossary by type and name...") | |
| header = all_csv_lines[0] | |
| entries = all_csv_lines[1:] | |
| if entries: | |
| entries.sort(key=_csv_sort_key) | |
| all_csv_lines = [header] + entries | |
| # Save | |
| # Check format preference | |
| use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' | |
| if not use_legacy_format: | |
| # Convert to token-efficient format | |
| all_csv_lines = _convert_to_token_efficient_format(all_csv_lines) | |
| # Final sanitize to prevent stray headers | |
| all_csv_lines = _sanitize_final_glossary_lines(all_csv_lines, use_legacy_format) | |
| # If user requested stop, avoid writing new glossary to disk | |
| if is_stop_requested(): | |
| print("🛑 Stop requested — skipping final glossary write (batch mode)") | |
| return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {} | |
| # If user stopped and we have no entries, keep existing file to avoid wiping it | |
| if is_stop_requested() and len(all_csv_lines) <= 1: | |
| print("🛑 Stop requested with no new entries — preserving existing glossary.csv") | |
| return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {} | |
| # Save | |
| csv_content = '\n'.join(all_csv_lines) | |
| glossary_path = os.path.join(output_dir, "glossary.csv") | |
| _atomic_write_file(glossary_path, csv_content) | |
| # Verify file exists; fallback direct write if needed | |
| if not os.path.exists(glossary_path): | |
| try: | |
| with open(glossary_path, 'w', encoding='utf-8') as f: | |
| f.write(csv_content) | |
| print("📑 Fallback write succeeded for glossary.csv") | |
| except Exception as e: | |
| print(f"❌ Failed to write glossary.csv: {e}") | |
| print(f"\n📑 ✅ GLOSSARY SAVED!") | |
| print(f"📑 ✅ AI GLOSSARY SAVED!") | |
| c_count, t_count, total = _count_glossary_entries(all_csv_lines, use_legacy_format) | |
| print(f"📑 Character entries: {c_count}") | |
| # print(f"📑 Term entries: {t_count}") | |
| print(f"📑 Total entries: {total}") | |
| return _parse_csv_to_dict(csv_content) | |
| else: | |
| # Strict sequential processing (one API call at a time) | |
| _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") | |
| _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") | |
| _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") | |
| os.environ["GLOSSARY_DEFER_SAVE"] = "1" | |
| # Tell the extractor each chunk is already filtered to avoid re-running smart filter per chunk | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = "1" | |
| os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" | |
| try: | |
| for pos, (chunk_idx, chunk_text) in enumerate(chunks_to_process, start=1): | |
| if is_stop_requested(): | |
| break | |
| print(f"📑 Processing chunk {chunk_idx}/{len(chunks_to_process)} ({len(chunk_text):,} chars)...") | |
| if custom_prompt: | |
| chunk_glossary = _extract_with_custom_prompt( | |
| custom_prompt, chunk_text, language, | |
| min_frequency, max_names, max_titles, | |
| None, output_dir, # Don't pass existing glossary to chunks | |
| strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback, | |
| chunk_pos=pos, | |
| total_chunks=len(chunks_to_process), | |
| ) | |
| else: | |
| # Pattern fallback disabled | |
| print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping chunk glossary extraction (pattern fallback disabled)") | |
| chunk_glossary = {} | |
| # Normalize to CSV lines and aggregate | |
| chunk_lines = [] | |
| if isinstance(chunk_glossary, list): | |
| for line in chunk_glossary: | |
| if line and not line.startswith('type,'): | |
| all_glossary_entries.append(line) | |
| chunk_lines.append(line) | |
| else: | |
| for raw_name, translated_name in chunk_glossary.items(): | |
| entry_type = "character" if _has_honorific(raw_name) else "term" | |
| line = f"{entry_type},{raw_name},{translated_name}" | |
| all_glossary_entries.append(line) | |
| chunk_lines.append(line) | |
| # Incremental update (per chunk file inside incremental_glossary folder) | |
| try: | |
| _incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode) | |
| print(f"📑 Incremental write: chunk {chunk_idx} (+{len(chunk_lines)} entries)") | |
| except Exception as e2: | |
| print(f"⚠️ Incremental write failed for chunk {chunk_idx}: {e2}") | |
| finally: | |
| if _prev_defer is None: | |
| if "GLOSSARY_DEFER_SAVE" in os.environ: | |
| del os.environ["GLOSSARY_DEFER_SAVE"] | |
| else: | |
| os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer | |
| if _prev_filtered is None: | |
| os.environ.pop("_CHUNK_ALREADY_FILTERED", None) | |
| else: | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered | |
| if _prev_force_disable is None: | |
| os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) | |
| else: | |
| os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable | |
| # Build CSV from aggregated entries | |
| print(f"📑 DEBUG: all_glossary_entries count before merge: {len(all_glossary_entries)}") | |
| # START WITH INCREMENTAL GLOSSARY AS BASE IF IT EXISTS AND IS LARGER | |
| # This ensures that if memory was lost (e.g. during a long sequential run), we rely on the disk backup | |
| incremental_dir = os.path.join(output_dir, "incremental_glossary") | |
| incremental_path = os.path.join(incremental_dir, "glossary.incremental.all.csv") | |
| base_entries = list(all_glossary_entries) | |
| using_incremental_as_base = False | |
| if os.path.exists(incremental_path): | |
| try: | |
| with open(incremental_path, 'r', encoding='utf-8') as f: | |
| inc_content = f.read() | |
| # Simple parse to count lines/entries | |
| inc_lines = [line for line in inc_content.split('\n') if line.strip() and not line.startswith('type,')] | |
| print(f"📑 Found incremental glossary: {len(inc_lines)} entries (Memory: {len(all_glossary_entries)} entries)") | |
| if len(inc_lines) > len(all_glossary_entries): | |
| print("📑 🔄 Incremental glossary is larger than memory - using it as primary source") | |
| # We need to ensure it has the header for csv_lines logic below | |
| # But csv_lines construction adds header anyway. | |
| # So we just REPLACE base_entries with inc_lines | |
| base_entries = inc_lines | |
| using_incremental_as_base = True | |
| except Exception as e: | |
| print(f"⚠️ Failed to check incremental glossary: {e}") | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1" | |
| if include_description: | |
| csv_lines = ["type,raw_name,translated_name,gender,description"] + base_entries | |
| elif include_gender_context: | |
| csv_lines = ["type,raw_name,translated_name,gender"] + base_entries | |
| else: | |
| csv_lines = ["type,raw_name,translated_name"] + base_entries | |
| # If we used incremental as base, we must merge MEMORY into it (to capture the last chunk if it wasn't in incremental yet) | |
| if using_incremental_as_base and all_glossary_entries: | |
| print("📑 Merging memory entries into incremental base...") | |
| # Create a mini-CSV for memory entries | |
| mem_csv = ["type,raw_name,translated_name"] + all_glossary_entries | |
| csv_lines = _merge_csv_entries(csv_lines, '\n'.join(mem_csv), strip_honorifics, language) | |
| # Merge with any provided existing glossary AND on-disk glossary to avoid overwriting | |
| on_disk_path = os.path.join(output_dir, "glossary.csv") | |
| merge_sources = [] | |
| if existing_glossary: | |
| merge_sources.append(existing_glossary) | |
| # We already handled incremental above as the base, so we don't add it to merge_sources here | |
| if os.path.exists(on_disk_path): | |
| try: | |
| with open(on_disk_path, 'r', encoding='utf-8') as f: | |
| merge_sources.append(f.read()) | |
| print("📑 Found existing on-disk glossary to merge") | |
| except Exception as e: | |
| print(f"⚠️ Failed to read on-disk glossary for merging: {e}") | |
| # Also merge the main on-disk glossary if it was present at start | |
| if existing_glossary_content: | |
| csv_lines = _merge_csv_entries(csv_lines, existing_glossary_content, strip_honorifics, language) | |
| for src in merge_sources: | |
| before_merge_count = len(csv_lines) | |
| csv_lines = _merge_csv_entries(csv_lines, src, strip_honorifics, language) | |
| print(f"📑 DEBUG: Merged source. Count: {before_merge_count} -> {len(csv_lines)}") | |
| # Apply filter mode to final results | |
| csv_lines = _filter_csv_by_mode(csv_lines, filter_mode) | |
| # Ensure book title entry before dedup/sort when requested | |
| if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1": | |
| csv_lines = _ensure_book_title_csv_lines(csv_lines) | |
| # Apply fuzzy deduplication (deferred until after all chunks) | |
| print(f"📑 Applying fuzzy deduplication (threshold: {fuzzy_threshold})...") | |
| original_count = len(csv_lines) - 1 | |
| csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) | |
| deduped_count = len(csv_lines) - 1 | |
| if original_count > deduped_count: | |
| print(f"📑 Removed {original_count - deduped_count} duplicate entries") | |
| # Sort by type and name | |
| print(f"📑 Sorting glossary by type and name...") | |
| header = csv_lines[0] | |
| entries = csv_lines[1:] | |
| entries.sort(key=_csv_sort_key) | |
| csv_lines = [header] + entries | |
| # Token-efficient format if enabled | |
| use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' | |
| if not use_legacy_format: | |
| csv_lines = _convert_to_token_efficient_format(csv_lines) | |
| # Final sanitize to prevent stray headers and section titles at end | |
| csv_lines = _sanitize_final_glossary_lines(csv_lines, use_legacy_format) | |
| # If user requested stop, avoid overwriting files; preserve existing when possible | |
| if is_stop_requested(): | |
| if len(csv_lines) <= 1 and os.path.exists(on_disk_path): | |
| print("🛑 Stop requested with no new entries — preserving existing glossary.csv") | |
| return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {} | |
| print("🛑 Stop requested — skipping final glossary write (chunked mode)") | |
| return _parse_csv_to_dict(existing_glossary_content) if existing_glossary_content else {} | |
| # Copy glossary extension file if configured | |
| # Copy glossary extension file if configured | |
| add_additional_glossary = os.getenv('ADD_ADDITIONAL_GLOSSARY', '0') == '1' | |
| additional_glossary_path = os.getenv('ADDITIONAL_GLOSSARY_PATH', '') | |
| if add_additional_glossary and additional_glossary_path and os.path.exists(additional_glossary_path): | |
| print(f"📜 Processing glossary extension: {os.path.basename(additional_glossary_path)}") | |
| try: | |
| import shutil | |
| file_ext = os.path.splitext(additional_glossary_path)[1].lower() | |
| # Target path in output directory | |
| target_path = os.path.join(output_dir, "glossary_extension.csv") | |
| if file_ext == '.csv': | |
| # Copy CSV directly | |
| shutil.copy2(additional_glossary_path, target_path) | |
| print(f"📜 Copied glossary extension to {os.path.basename(target_path)}") | |
| elif file_ext in ['.txt', '.json', '.pdf']: | |
| # Convert non-CSV formats to CSV | |
| converted_lines = [] | |
| if file_ext == '.txt': | |
| with open(additional_glossary_path, 'r', encoding='utf-8') as f: | |
| content = f.read() | |
| # Try to parse as CSV-like format | |
| for line in content.strip().split('\n'): | |
| if line.strip(): | |
| converted_lines.append(line.strip()) | |
| elif file_ext == '.json': | |
| import json | |
| with open(additional_glossary_path, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # Add CSV header | |
| converted_lines.append("type,raw_name,translated_name") | |
| # Convert JSON to CSV format | |
| if isinstance(data, dict): | |
| for key, value in data.items(): | |
| if isinstance(value, dict): | |
| raw = value.get('raw', key) | |
| translated = value.get('translated', value.get('translation', key)) | |
| entry_type = value.get('type', 'term') | |
| converted_lines.append(f"{entry_type},{raw},{translated}") | |
| else: | |
| converted_lines.append(f"term,{key},{value}") | |
| elif isinstance(data, list): | |
| for entry in data: | |
| if isinstance(entry, dict): | |
| entry_type = entry.get('type', 'term') | |
| raw = entry.get('raw_name', entry.get('raw', '')) | |
| translated = entry.get('translated_name', entry.get('translated', '')) | |
| if raw and translated: | |
| converted_lines.append(f"{entry_type},{raw},{translated}") | |
| elif file_ext == '.pdf': | |
| # Try to extract text from PDF and save as CSV | |
| try: | |
| import PyPDF2 | |
| with open(additional_glossary_path, 'rb') as f: | |
| pdf_reader = PyPDF2.PdfReader(f) | |
| pdf_text = [] | |
| for page in pdf_reader.pages: | |
| pdf_text.append(page.extract_text()) | |
| text_content = '\n'.join(pdf_text) | |
| # Try to parse as CSV | |
| for line in text_content.strip().split('\n'): | |
| if line.strip(): | |
| converted_lines.append(line.strip()) | |
| except ImportError: | |
| print("⚠️ PyPDF2 not available, cannot read PDF. Install with: pip install PyPDF2") | |
| except Exception as pdf_error: | |
| print(f"⚠️ Could not read PDF: {pdf_error}") | |
| # Write converted content to CSV | |
| if converted_lines: | |
| with open(target_path, 'w', encoding='utf-8') as f: | |
| f.write('\n'.join(converted_lines)) | |
| print(f"📜 Converted and saved glossary extension to {os.path.basename(target_path)}") | |
| except Exception as e: | |
| print(f"⚠️ Failed to copy glossary extension: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| try: | |
| # Save | |
| csv_content = '\n'.join(csv_lines) | |
| glossary_path = os.path.join(output_dir, "glossary.csv") | |
| _atomic_write_file(glossary_path, csv_content) | |
| # Verify file exists; fallback direct write if needed | |
| if not os.path.exists(glossary_path): | |
| try: | |
| with open(glossary_path, 'w', encoding='utf-8') as f: | |
| f.write(csv_content) | |
| print("📑 Fallback write succeeded for glossary.csv") | |
| except Exception as e: | |
| print(f"❌ Failed to write glossary.csv: {e}") | |
| finally: | |
| print(f"\n📑 ✅ CHUNKED GLOSSARY SAVED!") | |
| print(f"📑 ✅ AI GLOSSARY SAVED!") | |
| print(f"📑 File: {glossary_path}") | |
| c_count, t_count, total = _count_glossary_entries(csv_lines, use_legacy_format) | |
| print(f"📑 Character entries: {c_count}") | |
| # print(f"📑 Term entries: {t_count}") | |
| print(f"📑 Total entries: {total}") | |
| return _parse_csv_to_dict(csv_content) | |
| # Original single-text processing | |
| if custom_prompt: | |
| # Pass cached filtered text if available to avoid re-filtering | |
| text_to_process = filtered_text_cache if filtered_text_cache is not None else all_text | |
| already_filtered = filtered_text_cache is not None | |
| # Set environment flag to indicate text is already filtered | |
| if already_filtered: | |
| os.environ["_TEXT_ALREADY_FILTERED"] = "1" | |
| try: | |
| return _extract_with_custom_prompt(custom_prompt, text_to_process, language, | |
| min_frequency, max_names, max_titles, | |
| existing_glossary, output_dir, | |
| strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback) | |
| finally: | |
| if already_filtered: | |
| os.environ.pop("_TEXT_ALREADY_FILTERED", None) | |
| else: | |
| # Pattern fallback disabled | |
| print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping automatic glossary generation (pattern fallback disabled)") | |
| return {} | |
| total_time = time.time() - total_start_time | |
| print(f"\n📑 ========== GLOSSARY GENERATION COMPLETE ==========") | |
| print(f"📑 Total time: {total_time:.1f}s") | |
| print(f"📑 Performance breakdown:") | |
| print(f"📑 - Extraction: {0:.1f}s") | |
| print(f"📑 - API calls: {0:.1f}s") | |
| print(f"📑 - Frequency checking: {0:.1f}s") | |
| print(f"📑 - Deduplication: {0:.1f}s") | |
| print(f"📑 - File I/O: {0:.1f}s") | |
| print(f"📑 ================================================") | |
| return result # This is the existing return statement | |
| def _convert_to_token_efficient_format(csv_lines): | |
| """Convert CSV lines to token-efficient format with sections and asterisks""" | |
| if len(csv_lines) <= 1: | |
| return csv_lines | |
| header = csv_lines[0] | |
| entries = csv_lines[1:] | |
| # Group by type (only from valid CSV lines) | |
| import re as _re | |
| import csv as _csv | |
| grouped = {} | |
| for line in entries: | |
| if not line.strip(): | |
| continue | |
| # Only accept proper CSV rows: at least 3 fields and a sane type token | |
| parts_full = [p.strip() for p in line.split(',')] | |
| if len(parts_full) < 3: | |
| continue | |
| entry_type = parts_full[0].lower() | |
| if not _re.match(r'^[a-z_]+$', entry_type): | |
| continue | |
| if entry_type not in grouped: | |
| grouped[entry_type] = [] | |
| grouped[entry_type].append(line) | |
| # Rebuild with token-efficient format | |
| result = [] | |
| # Extract column headers from CSV to show in dynamic header | |
| columns = ['translated_name', 'raw_name'] | |
| # Check for gender and description columns | |
| try: | |
| header_parts = [p.strip() for p in next(_csv.reader([header]))] if header else [] | |
| except Exception: | |
| header_parts = [p.strip() for p in header.split(',')] if header else [] | |
| if 'gender' in header_parts: | |
| columns.append('gender') | |
| if 'description' in header_parts: | |
| columns.append('description') | |
| # Add any other custom fields (exclude type, raw_name, translated_name, gender, description) | |
| standard_cols = {'type', 'raw_name', 'translated_name', 'gender', 'description'} | |
| for col in header_parts: | |
| if col.lower() not in standard_cols and col: | |
| columns.append(col) | |
| result.append(f"Glossary Columns: {', '.join(columns)}\n") | |
| # Process in order: character first, then term, then others | |
| type_order = ['book', 'character', 'term'] + [t for t in grouped.keys() if t not in ['book', 'character', 'term']] | |
| # Precompute column indices for richer rendering | |
| lower_header = [h.lower() for h in header_parts] | |
| def _idx(name): | |
| return lower_header.index(name) if name in lower_header else -1 | |
| type_idx = _idx('type') | |
| raw_idx = _idx('raw_name') | |
| trans_idx = _idx('translated_name') | |
| gender_idx = _idx('gender') | |
| desc_idx = _idx('description') | |
| for entry_type in type_order: | |
| if entry_type not in grouped: | |
| continue | |
| entries = grouped[entry_type] | |
| # Add section header | |
| section_name = entry_type.upper() + 'S' if not entry_type.upper().endswith('S') else entry_type.upper() | |
| result.append(f"=== {section_name} ===") | |
| # Add entries in new format | |
| for line in entries: | |
| try: | |
| parts = next(_csv.reader([line])) | |
| except Exception: | |
| parts = [p.strip() for p in line.split(',')] | |
| if header_parts and len(parts) < len(header_parts): | |
| parts += [''] * (len(header_parts) - len(parts)) | |
| elif header_parts and len(parts) > len(header_parts): | |
| # If unquoted commas split the description, merge overflow into the description column | |
| if desc_idx != -1 and desc_idx < len(header_parts): | |
| parts = parts[:desc_idx] + [",".join(parts[desc_idx:])] | |
| else: | |
| parts = parts[:len(header_parts)] | |
| # Extract core fields using header positions when available | |
| entry_type_val = (parts[type_idx] if type_idx != -1 and len(parts) > type_idx else entry_type).lower() | |
| raw_name = parts[raw_idx] if raw_idx != -1 and len(parts) > raw_idx else (parts[1] if len(parts) > 1 else '') | |
| translated_name = parts[trans_idx] if trans_idx != -1 and len(parts) > trans_idx else (parts[2] if len(parts) > 2 else '') | |
| if not raw_name or not translated_name: | |
| continue | |
| entry_line = f"* {translated_name} ({raw_name})" | |
| # Gender support (any type that supplies it) | |
| if gender_idx != -1 and len(parts) > gender_idx: | |
| gender_val = parts[gender_idx].strip() | |
| if gender_val and gender_val != 'Unknown': | |
| entry_line += f" [{gender_val}]" | |
| # Description + extra fields | |
| desc_val = parts[desc_idx].strip() if desc_idx != -1 and len(parts) > desc_idx else '' | |
| # Fallback: if no description column exists in header but there are trailing columns, | |
| # join everything after the last known core column as description. | |
| if desc_idx == -1: | |
| core_max = max(idx for idx in [type_idx, raw_idx, trans_idx, gender_idx] if idx != -1) if any(idx != -1 for idx in [type_idx, raw_idx, trans_idx, gender_idx]) else 2 | |
| if len(parts) > core_max + 1: | |
| desc_tail = ",".join(parts[core_max + 1:]).strip() | |
| if desc_tail and not desc_val: | |
| desc_val = desc_tail | |
| extra_segments = [] | |
| for idx, col in enumerate(header_parts): | |
| col_lower = col.lower() | |
| if col_lower in ['type', 'raw_name', 'translated_name', 'gender', 'description']: | |
| continue | |
| if idx < len(parts): | |
| val = parts[idx].strip() | |
| if val: | |
| extra_segments.append(f"{col}: {val}") | |
| base_desc = desc_val | |
| if not base_desc and extra_segments: | |
| base_desc = extra_segments[0] | |
| extra_segments = extra_segments[1:] | |
| if base_desc: | |
| entry_line += f": {base_desc}" | |
| for seg in extra_segments: | |
| entry_line += f" | {seg}" | |
| result.append(entry_line) | |
| result.append("") # Blank line between sections | |
| return result | |
| def _count_glossary_entries(lines, use_legacy_format=False): | |
| """Return (char_count, term_count, total_count) for either format.""" | |
| if not lines: | |
| return 0, 0, 0 | |
| if use_legacy_format: | |
| data = lines[1:] if lines and lines[0].lower().startswith('type,raw_name') else lines | |
| char_count = sum(1 for ln in data if ln.startswith('character,')) | |
| term_count = sum(1 for ln in data if ln.startswith('term,')) | |
| total = sum(1 for ln in data if ln and ',' in ln) | |
| return char_count, term_count, total | |
| # token-efficient | |
| current = None | |
| char_count = term_count = total = 0 | |
| for ln in lines: | |
| s = ln.strip() | |
| if s.startswith('=== ') and 'CHARACTER' in s.upper(): | |
| current = 'character' | |
| continue | |
| if s.startswith('=== ') and 'TERM' in s.upper(): | |
| current = 'term' | |
| continue | |
| if s.startswith('* '): | |
| total += 1 | |
| if current == 'character': | |
| char_count += 1 | |
| elif current == 'term': | |
| term_count += 1 | |
| return char_count, term_count, total | |
| def _sanitize_final_glossary_lines(lines, use_legacy_format=False): | |
| """Remove stray CSV headers and normalize header placement before saving. | |
| - In legacy CSV mode, ensure exactly one header at the very top. | |
| - In token-efficient mode, remove any CSV header lines entirely. | |
| """ | |
| header_norm = "type,raw_name,translated_name" | |
| if not lines: | |
| return lines | |
| if use_legacy_format: | |
| sanitized = [] | |
| header_seen = False | |
| for ln in lines: | |
| txt = ln.strip() | |
| if txt.lower().startswith("type,raw_name"): | |
| if not header_seen: | |
| sanitized.append(header_norm) | |
| header_seen = True | |
| # skip duplicates | |
| else: | |
| sanitized.append(ln) | |
| # ensure header at top | |
| if sanitized and not sanitized[0].strip().lower().startswith("type,raw_name"): | |
| sanitized.insert(0, header_norm) | |
| return sanitized | |
| else: | |
| # remove any CSV header lines anywhere and duplicate top headers/sections | |
| cleaned = [] | |
| glossary_header_seen = False | |
| for i, ln in enumerate(lines): | |
| txt = ln.strip() | |
| low = txt.lower() | |
| # Drop CSV headers | |
| if low.startswith("type,raw_name"): | |
| continue | |
| # Keep only the first main glossary header | |
| if low.startswith("glossary:"): | |
| if glossary_header_seen: | |
| continue | |
| glossary_header_seen = True | |
| cleaned.append(ln) | |
| continue | |
| # Remove bogus section like '=== GLOSSARY: ... ===' | |
| if low.startswith("=== glossary:"): | |
| continue | |
| cleaned.append(ln) | |
| return cleaned | |
| def _process_chunks_batch_api(chunks_to_process, custom_prompt, language, | |
| min_frequency, max_names, max_titles, | |
| output_dir, strip_honorifics, fuzzy_threshold, | |
| filter_mode, api_batch_size, extraction_workers, max_sentences=200): | |
| """Process chunks using batch API calls for AI extraction with thread delay. | |
| IMPORTANT: when a stop is requested, we must stop *submitting* new API work immediately. | |
| Any already in-flight requests may finish (graceful stop) or be aborted by unified_api_client | |
| cancellation (immediate stop). | |
| """ | |
| print(f"📑 Using batch API mode with {api_batch_size} chunks per batch") | |
| # Graceful stop semantics: | |
| # - If GRACEFUL_STOP=1 and WAIT_FOR_CHUNKS=1: stop submitting *new* work, but do NOT cancel in-flight. | |
| # - If WAIT_FOR_CHUNKS=0: we will only "wait for in-flight" if ALL chunks were already submitted. | |
| # If any chunk is still pending/not-submitted when stop is raised, escalate to full-stop. | |
| graceful_stop = (os.getenv('GRACEFUL_STOP') == '1') | |
| wait_for_chunks = (os.getenv('WAIT_FOR_CHUNKS') == '1') | |
| # Ensure we defer saving and heavy merging when processing chunks | |
| _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") | |
| os.environ["GLOSSARY_DEFER_SAVE"] = "1" | |
| # Get thread submission delay | |
| thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) | |
| if thread_delay > 0: | |
| print(f"📑 Thread submission delay: {thread_delay}s between parallel calls") | |
| # CHANGE: Collect raw CSV lines instead of dictionary | |
| all_csv_lines = [] # Collect all entries as CSV lines | |
| total_chunks = len(chunks_to_process) | |
| completed_chunks = 0 | |
| # Ensure per-chunk smart filtering is disabled globally during batch processing | |
| _prev_filtered = os.getenv("_CHUNK_ALREADY_FILTERED") | |
| _prev_force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER") | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = "1" | |
| os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = "1" | |
| # Concurrency: follow GUI batch size (BATCH_SIZE). | |
| # NOTE: EXTRACTION_WORKERS is used for *chapter extraction*/CPU work; it should not cap API concurrency. | |
| # If you want to throttle API concurrency, use BATCH_SIZE (and/or SEND_INTERVAL_SECONDS). | |
| try: | |
| api_batch_size = int(api_batch_size) | |
| except Exception: | |
| api_batch_size = 1 | |
| api_batch_size = max(1, api_batch_size) | |
| max_workers = min(api_batch_size, len(chunks_to_process)) | |
| max_workers = max(1, max_workers) | |
| # Useful debug when users think batching isn't applying | |
| try: | |
| send_interval = os.getenv("SEND_INTERVAL_SECONDS", "") | |
| thread_delay_env = os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "") | |
| print(f"📑 DEBUG: BATCH_SIZE={api_batch_size}, EXTRACTION_WORKERS={extraction_workers}, SEND_INTERVAL_SECONDS={send_interval}, THREAD_SUBMISSION_DELAY_SECONDS={thread_delay_env}") | |
| except Exception: | |
| pass | |
| print(f"📑 Processing {len(chunks_to_process)} chunks with up to {max_workers} concurrent API calls...") | |
| # Submit incrementally so Stop can prevent queued work from ever starting. | |
| from concurrent.futures import wait, FIRST_COMPLETED | |
| pending = list(chunks_to_process) | |
| next_pos = 1 | |
| # Track work in three stages: | |
| # - executor_submitted: submitted to our ThreadPoolExecutor (NOT what the user means by "sent") | |
| # - sent_chunks: requests that actually transitioned to in-flight (i.e., after api stagger/delay) | |
| # - completed_chunks_local: futures that completed (success or failure) | |
| executor_submitted = 0 | |
| completed_chunks_local = 0 | |
| sent_chunks = set() # set[int] of chunk_pos that have actually been sent (in-flight) | |
| def _status_snapshot(*, in_flight_count: int) -> dict: | |
| total = int(total_chunks or 0) | |
| pend = int(len(pending)) | |
| # "all_sent" means every chunk call has actually begun sending (post-delay) at least once. | |
| all_sent = (total > 0 and len(sent_chunks) >= total) | |
| # Keep legacy fields for compatibility/debugging, but note "submitted" here is executor-submitted. | |
| all_submitted = (executor_submitted >= total and pend == 0) | |
| return { | |
| "pid": os.getpid(), | |
| "ts": time.time(), | |
| "total_chunks": total, | |
| "executor_submitted": int(executor_submitted), | |
| "submitted_chunks": int(executor_submitted), | |
| "sent_chunks": int(len(sent_chunks)), | |
| "all_sent": bool(all_sent), | |
| "completed_chunks": int(completed_chunks_local), | |
| "in_flight": int(in_flight_count), | |
| "pending": pend, | |
| "all_submitted": bool(all_submitted), | |
| "graceful_stop": bool(graceful_stop), | |
| "wait_for_chunks": bool(wait_for_chunks), | |
| "stop_requested": bool(is_stop_requested()), | |
| } | |
| # Monitor watchdog entries to detect when requests actually transition to "in_flight" (sent). | |
| # This matches the user's definition of "submitted" (after API delay/stagger). | |
| _sent_monitor_stop = threading.Event() | |
| def _sent_monitor(): | |
| try: | |
| import unified_api_client as _uac | |
| except Exception: | |
| return | |
| # Regex for the context we set in _extract_with_custom_prompt: "auto glossary (i/N)" | |
| rx = re.compile(r"auto\s+glossary\s*\(\s*(\d+)\s*/\s*(\d+)\s*\)", re.IGNORECASE) | |
| while not _sent_monitor_stop.is_set(): | |
| try: | |
| st = _uac.get_api_watchdog_state() if hasattr(_uac, 'get_api_watchdog_state') else {} | |
| entries = st.get('in_flight_entries', []) if isinstance(st, dict) else [] | |
| if not isinstance(entries, list): | |
| entries = [] | |
| for e in entries: | |
| if not isinstance(e, dict): | |
| continue | |
| if e.get('status') != 'in_flight': | |
| continue | |
| ctx = e.get('context') or e.get('label') or '' | |
| m = rx.search(str(ctx)) | |
| if not m: | |
| continue | |
| pos = int(m.group(1)) | |
| tot = int(m.group(2)) | |
| if tot == int(total_chunks or 0) and 1 <= pos <= tot: | |
| if pos not in sent_chunks: | |
| sent_chunks.add(pos) | |
| # Update status file periodically | |
| _write_glossary_status(_status_snapshot(in_flight_count=int(st.get('in_flight', 0) or 0) if isinstance(st, dict) else 0)) | |
| except Exception: | |
| pass | |
| time.sleep(0.1) | |
| try: | |
| t_mon = threading.Thread(target=_sent_monitor, name="GlossarySentMonitor", daemon=True) | |
| t_mon.start() | |
| except Exception: | |
| t_mon = None | |
| # Initialize status file early | |
| _write_glossary_status(_status_snapshot(in_flight_count=0)) | |
| def _submit_one(executor, pos, chunk_idx, chunk_text, *, last_submission_time: float): | |
| if is_stop_requested(): | |
| return None | |
| # Apply thread submission delay | |
| if thread_delay > 0 and last_submission_time > 0: | |
| time_since_last = time.time() - last_submission_time | |
| if time_since_last < thread_delay: | |
| sleep_time = thread_delay - time_since_last | |
| print(f"🧵 Thread delay: {sleep_time:.1f}s for chunk {chunk_idx}") | |
| time.sleep(sleep_time) | |
| fut = executor.submit( | |
| _extract_with_custom_prompt, | |
| custom_prompt, chunk_text, language, | |
| min_frequency, max_names, max_titles, | |
| None, output_dir, strip_honorifics, | |
| fuzzy_threshold, filter_mode, max_sentences, | |
| log_callback=None, | |
| chunk_pos=pos, | |
| total_chunks=total_chunks, | |
| ) | |
| return fut | |
| with ThreadPoolExecutor(max_workers=max_workers) as executor: | |
| futures = {} # future -> chunk_idx | |
| last_submission_time = 0.0 | |
| # Prime the worker pool | |
| while pending and len(futures) < max_workers and not is_stop_requested(): | |
| chunk_idx, chunk_text = pending.pop(0) | |
| fut = _submit_one(executor, next_pos, chunk_idx, chunk_text, last_submission_time=last_submission_time) | |
| if fut is False or fut is None: | |
| break | |
| futures[fut] = chunk_idx | |
| executor_submitted += 1 | |
| next_pos += 1 | |
| last_submission_time = time.time() | |
| _write_glossary_status(_status_snapshot(in_flight_count=len(futures))) | |
| escalated_full_stop = False | |
| def _escalate_to_full_stop(reason: str) -> None: | |
| nonlocal escalated_full_stop | |
| if escalated_full_stop: | |
| return | |
| escalated_full_stop = True | |
| try: | |
| print(f"🛑 Escalating to FULL STOP (glossary batch): {reason}") | |
| except Exception: | |
| pass | |
| # Disable graceful semantics locally so unified_api_client cancels quickly. | |
| try: | |
| os.environ['GRACEFUL_STOP'] = '0' | |
| os.environ['WAIT_FOR_CHUNKS'] = '0' | |
| except Exception: | |
| pass | |
| # Force unified_api_client cancellation if available. | |
| try: | |
| import unified_api_client | |
| if hasattr(unified_api_client, 'set_stop_flag'): | |
| unified_api_client.set_stop_flag(True) | |
| if hasattr(unified_api_client, 'global_stop_flag'): | |
| unified_api_client.global_stop_flag = True | |
| if hasattr(unified_api_client, 'UnifiedClient'): | |
| unified_api_client.UnifiedClient._global_cancelled = True | |
| except Exception: | |
| pass | |
| while futures: | |
| # On stop: | |
| # - If not graceful: immediate stop (cancel queued work). | |
| # - If graceful + WAIT_FOR_CHUNKS=1: stop submitting new but keep waiting for in-flight. | |
| # - If graceful + WAIT_FOR_CHUNKS=0: ONLY keep waiting if all chunks were already submitted; | |
| # otherwise escalate to full stop. | |
| if is_stop_requested(): | |
| # IMPORTANT: "all sent" means every chunk call has transitioned to in-flight (post delay/stagger). | |
| all_sent_now = (int(total_chunks or 0) > 0 and len(sent_chunks) >= int(total_chunks or 0)) | |
| if graceful_stop and (not wait_for_chunks) and (not all_sent_now): | |
| _escalate_to_full_stop("stop requested before all chunks were sent to API") | |
| if (not graceful_stop) or escalated_full_stop: | |
| try: | |
| for fut in list(futures.keys()): | |
| fut.cancel() | |
| except Exception: | |
| pass | |
| # Do not keep waiting if we're full-stopping. | |
| break | |
| # Graceful stop: keep waiting only if WAIT_FOR_CHUNKS=1 OR all chunks already sent. | |
| if graceful_stop and (wait_for_chunks or all_sent_now): | |
| # no-op: just continue waiting for done futures | |
| pass | |
| else: | |
| # Graceful stop without waiting semantics -> treat as immediate stop. | |
| try: | |
| for fut in list(futures.keys()): | |
| fut.cancel() | |
| except Exception: | |
| pass | |
| break | |
| done, _ = wait(futures.keys(), return_when=FIRST_COMPLETED) | |
| for fut in done: | |
| chunk_idx = futures.pop(fut, None) | |
| if chunk_idx is None: | |
| continue | |
| # Collect result (even if stop was requested; it may have completed before cancellation) | |
| try: | |
| chunk_glossary = fut.result() | |
| print(f"📑 DEBUG: Chunk {chunk_idx} returned type={type(chunk_glossary)}, len={len(chunk_glossary)}") | |
| # Normalize to CSV lines (without header) | |
| chunk_lines = [] | |
| if isinstance(chunk_glossary, dict): | |
| for raw_name, translated_name in chunk_glossary.items(): | |
| entry_type = "character" if _has_honorific(raw_name) else "term" | |
| chunk_lines.append(f"{entry_type},{raw_name},{translated_name}") | |
| elif isinstance(chunk_glossary, list): | |
| for line in chunk_glossary: | |
| if line and not line.startswith('type,'): | |
| chunk_lines.append(line) | |
| # Aggregate for end-of-run | |
| all_csv_lines.extend(chunk_lines) | |
| # Incremental writes (best-effort) | |
| try: | |
| _incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode) | |
| print(f"📑 Incremental write: chunk {chunk_idx} (+{len(chunk_lines)} entries)") | |
| except Exception as e2: | |
| print(f"⚠️ Incremental write failed: {e2}") | |
| completed_chunks += 1 | |
| completed_chunks_local += 1 | |
| progress_percent = (completed_chunks / total_chunks) * 100 if total_chunks else 100 | |
| print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") | |
| print(f"📑 Chunk {chunk_idx} completed and aggregated") | |
| except Exception as e: | |
| print(f"⚠️ API call for chunk {chunk_idx} failed: {e}") | |
| completed_chunks += 1 | |
| progress_percent = (completed_chunks / total_chunks) * 100 if total_chunks else 100 | |
| print(f"📑 Progress: {completed_chunks}/{total_chunks} chunks ({progress_percent:.0f}%)") | |
| # Submit next work only if not stopping | |
| while pending and len(futures) < max_workers and not is_stop_requested(): | |
| next_chunk_idx, next_chunk_text = pending.pop(0) | |
| fut2 = _submit_one(executor, next_pos, next_chunk_idx, next_chunk_text, last_submission_time=last_submission_time) | |
| if fut2 is False or fut2 is None: | |
| pending.clear() | |
| break | |
| futures[fut2] = next_chunk_idx | |
| executor_submitted += 1 | |
| next_pos += 1 | |
| last_submission_time = time.time() | |
| _write_glossary_status(_status_snapshot(in_flight_count=len(futures))) | |
| # Update status after processing completions | |
| _write_glossary_status(_status_snapshot(in_flight_count=len(futures))) | |
| # CHANGE: Return CSV lines instead of dictionary | |
| # Stop sent-monitor thread | |
| try: | |
| _sent_monitor_stop.set() | |
| except Exception: | |
| pass | |
| # Restore per-chunk filter disabling envs | |
| if _prev_filtered is None: | |
| os.environ.pop("_CHUNK_ALREADY_FILTERED", None) | |
| else: | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = _prev_filtered | |
| if _prev_force_disable is None: | |
| os.environ.pop("GLOSSARY_FORCE_DISABLE_SMART_FILTER", None) | |
| else: | |
| os.environ["GLOSSARY_FORCE_DISABLE_SMART_FILTER"] = _prev_force_disable | |
| # Restore previous defer setting | |
| if _prev_defer is None: | |
| # Default back to not deferring if it wasn't set | |
| if "GLOSSARY_DEFER_SAVE" in os.environ: | |
| del os.environ["GLOSSARY_DEFER_SAVE"] | |
| else: | |
| os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer | |
| # If we are exiting due to a stop request, clear watchdog state/file so GUI doesn't stay "busy". | |
| if is_stop_requested(): | |
| try: | |
| _clear_api_watchdog_state(remove_watchdog_file=True) | |
| except Exception: | |
| pass | |
| return all_csv_lines | |
| def _incremental_update_glossary(output_dir, chunk_idx, chunk_lines, strip_honorifics, language, filter_mode): | |
| """Incrementally update glossary output. | |
| Creates per-chunk CSV snapshots in an "incremental_glossary" subfolder: | |
| glossary.incremental1.csv, glossary.incremental2.csv, ... | |
| Also maintains a combined aggregator file (glossary.incremental.all.csv) | |
| that save_glossary() can use as a crash-safe backup. | |
| """ | |
| if not chunk_lines: | |
| return | |
| # Respect stop flag to avoid writing partial files after cancellation | |
| if is_stop_requested(): | |
| return | |
| # Incremental output directory | |
| incremental_dir = os.path.join(output_dir, "incremental_glossary") | |
| os.makedirs(incremental_dir, exist_ok=True) | |
| # Per-chunk snapshot path (no merging, just this chunk) | |
| chunk_filename = f"glossary.incremental{chunk_idx}.csv" | |
| chunk_path = os.path.join(incremental_dir, chunk_filename) | |
| # Combined aggregator path (append-only) and visible glossary path (merged) | |
| agg_path = os.path.join(incremental_dir, "glossary.incremental.all.csv") | |
| vis_path = os.path.join(output_dir, "glossary.csv") | |
| # Ensure main output dir exists | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Compose CSV lines for this chunk | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1" | |
| header = "type,raw_name,translated_name" | |
| if include_description: | |
| header += ",gender,description" | |
| elif include_gender_context: | |
| header += ",gender" | |
| new_csv_lines = [header] + chunk_lines | |
| # Save per-chunk snapshot (no merging) | |
| _atomic_write_file(chunk_path, "\n".join(new_csv_lines)) | |
| # Append to aggregator (raw append, no merging/deduping to preserve full history) | |
| # Use lock to prevent concurrent appends - use proper file locking/flushing | |
| with _file_write_lock: | |
| try: | |
| # Force close/reopen to ensure flush | |
| # Read first to check header | |
| file_exists = os.path.exists(agg_path) | |
| with open(agg_path, 'a', encoding='utf-8') as f: | |
| # If new file, write header | |
| if not file_exists: | |
| f.write(header + "\n") | |
| # Append chunks | |
| if chunk_lines: | |
| content_to_write = "\n".join(chunk_lines) + "\n" | |
| f.write(content_to_write) | |
| # Force flush to disk | |
| f.flush() | |
| os.fsync(f.fileno()) | |
| except Exception as e: | |
| print(f"⚠️ Failed to append to incremental aggregator: {e}") | |
| # Update visible glossary.csv (merged and deduped) | |
| # DISABLED: Per user request, we only do this at the very end to save performance | |
| # The incremental_glossary folder maintains the safety backup | |
| # existing_csv = None | |
| # if os.path.exists(agg_path): | |
| # try: | |
| # with open(agg_path, 'r', encoding='utf-8') as f: | |
| # existing_csv = f.read() | |
| # except Exception as e: | |
| # print(f"⚠️ Incremental: cannot read aggregator: {e}") | |
| # Merge (exact merge, no fuzzy to keep this fast) | |
| # Note: _merge_csv_entries handles deduplication | |
| # We pass empty string as 'new' content because existing_csv already contains everything (from append above) | |
| # Actually, _merge_csv_entries merges two CSV strings. existing_csv is the full raw history. | |
| # If we pass it as 'base', it will clean it up. | |
| # merged_csv_lines = _merge_csv_entries([], existing_csv or "", strip_honorifics, language) | |
| # Optional filter mode | |
| # merged_csv_lines = _filter_csv_by_mode(merged_csv_lines, filter_mode) | |
| # Convert to token-efficient format for visible glossary.csv | |
| # token_lines = _convert_to_token_efficient_format(merged_csv_lines) | |
| # token_lines = _sanitize_final_glossary_lines(token_lines, use_legacy_format=False) | |
| # _atomic_write_file(vis_path, "\n".join(token_lines)) | |
| def _process_single_chunk(chunk_idx, chunk_text, custom_prompt, language, | |
| min_frequency, max_names, max_titles, batch_size, | |
| output_dir, strip_honorifics, fuzzy_threshold, filter_mode, | |
| already_filtered=False, max_sentences=200): | |
| """Process a single chunk - wrapper for parallel execution""" | |
| print(f"📑 Worker processing chunk {chunk_idx} ({len(chunk_text):,} chars)...") | |
| if custom_prompt: | |
| # Pass flag to indicate if text is already filtered | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = "1" if already_filtered else "0" | |
| _prev_defer = os.getenv("GLOSSARY_DEFER_SAVE") | |
| os.environ["GLOSSARY_DEFER_SAVE"] = "1" | |
| try: | |
| result = _extract_with_custom_prompt( | |
| custom_prompt, chunk_text, language, | |
| min_frequency, max_names, max_titles, | |
| None, output_dir, | |
| strip_honorifics, fuzzy_threshold, filter_mode, max_sentences, log_callback=None | |
| ) | |
| finally: | |
| os.environ["_CHUNK_ALREADY_FILTERED"] = "0" # Reset | |
| if _prev_defer is None: | |
| if "GLOSSARY_DEFER_SAVE" in os.environ: | |
| del os.environ["GLOSSARY_DEFER_SAVE"] | |
| else: | |
| os.environ["GLOSSARY_DEFER_SAVE"] = _prev_defer | |
| return result | |
| else: | |
| # Pattern fallback disabled | |
| print("📑 AUTO_GLOSSARY_PROMPT is empty - skipping chunk glossary extraction (pattern fallback disabled)") | |
| return {} | |
| def _apply_final_filter(entries, filter_mode): | |
| """Apply final filtering based on mode to ensure only requested types are included""" | |
| if filter_mode == "only_with_honorifics": | |
| # Filter to keep only entries that look like they have honorifics | |
| filtered = {} | |
| for key, value in entries.items(): | |
| # Check if the key contains known honorific patterns | |
| if _has_honorific(key): | |
| filtered[key] = value | |
| print(f"📑 Final filter: Kept {len(filtered)} entries with honorifics (from {len(entries)} total)") | |
| return filtered | |
| elif filter_mode == "only_without_honorifics": | |
| # Filter to keep only entries without honorifics | |
| filtered = {} | |
| for key, value in entries.items(): | |
| if not _has_honorific(key): | |
| filtered[key] = value | |
| print(f"📑 Final filter: Kept {len(filtered)} entries without honorifics (from {len(entries)} total)") | |
| return filtered | |
| else: | |
| return entries | |
| def _looks_like_name(text): | |
| """Check if text looks like a character name""" | |
| if not text: | |
| return False | |
| # Check for various name patterns | |
| # Korean names (2-4 hangul characters) | |
| if all(0xAC00 <= ord(char) <= 0xD7AF for char in text) and 2 <= len(text) <= 4: | |
| return True | |
| # Japanese names (mix of kanji/kana, 2-6 chars) | |
| has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in text) | |
| has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in text) | |
| if (has_kanji or has_kana) and 2 <= len(text) <= 6: | |
| return True | |
| # Chinese names (EXPANDED: 2-6 Chinese characters for cultivation novels) | |
| if all(0x4E00 <= ord(char) <= 0x9FFF for char in text) and 2 <= len(text) <= 6: | |
| # 1. Check if it matches specific Chinese name patterns (Courtesy Name, Generation Name) | |
| if hasattr(PM, 'CHINESE_NAME_PATTERNS'): | |
| # Courtesy names (e.g. "Lu Bozi") | |
| if 'courtesy_names' in PM.CHINESE_NAME_PATTERNS: | |
| for pattern in PM.CHINESE_NAME_PATTERNS['courtesy_names']: | |
| if re.match(pattern, text): | |
| return True | |
| # Generation names (middle character matches generation list) | |
| if len(text) == 3 and 'generation_names' in PM.CHINESE_NAME_PATTERNS: | |
| if text[1] in PM.CHINESE_NAME_PATTERNS['generation_names']: | |
| return True | |
| # Title prefixes (e.g. "Old Li", "Little Wang") | |
| if 'title_prefixes' in PM.CHINESE_NAME_PATTERNS: | |
| if text[0] in PM.CHINESE_NAME_PATTERNS['title_prefixes']: | |
| return True | |
| # 2. Check if it starts with a known surname (1 or 2 chars) | |
| if len(text) >= 2: | |
| # Check single-char surname | |
| if text[0] in PM.CHINESE_SINGLE_SURNAMES: | |
| return True | |
| # Check two-char compound surname | |
| if len(text) >= 3 and text[:2] in PM.CHINESE_COMPOUND_SURNAMES: | |
| return True | |
| # 3. Even without surname match, if it's 2-6 chars it could be a valid term | |
| return True | |
| # English names (starts with capital, mostly letters) | |
| if text[0].isupper() and sum(1 for c in text if c.isalpha()) >= len(text) * 0.8: | |
| return True | |
| return False | |
| def _has_honorific(term): | |
| """Check if a term contains an honorific using PatternManager's comprehensive list""" | |
| if not term: | |
| return False | |
| term_lower = term.lower() | |
| # Check all language honorifics from PatternManager | |
| for language, honorifics_list in PM.CJK_HONORIFICS.items(): | |
| for honorific in honorifics_list: | |
| # For romanized/English honorifics with spaces or dashes | |
| if honorific.startswith(' ') or honorific.startswith('-'): | |
| if term_lower.endswith(honorific.lower()): | |
| return True | |
| # For CJK honorifics (no separator) | |
| else: | |
| if honorific in term: | |
| return True | |
| return False | |
| def _strip_all_honorifics(term, language='korean'): | |
| """Strip all honorifics from a term using PatternManager's lists""" | |
| if not term: | |
| return term | |
| result = term | |
| # Get honorifics for the specific language and English romanizations | |
| honorifics_to_strip = [] | |
| if language in PM.CJK_HONORIFICS: | |
| honorifics_to_strip.extend(PM.CJK_HONORIFICS[language]) | |
| honorifics_to_strip.extend(PM.CJK_HONORIFICS.get('english', [])) | |
| # Sort by length (longest first) to avoid partial matches | |
| honorifics_to_strip.sort(key=len, reverse=True) | |
| # Strip honorifics | |
| for honorific in honorifics_to_strip: | |
| if honorific.startswith(' ') or honorific.startswith('-'): | |
| # For romanized honorifics with separators | |
| if result.lower().endswith(honorific.lower()): | |
| result = result[:-len(honorific)] | |
| else: | |
| # For CJK honorifics (no separator) | |
| if result.endswith(honorific): | |
| result = result[:-len(honorific)] | |
| return result.strip() | |
| def _convert_to_csv_format(data): | |
| """Convert various glossary formats to CSV string format with enforced 3 columns""" | |
| csv_lines = ["type,raw_name,translated_name"] | |
| if isinstance(data, str): | |
| # Already CSV string | |
| if data.strip().startswith('type,raw_name'): | |
| return data | |
| # Try to parse as JSON | |
| try: | |
| data = json.loads(data) | |
| except: | |
| return data | |
| if isinstance(data, list): | |
| for item in data: | |
| if isinstance(item, dict): | |
| if 'type' in item and 'raw_name' in item: | |
| # Already in correct format | |
| line = f"{item['type']},{item['raw_name']},{item.get('translated_name', item['raw_name'])}" | |
| csv_lines.append(line) | |
| else: | |
| # Old format - default to 'term' type | |
| entry_type = 'term' | |
| raw_name = item.get('original_name', '') | |
| translated_name = item.get('name', raw_name) | |
| if raw_name and translated_name: | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name}") | |
| elif isinstance(data, dict): | |
| if 'entries' in data: | |
| # Has metadata wrapper, extract entries | |
| for original, translated in data['entries'].items(): | |
| csv_lines.append(f"term,{original},{translated}") | |
| else: | |
| # Plain dictionary - default to 'term' type | |
| for original, translated in data.items(): | |
| csv_lines.append(f"term,{original},{translated}") | |
| return '\n'.join(csv_lines) | |
| def _parse_csv_to_dict(csv_content): | |
| """Parse CSV content to dictionary for backward compatibility""" | |
| result = {} | |
| lines = csv_content.strip().split('\n') | |
| for line in lines[1:]: # Skip header | |
| if not line.strip(): | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) >= 3: | |
| result[parts[1]] = parts[2] # raw_name -> translated_name | |
| return result | |
| def _fuzzy_match(term1, term2, threshold=0.90): | |
| """Check if two terms match using fuzzy matching""" | |
| ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() | |
| return ratio >= threshold | |
| def _fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len): | |
| """Use rapidfuzz library for MUCH faster fuzzy matching""" | |
| from rapidfuzz import fuzz | |
| print(f"📑 Using RapidFuzz (C++ speed)...") | |
| start_time = time.time() | |
| matches_count = 0 | |
| threshold_percent = threshold * 100 # rapidfuzz uses 0-100 scale | |
| # Can use smaller step because rapidfuzz is so fast | |
| step = 1 # Check every position - rapidfuzz can handle it | |
| # Process text | |
| for i in range(0, len(text_lower) - term_len + 1, step): | |
| # Check stop flag every 10000 positions | |
| if i > 0 and i % 10000 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 RapidFuzz stopped at position {i}") | |
| return matches_count | |
| window = text_lower[i:i + term_len] | |
| # rapidfuzz is fast enough we can check every position | |
| if fuzz.ratio(term_lower, window) >= threshold_percent: | |
| matches_count += 1 | |
| elapsed = time.time() - start_time | |
| print(f"📑 RapidFuzz found {matches_count} matches in {elapsed:.2f}s") | |
| return matches_count | |
| def _batch_compute_frequencies(terms, all_text, fuzzy_threshold=0.90, min_frequency=2): | |
| """Compute frequencies for all terms at once - MUCH faster than individual checking""" | |
| print(f"📑 Computing frequencies for {len(terms)} terms in batch mode...") | |
| start_time = time.time() | |
| # Result dictionary | |
| term_frequencies = {} | |
| # First pass: exact matching (very fast) | |
| print(f"📑 Phase 1: Exact matching...") | |
| text_lower = all_text.lower() | |
| for term in terms: | |
| if is_stop_requested(): | |
| return term_frequencies | |
| term_lower = term.lower() | |
| count = text_lower.count(term_lower) | |
| term_frequencies[term] = count | |
| exact_time = time.time() - start_time | |
| high_freq_terms = sum(1 for count in term_frequencies.values() if count >= min_frequency) | |
| print(f"📑 Exact matching complete: {high_freq_terms}/{len(terms)} terms meet threshold ({exact_time:.1f}s)") | |
| # If fuzzy matching is disabled, we're done | |
| if fuzzy_threshold >= 1.0: | |
| return term_frequencies | |
| # Second pass: fuzzy matching ONLY for low-frequency terms | |
| low_freq_terms = [term for term, count in term_frequencies.items() if count < min_frequency] | |
| if low_freq_terms: | |
| print(f"📑 Phase 2: Fuzzy matching for {len(low_freq_terms)} low-frequency terms...") | |
| # Try to use RapidFuzz batch processing | |
| try: | |
| from rapidfuzz import process, fuzz | |
| # For very large texts, sample it for fuzzy matching | |
| if len(text_lower) > 500000: | |
| print(f"📑 Text too large ({len(text_lower):,} chars), sampling for fuzzy matching...") | |
| # Sample every Nth character to reduce size | |
| sample_rate = max(1, len(text_lower) // 100000) | |
| sampled_text = text_lower[::sample_rate] | |
| else: | |
| sampled_text = text_lower | |
| # Create chunks of text for fuzzy matching | |
| chunk_size = 1000 # Process text in chunks | |
| text_chunks = [sampled_text[i:i+chunk_size] for i in range(0, len(sampled_text), chunk_size//2)] # Overlapping chunks | |
| print(f"📑 Processing {len(text_chunks)} text chunks...") | |
| threshold_percent = fuzzy_threshold * 100 | |
| # Process in batches to avoid memory issues | |
| batch_size = 100 # Process 100 terms at a time | |
| for batch_start in range(0, len(low_freq_terms), batch_size): | |
| if is_stop_requested(): | |
| break | |
| batch_end = min(batch_start + batch_size, len(low_freq_terms)) | |
| batch_terms = low_freq_terms[batch_start:batch_end] | |
| for term in batch_terms: | |
| if is_stop_requested(): | |
| break | |
| # Quick fuzzy search in chunks | |
| fuzzy_count = 0 | |
| for chunk in text_chunks[:50]: # Limit to first 50 chunks for speed | |
| if fuzz.partial_ratio(term.lower(), chunk) >= threshold_percent: | |
| fuzzy_count += 1 | |
| if fuzzy_count > 0: | |
| # Scale up based on sampling | |
| if len(text_lower) > 500000: | |
| fuzzy_count *= (len(text_lower) // len(sampled_text)) | |
| term_frequencies[term] += fuzzy_count | |
| if (batch_end % 500 == 0) or (batch_end == len(low_freq_terms)): | |
| elapsed = time.time() - start_time | |
| print(f"📑 Processed {batch_end}/{len(low_freq_terms)} terms ({elapsed:.1f}s)") | |
| except ImportError: | |
| print("📑 RapidFuzz not available, skipping fuzzy matching") | |
| total_time = time.time() - start_time | |
| final_high_freq = sum(1 for count in term_frequencies.values() if count >= min_frequency) | |
| print(f"📑 Batch frequency computation complete: {final_high_freq}/{len(terms)} terms accepted ({total_time:.1f}s)") | |
| return term_frequencies | |
| def _find_fuzzy_matches(term, text, threshold=0.90): | |
| """Find fuzzy matches of a term in text using efficient method with parallel processing""" | |
| start_time = time.time() | |
| term_lower = term.lower() | |
| text_lower = text.lower() | |
| term_len = len(term) | |
| # Only log for debugging if explicitly enabled | |
| debug_search = os.getenv("GLOSSARY_DEBUG_SEARCH", "0") == "1" | |
| if debug_search and len(text) > 100000: | |
| print(f"📑 Searching for '{term}' in {len(text):,} chars (threshold: {threshold})") | |
| # Strategy 1: Use exact matching first for efficiency | |
| exact_start = time.time() | |
| matches_count = text_lower.count(term_lower) | |
| exact_time = time.time() - exact_start | |
| if matches_count > 0: | |
| if debug_search and len(text) > 100000: | |
| print(f"📑 Found {matches_count} exact matches in {exact_time:.3f}s") | |
| return matches_count | |
| # Strategy 2: Try rapidfuzz if available (much faster) | |
| if matches_count == 0 and threshold < 1.0: | |
| try: | |
| from rapidfuzz import fuzz | |
| return _fuzzy_match_rapidfuzz(term_lower, text_lower, threshold, term_len) | |
| except ImportError: | |
| pass # Fall back to parallel/sequential | |
| # Strategy 3: Fall back to parallel/sequential if rapidfuzz not available | |
| # Check if parallel processing is enabled | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts | |
| return _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) | |
| else: | |
| return _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) | |
| # Check if parallel processing is enabled | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| if extraction_workers > 1 and len(text) > 50000: # Use parallel for large texts | |
| return _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, extraction_workers) | |
| else: | |
| return _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len) | |
| return matches_count | |
| def _parallel_fuzzy_search(term_lower, text_lower, threshold, term_len, num_workers): | |
| """Parallel fuzzy search using ThreadPoolExecutor""" | |
| print(f"📑 Starting parallel fuzzy search with {num_workers} workers...") | |
| text_len = len(text_lower) | |
| matches_count = 0 | |
| # Split text into overlapping chunks for parallel processing | |
| chunk_size = max(text_len // num_workers, term_len * 100) | |
| chunks = [] | |
| for i in range(0, text_len, chunk_size): | |
| # Add overlap to avoid missing matches at boundaries | |
| end = min(i + chunk_size + term_len - 1, text_len) | |
| chunks.append((i, text_lower[i:end])) | |
| print(f"📑 Split into {len(chunks)} chunks of ~{chunk_size:,} chars each") | |
| # Process chunks in parallel | |
| with ThreadPoolExecutor(max_workers=num_workers) as executor: | |
| futures = [] | |
| for chunk_idx, (start_pos, chunk_text) in enumerate(chunks): | |
| if is_stop_requested(): | |
| return matches_count | |
| future = executor.submit( | |
| _fuzzy_search_chunk, | |
| term_lower, chunk_text, threshold, term_len, chunk_idx, len(chunks) | |
| ) | |
| futures.append(future) | |
| # Collect results | |
| for future in as_completed(futures): | |
| if is_stop_requested(): | |
| executor.shutdown(wait=False) | |
| return matches_count | |
| try: | |
| chunk_matches = future.result() | |
| matches_count += chunk_matches | |
| except Exception as e: | |
| print(f"📑 ⚠️ Chunk processing error: {e}") | |
| print(f"📑 Parallel fuzzy search found {matches_count} matches") | |
| return matches_count | |
| def _fuzzy_search_chunk(term_lower, chunk_text, threshold, term_len, chunk_idx, total_chunks): | |
| """Process a single chunk for fuzzy matches""" | |
| chunk_matches = 0 | |
| # Use a more efficient step size - no need to check every position | |
| step = max(1, term_len // 3) # Check every third of term length | |
| for i in range(0, len(chunk_text) - term_len + 1, step): | |
| # Check stop flag periodically | |
| if i > 0 and i % 1000 == 0: | |
| if is_stop_requested(): | |
| return chunk_matches | |
| window = chunk_text[i:i + term_len] | |
| # Use SequenceMatcher for fuzzy matching | |
| if SequenceMatcher(None, term_lower, window).ratio() >= threshold: | |
| chunk_matches += 1 | |
| # Log progress for this chunk | |
| if total_chunks > 1: | |
| print(f"📑 Chunk {chunk_idx + 1}/{total_chunks} completed: {chunk_matches} matches") | |
| return chunk_matches | |
| def _sequential_fuzzy_search(term_lower, text_lower, threshold, term_len): | |
| """Sequential fuzzy search (fallback for small texts or single worker)""" | |
| print(f"📑 Starting sequential fuzzy search...") | |
| fuzzy_start = time.time() | |
| matches_count = 0 | |
| # More efficient step size | |
| step = max(1, term_len // 3) | |
| total_windows = (len(text_lower) - term_len + 1) // step | |
| print(f"📑 Checking ~{total_windows:,} windows with step size {step}") | |
| windows_checked = 0 | |
| for i in range(0, len(text_lower) - term_len + 1, step): | |
| # Check stop flag frequently | |
| if i > 0 and i % (step * 100) == 0: | |
| if is_stop_requested(): | |
| return matches_count | |
| # Progress log for very long operations | |
| if windows_checked % 1000 == 0 and windows_checked > 0: | |
| elapsed = time.time() - fuzzy_start | |
| rate = windows_checked / elapsed if elapsed > 0 else 0 | |
| eta = (total_windows - windows_checked) / rate if rate > 0 else 0 | |
| print(f"📑 Progress: {windows_checked}/{total_windows} windows, {rate:.0f} w/s, ETA: {eta:.1f}s") | |
| window = text_lower[i:i + term_len] | |
| if SequenceMatcher(None, term_lower, window).ratio() >= threshold: | |
| matches_count += 1 | |
| windows_checked += 1 | |
| fuzzy_time = time.time() - fuzzy_start | |
| print(f"📑 Sequential fuzzy search completed in {fuzzy_time:.2f}s, found {matches_count} matches") | |
| return matches_count | |
| def _fuzzy_match(term1, term2, threshold=0.90): | |
| """Check if two terms match using fuzzy matching (unchanged)""" | |
| ratio = SequenceMatcher(None, term1.lower(), term2.lower()).ratio() | |
| return ratio >= threshold | |
| def _strip_honorific(term, language_hint='unknown'): | |
| """Strip honorific from a term if present""" | |
| if not term: | |
| return term | |
| # Get honorifics for the detected language | |
| honorifics_to_check = [] | |
| if language_hint in PM.CJK_HONORIFICS: | |
| honorifics_to_check.extend(PM.CJK_HONORIFICS[language_hint]) | |
| honorifics_to_check.extend(PM.CJK_HONORIFICS.get('english', [])) | |
| # Check and remove honorifics | |
| for honorific in honorifics_to_check: | |
| if honorific.startswith('-') or honorific.startswith(' '): | |
| # English-style suffix | |
| if term.endswith(honorific): | |
| return term[:-len(honorific)].strip() | |
| else: | |
| # CJK-style suffix (no separator) | |
| if term.endswith(honorific): | |
| return term[:-len(honorific)] | |
| return term | |
| def _filter_text_for_glossary(text, min_frequency=2, max_sentences=None): | |
| """Filter text to extract only meaningful content for glossary extraction | |
| Args: | |
| text: Input text to filter | |
| min_frequency: Minimum frequency threshold for terms | |
| max_sentences: Maximum number of sentences to return (reads from env if None) | |
| """ | |
| import re | |
| from collections import Counter | |
| from concurrent.futures import ThreadPoolExecutor, as_completed | |
| import time | |
| filter_start_time = time.time() | |
| print(f"📑 Starting smart text filtering...") | |
| print(f"📑 Input text size: {len(text):,} characters") | |
| # Dynamic character coverage flag (must be defined before any early checks) | |
| include_all_characters_env = os.getenv("GLOSSARY_INCLUDE_ALL_CHARACTERS", "0") | |
| include_all_characters = include_all_characters_env == "1" | |
| force_skip_smart_selection = False | |
| honorific_first_indices = {} | |
| # Clean HTML if present | |
| print(f"📑 Step 1/7: Cleaning HTML tags...") | |
| from bs4 import BeautifulSoup | |
| soup = BeautifulSoup(text, 'html.parser') | |
| clean_text = soup.get_text() | |
| print(f"📑 Clean text size: {len(clean_text):,} characters") | |
| # Detect primary language for better filtering | |
| print(f"📑 Step 2/7: Detecting primary language...") | |
| def detect_primary_language(text_sample): | |
| sample = text_sample[:1000] | |
| korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) | |
| japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) | |
| chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) | |
| # Check gender pronouns as secondary indicator if character counts are ambiguous | |
| if korean_chars == 0 and japanese_kana == 0 and chinese_chars > 0: | |
| # Distinguish Chinese vs Kanji-heavy Japanese using pronouns | |
| if hasattr(PM, 'GENDER_PRONOUNS'): | |
| # Check Chinese pronouns | |
| chinese_pronouns = PM.GENDER_PRONOUNS.get('chinese', {}).get('male', []) + \ | |
| PM.GENDER_PRONOUNS.get('chinese', {}).get('female', []) | |
| for p in chinese_pronouns: | |
| if p in sample: | |
| return 'chinese' | |
| # Check Japanese pronouns | |
| japanese_pronouns = PM.GENDER_PRONOUNS.get('japanese', {}).get('male', []) + \ | |
| PM.GENDER_PRONOUNS.get('japanese', {}).get('female', []) | |
| for p in japanese_pronouns: | |
| if p in sample: | |
| return 'japanese' | |
| if korean_chars > 50: | |
| return 'korean' | |
| elif japanese_kana > 20: | |
| return 'japanese' | |
| elif chinese_chars > 50 and japanese_kana < 10: | |
| return 'chinese' | |
| else: | |
| return 'english' | |
| primary_lang = detect_primary_language(clean_text) | |
| print(f"📑 Detected primary language: {primary_lang}") | |
| # Safety guard: ensure flag exists even if subprocess reload missed earlier assignment | |
| try: | |
| include_gender_context_flag | |
| except NameError: | |
| include_gender_context_flag = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| # Gender pronouns for optional gender-context filtering in early captures | |
| gender_pronouns = [] | |
| if include_gender_context_flag and hasattr(PM, "GENDER_PRONOUNS"): | |
| lang_key = "english" | |
| if primary_lang == "korean": | |
| lang_key = "korean" | |
| elif primary_lang == "chinese": | |
| lang_key = "chinese" | |
| elif primary_lang == "japanese": | |
| lang_key = "japanese" | |
| gp = PM.GENDER_PRONOUNS.get(lang_key, {}) | |
| gender_pronouns = gp.get("male", []) + gp.get("female", []) | |
| # Split into sentences for better context | |
| print(f"📁 Step 3/7: Splitting text into sentences...") | |
| # Use language-specific sentence splitting for better accuracy | |
| if primary_lang == 'chinese': | |
| # Split on major punctuation, but keep 、 and , within sentences | |
| # This preserves more context for Chinese cultivation/wuxia terms | |
| sentences = re.split(r'[。!?;:]+', clean_text) | |
| else: | |
| sentences = re.split(r'[.!?。!?]+', clean_text) | |
| print(f"📁 Found {len(sentences):,} sentences") | |
| # Extract potential terms (words/phrases that appear multiple times) | |
| print(f"📑 Step 4/7: Setting up extraction patterns and exclusion rules...") | |
| word_freq = Counter() | |
| # Pattern for detecting potential names/terms based on capitalization or special characters | |
| # Korean names: 2-4 hangul characters WITHOUT honorifics | |
| korean_pattern = r'[가-힣]{2,4}' | |
| # Japanese names: kanji/hiragana/katakana combinations | |
| japanese_pattern = r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,6}' | |
| # Chinese names: EXPANDED to 2-8 characters for cultivation/wuxia novels | |
| # This captures longer compound names, titles, and cultivation terms | |
| chinese_pattern = r'[\u4e00-\u9fff]{2,8}' | |
| # English proper nouns: Capitalized words | |
| english_pattern = r'\b[A-Z][a-z]+(?:\s+[A-Z][a-z]+)*\b' | |
| # Combine patterns | |
| combined_pattern = f'({korean_pattern}|{japanese_pattern}|{chinese_pattern}|{english_pattern})' | |
| print(f"📑 Using combined regex pattern for {primary_lang} text") | |
| # Get honorifics and title patterns for the detected language | |
| honorifics_to_exclude = set() | |
| if primary_lang in PM.CJK_HONORIFICS: | |
| honorifics_to_exclude.update(PM.CJK_HONORIFICS[primary_lang]) | |
| # Also add English romanizations | |
| honorifics_to_exclude.update(PM.CJK_HONORIFICS.get('english', [])) | |
| # Compile title patterns for the language | |
| title_patterns = [] | |
| if primary_lang in PM.TITLE_PATTERNS: | |
| for pattern in PM.TITLE_PATTERNS[primary_lang]: | |
| title_patterns.append(re.compile(pattern)) | |
| # Function to check if a term should be excluded | |
| def should_exclude_term(term): | |
| term_lower = term.lower() | |
| # Check if it's a common word | |
| if term in PM.COMMON_WORDS or term_lower in PM.COMMON_WORDS: | |
| return True | |
| # Check if it contains honorifics | |
| for honorific in honorifics_to_exclude: | |
| if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): | |
| return True | |
| # Check if it matches title patterns | |
| for pattern in title_patterns: | |
| if pattern.search(term): | |
| return True | |
| # Check if it's a number (including Chinese numbers) | |
| if term in PM.CHINESE_NUMS: | |
| return True | |
| # Check if it's just digits | |
| if term.isdigit(): | |
| return True | |
| # For Chinese text, INCLUDE domain-specific terms (don't exclude them) | |
| if primary_lang == 'chinese' and len(term) >= 2: | |
| # Check if it's a cultivation term - these should NOT be excluded | |
| for category in PM.CHINESE_CULTIVATION_TERMS.values(): | |
| if term in category: | |
| return False # Keep cultivation terms! | |
| # Check if it's a wuxia term - these should NOT be excluded | |
| for category in PM.CHINESE_WUXIA_TERMS.values(): | |
| if term in category: | |
| return False # Keep wuxia terms! | |
| # Check relationship terms (important character relationships) | |
| for category in PM.CHINESE_RELATIONSHIP_TERMS.values(): | |
| if term in category: | |
| return False # Keep relationship terms! | |
| # Check mythological terms (creatures, artifacts, legendary beings) | |
| for category in PM.CHINESE_MYTHOLOGICAL_TERMS.values(): | |
| if term in category: | |
| return False # Keep mythological terms! | |
| # Check elemental/natural force terms | |
| for category in PM.CHINESE_ELEMENTAL_TERMS.values(): | |
| if term in category: | |
| return False # Keep elemental terms! | |
| # Check physique/spiritual root terms | |
| for category in PM.CHINESE_PHYSIQUE_TERMS.values(): | |
| if term in category: | |
| return False # Keep physique terms! | |
| # Check treasure grades | |
| for category in PM.CHINESE_TREASURE_GRADES.values(): | |
| if term in category: | |
| return False # Keep treasure grade terms! | |
| # Check power system terms (levels, stars, etc.) | |
| for category in PM.CHINESE_POWER_SYSTEMS.values(): | |
| if term in category: | |
| return False # Keep power system terms! | |
| # Check location types | |
| for category in PM.CHINESE_LOCATION_TYPES.values(): | |
| if term in category: | |
| return False # Keep location terms! | |
| # Check battle terms | |
| for category in PM.CHINESE_BATTLE_TERMS.values(): | |
| if term in category: | |
| return False # Keep battle terms! | |
| # Check novel terms (common raw Chinese terms) | |
| if hasattr(PM, 'CHINESE_NOVEL_TERMS'): | |
| for category in PM.CHINESE_NOVEL_TERMS.values(): | |
| if term in category: | |
| return False | |
| return False | |
| # Extract potential terms from each sentence | |
| print(f"📑 Step 5/7: Extracting and filtering terms from sentences...") | |
| # Check if we should use parallel processing | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| # Auto-detect optimal workers if not set | |
| if extraction_workers == 1 and len(sentences) > 1000: | |
| # Use more cores for better parallelization | |
| cpu_count = os.cpu_count() or 4 | |
| extraction_workers = min(cpu_count, 12) # Use up to 12 cores | |
| print(f"📑 Auto-detected {cpu_count} CPU cores, using {extraction_workers} workers") | |
| use_parallel = extraction_workers > 1 and len(sentences) > 100 | |
| if use_parallel: | |
| print(f"📑 Using parallel processing with {extraction_workers} workers") | |
| print(f"📑 Estimated speedup: {extraction_workers}x faster") | |
| important_sentences = [] | |
| seen_contexts = set() | |
| processed_count = 0 | |
| total_sentences = len(sentences) | |
| last_progress_time = time.time() | |
| # Prepare gender context check | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| gender_nuance_enabled = include_gender_context and os.getenv("GLOSSARY_ENABLE_GENDER_NUANCE", "1") == "1" | |
| gender_pronouns = [] | |
| if gender_nuance_enabled and hasattr(PM, 'GENDER_PRONOUNS'): | |
| # Get pronouns for the detected language | |
| lang_key = 'english' | |
| if primary_lang == 'korean': lang_key = 'korean' | |
| elif primary_lang == 'chinese': lang_key = 'chinese' | |
| elif primary_lang == 'japanese': lang_key = 'japanese' | |
| gender_pronouns.extend(PM.GENDER_PRONOUNS.get(lang_key, {}).get('male', [])) | |
| gender_pronouns.extend(PM.GENDER_PRONOUNS.get(lang_key, {}).get('female', [])) | |
| if gender_pronouns: | |
| print(f"📑 Gender context enabled: scanning for pronouns in {lang_key}") | |
| def process_sentence_batch(batch_sentences, batch_idx): | |
| """Process a batch of sentences""" | |
| local_word_freq = Counter() | |
| local_important = [] | |
| local_seen = set() | |
| for sentence in batch_sentences: | |
| sentence = sentence.strip() | |
| if len(sentence) < 10 or len(sentence) > 500: | |
| continue | |
| # Check for gender pronouns if enabled - include sentence if pronoun found | |
| has_pronoun = False | |
| if gender_nuance_enabled and gender_pronouns: | |
| for pronoun in gender_pronouns: | |
| if pronoun in sentence: | |
| has_pronoun = True | |
| break | |
| # Find all potential terms in this sentence | |
| matches = re.findall(combined_pattern, sentence) | |
| valid_term_found = False | |
| if matches: | |
| # Filter out excluded terms | |
| for match in matches: | |
| if not should_exclude_term(match): | |
| local_word_freq[match] += 1 | |
| valid_term_found = True | |
| # Keep sentence if it has valid terms OR contains a gender pronoun (for context) | |
| # If include_gender_context is True, we include sentences with pronouns even if they don't have new terms, | |
| # but ONLY if the pronouns match known characters. However, we don't know the characters yet. | |
| # So, we include pronoun sentences to provide context for the LLM to infer gender. | |
| if valid_term_found or (gender_nuance_enabled and has_pronoun): | |
| sentence_key = sentence[:50] # Use prefix as key to avoid duplicates | |
| if sentence_key not in local_seen: | |
| local_important.append(sentence) | |
| local_seen.add(sentence_key) | |
| return local_word_freq, local_important, local_seen, batch_idx | |
| if use_parallel: | |
| # Force SMALL batches for real parallelization | |
| # We want MANY small batches, not few large ones! | |
| # Calculate based on total sentences | |
| total_sentences = len(sentences) | |
| # CRITICAL: Batch size must balance two factors: | |
| # 1. Small batches = more parallelism but higher overhead | |
| # 2. Large batches = less overhead but limits parallelism | |
| # | |
| # For Windows ProcessPoolExecutor, overhead is HIGH, so we prefer LARGE batches | |
| # Target: Each worker should get 3-10 batches (not 100+ tiny batches) | |
| # Calculate batch size based on workers to minimize overhead | |
| target_batches_per_worker = 5 # Sweet spot: enough work distribution, minimal overhead | |
| ideal_batch_size = max(500, total_sentences // (extraction_workers * target_batches_per_worker)) | |
| # Apply sensible limits | |
| if total_sentences < 1000: | |
| optimal_batch_size = 100 # Small dataset: normal batching | |
| elif total_sentences < 10000: | |
| optimal_batch_size = min(500, ideal_batch_size) | |
| elif total_sentences < 50000: | |
| optimal_batch_size = min(2000, ideal_batch_size) | |
| elif total_sentences < 200000: | |
| optimal_batch_size = min(5000, ideal_batch_size) | |
| else: | |
| # For 754K sentences with 12 workers: | |
| # target_batches = 12 * 5 = 60 batches | |
| # batch_size = 754K / 60 = ~12,500 sentences/batch | |
| # This is MUCH better than 1887 batches of 400! | |
| optimal_batch_size = min(20000, ideal_batch_size) | |
| # Ensure we have enough batches for all workers | |
| min_batches = extraction_workers * 3 # At least 3 batches per worker | |
| max_batch_size = max(50, total_sentences // min_batches) | |
| optimal_batch_size = min(optimal_batch_size, max_batch_size) | |
| print(f"📑 Total sentences: {total_sentences:,}") | |
| print(f"📑 Target batch size: {optimal_batch_size} sentences") | |
| # Calculate expected number of batches | |
| expected_batches = (total_sentences + optimal_batch_size - 1) // optimal_batch_size | |
| print(f"📑 Expected batches: {expected_batches} (for {extraction_workers} workers)") | |
| print(f"📑 Batches per worker: ~{expected_batches // extraction_workers} batches") | |
| batches = [sentences[i:i + optimal_batch_size] for i in range(0, len(sentences), optimal_batch_size)] | |
| print(f"📑 Processing {len(batches)} batches of ~{optimal_batch_size} sentences each") | |
| print(f"📑 Expected speedup: {min(extraction_workers, len(batches))}x (using {extraction_workers} workers)") | |
| # Decide between ThreadPoolExecutor and ProcessPoolExecutor | |
| import multiprocessing | |
| in_subprocess = multiprocessing.current_process().name != 'MainProcess' | |
| # Use ProcessPoolExecutor for better parallelism on larger datasets | |
| # On Windows, we CAN use ProcessPoolExecutor in subprocess with spawn context | |
| use_process_pool = len(sentences) > 5000 # Remove subprocess check! | |
| if use_process_pool: | |
| # Check if we're in a daemonic process (can't spawn children) | |
| is_daemon = multiprocessing.current_process().daemon if hasattr(multiprocessing.current_process(), 'daemon') else False | |
| if in_subprocess and is_daemon: | |
| # Daemonic processes can't spawn children - fall back to ThreadPoolExecutor | |
| print(f"⚠️ Running in daemonic subprocess - cannot use ProcessPoolExecutor") | |
| print(f"📁 Falling back to ThreadPoolExecutor (limited parallelism due to GIL)") | |
| use_process_pool = False | |
| executor_class = ThreadPoolExecutor | |
| executor_kwargs = {'max_workers': extraction_workers} | |
| use_mp_pool = False | |
| else: | |
| # We can use ProcessPoolExecutor | |
| if in_subprocess: | |
| print(f"📁 Using ProcessPoolExecutor in non-daemonic subprocess") | |
| print(f"📁 This enables TRUE parallelism even from within a subprocess!") | |
| else: | |
| print(f"📁 Using ProcessPoolExecutor for maximum performance (true parallelism)") | |
| mp_context = multiprocessing.get_context('spawn') | |
| executor_class = mp_context.Pool | |
| # Capture CURRENT environment variable values from parent process | |
| current_env_vars = { | |
| 'GLOSSARY_MAX_SENTENCES': os.getenv('GLOSSARY_MAX_SENTENCES', '200'), | |
| 'GLOSSARY_MIN_FREQUENCY': os.getenv('GLOSSARY_MIN_FREQUENCY', '2'), | |
| 'GLOSSARY_MAX_NAMES': os.getenv('GLOSSARY_MAX_NAMES', '50'), | |
| 'GLOSSARY_MAX_TITLES': os.getenv('GLOSSARY_MAX_TITLES', '30'), | |
| 'GLOSSARY_BATCH_SIZE': os.getenv('GLOSSARY_BATCH_SIZE', '50'), | |
| 'GLOSSARY_STRIP_HONORIFICS': os.getenv('GLOSSARY_STRIP_HONORIFICS', '1'), | |
| 'GLOSSARY_FUZZY_THRESHOLD': os.getenv('GLOSSARY_FUZZY_THRESHOLD', '0.90'), | |
| } | |
| print(f"📁 Passing env vars to child processes: GLOSSARY_MAX_SENTENCES={current_env_vars['GLOSSARY_MAX_SENTENCES']}") | |
| # For multiprocessing.Pool, we use different kwargs | |
| # Use module-level init function (can't use local function due to pickling) | |
| executor_kwargs = { | |
| 'processes': extraction_workers, | |
| 'initializer': _init_worker_with_env, | |
| 'initargs': (current_env_vars,) | |
| } | |
| use_mp_pool = True # Flag to use different API | |
| else: | |
| print(f"📁 Using ThreadPoolExecutor for sentence processing (dataset < 5000 sentences)") | |
| executor_class = ThreadPoolExecutor | |
| executor_kwargs = {'max_workers': extraction_workers} | |
| use_mp_pool = False | |
| # Handle multiprocessing.Pool vs concurrent.futures differently | |
| if use_process_pool and use_mp_pool: | |
| # Use multiprocessing.Pool API (map_async) | |
| with executor_class(**executor_kwargs) as pool: | |
| # Prepare data for process pool | |
| exclude_check_data = ( | |
| list(honorifics_to_exclude), | |
| [p.pattern for p in title_patterns], | |
| PM.COMMON_WORDS, | |
| PM.CHINESE_NUMS | |
| ) | |
| # Prepare all arguments | |
| all_args = [(batch, idx, combined_pattern, exclude_check_data) | |
| for idx, batch in enumerate(batches)] | |
| print(f"📁 Submitting {len(all_args)} batches to process pool...") | |
| # Use map_async with chunksize for better distribution | |
| # chunksize=1 means each worker gets one batch at a time | |
| result_async = pool.map_async(_process_sentence_batch_for_extraction, all_args, chunksize=1) | |
| # Poll for completion with progress estimates | |
| completed_batches = 0 | |
| batch_start_time = time.time() | |
| next_report_ts = batch_start_time + 5.0 | |
| print(f"📁 Processing batches with {extraction_workers} parallel workers...") | |
| while not result_async.ready(): | |
| time.sleep(2) # Check every 2 seconds | |
| now = time.time() | |
| elapsed = now - batch_start_time | |
| # Emit logs on a fixed 5s cadence (5, 10, 15...) even if our poll loop wakes late. | |
| while now >= next_report_ts: | |
| elapsed_for_log = int(next_report_ts - batch_start_time) | |
| # Estimate progress based on time and worker count | |
| batches_per_second = extraction_workers / 0.3 # rough heuristic | |
| estimated_completed = min(int(elapsed * batches_per_second), len(all_args)) | |
| estimated_progress = min(95, (estimated_completed / len(all_args)) * 100) | |
| estimated_sentences = min(estimated_completed * optimal_batch_size, total_sentences) | |
| if estimated_progress < 95: | |
| print(f"📁 Processing... ~{estimated_progress:.0f}% estimated (~{estimated_sentences:,} sentences) | {elapsed_for_log}s elapsed") | |
| else: | |
| print(f"📁 Processing... finalizing last batches | {elapsed_for_log}s elapsed") | |
| next_report_ts += 5.0 | |
| # Get all results | |
| total_elapsed = time.time() - batch_start_time | |
| print(f"📁 All batches completed in {total_elapsed:.1f}s! Collecting results...") | |
| all_results = result_async.get() | |
| # Process all results | |
| for local_word_freq, local_important, local_seen, batch_idx in all_results: | |
| # Merge results | |
| word_freq.update(local_word_freq) | |
| for sentence in local_important: | |
| sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence))) | |
| if sentence_key not in seen_contexts: | |
| important_sentences.append(sentence) | |
| seen_contexts.add(sentence_key) | |
| processed_count += len(batches[batch_idx]) | |
| completed_batches += 1 | |
| # Show progress | |
| progress_interval = 1 if len(batches) <= 20 else (5 if len(batches) <= 100 else 10) | |
| if completed_batches % progress_interval == 0 or completed_batches == len(batches): | |
| progress = (processed_count / total_sentences) * 100 | |
| elapsed = time.time() - batch_start_time | |
| rate = (processed_count / elapsed) if elapsed > 0 else 0 | |
| print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) | Batch {completed_batches}/{len(batches)} | {rate:.0f} sent/sec") | |
| else: | |
| # Use concurrent.futures API (ThreadPoolExecutor or ProcessPoolExecutor) | |
| with executor_class(**executor_kwargs) as executor: | |
| futures = [] | |
| # Prepare data for ProcessPoolExecutor if needed | |
| if use_process_pool: | |
| # Serialize exclusion check data for process pool | |
| exclude_check_data = ( | |
| list(honorifics_to_exclude), | |
| [p.pattern for p in title_patterns], | |
| PM.COMMON_WORDS, | |
| PM.CHINESE_NUMS | |
| ) | |
| for idx, batch in enumerate(batches): | |
| if use_process_pool: | |
| # Use module-level function for ProcessPoolExecutor | |
| future = executor.submit(_process_sentence_batch_for_extraction, | |
| (batch, idx, combined_pattern, exclude_check_data)) | |
| else: | |
| # Use local function for ThreadPoolExecutor | |
| future = executor.submit(process_sentence_batch, batch, idx) | |
| futures.append(future) | |
| # Yield to GUI when submitting futures | |
| if idx % 10 == 0: | |
| time.sleep(0.001) | |
| # Collect results with progress | |
| completed_batches = 0 | |
| batch_start_time = time.time() | |
| for future in as_completed(futures): | |
| # Get result without timeout - as_completed already handles waiting | |
| local_word_freq, local_important, local_seen, batch_idx = future.result() | |
| # Merge results | |
| word_freq.update(local_word_freq) | |
| for sentence in local_important: | |
| sentence_key = ' '.join(sorted(re.findall(combined_pattern, sentence))) | |
| if sentence_key not in seen_contexts: | |
| important_sentences.append(sentence) | |
| seen_contexts.add(sentence_key) | |
| processed_count += len(batches[batch_idx]) | |
| completed_batches += 1 | |
| # Show progress more frequently for better user feedback | |
| progress_interval = 1 if len(batches) <= 20 else (5 if len(batches) <= 100 else 10) | |
| if completed_batches % progress_interval == 0 or completed_batches == len(batches): | |
| progress = (processed_count / total_sentences) * 100 | |
| elapsed = time.time() - batch_start_time | |
| rate = (processed_count / elapsed) if elapsed > 0 else 0 | |
| print(f"📑 Progress: {processed_count:,}/{total_sentences:,} sentences ({progress:.1f}%) | Batch {completed_batches}/{len(batches)} | {rate:.0f} sent/sec") | |
| # Yield to GUI after each batch completes | |
| time.sleep(0.001) | |
| else: | |
| # Sequential processing with progress | |
| for idx, sentence in enumerate(sentences): | |
| sentence = sentence.strip() | |
| if len(sentence) < 10 or len(sentence) > 500: | |
| continue | |
| # Find all potential terms in this sentence | |
| matches = re.findall(combined_pattern, sentence) | |
| if matches: | |
| # Filter out excluded terms | |
| filtered_matches = [] | |
| for match in matches: | |
| if not should_exclude_term(match): | |
| word_freq[match] += 1 | |
| filtered_matches.append(match) | |
| # Keep sentences with valid potential terms | |
| if filtered_matches: | |
| sentence_key = ' '.join(sorted(filtered_matches)) | |
| if sentence_key not in seen_contexts: | |
| important_sentences.append(sentence) | |
| seen_contexts.add(sentence_key) | |
| # Show progress every 1000 sentences or 2 seconds | |
| if idx % 1000 == 0 or (time.time() - last_progress_time > 2): | |
| progress = ((idx + 1) / total_sentences) * 100 | |
| print(f"📑 Processing sentences: {idx + 1:,}/{total_sentences:,} ({progress:.1f}%)") | |
| last_progress_time = time.time() | |
| # Yield to GUI thread every 1000 sentences | |
| time.sleep(0.001) # Tiny sleep to let GUI update | |
| # Yield to GUI thread every 1000 sentences | |
| time.sleep(0.001) # Tiny sleep to let GUI update | |
| print(f"📑 Found {len(important_sentences):,} sentences with potential glossary terms") | |
| # Step 6/7: Deduplicate and normalize terms | |
| # Skip this heavy deduplication if "Dynamic Limit Expansion" (include_all_characters) is disabled | |
| # When disabled, we only care about exact matches of high-frequency terms, which combined_freq already handles | |
| if not include_all_characters: | |
| print(f"📑 Step 6/7: Skipping advanced term deduplication (Dynamic Limit Expansion disabled)...") | |
| print(f"📑 Using simple normalized frequency counts for {len(word_freq):,} terms") | |
| combined_freq = Counter() | |
| term_count = 0 | |
| # Simple deduplication by normalized form only | |
| for term, count in word_freq.items(): | |
| normalized = term.lower().strip() | |
| if normalized in combined_freq: | |
| if count > combined_freq[normalized]: | |
| del combined_freq[normalized] | |
| combined_freq[term] = count | |
| else: | |
| combined_freq[term] = count | |
| term_count += 1 | |
| if term_count % 5000 == 0: | |
| time.sleep(0.001) | |
| else: | |
| print(f"📑 Step 6/7: Normalizing and deduplicating {len(word_freq):,} unique terms...") | |
| combined_freq = Counter() | |
| term_count = 0 | |
| # Original logic with potential for future advanced features if enabled | |
| for term, count in word_freq.items(): | |
| normalized = term.lower().strip() | |
| if normalized in combined_freq: | |
| if count > combined_freq[normalized]: | |
| del combined_freq[normalized] | |
| combined_freq[term] = count | |
| else: | |
| combined_freq[term] = count | |
| term_count += 1 | |
| if term_count % 1000 == 0: | |
| time.sleep(0.001) | |
| print(f"📑 Deduplicated to {len(combined_freq):,} unique terms") | |
| # Filter to keep only terms that appear at least min_frequency times | |
| frequent_terms = {term: count for term, count in combined_freq.items() if count >= min_frequency} | |
| # Build filtered text focusing on sentences containing frequent terms | |
| print(f"📑 Step 7/7: Building filtered text from relevant sentences...") | |
| # OPTIMIZATION: Skip sentences that already passed filtering in step 5 | |
| # These sentences already contain glossary terms, no need to check again! | |
| # We just need to limit the sample size | |
| filtered_sentences = important_sentences # Already filtered! | |
| print(f"📑 Using {len(filtered_sentences):,} pre-filtered sentences (already contain glossary terms)") | |
| # EARLY DYNAMIC EXPANSION: collect one sentence index per unique honorific-attached name (first appearance), before scoring/nuance | |
| def _sentence_has_gender_pronoun(sent: str) -> bool: | |
| if not include_gender_context_flag or not gender_pronouns: | |
| return True | |
| return any(p in sent for p in gender_pronouns) | |
| if include_all_characters: | |
| honorific_pattern_str = None | |
| if primary_lang in PM.CJK_HONORIFICS: | |
| h_list = PM.CJK_HONORIFICS[primary_lang] + PM.CJK_HONORIFICS.get('english', []) | |
| h_list.sort(key=len, reverse=True) | |
| if h_list: | |
| honorific_pattern_str = '|'.join(map(re.escape, h_list)) | |
| if honorific_pattern_str: | |
| try: | |
| honorifics = PM.CJK_HONORIFICS.get(primary_lang, []) + PM.CJK_HONORIFICS.get('english', []) | |
| honorifics = [h for h in honorifics if h] # drop empties | |
| # Keep only clear suffix/title honorifics; drop verb endings/keigo/politeness particles | |
| if primary_lang == 'korean': | |
| suffix_allow = {'님','씨','군','양','공','옹','군','양','낭','랑','생','자','부','모','시','제','족하', | |
| '마마','대감','영감','나리','도령','낭자','아씨','규수','각하','전하','폐하','저하','합하', | |
| '대비','대왕','왕자','공주','도련님','아가씨'} | |
| honorifics = [h for h in honorifics if h in suffix_allow] | |
| elif primary_lang == 'japanese': | |
| suffix_allow = {'さん','ちゃん','君','くん','様','さま','殿','先輩','先生','氏','殿下','閣下','卿'} | |
| honorifics = [h for h in honorifics if h in suffix_allow] | |
| elif primary_lang == 'chinese': | |
| # short person titles only | |
| honorifics = [h for h in honorifics if len(h) <= 3 and h in {'先生','小姐','夫人','公子','姑娘','大人','阁下','将军','公主','少爷','老爷','相公','郎君','小姐','少侠','侠士'}] | |
| else: | |
| # romanized suffixes only | |
| honorifics = [h for h in honorifics if h.startswith('-') and len(h) <= 8] | |
| if honorifics: | |
| hon_regex = "|".join(map(re.escape, honorifics)) | |
| cjk_name_pat = r"[\\u4e00-\\u9fff\\u3040-\\u30ff\\uac00-\\ud7af·]{2,4}" | |
| latin_name_pat = r"[A-Z][a-z]{1,15}(?:\\s+[A-Z][a-z]{1,15}){0,1}" | |
| punct_opt = r"[,、,.:;!?…\\)\\] \\}】』」]?" | |
| combined_pat = re.compile( | |
| rf"(?P<name>{cjk_name_pat}|{latin_name_pat})\\s*(?P<hon>{hon_regex}){punct_opt}" | |
| ) | |
| honor_pat = re.compile(hon_regex) | |
| ordered_names = [] | |
| for idx, sent in enumerate(filtered_sentences): | |
| for m in combined_pat.finditer(sent): | |
| name = m.group("name").strip() | |
| if not name or any(ch.isdigit() for ch in name): | |
| continue | |
| # Apply strict filtering to regex matches too | |
| # FILTERING: Skip tokens with common noisy start characters | |
| if any(name.startswith(c) for c in ['[', '(', '{', '<', '-', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅋ', 'ㅎ']): | |
| continue | |
| # FILTERING: Skip tokens that are just common words/particles | |
| if name in PM.COMMON_WORDS: | |
| continue | |
| # FILTERING: Aggressive Korean Verb/Adjective Ending Check | |
| if len(name) > 2 and any(name.endswith(e) for e in ['겠네', '리라', '니까', '는데', '러나', '다가', '면서', '지만', '도록', '으로', '에서', '에게', '한테', '라고', '이란']): | |
| continue | |
| # Skip if name looks like a title term (PatternManager title patterns) | |
| skip_title = False | |
| for pat in PM.TITLE_PATTERNS.get(primary_lang, []): | |
| if re.search(pat, name): | |
| skip_title = True | |
| break | |
| if skip_title: | |
| continue | |
| if name not in honorific_first_indices: | |
| honorific_first_indices[name] = idx | |
| # Append every time to track frequency | |
| ordered_names.append(name) | |
| # Fallback: token immediately before any honorific | |
| # NOTE: Bidirectional check ('after') was removed due to excessive false positives. | |
| # Strict filtering applied to 'before' token to reduce noise. | |
| for m in honor_pat.finditer(sent): | |
| # 1. Check BEFORE the honorific | |
| if primary_lang == 'chinese': | |
| # Chinese logic: Get previous 2-4 characters without relying on space | |
| start_idx = m.start() | |
| # Try taking 2, 3, 4 characters backwards | |
| # Chinese names are typically 2-3 characters (Surname + Given Name) | |
| # We check if they form a valid name | |
| prefix_str = sent[max(0, start_idx-4):start_idx] | |
| # Iterate through possible name lengths (2 to 4) ending at honorific | |
| # We prioritize shorter names (2-3) if they look valid? No, prioritize longest valid? | |
| # Let's try to extract valid chunks. | |
| token = "" | |
| # Scan backwards for valid Chinese chars | |
| current_token = "" | |
| for i in range(1, 5): # Look back up to 4 chars | |
| if start_idx - i < 0: break | |
| char = sent[start_idx - i] | |
| # Check if char is valid Chinese character | |
| if '\u4e00' <= char <= '\u9fff': | |
| current_token = char + current_token | |
| else: | |
| break # Stop at non-Chinese char (punctuation, space, etc) | |
| if len(current_token) >= 2: | |
| token = current_token | |
| elif primary_lang == 'japanese': | |
| # Japanese logic: Get previous 2-6 characters | |
| start_idx = m.start() | |
| # Scan backwards for valid Japanese chars (Kanji, Hiragana, Katakana) | |
| token = "" | |
| current_token = "" | |
| for i in range(1, 7): # Look back up to 6 chars | |
| if start_idx - i < 0: break | |
| char = sent[start_idx - i] | |
| # Check if char is valid Japanese character | |
| # Kanji: 4E00-9FFF, Hiragana: 3040-309F, Katakana: 30A0-30FF | |
| # Also include long vowel mark (ー): 30FC | |
| is_valid_jp = ('\u4e00' <= char <= '\u9fff') or \ | |
| ('\u3040' <= char <= '\u309f') or \ | |
| ('\u30a0' <= char <= '\u30ff') or \ | |
| (char == '\u30fc') | |
| if is_valid_jp: | |
| current_token = char + current_token | |
| else: | |
| break # Stop at non-Japanese char | |
| if len(current_token) >= 2: | |
| token = current_token | |
| else: | |
| # Original logic for space-separated languages (Korean, English) | |
| prefix = sent[:m.start()].strip() | |
| if prefix: | |
| token = prefix.split()[-1] | |
| token = token.strip(".,;:!?\"'()[]{}<>~`@#$%^&*-=_+|\\/") | |
| else: | |
| token = "" | |
| if token: | |
| # Apply all validation logic (common words, fullmatch regex, etc.) | |
| if not any(ch.isdigit() for ch in token): | |
| # ... (Rest of existing validation logic) ... | |
| # FILTERING: Skip tokens with common noisy start characters | |
| if not any(token.startswith(c) for c in ['[', '(', '{', '<', '-', 'ㄴ', 'ㅇ', 'ㄹ', 'ㅁ', 'ㅂ', 'ㅋ', 'ㅎ']): | |
| # FILTERING: Skip tokens that look like file extensions or paths | |
| if not ('.' in token or '/' in token or '\\' in token): | |
| # FILTERING: Skip tokens that are just common words/particles | |
| if token not in PM.COMMON_WORDS: | |
| # FILTERING: Aggressive Korean Verb/Adjective Ending Check | |
| if not (len(token) > 2 and any(token.endswith(e) for e in ['겠네', '리라', '니까', '는데', '러나', '다가', '면서', '지만', '도록', '으로', '에서', '에게', '한테', '라고', '이란'])): | |
| # STRICTER ATTACHMENT CHECK FOR KOREAN SUFFIXES | |
| # (For Chinese, we already extracted attached characters, so this check is implicitly passed or N/A) | |
| is_attached = True | |
| if primary_lang != 'chinese': | |
| is_attached = not sent[:m.start()].endswith(' ') | |
| # Valid token structure check | |
| valid_shape = False | |
| # STRICTER: Use regex to ensure the ENTIRE token matches the valid pattern | |
| if re.fullmatch(r'[\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af·]{2,4}', token): | |
| valid_shape = True | |
| elif re.fullmatch(r'^[A-Z][a-z]{1,15}(\s+[A-Z][a-z]{1,15})?$', token): | |
| valid_shape = True | |
| if valid_shape: | |
| # Skip if token looks like a title term | |
| skip_title = False | |
| for pat in PM.TITLE_PATTERNS.get(primary_lang, []): | |
| if re.search(pat, token): | |
| skip_title = True | |
| break | |
| if not skip_title: | |
| if token not in honorific_first_indices: | |
| honorific_first_indices[token] = idx | |
| # Append every time to track frequency | |
| ordered_names.append(token) | |
| # DEDUPLICATE THE REPRESENTATIVE UNIQUE CHARACTERS HERE | |
| if ordered_names: | |
| print(f"📑 Deduplicating {len(ordered_names)} potential character names (honorific-first)...") | |
| try: | |
| import duplicate_detection_config as DDC | |
| # Get configured algorithm and threshold | |
| dd_config = DDC.get_duplicate_detection_config() | |
| algo_desc = dd_config.get('description', 'Unknown') | |
| # Use environment variable directly as fallback | |
| fallback_threshold = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90")) | |
| effective_threshold = dd_config.get('threshold', fallback_threshold) | |
| selected_algo = os.getenv('GLOSSARY_DUPLICATE_ALGORITHM', 'auto').upper() | |
| print(f"📑 Duplicate Detection Algorithm: {selected_algo} ({algo_desc})") | |
| print(f"📑 Deduplicating names with threshold: {effective_threshold:.2f}") | |
| deduped_names = [] | |
| kept_indices = {} # Rebuild this map | |
| skipped_dupes = 0 | |
| # Optimized deduplication using bucketing by first character | |
| # This avoids O(N²) all-to-all comparison while maintaining fuzzy matching quality | |
| deduped_names = [] | |
| kept_indices = {} | |
| skipped_dupes = 0 | |
| # Filter by honorific attachment frequency | |
| # Only keep names that appear with an honorific at least N times | |
| # This filters out one-off noise while keeping legitimate names | |
| name_freq_with_honorific = Counter(ordered_names) | |
| # Use configured minimum frequency (GLOSSARY_MIN_FREQUENCY) | |
| # This allows the user to control the strictness via the GUI/Config | |
| min_hon_freq = min_frequency | |
| print(f"📑 Filtering by honorific attachment frequency (min {min_hon_freq} occurrences)...") | |
| # Get unique candidates that meet frequency threshold | |
| # Use seen set to deduplicate ordered_names while preserving order | |
| filtered_unique = [] | |
| seen_candidates = set() | |
| for name in ordered_names: | |
| if name not in seen_candidates and name_freq_with_honorific[name] >= min_hon_freq: | |
| filtered_unique.append(name) | |
| seen_candidates.add(name) | |
| print(f"📑 Reduced candidates from {len(ordered_names)} (total) to {len(filtered_unique)} (unique freq-filtered)") | |
| ordered_names = filtered_unique | |
| # Fast lookup structures | |
| seen_normalized = set() | |
| # Bucket by first character (normalized) to reduce search space | |
| # Key: first_char, Value: list of existing names starting with that char | |
| lookup_buckets = {} | |
| print(f"📑 Processing {len(ordered_names)} names with bucketed optimization...") | |
| for i, name in enumerate(ordered_names): | |
| # Progress logging for large sets | |
| if i > 0 and i % 1000 == 0: | |
| print(f"📑 Dedupe progress: {i}/{len(ordered_names)}...") | |
| norm = name.lower().strip() | |
| if not norm: continue | |
| # 1. Exact normalized check (O(1) - Instant) | |
| if norm in seen_normalized: | |
| skipped_dupes += 1 | |
| continue | |
| # 2. Fuzzy Check (Bucketed) | |
| is_dup = False | |
| first_char = norm[0] | |
| # Only compare against names starting with the same character | |
| # This reduces comparisons by ~20-50x (alphabet size) | |
| candidates = lookup_buckets.get(first_char, []) | |
| # If bucket is massive (>1000), limit to most recent 1000 to prevent slowdown | |
| # (Heuristic: duplicates usually appear near each other or we catch them early) | |
| if len(candidates) > 1000: | |
| search_candidates = candidates[-1000:] | |
| else: | |
| search_candidates = candidates | |
| for existing in search_candidates: | |
| score = DDC.calculate_similarity_with_config(name, existing, dd_config) | |
| if score >= effective_threshold: | |
| is_dup = True | |
| skipped_dupes += 1 | |
| break | |
| if not is_dup: | |
| deduped_names.append(name) | |
| seen_normalized.add(norm) | |
| # Add to bucket | |
| if first_char not in lookup_buckets: | |
| lookup_buckets[first_char] = [] | |
| lookup_buckets[first_char].append(name) | |
| # Keep the original index | |
| if name in honorific_first_indices: | |
| kept_indices[name] = honorific_first_indices[name] | |
| print(f"📑 Advanced deduplication removed {skipped_dupes} duplicate names") | |
| # Update the lists | |
| ordered_names = deduped_names | |
| honorific_first_indices = kept_indices | |
| except ImportError: | |
| print("⚠️ duplicate_detection_config module not found, skipping name deduplication") | |
| except Exception as e: | |
| print(f"⚠️ Name deduplication failed: {e}") | |
| else: | |
| print("📑 Dynamic expansion (honorific-first): no honorifics found in PatternManager for this language") | |
| base_count = len(honorific_first_indices) | |
| if include_gender_context_flag and base_count > 0: | |
| try: | |
| gender_subset = sum( | |
| 1 for idx in honorific_first_indices.values() | |
| if 0 <= idx < len(filtered_sentences) and _sentence_has_gender_pronoun(filtered_sentences[idx]) | |
| ) | |
| print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring (gender-context subset: {gender_subset})") | |
| except Exception: | |
| print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring") | |
| else: | |
| print(f"📑 Dynamic expansion (honorific-first): captured {base_count} unique characters before scoring") | |
| # Debug: Write filtered terms to file (User request) | |
| if base_count > 0 and 'ordered_names' in locals(): | |
| try: | |
| # Use output_dir if available, otherwise cwd | |
| debug_base = output_dir if 'output_dir' in locals() else os.getcwd() | |
| debug_dir = os.path.join(debug_base, 'debug') | |
| os.makedirs(debug_dir, exist_ok=True) | |
| debug_file_path = os.path.join(debug_dir, 'honorific_debug.txt') | |
| with open(debug_file_path, 'w', encoding='utf-8') as f: | |
| for name in ordered_names: | |
| f.write(f"{name}\n") | |
| print(f"📑 Wrote {len(ordered_names)} terms to {debug_file_path}") | |
| except Exception as e: | |
| print(f"📑 Failed to write debug file: {e}") | |
| except Exception: | |
| print("📑 Dynamic expansion (honorific-first): error parsing honorific names; continuing without early captures") | |
| else: | |
| print("📑 Dynamic expansion (honorific-first): no honorific pattern available for this language") | |
| # For extremely large datasets, we can optionally do additional filtering | |
| # Skip this reduction when include_all_characters is enabled to avoid losing rare characters | |
| if (not include_all_characters) and len(filtered_sentences) > 10000 and len(frequent_terms) > 1000: | |
| print(f"📑 Large dataset detected - applying frequency-based filtering...") | |
| print(f"📑 Filtering {len(filtered_sentences):,} sentences for top frequent terms...") | |
| # Sort terms by frequency to prioritize high-frequency ones | |
| sorted_terms = sorted(frequent_terms.items(), key=lambda x: x[1], reverse=True) | |
| top_terms = dict(sorted_terms[:1000]) # Focus on top 1000 most frequent terms | |
| print(f"📑 Using top {len(top_terms):,} most frequent terms for final filtering") | |
| # Use parallel processing only if really needed | |
| if use_parallel and len(filtered_sentences) > 5000: | |
| import multiprocessing | |
| in_subprocess = multiprocessing.current_process().name != 'MainProcess' | |
| # Create a simple set of terms for fast lookup (no variations needed) | |
| term_set = set(top_terms.keys()) | |
| print(f"📑 Using parallel filtering with {extraction_workers} workers...") | |
| # Optimize batch size for ProcessPoolExecutor (reduce overhead) | |
| # Use larger batches since this is a simpler operation than term extraction | |
| check_batch_size = max(1000, len(filtered_sentences) // (extraction_workers * 5)) | |
| check_batches = [filtered_sentences[i:i + check_batch_size] | |
| for i in range(0, len(filtered_sentences), check_batch_size)] | |
| print(f"📑 Processing {len(check_batches)} batches of ~{check_batch_size} sentences") | |
| # Use ProcessPoolExecutor for true parallelism (if not already in subprocess) | |
| use_process_pool_filtering = (not in_subprocess and len(check_batches) > 3) | |
| if use_process_pool_filtering: | |
| print(f"📑 Using ProcessPoolExecutor for true parallel filtering") | |
| new_filtered = [] | |
| with ProcessPoolExecutor(max_workers=extraction_workers) as executor: | |
| # Use the module-level function _check_sentence_batch_for_terms | |
| futures = [executor.submit(_check_sentence_batch_for_terms, (batch, term_set)) | |
| for batch in check_batches] | |
| for future in as_completed(futures): | |
| new_filtered.extend(future.result()) | |
| else: | |
| print(f"📑 Using ThreadPoolExecutor for filtering (small dataset or in subprocess)") | |
| # Simple function to check if sentence contains any top term | |
| def check_batch_simple(batch): | |
| result = [] | |
| for sentence in batch: | |
| # Simple substring check - much faster than regex | |
| for term in term_set: | |
| if term in sentence: | |
| result.append(sentence) | |
| break | |
| return result | |
| new_filtered = [] | |
| with ThreadPoolExecutor(max_workers=extraction_workers) as executor: | |
| futures = [executor.submit(check_batch_simple, batch) for batch in check_batches] | |
| for future in as_completed(futures): | |
| new_filtered.extend(future.result()) | |
| filtered_sentences = new_filtered | |
| print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") | |
| else: | |
| # For smaller datasets, simple sequential filtering | |
| print(f"📑 Using sequential filtering...") | |
| new_filtered = [] | |
| for i, sentence in enumerate(filtered_sentences): | |
| for term in top_terms: | |
| if term in sentence: | |
| new_filtered.append(sentence) | |
| break | |
| if i % 1000 == 0: | |
| print(f"📑 Progress: {i:,}/{len(filtered_sentences):,} sentences") | |
| time.sleep(0.001) | |
| filtered_sentences = new_filtered | |
| print(f"📑 Filtered to {len(filtered_sentences):,} sentences containing top terms") | |
| print(f"📑 Selected {len(filtered_sentences):,} sentences containing frequent terms") | |
| # Track character-like term count for final summary | |
| character_term_count = 0 | |
| # Limit the number of sentences to reduce token usage | |
| if max_sentences is None: | |
| max_sentences_fallback = os.getenv("GLOSSARY_MAX_SENTENCES", "200") | |
| print(f"🔍 [DEBUG] max_sentences was None, reading from environment: '{max_sentences_fallback}'") | |
| max_sentences = int(max_sentences_fallback) | |
| else: | |
| print(f"🔍 [DEBUG] max_sentences parameter was provided: {max_sentences}") | |
| print(f"🔍 [DEBUG] Final GLOSSARY_MAX_SENTENCES value being used: {max_sentences}") | |
| # Force smart selection path when dynamic expansion is enabled, even if filtered_sentences <= max_sentences | |
| run_smart_selection = (not force_skip_smart_selection) and (include_all_characters or (max_sentences > 0 and len(filtered_sentences) > max_sentences)) | |
| if run_smart_selection and max_sentences > 0: | |
| dynamic_bonus = len(honorific_first_indices) if include_all_characters else 0 | |
| effective_preview = max_sentences + dynamic_bonus | |
| if dynamic_bonus > 0: | |
| print(f"📁 Limiting to {max_sentences} + {dynamic_bonus} (dynamic expansion) = {effective_preview} representative sentences (from {len(filtered_sentences):,})") | |
| else: | |
| print(f"📁 Limiting to {max_sentences} representative sentences (from {len(filtered_sentences):,})") | |
| # SMART SELECTION: Prioritize sentences with unique terms and gender context | |
| # instead of blind slicing. | |
| # 1. Identify which terms appear in which sentences | |
| # We need to re-scan briefly or pass this info along. Re-scanning is safer/easier here. | |
| if gender_nuance_enabled: | |
| print("📑 analyzing sentences for term coverage and gender nuance...") | |
| else: | |
| print("📑 analyzing sentences for term coverage (gender nuance disabled)...") | |
| term_to_sentences = {} # term -> list of (score, sentence_index) | |
| sentence_scores = {} # index -> score | |
| # Pre-compile regexes | |
| honorific_pattern_str = None | |
| if primary_lang in PM.CJK_HONORIFICS: | |
| h_list = PM.CJK_HONORIFICS[primary_lang] + PM.CJK_HONORIFICS.get('english', []) | |
| h_list.sort(key=len, reverse=True) | |
| if h_list: | |
| honorific_pattern_str = '|'.join(map(re.escape, h_list)) | |
| # Get pronouns for scoring | |
| gender_pronouns = [] | |
| if gender_nuance_enabled and hasattr(PM, 'GENDER_PRONOUNS'): | |
| lang_key = 'english' | |
| if primary_lang == 'korean': lang_key = 'korean' | |
| elif primary_lang == 'chinese': lang_key = 'chinese' | |
| elif primary_lang == 'japanese': lang_key = 'japanese' | |
| gender_pronouns = PM.GENDER_PRONOUNS.get(lang_key, {}).get('male', []) + \ | |
| PM.GENDER_PRONOUNS.get(lang_key, {}).get('female', []) | |
| # If gender context is OFF or nuance scoring is disabled, skip expensive scoring and just build simple coverage map | |
| if not gender_nuance_enabled: | |
| print("📑 Gender context or nuance toggle disabled: using simple term coverage (no pronoun weighting).") | |
| for idx, sent in enumerate(filtered_sentences): | |
| sentence_scores[idx] = 1.0 | |
| for term in frequent_terms: | |
| if term in sent: | |
| term_to_sentences.setdefault(term, []).append(idx) | |
| # Parallelize scoring if dataset is large enough and gender context is ON | |
| elif use_parallel and len(filtered_sentences) > 2000: | |
| print(f"📑 Parallelizing sentence scoring with {extraction_workers} workers...") | |
| # Prepare batches | |
| # Aim for ~500 sentences per batch to get updates every ~2-3 seconds (assuming ~150-200 sent/sec) | |
| batch_size = 500 | |
| # However, ensure we don't have too few batches for the workers (utilize parallelism) | |
| if len(filtered_sentences) // batch_size < extraction_workers * 4: | |
| batch_size = max(100, len(filtered_sentences) // (extraction_workers * 4)) | |
| batches = [] | |
| for i in range(0, len(filtered_sentences), batch_size): | |
| end_idx = min(i + batch_size, len(filtered_sentences)) | |
| # Pass (start_index, list_of_sentences) | |
| batches.append((i, filtered_sentences[i:end_idx])) | |
| term_list = list(frequent_terms.keys()) | |
| # Use ProcessPoolExecutor for heavy CPU work | |
| if use_process_pool: | |
| executor_cls = ProcessPoolExecutor | |
| else: | |
| executor_cls = ThreadPoolExecutor | |
| with executor_cls(max_workers=extraction_workers) as executor: | |
| # Submit all batches | |
| futures = [executor.submit( | |
| _score_sentence_batch, | |
| (batch_data, term_list, honorific_pattern_str, gender_pronouns, include_gender_context) | |
| ) for batch_data in batches] | |
| # Collect results with progress logging | |
| completed_batches = 0 | |
| processed_count = 0 | |
| scoring_start_time = time.time() | |
| last_log_time = scoring_start_time | |
| total_batches = len(batches) | |
| total_to_score = len(filtered_sentences) | |
| # Emit wait logs even before the first batch completes | |
| try: | |
| from concurrent.futures import wait as _wait, FIRST_COMPLETED as _FIRST_COMPLETED | |
| except Exception: | |
| _wait = None | |
| _FIRST_COMPLETED = None | |
| pending = set(futures) | |
| while pending: | |
| done = set() | |
| if _wait is not None and _FIRST_COMPLETED is not None: | |
| done, pending = _wait(pending, timeout=5.0, return_when=_FIRST_COMPLETED) | |
| done = set(done or []) | |
| else: | |
| # Fallback: block until first completion (no wait logs) | |
| for future in as_completed(list(pending)): | |
| done.add(future) | |
| pending.discard(future) | |
| break | |
| if not done: | |
| # No batch completed within timeout | |
| elapsed = time.time() - scoring_start_time | |
| print(f"📑 Scoring... {elapsed:.0f}s elapsed") | |
| continue | |
| for future in done: | |
| try: | |
| batch_scores, batch_term_map = future.result() | |
| sentence_scores.update(batch_scores) | |
| # Merge term mappings | |
| for term, indices in batch_term_map.items(): | |
| if term not in term_to_sentences: | |
| term_to_sentences[term] = [] | |
| term_to_sentences[term].extend(indices) | |
| # Update progress stats | |
| completed_batches += 1 | |
| processed_count += len(batch_scores) | |
| current_time = time.time() | |
| elapsed = current_time - scoring_start_time | |
| # Log periodically (every ~5 seconds or if it's the last batch) | |
| if (current_time - last_log_time >= 5.0) or (completed_batches == total_batches): | |
| display_count = min(processed_count, total_to_score) | |
| progress_pct = min(99.9, (display_count / total_to_score) * 100) | |
| rate = display_count / elapsed if elapsed > 0 else 0 | |
| if completed_batches < total_batches: | |
| print(f"📑 Scoring... {display_count:,}/{total_to_score:,} sentences ({progress_pct:.1f}%) | Batch {completed_batches}/{total_batches} | {rate:.0f} sent/sec | {elapsed:.0f}s elapsed") | |
| else: | |
| print(f"📑 Scoring... {total_to_score:,}/{total_to_score:,} sentences (100.0%) | Batch {total_batches}/{total_batches} | {rate:.0f} sent/sec | {elapsed:.0f}s elapsed") | |
| print(f"📑 Scoring... finalizing last batches | {elapsed:.0f}s elapsed") | |
| last_log_time = current_time | |
| except Exception as e: | |
| print(f"⚠️ Scoring batch failed: {e}") | |
| total_elapsed = time.time() - scoring_start_time | |
| print(f"📁 All scoring batches completed in {total_elapsed:.1f}s!") | |
| else: | |
| # Sequential fallback | |
| honorific_pattern = re.compile(honorific_pattern_str) if honorific_pattern_str else None | |
| for idx, sent in enumerate(filtered_sentences): | |
| score = 1.0 | |
| if gender_nuance_enabled and gender_pronouns: | |
| for p in gender_pronouns: | |
| if p in sent: | |
| score += 5.0 | |
| break | |
| if honorific_pattern and honorific_pattern.search(sent): | |
| score += 2.0 | |
| sentence_scores[idx] = score | |
| for term in frequent_terms: | |
| if term in sent: | |
| if term not in term_to_sentences: | |
| term_to_sentences[term] = [] | |
| term_to_sentences[term].append(idx) | |
| # 2. Select sentences via Round-Robin to ensure coverage of ALL unique terms | |
| # with PRIORITY for character-like terms (those with honorifics) | |
| selected_indices = set() | |
| # Sort each term's sentences by score descending (higher score first) | |
| for term in term_to_sentences: | |
| term_to_sentences[term].sort(key=lambda idx: sentence_scores[idx], reverse=True) | |
| # If dynamic expansion is on, prefer character terms derived from honorific-attached names | |
| honorific_char_terms = [] | |
| if include_all_characters and honorific_pattern_str: | |
| try: | |
| honor_pat = re.compile(honorific_pattern_str) | |
| char_term_map = {} | |
| name_regex = re.compile(r'([\w\-\u4e00-\u9fff\u3040-\u30ff\uac00-\ud7af]+)$') | |
| for idx, sent in enumerate(filtered_sentences): | |
| for m in honor_pat.finditer(sent): | |
| prefix = sent[:m.start()].strip() | |
| nm = name_regex.search(prefix) | |
| if nm: | |
| name = nm.group(1) | |
| char_term_map.setdefault(name, []).append(idx) | |
| if char_term_map: | |
| term_to_sentences = {k: sorted(v, key=lambda i: sentence_scores.get(i, 0), reverse=True) | |
| for k, v in char_term_map.items()} | |
| honorific_char_terms = list(term_to_sentences.keys()) | |
| except Exception: | |
| pass | |
| # Split terms into character-like (with honorifics) and others | |
| def _is_character_like(term: str) -> bool: | |
| try: | |
| if _has_honorific(term): | |
| return True | |
| # CJK short names | |
| if primary_lang in ['korean', 'japanese', 'chinese']: | |
| # Count CJK chars | |
| cjk_len = sum(1 for ch in term if 0x4E00 <= ord(ch) <= 0x9FFF or 0x3040 <= ord(ch) <= 0x30FF or 0xAC00 <= ord(ch) <= 0xD7AF) | |
| if 2 <= cjk_len <= 4: | |
| return True | |
| # English-style names: title case with 1-3 words | |
| parts = term.split() | |
| if 1 <= len(parts) <= 3 and all(p[:1].isupper() for p in parts if p): | |
| return True | |
| except Exception: | |
| pass | |
| return False | |
| character_terms = [] | |
| non_character_terms = [] | |
| source_terms = honorific_char_terms if (include_all_characters and honorific_char_terms) else sorted(term_to_sentences.keys()) | |
| for term in source_terms: | |
| if _is_character_like(term): | |
| character_terms.append(term) | |
| else: | |
| non_character_terms.append(term) | |
| character_term_count = len(character_terms) | |
| # If dynamic limit expansion is enabled, prepare to cover every character-like term once | |
| if include_all_characters and character_terms: | |
| # Build characters strictly from honorific-bearing terms first; fallback to detection if none | |
| honorific_chars = [] | |
| if honorific_pattern_str: | |
| try: | |
| honor_pat = re.compile(honorific_pattern_str) | |
| honorific_chars = [t for t in character_terms if honor_pat.search(t)] | |
| except Exception: | |
| honorific_chars = [] | |
| if honorific_chars: | |
| character_terms = honorific_chars | |
| # Rank character terms by frequency so most frequent get picked first when sentences are missing | |
| character_terms = sorted(character_terms, key=lambda t: frequent_terms.get(t, 0), reverse=True) | |
| def round_robin_terms(term_list, selected_indices, target_limit, min_per_term=None): | |
| """Round-robin over provided term list, updating selected_indices in-place.""" | |
| term_iterators = [iter(term_to_sentences[t]) for t in term_list] | |
| # If min_per_term is set, ensure we get at least that many for each term first | |
| if min_per_term: | |
| for term in term_list: | |
| sentences = term_to_sentences[term] | |
| for i in range(min(min_per_term, len(sentences))): | |
| selected_indices.add(sentences[i]) | |
| while len(selected_indices) < target_limit and term_iterators: | |
| active_iterators = [] | |
| for it in term_iterators: | |
| if len(selected_indices) >= target_limit: | |
| break | |
| try: | |
| while True: | |
| idx = next(it) | |
| if idx not in selected_indices: | |
| selected_indices.add(idx) | |
| active_iterators.append(it) | |
| break | |
| except StopIteration: | |
| pass | |
| term_iterators = active_iterators | |
| # Base limit from user/config | |
| base_limit = max_sentences | |
| requested_bonus = 0 | |
| # If we collected honorific-first sentences, seed the selection with them | |
| if include_all_characters and honorific_first_indices: | |
| for idx in honorific_first_indices.values(): | |
| if 0 <= idx < len(filtered_sentences): | |
| selected_indices.add(idx) | |
| requested_bonus = len(honorific_first_indices) | |
| # Dynamic expansion should ADD to the base limit, not replace it | |
| honorific_bonus = len(selected_indices) if include_all_characters else 0 | |
| effective_limit = base_limit + honorific_bonus | |
| requested_total = base_limit + requested_bonus | |
| print(f"📁 Requested sentence budget: base {base_limit} + bonus {requested_bonus} = {requested_total}") | |
| # Standard Fixed Limit Logic | |
| # First, prioritize character-like terms (honorific-based) | |
| if character_terms: | |
| round_robin_terms(character_terms, selected_indices, effective_limit) | |
| # Then, if we still have room, cover remaining non-character terms | |
| if len(selected_indices) < effective_limit and non_character_terms: | |
| round_robin_terms(non_character_terms, selected_indices, effective_limit) | |
| # If we still have room (rare), fill with highest scored remaining sentences | |
| target_limit = effective_limit | |
| if target_limit and len(selected_indices) < target_limit: | |
| remaining = sorted( | |
| [i for i in range(len(filtered_sentences)) if i not in selected_indices], | |
| key=lambda i: sentence_scores[i], | |
| reverse=True | |
| ) | |
| selected_indices.update(remaining[:target_limit - len(selected_indices)]) | |
| # Log the actual unique sentence count vs requested (base + bonus) | |
| unique_count = len(selected_indices) | |
| dropped = max(0, requested_total - unique_count) | |
| if include_all_characters: | |
| print(f"📁 Deduped sentence budget: requested {base_limit}+{requested_bonus} -> {unique_count} unique (dropped {dropped})") | |
| else: | |
| print(f"📁 Deduped sentence budget: requested {base_limit} -> {unique_count} unique (dropped {dropped})") | |
| # Sort indices to maintain narrative flow | |
| final_indices = sorted(list(selected_indices)) | |
| filtered_sentences = [filtered_sentences[i] for i in final_indices] | |
| dropped_windows = 0 | |
| dropped_sentence_indices = set() | |
| if include_all_characters: | |
| # Determine base vs bonus allocation before dedup | |
| pre_dedup_sentences = filtered_sentences # already ordered by final_indices | |
| pre_total = len(pre_dedup_sentences) | |
| pre_base = min(base_limit, pre_total) | |
| pre_bonus = max(0, pre_total - pre_base) | |
| base_idx_set = set(final_indices[:pre_base]) | |
| bonus_idx_set = set(final_indices[pre_base:]) | |
| # Map sentences to terms (characters and others) for coverage-aware dedup | |
| sentence_terms = {} | |
| if 'term_to_sentences' in locals(): | |
| for term, idx_list in term_to_sentences.items(): | |
| for idx in idx_list: | |
| if idx in final_indices: | |
| sentence_terms.setdefault(idx, set()).add(term) | |
| character_term_set = set(character_terms) if 'character_terms' in locals() else set() | |
| covered_char_terms = set() | |
| covered_terms_global = set() | |
| # Sentence-level dedup post-selection using duplicate_detection_config + slider threshold | |
| dup_config = ddc.get_duplicate_detection_config() | |
| # Fallback to env slider if save_glossary scope variable isn't in this function | |
| fuzzy_threshold_env = float(os.getenv("GLOSSARY_FUZZY_THRESHOLD", "0.90")) | |
| dup_threshold = dup_config.get('threshold', fuzzy_threshold_env) | |
| algo_list = dup_config.get('algorithms', []) | |
| algo_mode = os.getenv("GLOSSARY_DUPLICATE_ALGORITHM", "auto") | |
| print(f"📋 Sentence dedup config: mode={algo_mode}, algos={algo_list}, slider={fuzzy_threshold_env:.2f}, threshold_used={dup_threshold:.2f}, available={ddc.get_algorithm_display_info()}") | |
| dedup_seen_exact = set() | |
| kept_sentences = [] | |
| kept_indices = [] | |
| base_kept = bonus_kept = 0 | |
| base_dropped = bonus_dropped = 0 | |
| for idx, sent in zip(final_indices, pre_dedup_sentences): | |
| key = sent.strip() | |
| if not key: | |
| if idx in base_idx_set: | |
| base_dropped += 1 | |
| else: | |
| bonus_dropped += 1 | |
| continue | |
| # Exact duplicate quick check | |
| if key in dedup_seen_exact: | |
| if idx in base_idx_set: | |
| base_dropped += 1 | |
| else: | |
| bonus_dropped += 1 | |
| continue | |
| terms_here = sentence_terms.get(idx, set()) if sentence_terms else set() | |
| # Term-based dedup: drop if this sentence contributes no new terms (all terms already covered) | |
| is_dup = False | |
| if terms_here and terms_here.issubset(covered_terms_global): | |
| is_dup = True | |
| else: | |
| if kept_sentences: | |
| klen = len(key) | |
| min_len = int(klen * 0.7) | |
| max_len = int(klen * 1.3) | |
| for other in kept_sentences: | |
| if not (min_len <= len(other) <= max_len): | |
| continue | |
| if len(set(key) & set(other)) < klen * 0.5: | |
| continue | |
| sim = ddc.calculate_similarity_with_config(key, other, dup_config) | |
| if sim >= dup_threshold: | |
| is_dup = True | |
| break | |
| if is_dup: | |
| # Guard: keep if this sentence is the only coverage for an uncovered character term | |
| keep_for_character = False | |
| if sentence_terms: | |
| for t in sentence_terms.get(idx, set()): | |
| if t in character_term_set and t not in covered_char_terms: | |
| keep_for_character = True | |
| break | |
| if not keep_for_character: | |
| if idx in base_idx_set: | |
| base_dropped += 1 | |
| else: | |
| bonus_dropped += 1 | |
| continue | |
| # Keep | |
| dedup_seen_exact.add(key) | |
| kept_sentences.append(key) | |
| kept_indices.append(idx) | |
| # Mark covered character terms | |
| if sentence_terms: | |
| for t in terms_here: | |
| if t in character_term_set: | |
| covered_char_terms.add(t) | |
| covered_terms_global.add(t) | |
| if idx in base_idx_set: | |
| base_kept += 1 | |
| else: | |
| bonus_kept += 1 | |
| # Rebuild filtered_sentences preserving original ordering | |
| kept_index_set = set(kept_indices) | |
| filtered_sentences = [sent for idx, sent in zip(final_indices, pre_dedup_sentences) if idx in kept_index_set] | |
| dropped_sentence_indices = set(final_indices) - kept_index_set | |
| total_kept = base_kept + bonus_kept | |
| total_dropped = base_dropped + bonus_dropped | |
| dropped_windows = total_dropped | |
| print( | |
| f"📁 Deduped sentence budget: base {pre_base}->{base_kept} (dropped {base_dropped}), " | |
| f"bonus {pre_bonus}->{bonus_kept} (dropped {bonus_dropped}), total {total_kept}" | |
| ) | |
| # Re-log with dedup-applied cap shrink | |
| print( | |
| f"📁 Smart selection complete: Kept {len(filtered_sentences)} sentences covering " | |
| f"{len(term_to_sentences)} unique terms (cap shrink by {total_dropped})" | |
| ) | |
| else: | |
| print(f"📁 Smart selection complete: Kept {len(filtered_sentences)} sentences covering {len(term_to_sentences)} unique terms") | |
| dropped_windows = 0 | |
| elif max_sentences == 0: | |
| print(f"📁 Including ALL {len(filtered_sentences):,} sentences (max_sentences=0)") | |
| # Check if gender context expansion is enabled | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| if include_gender_context: | |
| context_window = int(os.getenv("GLOSSARY_CONTEXT_WINDOW", "2")) | |
| print(f"📑 Gender context enabled: Expanding snippets with {context_window}-sentence windows...") | |
| if 'dropped_windows' in locals() and dropped_windows: | |
| print(f"📑 Context windows skipped due to dedup: {dropped_windows}") | |
| # Split full text into sentences for context extraction | |
| all_sentences_list = re.split(r'[.!?。!?]+', clean_text) | |
| all_sentences_list = [s.strip() for s in all_sentences_list if s.strip()] | |
| # Create index map for fast lookup - OPTIMIZED to O(n) instead of O(n²) | |
| # Build a lookup dict: sentence -> index for fast matching | |
| sentence_to_index = {} | |
| all_sentences_normalized = {s.strip(): idx for idx, s in enumerate(all_sentences_list)} | |
| print(f"📑 Mapping {len(filtered_sentences):,} filtered sentences to context positions...") | |
| kept_windows = 0 | |
| for filtered_sent in filtered_sentences: | |
| filtered_normalized = filtered_sent.strip() | |
| # Try exact match first (fastest) | |
| if filtered_normalized in all_sentences_normalized: | |
| sentence_to_index[filtered_sent] = all_sentences_normalized[filtered_normalized] | |
| else: | |
| # Try substring match (slower fallback) | |
| found = False | |
| for sentence, idx in all_sentences_normalized.items(): | |
| if filtered_normalized in sentence or sentence in filtered_normalized: | |
| sentence_to_index[filtered_sent] = idx | |
| found = True | |
| break | |
| if not found: | |
| # Last resort: try finding in original list | |
| for idx, sentence in enumerate(all_sentences_list): | |
| if filtered_normalized in sentence or sentence in filtered_normalized: | |
| sentence_to_index[filtered_sent] = idx | |
| break | |
| # Build context windows with explicit boundaries to avoid cross-window leakage | |
| context_groups: list[str] = [] | |
| window_seeds: list[int] = [] | |
| included_indices = set() | |
| for filtered_sent in filtered_sentences: | |
| # If we can't locate the sentence in the master list, wrap it individually | |
| if filtered_sent not in sentence_to_index: | |
| if 'dropped_sentence_indices' in locals() and filtered_sent in dropped_sentence_indices: | |
| continue # skip entire window if its seed sentence was deduped | |
| window_num = len(context_groups) + 1 | |
| context_groups.append( | |
| f"{filtered_sent}\n=== CONTEXT {window_num} END ===" | |
| ) | |
| window_seeds.append(-1) | |
| continue | |
| idx = sentence_to_index[filtered_sent] | |
| # Skip if already included in a previous window | |
| if idx in included_indices: | |
| continue | |
| # Skip window if its seed sentence was deduped | |
| if 'dropped_sentence_indices' in locals() and filtered_sent in dropped_sentence_indices: | |
| continue | |
| # Get context window: [idx-context_window ... idx ... idx+context_window] | |
| start_idx = max(0, idx - context_window) | |
| end_idx = min(len(all_sentences_list), idx + context_window + 1) | |
| # Mark all sentences in this window as included | |
| for i in range(start_idx, end_idx): | |
| included_indices.add(i) | |
| # Extract the window and wrap with start/end markers for splitter safety | |
| window_sentences = all_sentences_list[start_idx:end_idx] | |
| context_group_body = ' '.join(window_sentences) | |
| window_num = len(context_groups) + 1 | |
| context_groups.append( | |
| f"{context_group_body}\n=== CONTEXT {window_num} END ===" | |
| ) | |
| window_seeds.append(idx) | |
| kept_windows += 1 | |
| skipped_windows = (len(filtered_sentences) - kept_windows) if 'kept_windows' in locals() else 0 | |
| print(f"📑 Created {len(context_groups):,} context windows (up to {context_window*2+1} sentences each)") | |
| if skipped_windows: | |
| print(f"📑 Context windows removed after dedup: {skipped_windows}") | |
| # Window-level dedup: drop windows whose term set is already covered, while keeping one per character | |
| window_terms = [] | |
| if 'sentence_terms' in locals(): | |
| for seed_idx in window_seeds: | |
| if seed_idx == -1: | |
| window_terms.append(set()) | |
| else: | |
| window_terms.append(sentence_terms.get(seed_idx, set())) | |
| else: | |
| window_terms = [set() for _ in window_seeds] | |
| covered_terms_global = set() | |
| covered_char_terms = set() | |
| kept_context_groups = [] | |
| kept_window_seeds = [] | |
| for cg, seed_idx, terms in zip(context_groups, window_seeds, window_terms): | |
| if not terms: | |
| # keep empty-term windows to preserve structure | |
| kept_context_groups.append(cg) | |
| kept_window_seeds.append(seed_idx) | |
| continue | |
| drop = False | |
| # STRICT: one window per character. If any character term here is already covered, drop this window. | |
| char_terms = set(t for t in terms if 'character_term_set' in locals() and t in character_term_set) | |
| if char_terms and char_terms & covered_char_terms: | |
| drop = True | |
| elif not char_terms and terms.issubset(covered_terms_global): | |
| drop = True | |
| # If no character terms yet covered, allow first appearance | |
| if drop: | |
| keep_for_char = any((t in character_term_set and t not in covered_char_terms) for t in terms) if 'character_term_set' in locals() else False | |
| if keep_for_char and not (char_terms & covered_char_terms): | |
| drop = False | |
| if drop: | |
| continue | |
| # keep and mark coverage | |
| kept_context_groups.append(cg) | |
| kept_window_seeds.append(seed_idx) | |
| for t in terms: | |
| covered_terms_global.add(t) | |
| if 'character_term_set' in locals() and t in character_term_set: | |
| covered_char_terms.add(t) | |
| dropped_windows_after_terms = len(context_groups) - len(kept_context_groups) | |
| if dropped_windows_after_terms: | |
| print(f"📑 Context windows removed after term-aware dedup: {dropped_windows_after_terms}") | |
| # Compute true total sentences emitted in kept windows | |
| total_window_sentences = 0 | |
| for ctx in kept_context_groups: | |
| # split on end marker to avoid counting it | |
| body = ctx.split('=== CONTEXT ')[0] | |
| # crude split by sentence separators | |
| total_window_sentences += len([s for s in re.split(r'[.!?。!?]+', body) if s.strip()]) | |
| print(f"📑 Final kept windows: {len(kept_context_groups)}, final kept sentences (within windows): {total_window_sentences}") | |
| filtered_text = '\n\n'.join(kept_context_groups) # Separate windows with double newline | |
| print(f"📑 Context-expanded text: {len(filtered_text):,} characters") | |
| else: | |
| # Even without gender context, add footer markers to preserve boundaries for chapter splitting | |
| context_groups = [] | |
| for idx, sent in enumerate(filtered_sentences, 1): | |
| context_groups.append(f"{sent}\n=== CONTEXT {idx} END ===") | |
| filtered_text = '\n\n'.join(context_groups) | |
| # Determine character count for summary (use dynamic-expansion tally when available) | |
| if include_all_characters and honorific_first_indices: | |
| character_term_count = len(honorific_first_indices) | |
| elif 'character_terms' in locals() and character_terms: | |
| character_term_count = len(set(character_terms)) | |
| # Calculate and display filtering statistics | |
| filter_end_time = time.time() | |
| filter_duration = filter_end_time - filter_start_time | |
| original_length = len(clean_text) | |
| filtered_length = len(filtered_text) | |
| size_change_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0 | |
| filtered_text = _normalize_filtered_text(filtered_text) | |
| filtered_length = len(filtered_text) | |
| size_change_percent = ((original_length - filtered_length) / original_length * 100) if original_length > 0 else 0 | |
| print("📑 Applied post-filter text normalization to remove orphaned quotes and extra blank lines") | |
| print(f"\n📑 === FILTERING COMPLETE ===") | |
| print(f"📑 Duration: {filter_duration:.1f} seconds") | |
| if size_change_percent >= 0: | |
| print(f"📑 Text reduction: {original_length:,} → {filtered_length:,} chars ({size_change_percent:.1f}% reduction)") | |
| else: | |
| print(f"📑 Text expansion: {original_length:,} → {filtered_length:,} chars ({abs(size_change_percent):.1f}% expansion)") | |
| print(f"📑 Terms found: {len(frequent_terms):,} unique terms (min frequency: {min_frequency})") | |
| print(f"📑 Characters found (character-like terms): {character_term_count:,}") | |
| print(f"📑 Final output: {len(filtered_sentences)} sentences, {filtered_length:,} characters") | |
| print(f"📑 Performance: {(original_length / filter_duration / 1000):.1f}K chars/second") | |
| print(f"📑 ========================\n") | |
| return filtered_text, frequent_terms | |
| def _normalize_filtered_text(text: str) -> str: | |
| """Normalize filtered text by collapsing stray blank lines and orphaned quote lines.""" | |
| if not text: | |
| return text | |
| quote_open = {"“", "「", "『", "\""} | |
| quote_close = {"”", "」", "』", "\""} | |
| lines = text.replace("\r\n", "\n").split("\n") | |
| normalized_lines = [] | |
| i = 0 | |
| while i < len(lines): | |
| line = lines[i] | |
| stripped = line.strip() | |
| if stripped in quote_close: | |
| # Remove trailing blank lines before attaching closing quote | |
| while normalized_lines and not normalized_lines[-1].strip(): | |
| normalized_lines.pop() | |
| if normalized_lines: | |
| normalized_lines[-1] = normalized_lines[-1].rstrip() + stripped | |
| else: | |
| normalized_lines.append(stripped) | |
| elif stripped in quote_open: | |
| j = i + 1 | |
| while j < len(lines) and not lines[j].strip(): | |
| j += 1 | |
| if j < len(lines): | |
| match = re.match(r"^(\s*)(.*)$", lines[j]) | |
| if match: | |
| leading, remainder = match.groups() | |
| lines[j] = f"{leading}{stripped}{remainder}" | |
| else: | |
| lines[j] = f"{stripped}{lines[j]}" | |
| else: | |
| normalized_lines.append(stripped) | |
| else: | |
| normalized_lines.append(line) | |
| i += 1 | |
| normalized_text = "\n".join(normalized_lines) | |
| normalized_text = re.sub(r"\n{3,}", "\n\n", normalized_text) | |
| normalized_text = re.sub(r"\n{2,}([”」』])", r"\n\1", normalized_text) | |
| normalized_text = re.sub(r"([“「『])\n{2,}", r"\1\n", normalized_text) | |
| normalized_text = re.sub(r"\n{2,}", "\n", normalized_text) | |
| return normalized_text | |
| def _extract_with_custom_prompt(custom_prompt, all_text, language, | |
| min_frequency, max_names, max_titles, | |
| existing_glossary, output_dir, | |
| strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all', max_sentences=200, log_callback=None, | |
| chunk_pos=None, total_chunks=None): | |
| """Extract glossary using custom AI prompt with proper filtering""" | |
| # Redirect stdout to GUI log if callback provided (but not in subprocess - worker handles it) | |
| import sys | |
| in_subprocess = hasattr(sys.stdout, 'queue') | |
| if log_callback and not in_subprocess: | |
| set_output_redirect(log_callback) | |
| print("📑 Using custom automatic glossary prompt") | |
| extraction_start = time.time() | |
| # Check stop flag | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary extraction stopped by user") | |
| return {} | |
| # Note: Filter mode can be controlled via the configurable prompt environment variable | |
| # No hardcoded filter instructions are added here | |
| try: | |
| MODEL = os.getenv("MODEL", "gemini-2.0-flash") | |
| API_KEY = (os.getenv("API_KEY") or | |
| os.getenv("OPENAI_API_KEY") or | |
| os.getenv("OPENAI_OR_Gemini_API_KEY") or | |
| os.getenv("GEMINI_API_KEY")) | |
| if is_traditional_translation_api(MODEL): | |
| # Pattern fallback disabled; traditional translation APIs can't run AI extraction. | |
| print("📑 Traditional translation API selected - skipping automatic glossary extraction (pattern fallback disabled)") | |
| return {} | |
| elif not API_KEY and not _model_uses_own_auth(MODEL): | |
| # Pattern fallback disabled; without an API key we can't run AI extraction. | |
| print("📑 No API key found - skipping automatic glossary extraction (pattern fallback disabled)") | |
| return {} | |
| else: | |
| print(f"📑 Using AI-assisted extraction with custom prompt") | |
| # Ensure multi-key config is available in this process if enabled | |
| _ensure_multi_key_config_loaded() | |
| from unified_api_client import UnifiedClient, UnifiedClientError | |
| client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) | |
| # Log glossary anti-duplicate parameters usage | |
| if os.getenv("GLOSSARY_ENABLE_ANTI_DUPLICATE", "0") == "1": | |
| ad_top_p = os.getenv("GLOSSARY_TOP_P", "1.0") | |
| ad_top_k = os.getenv("GLOSSARY_TOP_K", "0") | |
| ad_freq = os.getenv("GLOSSARY_FREQUENCY_PENALTY", "0.0") | |
| ad_pres = os.getenv("GLOSSARY_PRESENCE_PENALTY", "0.0") | |
| ad_rep = os.getenv("GLOSSARY_REPETITION_PENALTY", "1.0") | |
| print(f"🎯 Anti-duplicate enabled for glossary (top_p={ad_top_p}, top_k={ad_top_k}, freq_penalty={ad_freq}, presence_penalty={ad_pres}, repetition_penalty={ad_rep})") | |
| # Progress-bar labeling: when running chunked auto-glossary, give each in-flight call a unique name. | |
| # This drives the GUI watchdog tooltip "Active calls" list. | |
| progress_context = 'glossary' | |
| try: | |
| if chunk_pos is not None and total_chunks is not None: | |
| progress_context = f"auto glossary ({int(chunk_pos)}/{int(total_chunks)})" | |
| except Exception: | |
| progress_context = 'glossary' | |
| client.context = progress_context | |
| if hasattr(client, 'reset_cleanup_state'): | |
| client.reset_cleanup_state() | |
| # Apply thread submission delay using the client's method | |
| thread_delay = float(os.getenv("THREAD_SUBMISSION_DELAY_SECONDS", "0.5")) | |
| if thread_delay > 0: | |
| client._apply_thread_submission_delay() | |
| # Check if cancelled during delay | |
| if hasattr(client, '_cancelled') and client._cancelled: | |
| print("📑 ❌ Glossary extraction stopped during delay") | |
| return {} | |
| # Check if text is already filtered (from chunking or cache) | |
| already_filtered = (os.getenv("_CHUNK_ALREADY_FILTERED", "0") == "1" or | |
| os.getenv("_TEXT_ALREADY_FILTERED", "0") == "1") | |
| if already_filtered: | |
| # print("📑 Text already filtered, skipping re-filtering") | |
| text_sample = all_text # Use as-is since it's already filtered | |
| detected_terms = {} | |
| else: | |
| # Apply smart filtering to reduce noise and focus on meaningful content | |
| force_disable = os.getenv("GLOSSARY_FORCE_DISABLE_SMART_FILTER", "0") == "1" | |
| use_smart_filter = (os.getenv("GLOSSARY_USE_SMART_FILTER", "1") == "1") and not force_disable | |
| if not use_smart_filter: | |
| # Smart filter disabled - send FULL text without any filtering or truncation | |
| print("📁 Smart filtering DISABLED by user - sending full text to API (this will be expensive!)") | |
| text_sample = all_text | |
| detected_terms = {} | |
| else: | |
| # Smart filter enabled - apply intelligent filtering | |
| print("📁 Applying smart text filtering to reduce noise...") | |
| # Use max_sentences parameter (passed from parent, already read from environment) | |
| print(f"🔍 [DEBUG] In _extract_with_custom_prompt: max_sentences={max_sentences}") | |
| text_sample, detected_terms = _filter_text_for_glossary(all_text, min_frequency, max_sentences) | |
| # If there is no content left, skip API call | |
| if not text_sample or not str(text_sample).strip(): | |
| print("📑 No text available after filtering - skipping automatic glossary generation") | |
| return {} | |
| # Replace placeholders in prompt | |
| # Get target language from environment (used in the prompt for translation output) | |
| target_language = os.getenv('GLOSSARY_TARGET_LANGUAGE', 'English') | |
| # Count context marker windows for {marker} placeholder | |
| marker_matches = re.findall(r"===\s*CONTEXT\s+\d+\s+END\s*===", all_text or "") | |
| marker_count = len(marker_matches) | |
| system_prompt = custom_prompt.replace('{language}', target_language) | |
| system_prompt = system_prompt.replace('{min_frequency}', str(min_frequency)) | |
| system_prompt = system_prompt.replace('{max_names}', str(max_names)) | |
| system_prompt = system_prompt.replace('{max_titles}', str(max_titles)) | |
| system_prompt = system_prompt.replace('{marker}', str(marker_count)) | |
| # Send system prompt and text as separate messages | |
| messages = [ | |
| {"role": "system", "content": system_prompt}, | |
| {"role": "user", "content": f"{text_sample}"} | |
| ] | |
| # Check stop before API call | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary extraction stopped before API call") | |
| return {} | |
| try: | |
| # Use glossary-specific temperature with fallback to global | |
| temperature = float(os.getenv("GLOSSARY_TEMPERATURE", os.getenv("TEMPERATURE", "0.3"))) | |
| # Use glossary-specific max output tokens with fallback to global | |
| max_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "4096"))) | |
| # Use send_with_interrupt for interruptible API call | |
| # Respect RETRY_TIMEOUT toggle - if disabled, use None for infinite timeout | |
| retry_env = os.getenv("RETRY_TIMEOUT") | |
| retry_timeout_enabled = retry_env is None or retry_env.strip().lower() not in ("0", "false", "off", "") | |
| chunk_timeout = None | |
| if retry_timeout_enabled: | |
| env_ct = os.getenv("CHUNK_TIMEOUT", "1800") | |
| try: | |
| ct_val = float(env_ct) | |
| chunk_timeout = None if ct_val <= 0 else ct_val | |
| except Exception: | |
| chunk_timeout = None | |
| print(f"📑 Sending AI extraction request (timeout: {chunk_timeout if chunk_timeout is not None else 'disabled'}s, interruptible)...") | |
| else: | |
| print(f"📑 Sending AI extraction request (timeout: disabled, interruptible)...") | |
| # Before API call | |
| api_start = time.time() | |
| print(f"📑 Preparing API request (text size: {len(text_sample):,} chars)...") | |
| print(f"📑 ⏳ Processing {len(text_sample):,} characters... Please wait, this may take 5-10 minutes") | |
| # Timeout retry logic (matches translation behavior) | |
| try: | |
| max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2")) | |
| except Exception: | |
| max_timeout_retries = 2 | |
| timeout_retry_count = 0 | |
| while True: | |
| try: | |
| response, finish_reason, raw_obj = send_with_interrupt( | |
| messages=messages, | |
| client=client, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stop_check_fn=is_stop_requested, | |
| chunk_timeout=chunk_timeout, | |
| context=progress_context | |
| ) | |
| break | |
| except UnifiedClientError as e: | |
| error_msg = str(e) | |
| lower_msg = error_msg.lower() | |
| # Only treat an explicit user stop as an interrupt; timeouts/cancellations should retry | |
| user_stopped = ("stopped by user" in lower_msg) or ( | |
| is_stop_requested() and not any(k in lower_msg for k in ("timeout", "timed out", "cancelled")) | |
| ) | |
| if user_stopped: | |
| print(f"📑 ❌ AI extraction interrupted by user") | |
| return {} | |
| # Treat cancelled / client init errors as timeout retries | |
| is_timeout = ("timed out" in lower_msg) or ("timeout" in lower_msg) or ("cancelled" in lower_msg) or ("client not initialized" in lower_msg) | |
| if is_timeout and timeout_retry_count < max_timeout_retries: | |
| timeout_retry_count += 1 | |
| if chunk_timeout: | |
| print(f"⚠️ AI extraction timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...") | |
| else: | |
| print(f"⚠️ AI extraction timed out, retrying ({timeout_retry_count}/{max_timeout_retries})...") | |
| # Clear cancellation flags that timeouts may have set | |
| try: | |
| client.reset_cleanup_state() | |
| except Exception: | |
| pass | |
| try: | |
| # Also clear class-level global cancellation for all clients | |
| client.__class__.set_global_cancellation(False) | |
| except Exception: | |
| pass | |
| # Reinitialize client if needed | |
| client_type = getattr(client, 'client_type', 'unknown') | |
| needs_reinit = False | |
| if client_type == 'gemini': | |
| needs_reinit = hasattr(client, 'gemini_client') and client.gemini_client is None | |
| elif client_type == 'openai': | |
| needs_reinit = hasattr(client, 'openai_client') and client.openai_client is None | |
| if needs_reinit: | |
| try: | |
| print(f" 🔄 Reinitializing {client_type} client...") | |
| client._setup_client() | |
| except Exception as reinit_err: | |
| print(f" ⚠️ Failed to reinitialize client: {reinit_err}") | |
| # Stagger retries to avoid simultaneous API calls | |
| try: | |
| import random | |
| base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) | |
| retry_delay = random.uniform(base_delay / 2, base_delay) | |
| print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") | |
| time.sleep(retry_delay) | |
| except Exception: | |
| time.sleep(1.0) | |
| continue | |
| else: | |
| raise | |
| api_time = time.time() - api_start | |
| print(f"📑 API call completed in {api_time:.1f}s") | |
| # Get the actual text from the response | |
| if hasattr(response, 'content'): | |
| response_text = response.content | |
| else: | |
| response_text = str(response) | |
| # Before processing response | |
| process_start = time.time() | |
| # print(f"📑 Processing AI response...") | |
| # Process response and build CSV | |
| csv_lines = _process_ai_response(response_text, all_text, min_frequency, | |
| strip_honorifics, fuzzy_threshold, | |
| language, filter_mode) | |
| print(f"📑 AI extracted {len(csv_lines) - 1} valid terms (header excluded)") | |
| process_time = time.time() - process_start | |
| # print(f"📑 Response processing took {process_time:.1f}s") | |
| # If we're running per-chunk, defer all heavy work and saving | |
| if os.getenv("GLOSSARY_DEFER_SAVE", "0") == "1": | |
| return csv_lines | |
| # Check stop before merging | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary generation stopped before merging") | |
| return {} | |
| # Merge with existing glossary if present | |
| if existing_glossary: | |
| csv_lines = _merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language) | |
| # Always inject the book title BEFORE any deduplication or filtering so it | |
| # survives the first run (previously only happened after a second run/merge) | |
| if os.getenv("GLOSSARY_INCLUDE_BOOK_TITLE", "0") == "1": | |
| csv_lines = _ensure_book_title_csv_lines(csv_lines) | |
| print("📚 Book title injected before dedup (single-shot glossary path)") | |
| # Fuzzy matching deduplication | |
| skip_frequency_check = os.getenv("GLOSSARY_SKIP_FREQUENCY_CHECK", "0") == "1" | |
| if not skip_frequency_check: # Only dedupe if we're checking frequencies | |
| # Time the deduplication | |
| dedup_start = time.time() | |
| original_count = len(csv_lines) - 1 # Exclude header | |
| csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) | |
| dedup_time = time.time() - dedup_start | |
| final_count = len(csv_lines) - 1 # Exclude header | |
| removed_count = original_count - final_count | |
| print(f"📑 Deduplication completed in {dedup_time:.1f}s") | |
| print(f"📑 - Original entries: {original_count}") | |
| print(f"📑 - Duplicates removed: {removed_count}") | |
| print(f"📑 - Final entries: {final_count}") | |
| # Store for summary statistics | |
| _dedup_time = 0 + dedup_time | |
| else: | |
| print(f"📑 Skipping deduplication (frequency check disabled)") | |
| # Apply filter mode to final results | |
| csv_lines = _filter_csv_by_mode(csv_lines, filter_mode) | |
| # Check if we should use token-efficient format | |
| use_legacy_format = os.getenv('GLOSSARY_USE_LEGACY_CSV', '0') == '1' | |
| if not use_legacy_format: | |
| # Convert to token-efficient format | |
| csv_lines = _convert_to_token_efficient_format(csv_lines) | |
| # Final sanitize to prevent stray headers | |
| csv_lines = _sanitize_final_glossary_lines(csv_lines, use_legacy_format) | |
| # Create final CSV content | |
| csv_content = '\n'.join(csv_lines) | |
| # Save glossary as CSV with proper extension | |
| glossary_path = os.path.join(output_dir, "glossary.csv") | |
| _atomic_write_file(glossary_path, csv_content) | |
| print(f"\n📑 ✅ AI-ASSISTED GLOSSARY SAVED!") | |
| print(f"📑 File: {glossary_path}") | |
| c_count, t_count, total = _count_glossary_entries(csv_lines, use_legacy_format) | |
| print(f"📑 Character entries: {c_count}") | |
| # print(f"📑 Term entries: {t_count}") | |
| print(f"📑 Total entries: {total}") | |
| total_time = time.time() - extraction_start | |
| print(f"📑 Total extraction time: {total_time:.1f}s") | |
| return _parse_csv_to_dict(csv_content) | |
| except UnifiedClientError as e: | |
| if "stopped by user" in str(e).lower(): | |
| print(f"📑 ❌ AI extraction interrupted by user") | |
| return {} | |
| else: | |
| print(f"⚠️ AI extraction failed: {e}") | |
| print("📑 ❌ Glossary generation failed - returning empty glossary") | |
| return {} | |
| except Exception as e: | |
| print(f"⚠️ AI extraction failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| print("📑 ❌ Glossary generation failed - returning empty glossary") | |
| return {} | |
| except Exception as e: | |
| print(f"⚠️ Custom prompt processing failed: {e}") | |
| import traceback | |
| traceback.print_exc() | |
| print("📑 ❌ Glossary generation failed - returning empty glossary") | |
| return {} | |
| def _filter_csv_by_mode(csv_lines, filter_mode): | |
| """Filter CSV lines based on the filter mode""" | |
| if filter_mode == "all": | |
| return csv_lines | |
| filtered = [csv_lines[0]] # Keep header | |
| for line in csv_lines[1:]: | |
| if not line.strip(): | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) < 3: | |
| continue | |
| entry_type = parts[0].lower() | |
| raw_name = parts[1] | |
| if filter_mode == "only_with_honorifics": | |
| # Only keep character entries with honorifics | |
| if entry_type == "character" and _has_honorific(raw_name): | |
| filtered.append(line) | |
| elif filter_mode == "only_without_honorifics": | |
| # Keep terms and characters without honorifics | |
| if entry_type == "term" or (entry_type == "character" and not _has_honorific(raw_name)): | |
| filtered.append(line) | |
| print(f"📑 Filter '{filter_mode}': {len(filtered)-1} entries kept from {len(csv_lines)-1}") | |
| return filtered | |
| def _process_ai_response(response_text, all_text, min_frequency, | |
| strip_honorifics, fuzzy_threshold, language, filter_mode): | |
| """Process AI response and return CSV lines""" | |
| # Check if gender context and description are enabled (used throughout the function) | |
| include_gender_context = os.getenv("GLOSSARY_INCLUDE_GENDER_CONTEXT", "0") == "1" | |
| include_description = os.getenv("GLOSSARY_INCLUDE_DESCRIPTION", "0") == "1" | |
| # option to completely skip frequency validation for speed | |
| skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" | |
| # if skip_all_validation: | |
| # print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") | |
| # Clean response text | |
| response_text = response_text.strip() | |
| # Remove string representation artifacts if they wrap the entire response | |
| if response_text.startswith('("') and response_text.endswith('")'): | |
| response_text = response_text[2:-2] | |
| elif response_text.startswith('"') and response_text.endswith('"'): | |
| response_text = response_text[1:-1] | |
| elif response_text.startswith('(') and response_text.endswith(')'): | |
| response_text = response_text[1:-1] | |
| # Unescape the string | |
| response_text = response_text.replace('\\n', '\n') | |
| response_text = response_text.replace('\\r', '') | |
| response_text = response_text.replace('\\t', '\t') | |
| response_text = response_text.replace('\\"', '"') | |
| response_text = response_text.replace("\\'", "'") | |
| response_text = response_text.replace('\\\\', '\\') | |
| # Clean up markdown code blocks if present | |
| if '```' in response_text: | |
| parts = response_text.split('```') | |
| for part in parts: | |
| if 'csv' in part[:10].lower(): | |
| response_text = part[part.find('\n')+1:] | |
| break | |
| elif part.strip() and ('type,raw_name' in part or 'character,' in part or 'term,' in part): | |
| response_text = part | |
| break | |
| # Normalize line endings | |
| response_text = response_text.replace('\r\n', '\n').replace('\r', '\n') | |
| lines = [line.strip() for line in response_text.strip().split('\n') if line.strip()] | |
| import csv | |
| # --- Dynamic header capture: accept every column the AI returns --- | |
| dynamic_header = None | |
| dynamic_rows = [] | |
| for ln in lines: | |
| low = ln.lower() | |
| if 'type' in low and 'raw_name' in low: | |
| try: | |
| dynamic_header = [c.strip() for c in next(csv.reader([ln])) if c.strip()] | |
| except Exception: | |
| dynamic_header = [c.strip() for c in ln.split(',') if c.strip()] | |
| continue | |
| if dynamic_header: | |
| try: | |
| dynamic_rows.append(next(csv.reader([ln]))) | |
| except Exception: | |
| dynamic_rows.append([c.strip() for c in ln.split(',')]) | |
| if dynamic_header: | |
| required = {h.lower(): i for i, h in enumerate(dynamic_header)} | |
| if all(k in required for k in ('type', 'raw_name', 'translated_name')): | |
| csv_lines = [','.join(dynamic_header)] | |
| for row in dynamic_rows: | |
| if len(row) < len(dynamic_header): | |
| row += [''] * (len(dynamic_header) - len(row)) | |
| elif len(row) > len(dynamic_header): | |
| desc_idx = required.get('description') | |
| if desc_idx is not None and desc_idx < len(dynamic_header): | |
| row = row[:desc_idx] + [','.join(row[desc_idx:])] | |
| else: | |
| row = row[:len(dynamic_header)] | |
| # Clean stop tokens | |
| row = ['' if cell in ("'stop'", "stop") else cell for cell in row] | |
| entry_type = row[required['type']].strip() if len(row) > required['type'] else '' | |
| raw_name = row[required['raw_name']].strip() if len(row) > required['raw_name'] else '' | |
| translated_name = row[required['translated_name']].strip() if len(row) > required['translated_name'] else '' | |
| if not raw_name or not translated_name: | |
| continue | |
| csv_lines.append(','.join(row[:len(dynamic_header)])) | |
| if csv_lines: | |
| print(f"📑 Dynamic header detected from AI: {dynamic_header}") | |
| return csv_lines | |
| csv_lines = [] | |
| header_found = False | |
| # Post-response min_frequency filtering is disabled (accept all AI rows); | |
| # skip_frequency_check forced true to bypass frequency gating. | |
| skip_frequency_check = True | |
| # Add option to completely skip ALL validation for maximum speed | |
| skip_all_validation = os.getenv("GLOSSARY_SKIP_ALL_VALIDATION", "0") == "1" | |
| if skip_all_validation: | |
| # print("📑 ⚡ FAST MODE: Skipping all frequency validation (accepting all AI results)") | |
| # Use appropriate header based on gender and description settings | |
| if include_description: | |
| csv_lines.append("type,raw_name,translated_name,gender,description") | |
| elif include_gender_context: | |
| csv_lines.append("type,raw_name,translated_name,gender") | |
| # print("📑 Fast mode: Using 4-column format with gender") | |
| else: | |
| csv_lines.append("type,raw_name,translated_name") | |
| # Process the AI response | |
| for line in lines: | |
| # Skip header lines | |
| if 'type' in line.lower() and 'raw_name' in line.lower(): | |
| continue | |
| # Parse CSV line | |
| parts = [p.strip() for p in line.split(',')] | |
| # Replace invalid 'stop' values with empty string | |
| parts = ['' if p == "'stop'" or p == "stop" else p for p in parts] | |
| if include_description and len(parts) >= 5: | |
| # Has all 5 columns (with gender and description) | |
| entry_type = parts[0] | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| gender = parts[3] if len(parts) > 3 else '' | |
| description = parts[4] if len(parts) > 4 else '' | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if (raw_name and translated_name and | |
| not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and | |
| not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))): | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender},{description}") | |
| elif include_gender_context and len(parts) >= 4: | |
| # Has all 4 columns (with gender) | |
| entry_type = parts[0] | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| gender = parts[3] if len(parts) > 3 else '' | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if (raw_name and translated_name and | |
| not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and | |
| not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))): | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender}") | |
| elif len(parts) >= 3: | |
| # Has at least 3 columns | |
| entry_type = parts[0] | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if (raw_name and translated_name and | |
| not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and | |
| not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))): | |
| if include_description: | |
| # Add empty gender and description columns when 5 columns expected | |
| gender = parts[3] if len(parts) > 3 else '' | |
| description = parts[4] if len(parts) > 4 else '' | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender},{description}") | |
| elif include_gender_context: | |
| # Add empty gender column for 3-column entries when 4 columns expected | |
| gender = parts[3] if len(parts) > 3 else '' | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name},{gender}") | |
| else: | |
| csv_lines.append(f"{entry_type},{raw_name},{translated_name}") | |
| elif len(parts) == 2: | |
| # Missing type, default to 'term' | |
| raw_name = parts[0] | |
| translated_name = parts[1] | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if (raw_name and translated_name and | |
| not (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"'))) and | |
| not (raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"')))): | |
| if include_description: | |
| csv_lines.append(f"term,{raw_name},{translated_name},,") | |
| elif include_gender_context: | |
| csv_lines.append(f"term,{raw_name},{translated_name},") | |
| else: | |
| csv_lines.append(f"term,{raw_name},{translated_name}") | |
| # print(f"📑 Fast mode: Accepted {len(csv_lines) - 1} entries without validation") | |
| return csv_lines | |
| # For "only_with_honorifics" mode, ALWAYS skip frequency check | |
| if filter_mode == "only_with_honorifics": | |
| skip_frequency_check = True | |
| print("📑 Filter mode 'only_with_honorifics': Bypassing frequency checks") | |
| print(f'📑 Processing {len(lines)} lines from AI response...') | |
| # print(f'📑 Text corpus size: {len(all_text):,} chars') | |
| # print(f'📑 Frequency checking: DISABLED (post-response min_frequency bypassed)') | |
| # print(f'📑 Fuzzy threshold: {fuzzy_threshold}') | |
| # Collect all terms first for batch processing | |
| all_terms_to_check = [] | |
| term_info_map = {} # Map term to its full info | |
| if not skip_frequency_check: | |
| # First pass: collect all terms that need frequency checking | |
| for line in lines: | |
| if 'type' in line.lower() and 'raw_name' in line.lower(): | |
| continue # Skip header | |
| parts = [p.strip() for p in line.split(',')] | |
| # Replace invalid 'stop' values with empty string | |
| parts = ['' if p == "'stop'" or p == "stop" else p for p in parts] | |
| # Strip orphaned quotes and filter empty columns | |
| parts = [p.strip('"').strip("'").strip() for p in parts] | |
| parts = [p for p in parts if p] # Remove empty strings | |
| if len(parts) >= 3: | |
| entry_type = parts[0].lower() | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| gender = parts[3] if len(parts) > 3 else '' | |
| description = parts[4] if len(parts) > 4 else '' | |
| elif len(parts) == 2: | |
| entry_type = 'term' | |
| raw_name = parts[0] | |
| translated_name = parts[1] | |
| gender = '' | |
| description = '' | |
| else: | |
| continue | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if not raw_name or not translated_name: | |
| continue | |
| if (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"')) or | |
| raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"'))): | |
| continue | |
| if raw_name and translated_name: | |
| # Store for batch processing | |
| original_raw = raw_name | |
| if strip_honorifics: | |
| raw_name = _strip_honorific(raw_name, language) | |
| all_terms_to_check.append(raw_name) | |
| term_info_map[raw_name] = { | |
| 'entry_type': entry_type, | |
| 'original_raw': original_raw, | |
| 'translated_name': translated_name, | |
| 'gender': gender, | |
| 'description': description, | |
| 'line': line | |
| } | |
| # Batch compute all frequencies at once | |
| if all_terms_to_check: | |
| print(f"📑 Computing frequencies for {len(all_terms_to_check)} terms...") | |
| term_frequencies = _batch_compute_frequencies( | |
| all_terms_to_check, all_text, fuzzy_threshold, min_frequency | |
| ) | |
| else: | |
| term_frequencies = {} | |
| # Now process the results using pre-computed frequencies | |
| entries_processed = 0 | |
| entries_accepted = 0 | |
| # Process based on mode | |
| if filter_mode == "only_with_honorifics" or skip_frequency_check: | |
| # For these modes, accept all entries | |
| if include_description: | |
| csv_lines.append("type,raw_name,translated_name,gender,description") # Header with description | |
| elif include_gender_context: | |
| csv_lines.append("type,raw_name,translated_name,gender") # Header with gender | |
| else: | |
| csv_lines.append("type,raw_name,translated_name") # Header | |
| for line in lines: | |
| if 'type' in line.lower() and 'raw_name' in line.lower(): | |
| continue # Skip header | |
| parts = [p.strip() for p in line.split(',')] | |
| # Replace invalid 'stop' values with empty string | |
| parts = ['' if p == "'stop'" or p == "stop" else p for p in parts] | |
| # Strip orphaned quotes and filter empty columns | |
| parts = [p.strip('"').strip("'").strip() for p in parts] | |
| parts = [p for p in parts if p] # Remove empty strings | |
| if len(parts) >= 3: | |
| entry_type = parts[0].lower() | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| gender = parts[3] if len(parts) > 3 else '' | |
| description = parts[4] if len(parts) > 4 else '' | |
| elif len(parts) == 2: | |
| entry_type = 'term' | |
| raw_name = parts[0] | |
| translated_name = parts[1] | |
| gender = '' | |
| description = '' | |
| else: | |
| continue | |
| # Validate - reject malformed entries that look like tuples/lists or quoted strings | |
| if not raw_name or not translated_name: | |
| continue | |
| if (raw_name.startswith(('[', '(', "'", '"')) or translated_name.startswith(('[', '(', "'", '"')) or | |
| raw_name.endswith(("'", '"')) or translated_name.endswith(("'", '"'))): | |
| continue | |
| if raw_name and translated_name: | |
| if include_description: | |
| csv_line = f"{entry_type},{raw_name},{translated_name},{gender},{description}" | |
| elif include_gender_context: | |
| csv_line = f"{entry_type},{raw_name},{translated_name},{gender}" | |
| else: | |
| csv_line = f"{entry_type},{raw_name},{translated_name}" | |
| csv_lines.append(csv_line) | |
| entries_accepted += 1 | |
| print(f"📑 Accepted {entries_accepted} entries (frequency check disabled)") | |
| else: | |
| # Use pre-computed frequencies | |
| if include_description: | |
| csv_lines.append("type,raw_name,translated_name,gender,description") # Header with description | |
| elif include_gender_context: | |
| csv_lines.append("type,raw_name,translated_name,gender") # Header with gender | |
| else: | |
| csv_lines.append("type,raw_name,translated_name") # Header | |
| for term, info in term_info_map.items(): | |
| count = term_frequencies.get(term, 0) | |
| # Also check original form if it was stripped | |
| if info['original_raw'] != term: | |
| count += term_frequencies.get(info['original_raw'], 0) | |
| if count >= min_frequency: | |
| if include_description: | |
| csv_line = f"{info['entry_type']},{term},{info['translated_name']},{info['gender']},{info['description']}" | |
| elif include_gender_context: | |
| csv_line = f"{info['entry_type']},{term},{info['translated_name']},{info['gender']}" | |
| else: | |
| csv_line = f"{info['entry_type']},{term},{info['translated_name']}" | |
| csv_lines.append(csv_line) | |
| entries_accepted += 1 | |
| # Log first few examples | |
| if entries_accepted <= 5: | |
| print(f"📑 ✓ Example: {term} -> {info['translated_name']} (freq: {count})") | |
| print(f"📑 Frequency filtering complete: {entries_accepted}/{len(term_info_map)} terms accepted") | |
| # Ensure we have at least the header | |
| if len(csv_lines) == 0: | |
| if include_description: | |
| csv_lines.append("type,raw_name,translated_name,gender,description") | |
| elif include_gender_context: | |
| csv_lines.append("type,raw_name,translated_name,gender") | |
| else: | |
| csv_lines.append("type,raw_name,translated_name") | |
| # Print final summary | |
| print(f"📑 Processing complete: {entries_accepted} terms accepted") | |
| return csv_lines | |
| def _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold): | |
| """Apply advanced fuzzy matching to remove duplicate entries from the glossary with stop flag checks | |
| Uses a 2-pass approach: | |
| Pass 1: Remove entries with similar raw names (existing logic) | |
| Pass 2: Remove entries with identical translated names (new logic) | |
| """ | |
| from difflib import SequenceMatcher | |
| # Try to import advanced libraries | |
| try: | |
| from rapidfuzz import fuzz as rfuzz | |
| use_rapidfuzz = True | |
| except ImportError: | |
| use_rapidfuzz = False | |
| try: | |
| import jellyfish | |
| use_jellyfish = True | |
| except ImportError: | |
| use_jellyfish = False | |
| algo_info = [] | |
| if use_rapidfuzz: | |
| algo_info.append("RapidFuzz") | |
| if use_jellyfish: | |
| algo_info.append("Jaro-Winkler") | |
| if not algo_info: | |
| algo_info.append("difflib") | |
| # Check if translated name deduplication is enabled | |
| # GLOSSARY_DEDUPE_TRANSLATIONS: "1" = enable Pass 2 (remove entries with identical translations) | |
| # : "0" = disable Pass 2 (only remove entries with similar raw names) | |
| dedupe_translations = os.getenv("GLOSSARY_DEDUPE_TRANSLATIONS", "1") == "1" | |
| print(f"📋 Applying 2-pass fuzzy deduplication (threshold: {fuzzy_threshold})...") | |
| print(f"📋 Pass 1: Raw name deduplication (fuzzy matching)") | |
| if dedupe_translations: | |
| print(f"📋 Pass 2: Translated name deduplication (exact matching)") | |
| else: | |
| print(f"📋 Pass 2: DISABLED (GLOSSARY_DEDUPE_TRANSLATIONS=0)") | |
| print(f"📋 Using algorithms: {', '.join(algo_info)}") | |
| # Check stop flag at start | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Deduplication stopped by user") | |
| return csv_lines | |
| header_line = csv_lines[0] # Keep header | |
| entry_lines = csv_lines[1:] # Data lines | |
| original_count = len(entry_lines) | |
| print(f"📑 Starting deduplication with {original_count} entries...") | |
| # PASS 1: Raw name deduplication (existing fuzzy matching logic) | |
| print(f"📑 🔄 PASS 1: Raw name deduplication...") | |
| pass1_results = _deduplicate_pass1_raw_names( | |
| entry_lines, fuzzy_threshold, use_rapidfuzz, use_jellyfish | |
| ) | |
| pass1_count = len(pass1_results) | |
| pass1_removed = original_count - pass1_count | |
| print(f"📑 ✅ PASS 1 complete: {pass1_removed} duplicates removed ({pass1_count} remaining)") | |
| # PASS 2: Translated name deduplication (if enabled) | |
| if dedupe_translations: | |
| print(f"📑 🔄 PASS 2: Translated name deduplication...") | |
| final_results, replaced_count = _deduplicate_pass2_translated_names(pass1_results) | |
| pass2_removed = pass1_count - len(final_results) | |
| replaced_msg = f" ({replaced_count} replaced with more complete entries)" if replaced_count > 0 else "" | |
| print(f"📑 ✅ PASS 2 complete: {pass2_removed} duplicates removed{replaced_msg} ({len(final_results)} remaining)") | |
| total_removed = pass1_removed + pass2_removed | |
| else: | |
| final_results = pass1_results | |
| total_removed = pass1_removed | |
| print(f"📑 ⏭️ PASS 2 skipped (translation deduplication disabled)") | |
| # Rebuild CSV with header | |
| deduplicated = [header_line] + final_results | |
| print(f"📑 ✅ Total deduplication complete: {total_removed} duplicates removed") | |
| print(f"📑 Final glossary size: {len(final_results)} unique entries") | |
| return deduplicated | |
| def _deduplicate_pass1_raw_names(entry_lines, fuzzy_threshold, use_rapidfuzz, use_jellyfish): | |
| """Pass 1: Remove entries with similar raw names using fuzzy matching""" | |
| from difflib import SequenceMatcher | |
| if use_rapidfuzz: | |
| from rapidfuzz import fuzz as rfuzz | |
| if use_jellyfish: | |
| import jellyfish | |
| deduplicated = [] | |
| seen_entries = {} # raw_name -> (entry_type, translated_name) | |
| seen_names_lower = set() # Quick exact match check | |
| removed_count = 0 | |
| total_entries = len(entry_lines) | |
| for idx, line in enumerate(entry_lines): | |
| # Check stop flag every 100 entries | |
| if idx > 0 and idx % 100 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Pass 1 stopped at entry {idx}/{total_entries}") | |
| break | |
| # Show progress for large glossaries | |
| if total_entries > 500 and idx % 200 == 0: | |
| progress = (idx / total_entries) * 100 | |
| print(f"📑 Pass 1 progress: {progress:.1f}% ({idx}/{total_entries})") | |
| if not line.strip(): | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) < 3: | |
| continue | |
| entry_type = parts[0] | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| raw_name_lower = raw_name.lower() | |
| # Fast exact duplicate check first | |
| if raw_name_lower in seen_names_lower: | |
| removed_count += 1 | |
| if removed_count <= 10: # Only log first few | |
| print(f"📋 Pass 1: Removing exact duplicate: '{raw_name}'") | |
| continue | |
| # For fuzzy matching, only check if threshold is less than 1.0 | |
| is_duplicate = False | |
| if fuzzy_threshold < 1.0: | |
| # Use a more efficient approach: only check similar length strings | |
| name_len = len(raw_name) | |
| min_len = int(name_len * 0.7) | |
| max_len = int(name_len * 1.3) | |
| # Only compare with entries of similar length | |
| candidates = [] | |
| for seen_name, (seen_type, seen_trans) in seen_entries.items(): | |
| if min_len <= len(seen_name) <= max_len: | |
| candidates.append(seen_name) | |
| # Check fuzzy similarity with candidates using multiple algorithms | |
| for seen_name in candidates: | |
| # Quick character overlap check before expensive comparison | |
| char_overlap = len(set(raw_name_lower) & set(seen_name.lower())) | |
| if char_overlap < len(raw_name_lower) * 0.5: | |
| continue # Too different, skip | |
| # Try multiple algorithms and take the best score | |
| scores = [] | |
| if use_rapidfuzz: | |
| # RapidFuzz basic ratio | |
| scores.append(rfuzz.ratio(raw_name_lower, seen_name.lower()) / 100.0) | |
| # Token sort (handles word order) | |
| try: | |
| scores.append(rfuzz.token_sort_ratio(raw_name_lower, seen_name.lower()) / 100.0) | |
| except: | |
| pass | |
| # Partial ratio (substring) | |
| try: | |
| scores.append(rfuzz.partial_ratio(raw_name_lower, seen_name.lower()) / 100.0) | |
| except: | |
| pass | |
| else: | |
| # Fallback to difflib | |
| scores.append(SequenceMatcher(None, raw_name_lower, seen_name.lower()).ratio()) | |
| # Try Jaro-Winkler (better for names) | |
| if use_jellyfish: | |
| try: | |
| jaro = jellyfish.jaro_winkler_similarity(raw_name, seen_name) | |
| scores.append(jaro) | |
| except: | |
| pass | |
| # Take best score | |
| best_similarity = max(scores) if scores else 0.0 | |
| if best_similarity >= fuzzy_threshold: | |
| if removed_count < 10: # Only log first few | |
| print(f"📋 Pass 1: Removing fuzzy duplicate: '{raw_name}' ~= '{seen_name}' (score: {best_similarity:.2%})") | |
| removed_count += 1 | |
| is_duplicate = True | |
| break | |
| if not is_duplicate: | |
| seen_entries[raw_name] = (entry_type, translated_name) | |
| seen_names_lower.add(raw_name_lower) | |
| deduplicated.append(line) | |
| return deduplicated | |
| def _deduplicate_pass2_translated_names(entry_lines): | |
| """Pass 2: Remove entries with identical translated names""" | |
| deduplicated = [] | |
| seen_translations = {} # translated_name.lower() -> (raw_name, line) | |
| removed_count = 0 | |
| replaced_count = 0 | |
| for line in entry_lines: | |
| if not line.strip(): | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) < 3: | |
| continue | |
| entry_type = parts[0] | |
| raw_name = parts[1] | |
| translated_name = parts[2] | |
| translated_lower = translated_name.lower().strip() | |
| # Skip empty translations | |
| if not translated_lower: | |
| deduplicated.append(line) | |
| continue | |
| # Check if we've seen this translation before | |
| if translated_lower in seen_translations: | |
| existing_raw, existing_line = seen_translations[translated_lower] | |
| # Get the existing translated name from the line | |
| existing_parts = existing_line.split(',') | |
| existing_translated = existing_parts[2] if len(existing_parts) >= 3 else translated_name | |
| # Count fields in both entries (more fields = higher priority) | |
| current_field_count = len([f.strip() for f in parts if f.strip()]) | |
| existing_field_count = len([f.strip() for f in existing_parts if f.strip()]) | |
| # If current entry has more fields, replace the existing one | |
| if current_field_count > existing_field_count: | |
| # Remove existing entry from deduplicated list | |
| deduplicated = [l for l in deduplicated if l != existing_line] | |
| # Replace with current entry | |
| seen_translations[translated_lower] = (raw_name, line) | |
| deduplicated.append(line) | |
| removed_count += 1 | |
| replaced_count += 1 | |
| if removed_count <= 10: # Only log first few | |
| print(f"📋 Pass 2: Replacing '{existing_raw}' -> '{existing_translated}' ({existing_field_count} fields) with '{raw_name}' -> '{translated_name}' ({current_field_count} fields) - more detailed entry") | |
| else: | |
| # Keep existing entry (has same or more fields) | |
| removed_count += 1 | |
| if removed_count <= 10: # Only log first few | |
| extra_info = f" ({current_field_count} vs {existing_field_count} fields)" if current_field_count != existing_field_count else "" | |
| print(f"📋 Pass 2: Removing '{raw_name}' -> '{translated_name}' (duplicate translation of '{existing_raw}' -> '{existing_translated}'){extra_info}") | |
| else: | |
| # New translation, keep it | |
| seen_translations[translated_lower] = (raw_name, line) | |
| deduplicated.append(line) | |
| return deduplicated, replaced_count | |
| def _merge_csv_entries(new_csv_lines, existing_glossary, strip_honorifics, language): | |
| """Merge CSV entries with existing glossary with stop flag checks""" | |
| # Check stop flag at start | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Glossary merge stopped by user") | |
| return new_csv_lines | |
| # Parse existing glossary | |
| existing_lines = [] | |
| existing_names = set() | |
| if isinstance(existing_glossary, str): | |
| # Already CSV format | |
| lines = existing_glossary.strip().split('\n') | |
| total_lines = len(lines) | |
| for idx, line in enumerate(lines): | |
| # Check stop flag every 50 lines | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Merge stopped while processing existing glossary at line {idx}/{total_lines}") | |
| return new_csv_lines | |
| if total_lines > 200: | |
| progress = (idx / total_lines) * 100 | |
| print(f"📑 Processing existing glossary: {progress:.1f}%") | |
| if 'type,raw_name' in line.lower(): | |
| continue # Skip header | |
| line_stripped = line.strip() | |
| # Skip token-efficient lines and section/bullet markers | |
| if not line_stripped or line_stripped.startswith('===') or line_stripped.startswith('*') or line_stripped.lower().startswith('glossary:'): | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| # Require at least 3 fields (type, raw_name, translated_name) | |
| if len(parts) < 3: | |
| continue | |
| entry_type = parts[0].strip().lower() | |
| # Only accept reasonable type tokens (letters/underscores only) | |
| import re as _re | |
| if not _re.match(r'^[a-z_]+$', entry_type): | |
| continue | |
| raw_name = parts[1] | |
| if strip_honorifics: | |
| raw_name = _strip_honorific(raw_name, language) | |
| parts[1] = raw_name | |
| if raw_name not in existing_names: | |
| existing_lines.append(','.join(parts)) | |
| existing_names.add(raw_name) | |
| # Check stop flag before processing new names | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Merge stopped before processing new entries") | |
| return new_csv_lines | |
| # Get new names | |
| new_names = set() | |
| final_lines = [] | |
| for idx, line in enumerate(new_csv_lines): | |
| # Check stop flag every 50 lines | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Merge stopped while processing new entries at line {idx}") | |
| return final_lines if final_lines else new_csv_lines | |
| if 'type,raw_name' in line.lower(): | |
| final_lines.append(line) # Keep header | |
| continue | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) >= 2: | |
| new_names.add(parts[1]) | |
| final_lines.append(line) | |
| # Check stop flag before adding existing entries | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Merge stopped before combining entries") | |
| return final_lines | |
| # Add non-duplicate existing entries | |
| added_count = 0 | |
| for idx, line in enumerate(existing_lines): | |
| # Check stop flag every 50 additions | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Merge stopped while adding existing entries ({added_count} added)") | |
| return final_lines | |
| parts = [p.strip() for p in line.split(',')] | |
| if len(parts) >= 2 and parts[1] not in new_names: | |
| final_lines.append(line) | |
| added_count += 1 | |
| print(f"📑 Merged {added_count} entries from existing glossary") | |
| return final_lines | |
| def _extract_with_patterns(all_text, language, min_frequency, | |
| max_names, max_titles, batch_size, | |
| existing_glossary, output_dir, | |
| strip_honorifics=True, fuzzy_threshold=0.90, filter_mode='all'): | |
| """Extract glossary using pattern matching with true CSV format output and stop flag checks""" | |
| print("📑 Using pattern-based extraction") | |
| # Check stop flag at start | |
| if is_stop_requested(): | |
| print("📑 ❌ Pattern-based extraction stopped by user") | |
| return {} | |
| def is_valid_name(name, language_hint='unknown'): | |
| """Strict validation for proper names only""" | |
| if not name or len(name.strip()) < 1: | |
| return False | |
| name = name.strip() | |
| if name.lower() in PM.COMMON_WORDS or name in PM.COMMON_WORDS: | |
| return False | |
| if language_hint == 'korean': | |
| if not (2 <= len(name) <= 4): | |
| return False | |
| if not all(0xAC00 <= ord(char) <= 0xD7AF for char in name): | |
| return False | |
| if len(set(name)) == 1: | |
| return False | |
| elif language_hint == 'japanese': | |
| if not (2 <= len(name) <= 6): | |
| return False | |
| has_kanji = any(0x4E00 <= ord(char) <= 0x9FFF for char in name) | |
| has_kana = any((0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF) for char in name) | |
| if not (has_kanji or has_kana): | |
| return False | |
| elif language_hint == 'chinese': | |
| if not (2 <= len(name) <= 4): | |
| return False | |
| if not all(0x4E00 <= ord(char) <= 0x9FFF for char in name): | |
| return False | |
| elif language_hint == 'english': | |
| if not name[0].isupper(): | |
| return False | |
| if sum(1 for c in name if c.isalpha()) < len(name) * 0.8: | |
| return False | |
| if not (2 <= len(name) <= 20): | |
| return False | |
| return True | |
| def detect_language_hint(text_sample): | |
| """Quick language detection for validation purposes""" | |
| sample = text_sample[:1000] | |
| korean_chars = sum(1 for char in sample if 0xAC00 <= ord(char) <= 0xD7AF) | |
| japanese_kana = sum(1 for char in sample if (0x3040 <= ord(char) <= 0x309F) or (0x30A0 <= ord(char) <= 0x30FF)) | |
| chinese_chars = sum(1 for char in sample if 0x4E00 <= ord(char) <= 0x9FFF) | |
| latin_chars = sum(1 for char in sample if 0x0041 <= ord(char) <= 0x007A) | |
| if korean_chars > 50: | |
| return 'korean' | |
| elif japanese_kana > 20: | |
| return 'japanese' | |
| elif chinese_chars > 50 and japanese_kana < 10: | |
| return 'chinese' | |
| elif latin_chars > 100: | |
| return 'english' | |
| else: | |
| return 'unknown' | |
| language_hint = detect_language_hint(all_text) | |
| print(f"📑 Detected primary language: {language_hint}") | |
| # Check stop flag after language detection | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped after language detection") | |
| return {} | |
| honorifics_to_use = [] | |
| if language_hint in PM.CJK_HONORIFICS: | |
| honorifics_to_use.extend(PM.CJK_HONORIFICS[language_hint]) | |
| honorifics_to_use.extend(PM.CJK_HONORIFICS.get('english', [])) | |
| print(f"📑 Using {len(honorifics_to_use)} honorifics for {language_hint}") | |
| names_with_honorifics = {} | |
| standalone_names = {} | |
| # Check if parallel processing is enabled | |
| extraction_workers = int(os.getenv("EXTRACTION_WORKERS", "1")) | |
| # PARALLEL HONORIFIC PROCESSING | |
| if extraction_workers > 1 and len(honorifics_to_use) > 3: | |
| print(f"📑 Scanning for names with honorifics (parallel with {extraction_workers} workers)...") | |
| # Create a wrapper function that can be called in parallel | |
| def process_honorific(args): | |
| """Process a single honorific in a worker thread""" | |
| honorific, idx, total = args | |
| # Check stop flag | |
| if is_stop_requested(): | |
| return None, None | |
| print(f"📑 Worker processing honorific {idx}/{total}: '{honorific}'") | |
| # Local dictionaries for this worker | |
| local_names_with = {} | |
| local_standalone = {} | |
| # Call the extraction method | |
| _extract_names_for_honorific( | |
| honorific, all_text, language_hint, | |
| min_frequency, local_names_with, | |
| local_standalone, is_valid_name, fuzzy_threshold | |
| ) | |
| return local_names_with, local_standalone | |
| # Prepare arguments for parallel processing | |
| honorific_args = [ | |
| (honorific, idx + 1, len(honorifics_to_use)) | |
| for idx, honorific in enumerate(honorifics_to_use) | |
| ] | |
| # Process honorifics in parallel | |
| with ThreadPoolExecutor(max_workers=min(extraction_workers, len(honorifics_to_use))) as executor: | |
| futures = [] | |
| for args in honorific_args: | |
| if is_stop_requested(): | |
| executor.shutdown(wait=False) | |
| return {} | |
| future = executor.submit(process_honorific, args) | |
| futures.append(future) | |
| # Collect results as they complete | |
| completed = 0 | |
| for future in as_completed(futures): | |
| if is_stop_requested(): | |
| executor.shutdown(wait=False) | |
| return {} | |
| try: | |
| result = future.result() | |
| if result and result[0] is not None: | |
| local_names_with, local_standalone = result | |
| # Merge results (thread-safe since we're in main thread) | |
| for name, count in local_names_with.items(): | |
| if name not in names_with_honorifics: | |
| names_with_honorifics[name] = count | |
| else: | |
| names_with_honorifics[name] = max(names_with_honorifics[name], count) | |
| for name, count in local_standalone.items(): | |
| if name not in standalone_names: | |
| standalone_names[name] = count | |
| else: | |
| standalone_names[name] = max(standalone_names[name], count) | |
| completed += 1 | |
| if completed % 5 == 0 or completed == len(honorifics_to_use): | |
| print(f"📑 Honorific processing: {completed}/{len(honorifics_to_use)} completed") | |
| except Exception as e: | |
| print(f"⚠️ Failed to process honorific: {e}") | |
| completed += 1 | |
| print(f"📑 Parallel honorific processing completed: found {len(names_with_honorifics)} names") | |
| else: | |
| # SEQUENTIAL PROCESSING (fallback) | |
| print("📑 Scanning for names with honorifics...") | |
| # Extract names with honorifics | |
| total_honorifics = len(honorifics_to_use) | |
| for idx, honorific in enumerate(honorifics_to_use): | |
| # Check stop flag before each honorific | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Extraction stopped at honorific {idx}/{total_honorifics}") | |
| return {} | |
| print(f"📑 Processing honorific {idx + 1}/{total_honorifics}: '{honorific}'") | |
| _extract_names_for_honorific(honorific, all_text, language_hint, | |
| min_frequency, names_with_honorifics, | |
| standalone_names, is_valid_name, fuzzy_threshold) | |
| # Check stop flag before processing terms | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before processing terms") | |
| return {} | |
| # Apply filter mode | |
| filtered_names = {} | |
| if filter_mode == 'only_with_honorifics': | |
| # Only keep names that have honorifics (no standalone names) | |
| filtered_names = names_with_honorifics.copy() | |
| print(f"📑 Filter: Keeping only names with honorifics ({len(filtered_names)} names)") | |
| elif filter_mode == 'only_without_honorifics': | |
| # Keep standalone names that were NOT found with honorifics | |
| for name, count in standalone_names.items(): | |
| # Check if this name also appears with honorifics | |
| appears_with_honorific = False | |
| for honorific_name in names_with_honorifics.keys(): | |
| if _strip_honorific(honorific_name, language_hint) == name: | |
| appears_with_honorific = True | |
| break | |
| # Only add if it doesn't appear with honorifics | |
| if not appears_with_honorific: | |
| filtered_names[name] = count | |
| print(f"📑 Filter: Keeping only names without honorifics ({len(filtered_names)} names)") | |
| else: # 'all' mode | |
| # Keep all names (both with and without honorifics) | |
| filtered_names = names_with_honorifics.copy() | |
| # Also add standalone names | |
| for name, count in standalone_names.items(): | |
| if name not in filtered_names and not any( | |
| _strip_honorific(n, language_hint) == name for n in filtered_names.keys() | |
| ): | |
| filtered_names[name] = count | |
| print(f"📑 Filter: Keeping all names ({len(filtered_names)} names)") | |
| # Process extracted terms | |
| final_terms = {} | |
| term_count = 0 | |
| total_terms = len(filtered_names) | |
| for term, count in filtered_names.items(): | |
| term_count += 1 | |
| # Check stop flag every 20 terms | |
| if term_count % 20 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Term processing stopped at {term_count}/{total_terms}") | |
| return {} | |
| if strip_honorifics: | |
| clean_term = _strip_honorific(term, language_hint) | |
| if clean_term in final_terms: | |
| final_terms[clean_term] = final_terms[clean_term] + count | |
| else: | |
| final_terms[clean_term] = count | |
| else: | |
| final_terms[term] = count | |
| # Check stop flag before finding titles | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before finding titles") | |
| return {} | |
| # Find titles (but respect filter mode) | |
| print("📑 Scanning for titles...") | |
| found_titles = {} | |
| # Extract titles for all modes EXCEPT "only_with_honorifics" | |
| # (titles are included in "only_without_honorifics" since titles typically don't have honorifics) | |
| if filter_mode != 'only_with_honorifics': | |
| title_patterns_to_use = [] | |
| if language_hint in PM.TITLE_PATTERNS: | |
| title_patterns_to_use.extend(PM.TITLE_PATTERNS[language_hint]) | |
| title_patterns_to_use.extend(PM.TITLE_PATTERNS.get('english', [])) | |
| total_patterns = len(title_patterns_to_use) | |
| for pattern_idx, pattern in enumerate(title_patterns_to_use): | |
| # Check stop flag before each pattern | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Title extraction stopped at pattern {pattern_idx}/{total_patterns}") | |
| return {} | |
| print(f"📑 Processing title pattern {pattern_idx + 1}/{total_patterns}") | |
| matches = list(re.finditer(pattern, all_text, re.IGNORECASE if 'english' in pattern else 0)) | |
| for match_idx, match in enumerate(matches): | |
| # Check stop flag every 50 matches | |
| if match_idx > 0 and match_idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Title extraction stopped at match {match_idx}") | |
| return {} | |
| title = match.group(0) | |
| # Skip if this title is already in names | |
| if title in filtered_names or title in names_with_honorifics: | |
| continue | |
| count = _find_fuzzy_matches(title, all_text, fuzzy_threshold) | |
| # Check if stopped during fuzzy matching | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Title extraction stopped during fuzzy matching") | |
| return {} | |
| if count >= min_frequency: | |
| if re.match(r'[A-Za-z]', title): | |
| title = title.title() | |
| if strip_honorifics: | |
| title = _strip_honorific(title, language_hint) | |
| if title not in found_titles: | |
| found_titles[title] = count | |
| if filter_mode == 'only_without_honorifics': | |
| print(f"📑 Found {len(found_titles)} titles (included in 'without honorifics' mode)") | |
| else: | |
| print(f"📑 Found {len(found_titles)} unique titles") | |
| else: | |
| print(f"📑 Skipping title extraction (filter mode: only_with_honorifics)") | |
| # Check stop flag before sorting and translation | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before sorting terms") | |
| return {} | |
| # Combine and sort | |
| sorted_names = sorted(final_terms.items(), key=lambda x: x[1], reverse=True) | |
| sorted_titles = sorted(found_titles.items(), key=lambda x: x[1], reverse=True) | |
| all_terms = [] | |
| for name, count in sorted_names: | |
| all_terms.append(name) | |
| for title, count in sorted_titles: | |
| all_terms.append(title) | |
| print(f"📑 Total terms to translate: {len(all_terms)}") | |
| # Check stop flag before translation | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before translation") | |
| return {} | |
| # Translate terms | |
| if os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": | |
| print("📑 Translation disabled - keeping original terms") | |
| translations = {term: term for term in all_terms} | |
| else: | |
| print(f"📑 Translating {len(all_terms)} terms...") | |
| translations = _translate_terms_batch(all_terms, language_hint, batch_size, output_dir) | |
| # Check if translation was stopped | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped after translation") | |
| return translations # Return partial results | |
| # Build CSV lines | |
| csv_lines = ["type,raw_name,translated_name"] | |
| for name, _ in sorted_names: | |
| if name in translations: | |
| csv_lines.append(f"character,{name},{translations[name]}") | |
| for title, _ in sorted_titles: | |
| if title in translations: | |
| csv_lines.append(f"term,{title},{translations[title]}") | |
| # Check stop flag before merging | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before merging with existing glossary") | |
| # Still save what we have | |
| csv_content = '\n'.join(csv_lines) | |
| glossary_path = os.path.join(output_dir, "glossary.json") | |
| _atomic_write_file(glossary_path, csv_content) | |
| return _parse_csv_to_dict(csv_content) | |
| # Merge with existing glossary | |
| if existing_glossary: | |
| csv_lines = _merge_csv_entries(csv_lines, existing_glossary, strip_honorifics, language_hint) | |
| # Check stop flag before deduplication | |
| if is_stop_requested(): | |
| print("📑 ❌ Extraction stopped before deduplication") | |
| csv_content = '\n'.join(csv_lines) | |
| glossary_path = os.path.join(output_dir, "glossary.json") | |
| _atomic_write_file(glossary_path, csv_content) | |
| return _parse_csv_to_dict(csv_content) | |
| # Fuzzy matching deduplication | |
| csv_lines = _deduplicate_glossary_with_fuzzy(csv_lines, fuzzy_threshold) | |
| # Create CSV content | |
| csv_content = '\n'.join(csv_lines) | |
| # Save glossary as CSV | |
| glossary_path = os.path.join(output_dir, "glossary.csv") | |
| _atomic_write_file(glossary_path, csv_content) | |
| print(f"\n📑 ✅ TARGETED GLOSSARY SAVED!") | |
| print(f"📑 File: {glossary_path}") | |
| print(f"📑 Total entries: {len(csv_lines) - 1}") # Exclude header | |
| return _parse_csv_to_dict(csv_content) | |
| def _translate_terms_batch(term_list, profile_name, batch_size=50, output_dir=None, log_callback=None): | |
| """Use fully configurable prompts for translation with interrupt support""" | |
| # Redirect stdout to GUI log if callback provided | |
| if log_callback: | |
| set_output_redirect(log_callback) | |
| if not term_list or os.getenv("DISABLE_GLOSSARY_TRANSLATION", "0") == "1": | |
| print(f"📑 Glossary translation disabled or no terms to translate") | |
| return {term: term for term in term_list} | |
| # Check stop flag | |
| if is_stop_requested(): | |
| print("📑 ❌ Glossary translation stopped by user") | |
| return {term: term for term in term_list} | |
| try: | |
| MODEL = os.getenv("MODEL", "gemini-1.5-flash") | |
| API_KEY = (os.getenv("API_KEY") or | |
| os.getenv("OPENAI_API_KEY") or | |
| os.getenv("OPENAI_OR_Gemini_API_KEY") or | |
| os.getenv("GEMINI_API_KEY")) | |
| if is_traditional_translation_api(MODEL): | |
| return | |
| if not API_KEY and not _model_uses_own_auth(MODEL): | |
| print(f"📑 No API key found, skipping translation") | |
| return {term: term for term in term_list} | |
| print(f"📑 Translating {len(term_list)} {profile_name} terms to English using batch size {batch_size}...") | |
| # Ensure multi-key config is available in this process if enabled | |
| _ensure_multi_key_config_loaded() | |
| from unified_api_client import UnifiedClient, UnifiedClientError | |
| client = UnifiedClient(model=MODEL, api_key=API_KEY, output_dir=output_dir) | |
| if hasattr(client, 'reset_cleanup_state'): | |
| client.reset_cleanup_state() | |
| # Get custom translation prompt from environment | |
| translation_prompt_template = os.getenv("GLOSSARY_TRANSLATION_PROMPT", "") | |
| if not translation_prompt_template: | |
| translation_prompt_template = """You are translating {language} character names and important terms to English. | |
| For character names, provide English transliterations or keep as romanized. | |
| Keep honorifics/suffixes only if they are integral to the name. | |
| Respond with the same numbered format. | |
| Terms to translate: | |
| {terms_list} | |
| Provide translations in the same numbered format.""" | |
| all_translations = {} | |
| all_responses = [] # Collect raw responses | |
| # Respect Auto-retry Slow Chunks toggle (RETRY_TIMEOUT env): when off, disable chunk timeouts entirely | |
| retry_env = os.getenv("RETRY_TIMEOUT") | |
| retry_timeout_enabled = retry_env is None or retry_env.strip().lower() not in ("0", "false", "off", "") | |
| if retry_timeout_enabled: | |
| env_ct = os.getenv("CHUNK_TIMEOUT", "1800") | |
| try: | |
| ct_val = float(env_ct) | |
| chunk_timeout = None if ct_val <= 0 else ct_val | |
| except Exception: | |
| chunk_timeout = None | |
| else: | |
| chunk_timeout = None | |
| for i in range(0, len(term_list), batch_size): | |
| # Check stop flag before each batch | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Translation stopped at batch {(i // batch_size) + 1}") | |
| # Return partial translations | |
| for term in term_list: | |
| if term not in all_translations: | |
| all_translations[term] = term | |
| return all_translations | |
| batch = term_list[i:i + batch_size] | |
| batch_num = (i // batch_size) + 1 | |
| total_batches = (len(term_list) + batch_size - 1) // batch_size | |
| print(f"📑 Processing batch {batch_num}/{total_batches} ({len(batch)} terms)...") | |
| # Format terms list | |
| terms_text = "" | |
| for idx, term in enumerate(batch, 1): | |
| terms_text += f"{idx}. {term}\n" | |
| # Replace placeholders in prompt | |
| prompt = translation_prompt_template.replace('{language}', profile_name) | |
| prompt = prompt.replace('{terms_list}', terms_text.strip()) | |
| prompt = prompt.replace('{batch_size}', str(len(batch))) | |
| messages = [ | |
| {"role": "user", "content": prompt} | |
| ] | |
| try: | |
| # Use glossary-specific temperature with fallback to global | |
| temperature = float(os.getenv("GLOSSARY_TEMPERATURE", os.getenv("TEMPERATURE", "0.3"))) | |
| # Use glossary-specific max output tokens with fallback to global | |
| max_tokens = int(os.getenv("GLOSSARY_MAX_OUTPUT_TOKENS", os.getenv("MAX_OUTPUT_TOKENS", "4096"))) | |
| # Use send_with_interrupt for interruptible API call | |
| print(f"📑 Sending translation request for batch {batch_num} (interruptible)...") | |
| # Timeout retry logic (matches translation behavior) | |
| try: | |
| max_timeout_retries = int(os.getenv("TIMEOUT_RETRY_ATTEMPTS", "2")) | |
| except Exception: | |
| max_timeout_retries = 2 | |
| timeout_retry_count = 0 | |
| while True: | |
| try: | |
| response, finish_reason, raw_obj = send_with_interrupt( | |
| messages=messages, | |
| client=client, | |
| temperature=temperature, | |
| max_tokens=max_tokens, | |
| stop_check_fn=is_stop_requested, | |
| chunk_timeout=chunk_timeout | |
| ) | |
| break | |
| except UnifiedClientError as e: | |
| error_msg = str(e) | |
| lower_msg = error_msg.lower() | |
| if "stopped by user" in lower_msg or is_stop_requested(): | |
| raise | |
| is_timeout = ("timed out" in lower_msg) or ("timeout" in lower_msg) or ("cancelled" in lower_msg) or ("client not initialized" in lower_msg) | |
| if is_timeout and timeout_retry_count < max_timeout_retries: | |
| timeout_retry_count += 1 | |
| if chunk_timeout: | |
| print(f"⚠️ Glossary translation batch {batch_num} timed out after {chunk_timeout} seconds, retrying ({timeout_retry_count}/{max_timeout_retries})...") | |
| else: | |
| print(f"⚠️ Glossary translation batch {batch_num} timed out, retrying ({timeout_retry_count}/{max_timeout_retries})...") | |
| # Reinitialize client if needed | |
| client_type = getattr(client, 'client_type', 'unknown') | |
| needs_reinit = False | |
| if client_type == 'gemini': | |
| needs_reinit = hasattr(client, 'gemini_client') and client.gemini_client is None | |
| elif client_type == 'openai': | |
| needs_reinit = hasattr(client, 'openai_client') and client.openai_client is None | |
| if needs_reinit: | |
| try: | |
| print(f" 🔄 Reinitializing {client_type} client...") | |
| client._setup_client() | |
| except Exception as reinit_err: | |
| print(f" ⚠️ Failed to reinitialize client: {reinit_err}") | |
| # Stagger retries | |
| try: | |
| import random | |
| base_delay = float(os.getenv("SEND_INTERVAL_SECONDS", "2")) | |
| retry_delay = random.uniform(base_delay / 2, base_delay) | |
| print(f" ⏳ Waiting {retry_delay:.1f}s before retry...") | |
| time.sleep(retry_delay) | |
| except Exception: | |
| time.sleep(1.0) | |
| continue | |
| else: | |
| raise | |
| # Handle response properly | |
| if hasattr(response, 'content'): | |
| response_text = response.content | |
| else: | |
| response_text = str(response) | |
| # Store raw response with batch info | |
| all_responses.append((batch, response_text)) | |
| print(f"📑 Batch {batch_num} completed - response received") | |
| # Small delay between batches to avoid rate limiting (configurable) | |
| if i + batch_size < len(term_list): | |
| # Check stop before sleep | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Translation stopped after batch {batch_num}") | |
| # Fill in missing translations | |
| for term in term_list: | |
| if term not in all_translations: | |
| all_translations[term] = term | |
| return all_translations | |
| # Use configurable batch delay or default to 0.1s (much faster than 0.5s) | |
| batch_delay = float(os.getenv("GLOSSARY_BATCH_DELAY", "0.001")) | |
| if batch_delay > 0: | |
| time.sleep(batch_delay) | |
| except UnifiedClientError as e: | |
| if "stopped by user" in str(e).lower(): | |
| print(f"📑 ❌ Translation interrupted by user at batch {batch_num}") | |
| # Fill in remaining terms with originals | |
| for term in term_list: | |
| if term not in all_translations: | |
| all_translations[term] = term | |
| return all_translations | |
| else: | |
| print(f"⚠️ Translation failed for batch {batch_num}: {e}") | |
| for term in batch: | |
| all_translations[term] = term | |
| except Exception as e: | |
| print(f"⚠️ Translation failed for batch {batch_num}: {e}") | |
| for term in batch: | |
| all_translations[term] = term | |
| # Parse all responses at the end | |
| print(f"📑 Parsing {len(all_responses)} batch responses...") | |
| for batch, response_text in all_responses: | |
| batch_translations = _parse_translation_response(response_text, batch) | |
| all_translations.update(batch_translations) | |
| # Ensure all terms have translations | |
| for term in term_list: | |
| if term not in all_translations: | |
| all_translations[term] = term | |
| translated_count = sum(1 for term, translation in all_translations.items() | |
| if translation != term and translation.strip()) | |
| print(f"📑 Successfully translated {translated_count}/{len(term_list)} terms") | |
| return all_translations | |
| except Exception as e: | |
| print(f"⚠️ Glossary translation failed: {e}") | |
| return {term: term for term in term_list} | |
| def _extract_names_for_honorific(honorific, all_text, language_hint, | |
| min_frequency, names_with_honorifics, | |
| standalone_names, is_valid_name, fuzzy_threshold=0.90): | |
| """Extract names for a specific honorific with fuzzy matching and stop flag checks""" | |
| # Check stop flag at start | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Name extraction for '{honorific}' stopped by user") | |
| return | |
| if language_hint == 'korean' and not honorific.startswith('-'): | |
| pattern = r'([\uac00-\ud7af]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,.\!?]|$))' | |
| matches = list(re.finditer(pattern, all_text)) | |
| total_matches = len(matches) | |
| for idx, match in enumerate(matches): | |
| # Check stop flag every 50 matches | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Korean name extraction stopped at {idx}/{total_matches}") | |
| return | |
| # Show progress for large sets | |
| if total_matches > 500: | |
| progress = (idx / total_matches) * 100 | |
| print(f"📑 Processing Korean names: {progress:.1f}% ({idx}/{total_matches})") | |
| potential_name = match.group(1) | |
| if is_valid_name(potential_name, 'korean'): | |
| full_form = potential_name + honorific | |
| # Use fuzzy matching for counting with stop check | |
| count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold) | |
| # Check if stopped during fuzzy matching | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Name extraction stopped during fuzzy matching") | |
| return | |
| if count >= min_frequency: | |
| context_patterns = [ | |
| full_form + r'[은는이가]', | |
| full_form + r'[을를]', | |
| full_form + r'[에게한테]', | |
| r'["]' + full_form, | |
| full_form + r'[,]', | |
| ] | |
| context_count = 0 | |
| for ctx_pattern in context_patterns: | |
| context_count += len(re.findall(ctx_pattern, all_text)) | |
| if context_count > 0: | |
| names_with_honorifics[full_form] = count | |
| standalone_names[potential_name] = count | |
| elif language_hint == 'japanese' and not honorific.startswith('-'): | |
| pattern = r'([\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff]{2,5})(?=' + re.escape(honorific) + r'(?:\s|[、。!?]|$))' | |
| matches = list(re.finditer(pattern, all_text)) | |
| total_matches = len(matches) | |
| for idx, match in enumerate(matches): | |
| # Check stop flag every 50 matches | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Japanese name extraction stopped at {idx}/{total_matches}") | |
| return | |
| if total_matches > 500: | |
| progress = (idx / total_matches) * 100 | |
| print(f"📑 Processing Japanese names: {progress:.1f}% ({idx}/{total_matches})") | |
| potential_name = match.group(1) | |
| if is_valid_name(potential_name, 'japanese'): | |
| full_form = potential_name + honorific | |
| count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold) | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Name extraction stopped during fuzzy matching") | |
| return | |
| if count >= min_frequency: | |
| names_with_honorifics[full_form] = count | |
| standalone_names[potential_name] = count | |
| elif language_hint == 'chinese' and not honorific.startswith('-'): | |
| pattern = r'([\u4e00-\u9fff]{2,4})(?=' + re.escape(honorific) + r'(?:\s|[,。!?]|$))' | |
| matches = list(re.finditer(pattern, all_text)) | |
| total_matches = len(matches) | |
| for idx, match in enumerate(matches): | |
| # Check stop flag every 50 matches | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Chinese name extraction stopped at {idx}/{total_matches}") | |
| return | |
| if total_matches > 500: | |
| progress = (idx / total_matches) * 100 | |
| print(f"📑 Processing Chinese names: {progress:.1f}% ({idx}/{total_matches})") | |
| potential_name = match.group(1) | |
| if is_valid_name(potential_name, 'chinese'): | |
| full_form = potential_name + honorific | |
| count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold) | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Name extraction stopped during fuzzy matching") | |
| return | |
| if count >= min_frequency: | |
| names_with_honorifics[full_form] = count | |
| standalone_names[potential_name] = count | |
| elif honorific.startswith('-') or honorific.startswith(' '): | |
| is_space_separated = honorific.startswith(' ') | |
| if is_space_separated: | |
| pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'(?=\s|[,.\!?]|$)' | |
| else: | |
| pattern_english = r'\b([A-Z][a-zA-Z]+)' + re.escape(honorific) + r'\b' | |
| matches = list(re.finditer(pattern_english, all_text)) | |
| total_matches = len(matches) | |
| for idx, match in enumerate(matches): | |
| # Check stop flag every 50 matches | |
| if idx > 0 and idx % 50 == 0: | |
| if is_stop_requested(): | |
| print(f"📑 ❌ English name extraction stopped at {idx}/{total_matches}") | |
| return | |
| if total_matches > 500: | |
| progress = (idx / total_matches) * 100 | |
| print(f"📑 Processing English names: {progress:.1f}% ({idx}/{total_matches})") | |
| potential_name = match.group(1) | |
| if is_valid_name(potential_name, 'english'): | |
| full_form = potential_name + honorific | |
| count = _find_fuzzy_matches(full_form, all_text, fuzzy_threshold) | |
| if is_stop_requested(): | |
| print(f"📑 ❌ Name extraction stopped during fuzzy matching") | |
| return | |
| if count >= min_frequency: | |
| names_with_honorifics[full_form] = count | |
| standalone_names[potential_name] = count | |
| def _parse_translation_response(response, original_terms): | |
| """Extract translations from AI response by matching numbered lines to original terms""" | |
| translations = {} | |
| # Handle UnifiedResponse object | |
| if hasattr(response, 'content'): | |
| response_text = response.content | |
| else: | |
| response_text = str(response) | |
| # Split into lines | |
| lines = response_text.strip().split('\n') | |
| for line in lines: | |
| line = line.strip() | |
| if not line: | |
| continue | |
| # Match numbered format: "1. Translation" or "1) Translation" etc | |
| number_match = re.match(r'^(\d+)[\.):\-\s]+(.+)', line) | |
| if number_match: | |
| idx = int(number_match.group(1)) - 1 # Convert to 0-based | |
| translation = number_match.group(2).strip() | |
| # Remove trailing explanations in parentheses | |
| translation = re.sub(r'\s*\([^)]+\)\s*$', '', translation) | |
| if 0 <= idx < len(original_terms): | |
| translations[original_terms[idx]] = translation | |
| print(f"📑 Extracted {len(translations)}/{len(original_terms)} translations") | |
| return translations | |
| def _init_worker_with_env(env_vars_dict): | |
| """Initialize worker process with environment variables from parent. | |
| MUST be at module level for pickling by multiprocessing.Pool. | |
| """ | |
| import os | |
| for k, v in env_vars_dict.items(): | |
| os.environ[k] = str(v) | |
| def _check_sentence_batch_for_terms(args): | |
| """Check a batch of sentences for term matches - used by ProcessPoolExecutor""" | |
| batch_sentences, terms = args | |
| filtered = [] | |
| # Use pre-compiled term list for fast checking | |
| for sentence in batch_sentences: | |
| # Quick check using any() - stops at first match | |
| if any(term in sentence for term in terms): | |
| filtered.append(sentence) | |
| return filtered | |
| def _score_sentence_batch(args): | |
| """Worker function to score a batch of sentences - Optimized for speed""" | |
| (start_idx, sentences), term_list, honorific_pattern_str, gender_pronouns, include_gender_context = args | |
| import re | |
| local_scores = {} | |
| local_term_map = {} | |
| # Pre-compile regex if needed | |
| honorific_pattern = re.compile(honorific_pattern_str) if honorific_pattern_str else None | |
| # OPTIMIZATION 1: Segregate terms for hybrid strategy | |
| # - Single-token terms: Use O(1) set intersection (FAST) | |
| # - Multi-token terms: Use iteration (SLOWER, but few terms) | |
| # This preserves quality for terms with spaces while keeping speed for CJK/single names | |
| # Simple tokenizer for classification (matches CJK chars or alphanumeric sequences) | |
| tokenizer_pattern = re.compile(r'[\u4e00-\u9fff\u3040-\u309f\u30a0-\u30ff\uac00-\ud7af]+|[a-zA-Z0-9]+') | |
| single_token_terms = set() | |
| multi_token_terms = [] | |
| for t in term_list: | |
| if len(t) < 2: continue | |
| # Check if term splits into multiple tokens | |
| tokens = tokenizer_pattern.findall(t) | |
| if len(tokens) > 1: | |
| multi_token_terms.append(t) | |
| else: | |
| single_token_terms.add(t) | |
| # Pre-compile multi-token terms regex if there are any (faster than loop) | |
| multi_term_regex = None | |
| if multi_token_terms: | |
| # Sort by length desc to match longest first | |
| multi_token_terms.sort(key=len, reverse=True) | |
| # Escape terms | |
| pattern = '|'.join(map(re.escape, multi_token_terms)) | |
| try: | |
| multi_term_regex = re.compile(pattern) | |
| except: | |
| # Fallback if pattern is too huge (unlikely for just multi-word subset) | |
| pass | |
| for idx, sentence in enumerate(sentences): | |
| global_idx = start_idx + idx | |
| score = 1.0 | |
| # Gender pronoun check (fast) | |
| if include_gender_context and gender_pronouns: | |
| for p in gender_pronouns: | |
| if p in sentence: | |
| score += 5.0 | |
| break | |
| # Honorific check (fast regex) | |
| if honorific_pattern and honorific_pattern.search(sentence): | |
| score += 2.0 | |
| local_scores[global_idx] = score | |
| # 1. Fast Path: Single-token terms (Set Intersection) | |
| tokens = set(tokenizer_pattern.findall(sentence)) | |
| found_terms = tokens.intersection(single_token_terms) | |
| for term in found_terms: | |
| if term not in local_term_map: | |
| local_term_map[term] = [] | |
| local_term_map[term].append(global_idx) | |
| # 2. Slow Path: Multi-token terms (Regex or Iteration) | |
| # Only needed if we actually have multi-word terms | |
| if multi_token_terms: | |
| if multi_term_regex: | |
| # Fast regex batch match | |
| for match in multi_term_regex.findall(sentence): | |
| if match not in local_term_map: | |
| local_term_map[match] = [] | |
| # Avoid duplicates if regex matches same term multiple times | |
| if global_idx not in local_term_map[match]: | |
| local_term_map[match].append(global_idx) | |
| else: | |
| # Fallback iteration | |
| for term in multi_token_terms: | |
| if term in sentence: | |
| if term not in local_term_map: | |
| local_term_map[term] = [] | |
| local_term_map[term].append(global_idx) | |
| return local_scores, local_term_map | |
| def _process_sentence_batch_for_extraction(args): | |
| """Process sentences to extract terms - used by ProcessPoolExecutor""" | |
| batch_sentences, batch_idx, combined_pattern, exclude_check_data = args | |
| from collections import Counter | |
| import re | |
| local_word_freq = Counter() | |
| local_important = [] | |
| local_seen = set() | |
| # Rebuild the exclusion check function from data | |
| honorifics_to_exclude, title_patterns_str, common_words, chinese_nums = exclude_check_data | |
| title_patterns = [re.compile(p) for p in title_patterns_str] | |
| def should_exclude_term(term): | |
| term_lower = term.lower() | |
| # Check if it's a common word | |
| if term in common_words or term_lower in common_words: | |
| return True | |
| # Check if it contains honorifics | |
| for honorific in honorifics_to_exclude: | |
| if honorific in term or (honorific.startswith('-') and term.endswith(honorific[1:])): | |
| return True | |
| # Check if it matches title patterns | |
| for pattern in title_patterns: | |
| if pattern.search(term): | |
| return True | |
| # Check if it's a number | |
| if term in chinese_nums or term.isdigit(): | |
| return True | |
| return False | |
| for sentence in batch_sentences: | |
| sentence = sentence.strip() | |
| if len(sentence) < 10 or len(sentence) > 500: | |
| continue | |
| # Find all potential terms in this sentence | |
| matches = re.findall(combined_pattern, sentence) | |
| if matches: | |
| # Filter out excluded terms | |
| filtered_matches = [] | |
| for match in matches: | |
| if not should_exclude_term(match): | |
| local_word_freq[match] += 1 | |
| filtered_matches.append(match) | |
| # Keep sentences with valid potential terms | |
| if filtered_matches: | |
| sentence_key = ' '.join(sorted(filtered_matches)) | |
| if sentence_key not in local_seen: | |
| local_important.append(sentence) | |
| local_seen.add(sentence_key) | |
| return local_word_freq, local_important, local_seen, batch_idx | |