# Chapter_Extractor.py - Module-level chapter extraction functions import os import re import sys import json import threading import time import shutil import hashlib import warnings # Lazy import for PatternManager to speed up ProcessPoolExecutor worker startup on Windows # The heavy TransateKRtoEN import is deferred until actually needed _PatternManager = None _PM = None def _get_pattern_manager(): """Lazy initialization of PatternManager to avoid slow imports in worker processes""" global _PatternManager, _PM if _PatternManager is None: from TransateKRtoEN import PatternManager as PM_Class _PatternManager = PM_Class _PM = PM_Class() return _PM # For backward compatibility - property-like access class _LazyPM: def __getattr__(self, name): return getattr(_get_pattern_manager(), name) PM = _LazyPM() from bs4 import BeautifulSoup try: from bs4 import XMLParsedAsHTMLWarning warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning) except ImportError: pass from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor, as_completed from collections import Counter # Stop request function (can be overridden) def is_stop_requested(): """Check if stop has been requested - default implementation""" return False # Progress bar for terminal output class ProgressBar: """Simple in-place progress bar for terminal output""" _last_line_length = 0 @classmethod def update(cls, current, total, prefix="Progress", bar_length=30): if total == 0: return percent = min(100, int(100 * current / total)) filled = int(bar_length * current / total) bar = 'β' * filled + 'β' * (bar_length - filled) line = f"\r{prefix}: [{bar}] {current}/{total} ({percent}%)" if len(line) < cls._last_line_length: line += ' ' * (cls._last_line_length - len(line)) cls._last_line_length = len(line) print(line, end='', flush=True) @classmethod def finish(cls): print() cls._last_line_length = 0 # Helper for resource filename sanitization def sanitize_resource_filename(filename): """Sanitize resource filenames to be filesystem-safe""" import unicodedata # Normalize unicode - use NFC to preserve Korean/CJK characters # NFKD decomposes Korean Hangul into jamo components, corrupting them filename = unicodedata.normalize('NFC', filename) # Remove or replace problematic characters filename = re.sub(r'[<>:"/\\|?*]', '_', filename) return filename def _get_best_parser(): """Determine the best parser available, preferring lxml for CJK text""" try: import lxml return 'lxml' except ImportError: return 'html.parser' def _sort_by_opf_spine(chapters, opf_path): """Sort chapters according to OPF spine order""" try: import xml.etree.ElementTree as ET # Read OPF file with open(opf_path, 'r', encoding='utf-8') as f: opf_content = f.read() # Parse OPF root = ET.fromstring(opf_content) # Find namespaces ns = {'opf': 'http://www.idpf.org/2007/opf'} if root.tag.startswith('{'): default_ns = root.tag[1:root.tag.index('}')] ns = {'opf': default_ns} # Build manifest map (id -> href) manifest = {} for item in root.findall('.//opf:manifest/opf:item', ns): item_id = item.get('id') href = item.get('href') if item_id and href: manifest[item_id] = href # Get spine order spine_order = [] spine = root.find('.//opf:spine', ns) if spine is not None: for itemref in spine.findall('opf:itemref', ns): idref = itemref.get('idref') if idref and idref in manifest: href = manifest[idref] spine_order.append(href) if not spine_order: print("β οΈ No spine order found in OPF, keeping original order") return chapters # Create a mapping of filenames to spine position spine_map = {} for idx, href in enumerate(spine_order): # Try different matching strategies basename = os.path.basename(href) spine_map[basename] = idx spine_map[href] = idx # Also store without extension for flexible matching name_no_ext = os.path.splitext(basename)[0] spine_map[name_no_ext] = idx print(f"π OPF spine contains {len(spine_order)} items") # Sort chapters based on spine order def get_spine_position(chapter): # Try to match chapter to spine filename = chapter.get('filename', '') basename = chapter.get('original_basename', '') # Try exact filename match if filename in spine_map: return spine_map[filename] # Try basename match if basename in spine_map: return spine_map[basename] # Try basename of filename if filename: fname_base = os.path.basename(filename) if fname_base in spine_map: return spine_map[fname_base] # Try without extension if basename: if basename + '.html' in spine_map: return spine_map[basename + '.html'] if basename + '.xhtml' in spine_map: return spine_map[basename + '.xhtml'] # Fallback to chapter number * 1000 (to sort after spine items) return 1000000 + chapter.get('num', 0) # Sort chapters sorted_chapters = sorted(chapters, key=get_spine_position) # Renumber chapters based on new order for idx, chapter in enumerate(sorted_chapters, 1): chapter['spine_order'] = idx # Optionally update chapter numbers to match spine order # chapter['num'] = idx # Uncomment if you want to renumber # Log reordering info reordered_count = 0 for idx, chapter in enumerate(sorted_chapters): original_idx = chapters.index(chapter) if original_idx != idx: reordered_count += 1 if reordered_count > 0: print(f"π Reordered {reordered_count} chapters to match OPF spine") else: print(f"β Chapter order already matches OPF spine") return sorted_chapters except Exception as e: print(f"β οΈ Could not sort by OPF spine: {e}") import traceback traceback.print_exc() return chapters def protect_angle_brackets_with_korean(text: str) -> str: """Protect CJK text in angle brackets from HTML parsing""" if text is None: return "" import re # Extended pattern to include Korean, Chinese, and Japanese characters cjk_pattern = r'[κ°-ν£γ±-γ γ -γ £δΈ-ιΎΏγ-γγ‘-γΏ]' bracket_pattern = rf'<([^<>]*{cjk_pattern}[^<>]*)>' def replace_brackets(match): content = match.group(1) return f'<{content}>' return re.sub(bracket_pattern, replace_brackets, text) def ensure_all_opf_chapters_extracted(zf, chapters, out): """Ensure ALL chapters from OPF spine are extracted, not just what ChapterExtractor found""" # Parse OPF to get ALL chapters in spine opf_chapters = [] try: # Find content.opf opf_content = None for name in zf.namelist(): if name.endswith('content.opf'): opf_content = zf.read(name) break if not opf_content: return chapters # No OPF, return original import xml.etree.ElementTree as ET root = ET.fromstring(opf_content) # Handle namespaces ns = {'opf': 'http://www.idpf.org/2007/opf'} if root.tag.startswith('{'): default_ns = root.tag[1:root.tag.index('}')] ns = {'opf': default_ns} # Get manifest manifest = {} for item in root.findall('.//opf:manifest/opf:item', ns): item_id = item.get('id') href = item.get('href') media_type = item.get('media-type', '') if item_id and href and ('html' in media_type.lower() or href.endswith(('.html', '.xhtml', '.htm'))): manifest[item_id] = href # Get spine order spine = root.find('.//opf:spine', ns) if spine: for itemref in spine.findall('opf:itemref', ns): idref = itemref.get('idref') if idref and idref in manifest: href = manifest[idref] filename = os.path.basename(href) # Skip nav, toc, cover - BUT only if filename has NO numbers # Files with numbers like 'nav01', 'toc05' are real chapters import re has_numbers = bool(re.search(r'\d', filename)) if not has_numbers and any(skip in filename.lower() for skip in ['nav', 'toc', 'cover']): continue opf_chapters.append(href) print(f"π OPF spine contains {len(opf_chapters)} chapters") # Check which OPF chapters are missing from extraction extracted_files = set() for c in chapters: if 'filename' in c: extracted_files.add(c['filename']) if 'original_basename' in c: extracted_files.add(c['original_basename']) missing_chapters = [] for opf_chapter in opf_chapters: basename = os.path.basename(opf_chapter) if basename not in extracted_files and opf_chapter not in extracted_files: missing_chapters.append(opf_chapter) if missing_chapters: print(f"β οΈ {len(missing_chapters)} chapters in OPF but not extracted!") print(f" Missing: {missing_chapters[:5]}{'...' if len(missing_chapters) > 5 else ''}") # Extract the missing chapters for href in missing_chapters: try: # Read the chapter content content = zf.read(href).decode('utf-8') # Extract chapter number import re basename = os.path.basename(href) matches = re.findall(r'(\d+)', basename) if matches: chapter_num = int(matches[-1]) else: chapter_num = len(chapters) + 1 # Create chapter entry from bs4 import BeautifulSoup parser = 'lxml' if 'lxml' in sys.modules else 'html.parser' soup = BeautifulSoup(content, parser) # Get title title = "Chapter " + str(chapter_num) title_tag = soup.find('title') if title_tag: title = title_tag.get_text().strip() or title else: for tag in ['h1', 'h2', 'h3']: header = soup.find(tag) if header: title = header.get_text().strip() or title break # Save the chapter file output_filename = f"chapter_{chapter_num:04d}_{basename}" output_path = os.path.join(out, output_filename) with open(output_path, 'w', encoding='utf-8') as f: f.write(content) # Add to chapters list new_chapter = { 'num': chapter_num, 'title': title, 'body': content, 'filename': href, 'original_basename': basename, 'file_size': len(content), 'has_images': bool(soup.find_all('img')), 'detection_method': 'opf_recovery', 'content_hash': None # Will be calculated later } chapters.append(new_chapter) print(f" β Recovered chapter {chapter_num}: {basename}") except Exception as e: print(f" β Failed to extract {href}: {e}") # Re-sort chapters by number chapters.sort(key=lambda x: x['num']) print(f"β Total chapters after OPF recovery: {len(chapters)}") except Exception as e: print(f"β οΈ Error checking OPF chapters: {e}") import traceback traceback.print_exc() return chapters def extract_chapters(zf, output_dir, parser=None, progress_callback=None, pattern_manager=None): """Extract chapters and all resources from EPUB using ThreadPoolExecutor Args: zf: ZipFile object of the EPUB output_dir: Output directory for extracted files parser: BeautifulSoup parser to use ('lxml' or 'html.parser') progress_callback: Optional callback for progress updates pattern_manager: Optional PatternManager instance for chapter detection """ import time # Initialize defaults if not provided if parser is None: parser = _get_best_parser() # pattern_manager is no longer used - kept for API compatibility # Check stop at the very beginning if is_stop_requested(): print("β Extraction stopped by user") return [] print("π Starting EPUB extraction with ThreadPoolExecutor...") print(f"π Using parser: {parser} {'(optimized for CJK)' if parser == 'lxml' else '(standard)'}") # Initial progress if progress_callback: progress_callback("Starting EPUB extraction...") # First, extract and save content.opf for reference for name in zf.namelist(): if name.endswith('.opf'): try: opf_content = zf.read(name).decode('utf-8', errors='ignore') opf_output_path = os.path.join(output_dir, 'content.opf') with open(opf_output_path, 'w', encoding='utf-8') as f: f.write(opf_content) print(f"π Saved OPF file: {name} β content.opf") break except Exception as e: print(f"β οΈ Could not save OPF file: {e}") # Get extraction mode from environment extraction_mode = os.getenv("EXTRACTION_MODE", "smart").lower() print(f"β Using {extraction_mode.capitalize()} extraction mode") # Get number of workers from environment or use default max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) print(f"π§ Using {max_workers} workers for parallel processing") extracted_resources = _extract_all_resources(zf, output_dir, progress_callback) # Check stop after resource extraction if is_stop_requested(): print("β Extraction stopped by user") return [] metadata_path = os.path.join(output_dir, 'metadata.json') if os.path.exists(metadata_path): print("π Loading existing metadata...") with open(metadata_path, 'r', encoding='utf-8') as f: metadata = json.load(f) else: print("π Extracting fresh metadata...") metadata = _extract_epub_metadata(zf) print(f"π Extracted metadata: {list(metadata.keys())}") chapters, detected_language = _extract_chapters_universal(zf, extraction_mode, parser, progress_callback, pattern_manager) # Sort chapters according to OPF spine order if available opf_path = os.path.join(output_dir, 'content.opf') if os.path.exists(opf_path) and chapters: print("π Sorting chapters according to OPF spine order...") chapters = _sort_by_opf_spine(chapters, opf_path) print(f"β Chapters sorted according to OPF reading order") # Check stop after chapter extraction if is_stop_requested(): print("β Extraction stopped by user") return [] if not chapters: print("β No chapters could be extracted!") return [] chapters_info_path = os.path.join(output_dir, 'chapters_info.json') chapters_info = [] chapters_info_lock = threading.Lock() def process_chapter(chapter): """Process a single chapter""" # Check stop in worker if is_stop_requested(): return None info = { 'num': chapter['num'], 'title': chapter['title'], 'original_filename': chapter.get('filename', ''), 'has_images': chapter.get('has_images', False), 'image_count': chapter.get('image_count', 0), 'text_length': chapter.get('file_size', len(chapter.get('body', ''))), 'detection_method': chapter.get('detection_method', 'unknown'), 'content_hash': chapter.get('content_hash', '') } if chapter.get('has_images'): try: soup = BeautifulSoup(chapter.get('body', ''), parser) images = soup.find_all('img') info['images'] = [img.get('src', '') for img in images] except: info['images'] = [] return info # Process chapters in parallel print(f"π Processing {len(chapters)} chapters in parallel...") if progress_callback: progress_callback(f"Processing {len(chapters)} chapters...") with ThreadPoolExecutor(max_workers=max_workers) as executor: # Submit all tasks future_to_chapter = { executor.submit(process_chapter, chapter): chapter for chapter in chapters } # Process completed tasks completed = 0 for future in as_completed(future_to_chapter): if is_stop_requested(): print("β Extraction stopped by user") # Cancel remaining futures for f in future_to_chapter: f.cancel() return [] try: result = future.result() if result: with chapters_info_lock: chapters_info.append(result) completed += 1 # Yield to GUI periodically (can be disabled for max speed) if completed % 5 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) # Progress updates if completed % 10 == 0 or completed == len(chapters): if progress_callback: progress_msg = f"Processed {completed}/{len(chapters)} chapters" progress_callback(progress_msg) else: # Show progress bar in terminal ProgressBar.update(completed, len(chapters), prefix="π Processing metadata") except Exception as e: chapter = future_to_chapter[future] print(f" β Error processing chapter {chapter['num']}: {e}") # Finish progress bar if not progress_callback: ProgressBar.finish() # Sort chapters_info by chapter number to maintain order chapters_info.sort(key=lambda x: x['num']) print(f"β Successfully processed {len(chapters_info)} chapters") with open(chapters_info_path, 'w', encoding='utf-8') as f: json.dump(chapters_info, f, ensure_ascii=False, indent=2) print(f"πΎ Saved detailed chapter info to: chapters_info.json") metadata.update({ 'chapter_count': len(chapters), 'detected_language': detected_language, 'extracted_resources': extracted_resources, 'extraction_mode': extraction_mode, 'extraction_summary': { 'total_chapters': len(chapters), 'chapter_range': f"{chapters[0]['num']}-{chapters[-1]['num']}", 'resources_extracted': sum(len(files) for files in extracted_resources.values()) } }) metadata['chapter_titles'] = { str(c['num']): c['title'] for c in chapters } with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) print(f"πΎ Saved comprehensive metadata to: {metadata_path}") _create_extraction_report(output_dir, metadata, chapters, extracted_resources) _log_extraction_summary(chapters, extracted_resources, detected_language) print(f"π VERIFICATION: {extraction_mode.capitalize()} chapter extraction completed successfully") print(f"β‘ Used {max_workers} workers for parallel processing") return chapters def _extract_all_resources(zf, output_dir, progress_callback=None): """Extract all resources with parallel processing""" import time extracted_resources = { 'css': [], 'fonts': [], 'images': [], 'epub_structure': [], 'other': [] } # Check if already extracted extraction_marker = os.path.join(output_dir, '.resources_extracted') if os.path.exists(extraction_marker): print("π¦ Resources already extracted, skipping...") return _count_existing_resources(output_dir, extracted_resources) _cleanup_old_resources(output_dir) # Create directories for resource_type in ['css', 'fonts', 'images']: os.makedirs(os.path.join(output_dir, resource_type), exist_ok=True) # Only print if no callback (avoid duplicates in subprocess) if not progress_callback: print(f"π¦ Extracting resources in parallel...") # Get list of files to process file_list = [f for f in zf.namelist() if not f.endswith('/') and os.path.basename(f)] # Thread-safe lock for extracted_resources resource_lock = threading.Lock() def extract_single_resource(file_path): if is_stop_requested(): return None try: file_data = zf.read(file_path) resource_info = _categorize_resource(file_path, os.path.basename(file_path)) if resource_info: resource_type, target_dir, safe_filename = resource_info target_path = os.path.join(output_dir, target_dir, safe_filename) if target_dir else os.path.join(output_dir, safe_filename) with open(target_path, 'wb') as f: f.write(file_data) # Thread-safe update with resource_lock: extracted_resources[resource_type].append(safe_filename) return (resource_type, safe_filename) except Exception as e: print(f"[WARNING] Failed to extract {file_path}: {e}") return None # Process files in parallel total_resources = len(file_list) extracted_count = 0 # Use same worker count as chapter processing resource_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) with ThreadPoolExecutor(max_workers=resource_workers) as executor: futures = {executor.submit(extract_single_resource, file_path): file_path for file_path in file_list} for future in as_completed(futures): if is_stop_requested(): executor.shutdown(wait=False) break extracted_count += 1 # Progress update every 20 files if extracted_count % 20 == 0: if progress_callback: progress_callback(f"Extracting resources: {extracted_count}/{total_resources}") else: # Print progress bar in terminal ProgressBar.update(extracted_count, total_resources, prefix="π¦ Extracting resources") # Yield to GUI periodically (can be disabled for max speed) if extracted_count % 10 == 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) result = future.result() if result: resource_type, filename = result # Only print for important resources if extracted_count < 10 or resource_type in ['css', 'fonts']: print(f" π Extracted {resource_type}: {filename}") # Show 100% completion if progress_callback: progress_callback(f"Extracting resources: {total_resources}/{total_resources}") else: ProgressBar.update(total_resources, total_resources, prefix="π¦ Extracting resources") ProgressBar.finish() # Mark as complete with open(extraction_marker, 'w') as f: f.write(f"Resources extracted at {time.time()}") _validate_critical_files(output_dir, extracted_resources) return extracted_resources def _extract_chapters_universal(zf, extraction_mode="smart", parser=None, progress_callback=None, pattern_manager=None): """Universal chapter extraction with four modes: smart, comprehensive, full, enhanced All modes now properly merge Section/Chapter pairs Enhanced mode uses html2text for superior text processing Now with parallel processing for improved performance """ # Initialize defaults if not provided if parser is None: parser = _get_best_parser() # pattern_manager is no longer used - kept for API compatibility # Check stop at the beginning if is_stop_requested(): print("β Chapter extraction stopped by user") return [], 'unknown' # Import time for yielding import time # Initialize enhanced extractor if using enhanced mode enhanced_extractor = None enhanced_filtering = extraction_mode # Default fallback preserve_structure = True # Check if user wants to translate special files (info.xhtml, message.xhtml, etc.) # By default, skip them as they're typically metadata/navigation translate_special = os.getenv('TRANSLATE_SPECIAL_FILES', '0') == '1' if translate_special: print("π Special files translation is ENABLED (info.xhtml, message.xhtml, etc.)") else: print("π Special files translation is DISABLED - skipping navigation/metadata files") if extraction_mode == "enhanced": print("π Initializing Enhanced extraction mode with html2text...") # Get enhanced mode configuration from environment enhanced_filtering = os.getenv("ENHANCED_FILTERING", "smart") # Avoid 'full' with html2text to prevent XML declaration artifacts; use 'comprehensive' instead if str(enhanced_filtering).lower() == 'full': enhanced_filtering = 'comprehensive' preserve_structure = os.getenv("ENHANCED_PRESERVE_STRUCTURE", "1") == "1" print(f" β’ Enhanced filtering level: {enhanced_filtering}") print(f" β’ Preserve structure: {preserve_structure}") # Try to initialize enhanced extractor try: # Import our enhanced extractor (assume it's in the same directory or importable) from enhanced_text_extractor import EnhancedTextExtractor enhanced_extractor = EnhancedTextExtractor( filtering_mode=enhanced_filtering, preserve_structure=preserve_structure ) print("β Enhanced text extractor initialized successfully") except ImportError as e: print(f"β Enhanced text extractor module not found: {e}") print(f"β Cannot use enhanced extraction mode. Please install enhanced_text_extractor or select a different extraction mode.") raise e except Exception as e: print(f"β Enhanced extractor initialization failed: {e}") print(f"β Cannot use enhanced extraction mode. Please select a different extraction mode.") raise e chapters = [] sample_texts = [] # First phase: Collect HTML files html_files = [] file_list = zf.namelist() total_files = len(file_list) # Update progress for file collection if progress_callback and total_files > 100: progress_callback(f"Scanning {total_files} files in EPUB...") elif total_files > 100 and not progress_callback: # Print initial message for progress bar (only if no callback) print(f"π Scanning {total_files} files in EPUB...") for idx, name in enumerate(file_list): # Check stop while collecting files if is_stop_requested(): print("β Chapter extraction stopped by user") return [], 'unknown' # Yield to GUI every 50 files (can be disabled for max speed) if idx % 50 == 0 and idx > 0: if os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) # Brief yield to GUI if total_files > 100: if progress_callback: progress_callback(f"Scanning files: {idx}/{total_files}") else: # Print progress bar in terminal ProgressBar.update(idx, total_files, prefix="π Scanning files") if name.lower().endswith(('.xhtml', '.html', '.htm')): basename = os.path.basename(name).lower() # Skip cover files unless special file translation is enabled if basename in ['cover.html', 'cover.xhtml', 'cover.htm']: if not translate_special: print(f"[SKIP] Cover file excluded: {name}") continue else: print(f"[INCLUDE] Cover file included (special files enabled): {name}") # All filtering is now controlled by TRANSLATE_SPECIAL_FILES toggle and extraction mode # No hardcoded special file patterns html_files.append(name) # Print final 100% progress update before finishing if total_files > 100: if progress_callback: progress_callback(f"Scanning files: {total_files}/{total_files}") else: # Show 100% completion ProgressBar.update(total_files, total_files, prefix="π Scanning files") # Finish progress bar if we were using it if total_files > 100 and not progress_callback: ProgressBar.finish() # Update mode description to include enhanced mode mode_description = { "smart": "potential content files", "comprehensive": "HTML files", "full": "ALL HTML/XHTML files (no filtering)", "enhanced": f"files (enhanced with {enhanced_filtering} filtering)" } print(f"π Found {len(html_files)} {mode_description.get(extraction_mode, 'files')} in EPUB") # Sort files to ensure proper order html_files.sort() # Check if merging is disabled via environment variable disable_merging = os.getenv("DISABLE_CHAPTER_MERGING", "0") == "1" processed_files = set() merge_candidates = {} # Store potential merges without reading files yet if disable_merging: print("π Chapter merging is DISABLED - processing all files independently") else: print("π Chapter merging is ENABLED") # Only do merging logic if not disabled file_groups = {} # Group files by their base number to detect Section/Chapter pairs for file_path in html_files: filename = os.path.basename(file_path) # Try different patterns to extract base number base_num = None # Pattern 1: "No00014" from "No00014Section.xhtml" match = re.match(r'(No\d+)', filename) if match: base_num = match.group(1) else: # Pattern 2: "0014" from "0014_section.html" or "0014_chapter.html" match = re.match(r'^(\d+)[_\-]', filename) if match: base_num = match.group(1) else: # Pattern 3: Just numbers at the start match = re.match(r'^(\d+)', filename) if match: base_num = match.group(1) if base_num: if base_num not in file_groups: file_groups[base_num] = [] file_groups[base_num].append(file_path) # Identify merge candidates WITHOUT reading files yet for base_num, group_files in sorted(file_groups.items()): if len(group_files) == 2: # Check if we have a Section/Chapter pair based on filenames only section_file = None chapter_file = None for file_path in group_files: basename = os.path.basename(file_path) # More strict detection - must have 'section' or 'chapter' in the filename if 'section' in basename.lower() and 'chapter' not in basename.lower(): section_file = file_path elif 'chapter' in basename.lower() and 'section' not in basename.lower(): chapter_file = file_path if section_file and chapter_file: # Store as potential merge candidate merge_candidates[chapter_file] = section_file processed_files.add(section_file) print(f"[DEBUG] Potential merge candidate: {base_num}") print(f" Section: {os.path.basename(section_file)}") print(f" Chapter: {os.path.basename(chapter_file)}") # Filter out section files that were marked for merging files_to_process = [] for file_path in html_files: if not disable_merging and file_path in processed_files: print(f"[DEBUG] Skipping section file: {file_path}") continue files_to_process.append(file_path) print(f"π Processing {len(files_to_process)} files after merge analysis") if progress_callback: progress_callback(f"Preparing to process {len(files_to_process)} chapters...") # Initialize collections for aggregating results file_size_groups = {} h1_count = 0 h2_count = 0 skipped_files = [] # Progress tracking total_files = len(files_to_process) # Prepare arguments for parallel processing zip_file_path = zf.filename # Process files in parallel or sequentially based on file count # Only print if no callback (avoid duplicates) if not progress_callback: print(f"π Processing {len(files_to_process)} HTML files...") # Initial progress - no message needed, progress bar will show candidate_chapters = [] # For smart mode chapters_direct = [] # For other modes # Decide whether to use parallel processing use_parallel = len(files_to_process) > 10 if use_parallel: # Get worker count from environment variable max_workers = int(os.getenv("EXTRACTION_WORKERS", "2")) print(f"π¦ Using parallel processing with {max_workers} workers...") if progress_callback: progress_callback(f"Starting {max_workers} extraction workers...") # Use ProcessPoolExecutor for true multi-process parallelism # Now that all functions are at module level and picklable, we can use processes with ProcessPoolExecutor(max_workers=max_workers) as executor: # Submit all files for processing future_to_file = { executor.submit( _process_single_html_file, file_path=file_path, file_index=idx, zip_file_path=zip_file_path, parser=parser, merge_candidates=merge_candidates, disable_merging=disable_merging, enhanced_extractor=enhanced_extractor, extraction_mode=extraction_mode, enhanced_filtering=enhanced_filtering, preserve_structure=preserve_structure, protect_angle_brackets_func=protect_angle_brackets_with_korean, pattern_manager=pattern_manager, files_to_process=files_to_process, is_stop_requested=is_stop_requested ): (file_path, idx) for idx, file_path in enumerate(files_to_process) } # Collect results as they complete with progress tracking processed_count = 0 for future in as_completed(future_to_file): if is_stop_requested(): print("β Chapter processing stopped by user") executor.shutdown(wait=False) return [], 'unknown' try: # Unpack result from _process_single_html_file result = future.result() chapter_info, h1_found, h2_found, file_size, sample_text, skipped_info = result # Update progress processed_count += 1 if processed_count % 5 == 0: if progress_callback: progress_msg = f"Processing chapters: {processed_count}/{total_files} ({processed_count*100//total_files}%)" progress_callback(progress_msg) else: # Print progress bar in terminal ProgressBar.update(processed_count, total_files, prefix="π Processing chapters") # Aggregate header counts if h1_found: h1_count += 1 if h2_found: h2_count += 1 # Collect file size groups and sample texts if chapter_info: effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode if effective_mode == "smart" and file_size > 0: if file_size not in file_size_groups: file_size_groups[file_size] = [] file_path, _ = future_to_file[future] file_size_groups[file_size].append(file_path) # Collect sample texts if sample_text and len(sample_texts) < 5: sample_texts.append(sample_text) # For smart mode when merging is enabled, collect candidates # Otherwise, add directly to chapters if effective_mode == "smart" and not disable_merging: candidate_chapters.append(chapter_info) else: chapters_direct.append(chapter_info) # Collect skipped info if skipped_info: skipped_files.append(skipped_info) except Exception as e: file_path, idx = future_to_file[future] print(f"[ERROR] Process error processing {file_path}: {e}") import traceback traceback.print_exc() # Show 100% completion if progress_callback: progress_callback(f"Processing chapters: {total_files}/{total_files} (100%)") else: ProgressBar.update(total_files, total_files, prefix="π Processing chapters") else: print("π¦ Using sequential processing (small file count)...") # Process files sequentially for small EPUBs for idx, file_path in enumerate(files_to_process): if is_stop_requested(): print("β Chapter processing stopped by user") return [], 'unknown' # Call the module-level function directly result = _process_single_html_file( file_path=file_path, file_index=idx, zip_file_path=zip_file_path, parser=parser, merge_candidates=merge_candidates, disable_merging=disable_merging, enhanced_extractor=enhanced_extractor, extraction_mode=extraction_mode, enhanced_filtering=enhanced_filtering, preserve_structure=preserve_structure, protect_angle_brackets_func=protect_angle_brackets_with_korean, pattern_manager=pattern_manager, files_to_process=files_to_process, is_stop_requested=is_stop_requested ) # Unpack result chapter_info, h1_found, h2_found, file_size, sample_text, skipped_info = result # Update progress if (idx + 1) % 5 == 0: if progress_callback: progress_msg = f"Processing chapters: {idx+1}/{total_files} ({(idx+1)*100//total_files}%)" progress_callback(progress_msg) else: # Print progress bar in terminal ProgressBar.update(idx+1, total_files, prefix="π Processing chapters") # Aggregate header counts if h1_found: h1_count += 1 if h2_found: h2_count += 1 # Collect file size groups and sample texts if chapter_info: effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode if effective_mode == "smart" and file_size > 0: if file_size not in file_size_groups: file_size_groups[file_size] = [] file_size_groups[file_size].append(file_path) # Collect sample texts if sample_text and len(sample_texts) < 5: sample_texts.append(sample_text) # For smart mode when merging is enabled, collect candidates # Otherwise, add directly to chapters if effective_mode == "smart" and not disable_merging: candidate_chapters.append(chapter_info) else: chapters_direct.append(chapter_info) # Collect skipped info if skipped_info: skipped_files.append(skipped_info) # Show 100% completion for sequential mode if progress_callback: progress_callback(f"Processing chapters: {total_files}/{total_files} (100%)") else: ProgressBar.update(total_files, total_files, prefix="π Processing chapters") # Final progress update and cleanup progress bar if not progress_callback: ProgressBar.finish() else: progress_callback(f"Chapter processing complete: {len(candidate_chapters) + len(chapters_direct)} chapters") # Print skip summary if any files were skipped if skipped_files: print(f"\nπ Skipped {len(skipped_files)} files during processing:") empty_count = sum(1 for _, reason, _ in skipped_files if reason == 'empty') if empty_count > 0: print(f" β’ {empty_count} nearly empty files") # Show first 3 examples if debug enabled if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1' and skipped_files: print(" Examples:") for path, reason, size in skipped_files[:3]: print(f" - {os.path.basename(path)} ({size} chars)") # Sort direct chapters by file index to maintain order chapters_direct.sort(key=lambda x: x["file_index"]) # Post-process smart mode candidates (only when merging is enabled) effective_mode = enhanced_filtering if extraction_mode == "enhanced" else extraction_mode if effective_mode == "smart" and candidate_chapters and not disable_merging: # Check stop before post-processing if is_stop_requested(): print("β Chapter post-processing stopped by user") return chapters, 'unknown' print(f"\n[SMART MODE] Processing {len(candidate_chapters)} candidate files...") # Sort candidates by file index to maintain order candidate_chapters.sort(key=lambda x: x["file_index"]) # Debug: Show what files we have section_files = [c for c in candidate_chapters if 'section' in c['original_basename'].lower()] chapter_files = [c for c in candidate_chapters if 'chapter' in c['original_basename'].lower() and 'section' not in c['original_basename'].lower()] other_files = [c for c in candidate_chapters if c not in section_files and c not in chapter_files] print(f" π File breakdown:") print(f" β’ Section files: {len(section_files)}") print(f" β’ Chapter files: {len(chapter_files)}") print(f" β’ Other files: {len(other_files)}") # Original smart mode logic when merging is enabled # First, separate files with detected chapter numbers from those without numbered_chapters = [] unnumbered_chapters = [] for idx, chapter in enumerate(candidate_chapters): # Yield periodically during categorization (can be disabled for max speed) if idx % 10 == 0 and idx > 0 and os.getenv("ENABLE_GUI_YIELD", "1") == "1": time.sleep(0.001) if chapter["num"] is not None: numbered_chapters.append(chapter) else: unnumbered_chapters.append(chapter) print(f" β’ Files with chapter numbers: {len(numbered_chapters)}") print(f" β’ Files without chapter numbers: {len(unnumbered_chapters)}") # Check if we have hash-based filenames (no numbered chapters found) if not numbered_chapters and unnumbered_chapters: print(" β οΈ No chapter numbers found - likely hash-based filenames") print(" β Using file order as chapter sequence") # Sort by file index to maintain order unnumbered_chapters.sort(key=lambda x: x["file_index"]) # Assign sequential numbers for i, chapter in enumerate(unnumbered_chapters, 1): chapter["num"] = i chapter["detection_method"] = f"{extraction_mode}_hash_filename_sequential" if extraction_mode == "enhanced" else "hash_filename_sequential" if not chapter["title"] or chapter["title"] == chapter["original_basename"]: chapter["title"] = f"Chapter {i}" chapters = unnumbered_chapters else: # We have some numbered chapters chapters = numbered_chapters # For unnumbered files, check if they might be duplicates or appendices if unnumbered_chapters: print(f" β Analyzing {len(unnumbered_chapters)} unnumbered files...") # Get the max chapter number max_num = max(c["num"] for c in numbered_chapters) # Check each unnumbered file for chapter in unnumbered_chapters: # Check stop in post-processing loop if is_stop_requested(): print("β Chapter post-processing stopped by user") return chapters, 'unknown' # Check if it's very small (might be a separator or note) if chapter["file_size"] < 200: # Collect for summary instead of printing # Note: _smart_mode_skips defined in outer scope _smart_mode_skips.append(('small', chapter['filename'], chapter['file_size'])) continue # Check if it has similar size to existing chapters (might be duplicate) size = chapter["file_size"] similar_chapters = [c for c in numbered_chapters if abs(c["file_size"] - size) < 50] if similar_chapters: # Might be a duplicate, skip it (collect for summary) _smart_mode_skips.append(('duplicate', chapter['filename'], len(similar_chapters))) continue # Otherwise, add as appendix max_num += 1 chapter["num"] = max_num chapter["detection_method"] = f"{extraction_mode}_appendix_sequential" if extraction_mode == "enhanced" else "appendix_sequential" if not chapter["title"] or chapter["title"] == chapter["original_basename"]: chapter["title"] = f"Appendix {max_num}" chapters.append(chapter) print(f" [ADD] Added as chapter {max_num}: {chapter['filename']}") else: # For other modes or smart mode with merging disabled chapters = chapters_direct # Print smart mode skip summary if any if '_smart_mode_skips' in locals() and _smart_mode_skips: print(f"\nπ Smart mode filtering summary:") small_count = sum(1 for reason, _, _ in _smart_mode_skips if reason == 'small') dup_count = sum(1 for reason, _, _ in _smart_mode_skips if reason == 'duplicate') if small_count > 0: print(f" β’ Skipped {small_count} very small files") if dup_count > 0: print(f" β’ Skipped {dup_count} possible duplicates") # Show examples if debug enabled if os.getenv('DEBUG_SKIP_MESSAGES', '0') == '1': print(" Examples:") for reason, filename, detail in _smart_mode_skips[:3]: if reason == 'small': print(f" - {filename} ({detail} chars)") else: print(f" - {filename} (similar to {detail} chapters)") # Clear the list _smart_mode_skips = [] # Sort chapters by number chapters.sort(key=lambda x: x["num"]) # Ensure chapter numbers are integers # When merging is disabled, all chapters should have integer numbers anyway for chapter in chapters: if isinstance(chapter["num"], float): chapter["num"] = int(chapter["num"]) # Final validation if chapters: print(f"\nβ Final chapter count: {len(chapters)}") print(f" β’ Chapter range: {chapters[0]['num']} - {chapters[-1]['num']}") # Enhanced mode summary if extraction_mode == "enhanced": enhanced_count = sum(1 for c in chapters if c.get('enhanced_extraction', False)) total_chars = sum(len(c.get('body', '')) for c in chapters if c.get('enhanced_extraction', False)) avg_chars = total_chars // enhanced_count if enhanced_count > 0 else 0 print(f" π Enhanced extraction: {enhanced_count}/{len(chapters)} chapters, {total_chars:,} total chars (avg: {avg_chars:,})") # Check for gaps chapter_nums = [c["num"] for c in chapters] expected_nums = list(range(min(chapter_nums), max(chapter_nums) + 1)) missing = set(expected_nums) - set(chapter_nums) if missing: print(f" β οΈ Missing chapter numbers: {sorted(missing)}") # Language detection combined_sample = ' '.join(sample_texts) if effective_mode == "smart" else '' detected_language = _detect_content_language(combined_sample) if combined_sample else 'unknown' if chapters: _print_extraction_summary(chapters, detected_language, extraction_mode, h1_count if effective_mode == "smart" else 0, h2_count if effective_mode == "smart" else 0, file_size_groups if effective_mode == "smart" else {}) return chapters, detected_language def _extract_chapter_info(soup, file_path, content_text, html_content, pattern_manager): """Extract chapter number and title from various sources with parallel pattern matching""" chapter_num = None chapter_title = None detection_method = None # SPECIAL HANDLING: When we have Section/Chapter pairs, differentiate them filename = os.path.basename(file_path) # Handle different naming patterns for Section/Chapter files if ('section' in filename.lower() or '_section' in filename.lower()) and 'chapter' not in filename.lower(): # For Section files, add 0.1 to the base number # Try different patterns match = re.search(r'No(\d+)', filename) if not match: match = re.search(r'^(\d+)[_\-]', filename) if not match: match = re.search(r'^(\d+)', filename) if match: base_num = int(match.group(1)) chapter_num = base_num + 0.1 # Section gets .1 detection_method = "filename_section_special" elif ('chapter' in filename.lower() or '_chapter' in filename.lower()) and 'section' not in filename.lower(): # For Chapter files, use the base number # Try different patterns match = re.search(r'No(\d+)', filename) if not match: match = re.search(r'^(\d+)[_\-]', filename) if not match: match = re.search(r'^(\d+)', filename) if match: chapter_num = int(match.group(1)) detection_method = "filename_chapter_special" # If not handled by special logic, continue with normal extraction if not chapter_num: # Try filename first - use parallel pattern matching for better performance chapter_patterns = [(pattern, flags, method) for pattern, flags, method in PM.CHAPTER_PATTERNS if method.endswith('_number')] if len(chapter_patterns) > 3: # Only parallelize if we have enough patterns # Parallel pattern matching for filename with ThreadPoolExecutor(max_workers=min(4, len(chapter_patterns))) as executor: def try_pattern(pattern_info): pattern, flags, method = pattern_info match = re.search(pattern, file_path, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"filename_{method}" elif method == 'chinese_chapter_cn': from TransateKRtoEN import PatternManager pm = None # No longer needed converted = _convert_chinese_number(num_str, pm) if converted: return converted, f"filename_{method}" except (ValueError, IndexError): pass return None, None # Submit all patterns futures = [executor.submit(try_pattern, pattern_info) for pattern_info in chapter_patterns] # Check results as they complete for future in as_completed(futures): try: num, method = future.result() if num: chapter_num = num detection_method = method # Cancel remaining futures for f in futures: f.cancel() break except Exception: continue else: # Sequential processing for small pattern sets for pattern, flags, method in chapter_patterns: match = re.search(pattern, file_path, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): chapter_num = int(num_str) detection_method = f"filename_{method}" break elif method == 'chinese_chapter_cn': from TransateKRtoEN import PatternManager pm = None # No longer needed converted = _convert_chinese_number(num_str, pm) if converted: chapter_num = converted detection_method = f"filename_{method}" break except (ValueError, IndexError): continue # Try content if not found in filename if not chapter_num: # Check ignore settings for batch translation batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Prepare all text sources to check in parallel text_sources = [] # Add title tag if using titles if use_title_tag and soup.title and soup.title.string: title_text = soup.title.string.strip() text_sources.append(("title", title_text, True)) # True means this can be chapter_title # Add headers if not ignored if not ignore_header_tags: for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: headers = soup.find_all(header_tag) for header in headers[:3]: # Limit to first 3 of each type header_text = header.get_text(strip=True) if header_text: text_sources.append((f"header_{header_tag}", header_text, True)) # Add first paragraphs first_elements = soup.find_all(['p', 'div'])[:5] for elem in first_elements: elem_text = elem.get_text(strip=True) if elem_text: text_sources.append(("content", elem_text, False)) # False means don't use as chapter_title # Process text sources in parallel if we have many if len(text_sources) > 5: with ThreadPoolExecutor(max_workers=min(6, len(text_sources))) as executor: def extract_from_source(source_info): source_type, text, can_be_title = source_info num, method = _extract_from_text(text, source_type, pattern_manager) return num, method, text if (num and can_be_title) else None # Submit all text sources future_to_source = {executor.submit(extract_from_source, source): source for source in text_sources} # Process results as they complete for future in as_completed(future_to_source): try: num, method, title = future.result() if num: chapter_num = num detection_method = method if title and not chapter_title: chapter_title = title # Cancel remaining futures for f in future_to_source: f.cancel() break except Exception: continue else: # Sequential processing for small text sets for source_type, text, can_be_title in text_sources: num, method = _extract_from_text(text, source_type, pattern_manager) if num: chapter_num = num detection_method = method if can_be_title and not chapter_title: chapter_title = text break # Final fallback to filename patterns if not chapter_num: filename_base = os.path.basename(file_path) # Parallel pattern matching for filename extraction if len(PM.FILENAME_EXTRACT_PATTERNS) > 3: with ThreadPoolExecutor(max_workers=min(4, len(PM.FILENAME_EXTRACT_PATTERNS))) as executor: def try_filename_pattern(pattern): match = re.search(pattern, filename_base, re.IGNORECASE) if match: try: return int(match.group(1)) except (ValueError, IndexError): pass return None futures = [executor.submit(try_filename_pattern, pattern) for pattern in PM.FILENAME_EXTRACT_PATTERNS] for future in as_completed(futures): try: num = future.result() if num: chapter_num = num detection_method = "filename_number" for f in futures: f.cancel() break except Exception: continue else: # Sequential for small pattern sets for pattern in PM.FILENAME_EXTRACT_PATTERNS: match = re.search(pattern, filename_base, re.IGNORECASE) if match: chapter_num = int(match.group(1)) detection_method = "filename_number" break # Extract title if not already found (with ignore settings support) if not chapter_title: # Check settings for batch translation batch_translate_active = os.getenv('BATCH_TRANSLATE_HEADERS', '0') == '1' use_title_tag = os.getenv('USE_TITLE', '0') == '1' or not batch_translate_active ignore_header_tags = os.getenv('IGNORE_HEADER', '0') == '1' and batch_translate_active # Try title tag if using titles if use_title_tag and soup.title and soup.title.string: chapter_title = soup.title.string.strip() # Try header tags if not ignored and no title found if not chapter_title and not ignore_header_tags: for header_tag in ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']: header = soup.find(header_tag) if header: chapter_title = header.get_text(strip=True) break # Final fallback if not chapter_title: chapter_title = f"Chapter {chapter_num}" if chapter_num else None chapter_title = re.sub(r'\s+', ' ', chapter_title).strip() if chapter_title else None return chapter_num, chapter_title, detection_method def _extract_from_text(text, source_type, pattern_manager): """Extract chapter number from text using patterns with parallel matching for large pattern sets""" # Get patterns that don't end with '_number' text_patterns = [(pattern, flags, method) for pattern, flags, method in PM.CHAPTER_PATTERNS if not method.endswith('_number')] # Only use parallel processing if we have many patterns if len(text_patterns) > 5: with ThreadPoolExecutor(max_workers=min(4, len(text_patterns))) as executor: def try_text_pattern(pattern_info): pattern, flags, method = pattern_info match = re.search(pattern, text, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"{source_type}_{method}" elif method == 'chinese_chapter_cn': from TransateKRtoEN import PatternManager pm = None # No longer needed converted = _convert_chinese_number(num_str, pm) if converted: return converted, f"{source_type}_{method}" except (ValueError, IndexError): pass return None, None # Submit all patterns futures = [executor.submit(try_text_pattern, pattern_info) for pattern_info in text_patterns] # Check results as they complete for future in as_completed(futures): try: num, method = future.result() if num: # Cancel remaining futures for f in futures: f.cancel() return num, method except Exception: continue else: # Sequential processing for small pattern sets for pattern, flags, method in text_patterns: match = re.search(pattern, text, flags) if match: try: num_str = match.group(1) if num_str.isdigit(): return int(num_str), f"{source_type}_{method}" elif method == 'chinese_chapter_cn': from TransateKRtoEN import PatternManager pm = None # No longer needed converted = _convert_chinese_number(num_str, pm) if converted: return converted, f"{source_type}_{method}" except (ValueError, IndexError): continue return None, None def _convert_chinese_number(cn_num, pattern_manager): """Convert Chinese number to integer""" if cn_num in PM.CHINESE_NUMS: return PM.CHINESE_NUMS[cn_num] if 'ε' in cn_num: parts = cn_num.split('ε') if len(parts) == 2: tens = PM.CHINESE_NUMS.get(parts[0], 1) if parts[0] else 1 ones = PM.CHINESE_NUMS.get(parts[1], 0) if parts[1] else 0 return tens * 10 + ones return None def _detect_content_language( text_sample): """Detect the primary language of content with parallel processing for large texts""" # For very short texts, use sequential processing if len(text_sample) < 1000: scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } for char in text_sample: code = ord(char) if 0xAC00 <= code <= 0xD7AF: scripts['korean'] += 1 elif 0x3040 <= code <= 0x309F: scripts['japanese_hiragana'] += 1 elif 0x30A0 <= code <= 0x30FF: scripts['japanese_katakana'] += 1 elif 0x4E00 <= code <= 0x9FFF: scripts['chinese'] += 1 elif 0x0020 <= code <= 0x007F: scripts['latin'] += 1 else: # For longer texts, use parallel processing # Split text into chunks for parallel processing chunk_size = max(500, len(text_sample) // (os.cpu_count() or 4)) chunks = [text_sample[i:i + chunk_size] for i in range(0, len(text_sample), chunk_size)] # Thread-safe accumulator scripts_lock = threading.Lock() scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } def process_chunk(text_chunk): """Process a chunk of text and return script counts""" local_scripts = { 'korean': 0, 'japanese_hiragana': 0, 'japanese_katakana': 0, 'chinese': 0, 'latin': 0 } for char in text_chunk: code = ord(char) if 0xAC00 <= code <= 0xD7AF: local_scripts['korean'] += 1 elif 0x3040 <= code <= 0x309F: local_scripts['japanese_hiragana'] += 1 elif 0x30A0 <= code <= 0x30FF: local_scripts['japanese_katakana'] += 1 elif 0x4E00 <= code <= 0x9FFF: local_scripts['chinese'] += 1 elif 0x0020 <= code <= 0x007F: local_scripts['latin'] += 1 return local_scripts # Process chunks in parallel with ThreadPoolExecutor(max_workers=min(os.cpu_count() or 4, len(chunks))) as executor: # Submit all chunks futures = [executor.submit(process_chunk, chunk) for chunk in chunks] # Collect results for future in as_completed(futures): try: chunk_scripts = future.result() # Thread-safe accumulation with scripts_lock: for script, count in chunk_scripts.items(): scripts[script] += count except Exception as e: print(f"[WARNING] Error processing chunk in language detection: {e}") # Language determination logic (same as original) total_cjk = scripts['korean'] + scripts['japanese_hiragana'] + scripts['japanese_katakana'] + scripts['chinese'] if scripts['korean'] > total_cjk * 0.3: return 'korean' elif scripts['japanese_hiragana'] + scripts['japanese_katakana'] > total_cjk * 0.2: return 'japanese' elif scripts['chinese'] > total_cjk * 0.3: return 'chinese' elif scripts['latin'] > len(text_sample) * 0.7: return 'english' else: return 'unknown' # Global flag to track if language has been printed _language_printed = False def _print_extraction_summary( chapters, detected_language, extraction_mode, h1_count, h2_count, file_size_groups): """Print extraction summary""" global _language_printed print(f"\nπ Chapter Extraction Summary ({extraction_mode.capitalize()} Mode):") print(f" β’ Total chapters extracted: {len(chapters)}") # Format chapter range handling both int and float first_num = chapters[0]['num'] last_num = chapters[-1]['num'] print(f" β’ Chapter range: {first_num} to {last_num}") # Only print detected language once per session if not _language_printed and detected_language and detected_language != 'unknown': print(f" π Detected language: {detected_language}") _language_printed = True if extraction_mode == "smart": print(f" β’ Primary header type: {'