Spaces:
Running
Running
File size: 8,105 Bytes
b5b56ea ec038f4 b5b56ea ec038f4 b5b56ea ec038f4 b5b56ea ec038f4 b5b56ea ec038f4 b5b56ea ec038f4 b5b56ea ec038f4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 | #!/usr/bin/env python3
"""
Chapter Extraction Worker - Runs chapter extraction in a separate process to prevent GUI freezing
"""
import sys
import os
import io
# Force UTF-8 encoding for stdout/stderr on Windows
sys.stdout = io.TextIOWrapper(sys.stdout.buffer, encoding='utf-8', errors='replace')
sys.stderr = io.TextIOWrapper(sys.stderr.buffer, encoding='utf-8', errors='replace')
import json
import zipfile
import time
import traceback
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent))
def run_chapter_extraction(epub_path, output_dir, extraction_mode="smart", progress_callback=None):
"""
Run chapter extraction in this worker process
Args:
epub_path: Path to EPUB file
output_dir: Output directory for extracted content
extraction_mode: Extraction mode (smart, comprehensive, full, enhanced)
progress_callback: Callback function for progress updates (uses print for IPC)
Returns:
dict: Extraction results including chapters and metadata
"""
try:
# Honor OUTPUT_DIRECTORY override (keep leaf folder)
try:
override_dir = os.getenv("OUTPUT_DIRECTORY")
if override_dir:
override_dir = os.path.abspath(override_dir)
leaf = os.path.basename(os.path.abspath(output_dir)) or "output"
abs_output = os.path.abspath(output_dir)
if not os.path.commonpath([abs_output, override_dir]).startswith(override_dir):
output_dir = os.path.join(override_dir, leaf)
else:
output_dir = abs_output
except Exception as e:
print(f"[WARNING] OUTPUT_DIRECTORY override failed: {e}", flush=True)
# Suppress XML parsing warnings that can crash the subprocess
import warnings
from bs4 import XMLParsedAsHTMLWarning
warnings.filterwarnings("ignore", category=XMLParsedAsHTMLWarning)
# Import here to avoid loading heavy modules until needed
import Chapter_Extractor
# Create progress callback that prints to stdout for IPC
def worker_progress_callback(message):
# Use special prefix for progress messages
print(f"[PROGRESS] {message}", flush=True)
# Set extraction mode
os.environ["EXTRACTION_MODE"] = extraction_mode
# Open EPUB and extract chapters
print(f"[INFO] Starting extraction of: {epub_path}", flush=True)
print(f"[INFO] Output directory: {output_dir}", flush=True)
print(f"[INFO] Extraction mode: {extraction_mode}", flush=True)
# Create output directory early (after override)
os.makedirs(output_dir, exist_ok=True)
with zipfile.ZipFile(epub_path, 'r') as zf:
# Extract metadata first
metadata = Chapter_Extractor._extract_epub_metadata(zf)
print(f"[INFO] Extracted metadata: {list(metadata.keys())}", flush=True)
# Extract chapters using module-level function
chapters = Chapter_Extractor.extract_chapters(zf, output_dir, progress_callback=worker_progress_callback)
print(f"[INFO] Extracted {len(chapters)} chapters", flush=True)
# The extract_chapters method already handles OPF sorting internally
# Just log if OPF was used
opf_path = os.path.join(output_dir, 'content.opf')
if os.path.exists(opf_path):
print(f"[INFO] OPF file available for chapter ordering", flush=True)
# CRITICAL: Save the full chapters with body content!
# This is what the main process needs to load
chapters_full_path = os.path.join(output_dir, "chapters_full.json")
try:
with open(chapters_full_path, 'w', encoding='utf-8') as f:
json.dump(chapters, f, ensure_ascii=False)
print(f"[INFO] Saved full chapters data to: {chapters_full_path}", flush=True)
except Exception as e:
print(f"[WARNING] Could not save full chapters: {e}", flush=True)
# Fall back to saving individual files
for chapter in chapters:
try:
chapter_file = f"chapter_{chapter['num']:04d}_{chapter.get('filename', 'content').replace('/', '_')}.html"
chapter_path = os.path.join(output_dir, chapter_file)
with open(chapter_path, 'w', encoding='utf-8') as f:
f.write(chapter.get('body', ''))
print(f"[INFO] Saved chapter {chapter['num']} to {chapter_file}", flush=True)
except Exception as ce:
print(f"[WARNING] Could not save chapter {chapter.get('num')}: {ce}", flush=True)
# Return results as JSON for IPC
result = {
"success": True,
"chapters": len(chapters),
"metadata": metadata,
"chapter_info": [
{
"num": ch.get("num"),
"title": ch.get("title"),
"has_images": ch.get("has_images", False),
"file_size": ch.get("file_size", 0),
"content_hash": ch.get("content_hash", "")
}
for ch in chapters
]
}
# Output result as JSON
print(f"[RESULT] {json.dumps(result)}", flush=True)
return result
except Exception as e:
# Send error information
error_info = {
"success": False,
"error": str(e),
"traceback": traceback.format_exc()
}
print(f"[ERROR] {str(e)}", flush=True)
print(f"[RESULT] {json.dumps(error_info)}", flush=True)
return error_info
def main():
"""Main entry point for worker process"""
# Parse command line arguments
if len(sys.argv) < 3:
print("[ERROR] Usage: chapter_extraction_worker.py <epub_path> <output_dir> [extraction_mode]", flush=True)
sys.exit(1)
epub_path = sys.argv[1]
output_dir = sys.argv[2]
extraction_mode = sys.argv[3] if len(sys.argv) > 3 else "smart"
# Validate inputs
if not os.path.exists(epub_path):
print(f"[ERROR] EPUB file not found: {epub_path}", flush=True)
sys.exit(1)
# Honor OUTPUT_DIRECTORY override for CLI entry as well
try:
override_dir = os.getenv("OUTPUT_DIRECTORY")
if override_dir:
override_dir = os.path.abspath(override_dir)
leaf = os.path.basename(os.path.abspath(output_dir)) or "output"
abs_output = os.path.abspath(output_dir)
if not os.path.commonpath([abs_output, override_dir]).startswith(override_dir):
output_dir = os.path.join(override_dir, leaf)
else:
output_dir = abs_output
except Exception as e:
print(f"[WARNING] OUTPUT_DIRECTORY override failed: {e}", flush=True)
# Create output directory if needed
os.makedirs(output_dir, exist_ok=True)
# Run extraction
result = run_chapter_extraction(epub_path, output_dir, extraction_mode)
# Exit with appropriate code
sys.exit(0 if result.get("success", False) else 1)
if __name__ == "__main__":
from shutdown_utils import run_cli_main
def _main():
# Ensure freeze support for Windows frozen exe
try:
import multiprocessing
multiprocessing.freeze_support()
except Exception:
pass
main()
return 0
run_cli_main(_main)
|