import os import argparse import logging from pathlib import Path from concurrent.futures import ProcessPoolExecutor, as_completed from langchain_community.document_loaders import UnstructuredEPubLoader from langchain_community.document_loaders.unstructured import UnstructuredFileLoader # Set up logging logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') logger = logging.getLogger(__name__) def convert_file(file_path, output_dir): file_path = Path(file_path) output_path = Path(output_dir) / f"{file_path.stem}.txt" loaders = { ".epub": [(UnstructuredEPubLoader, {"mode": "elements"})], ".chm": [(UnstructuredFileLoader, {"mode": "elements"})], } loader_options = loaders.get(file_path.suffix.lower(), []) if not loader_options: logger.warning(f"Unsupported file type: {file_path}") return for loader_class, loader_args in loader_options: try: loader = loader_class(str(file_path), **loader_args) elements = loader.load() with open(output_path, 'w', encoding='utf-8') as txt_file: for elem in elements: if hasattr(elem, 'page_content'): txt_file.write(elem.page_content + "\n\n") elif isinstance(elem, str): txt_file.write(elem + "\n\n") else: logger.warning(f"Unexpected element type: {type(elem)}") logger.info(f"Converted {file_path} to {output_path}") return # Successfully converted, exit the function except Exception as e: logger.error(f"Failed to convert {file_path} with {loader_class.__name__}: {str(e)}") logger.error(f"All conversion attempts failed for {file_path}") def convert_directory(input_dir, output_dir): input_dir = Path(input_dir) output_dir = Path(output_dir) output_dir.mkdir(parents=True, exist_ok=True) files_to_convert = [] for ext in ['.epub', '.chm']: files_to_convert.extend(input_dir.rglob(f'*{ext}')) with ProcessPoolExecutor() as executor: futures = [executor.submit(convert_file, file, output_dir) for file in files_to_convert] for future in as_completed(futures): future.result() # This will raise any exceptions that occurred during execution def main(): parser = argparse.ArgumentParser(description="Convert EPUB and CHM files to TXT format.") parser.add_argument("input_dir", help="Directory containing the input files") parser.add_argument("output_dir", help="Directory to save the converted TXT files") args = parser.parse_args() logger.info(f"Starting conversion from {args.input_dir} to {args.output_dir}") convert_directory(args.input_dir, args.output_dir) logger.info("Conversion completed") if __name__ == "__main__": main()