Spaces:
Paused
Paused
| import os | |
| import argparse | |
| import logging | |
| from pathlib import Path | |
| from concurrent.futures import ProcessPoolExecutor, as_completed | |
| from langchain_community.document_loaders import UnstructuredEPubLoader | |
| from langchain_community.document_loaders.unstructured import UnstructuredFileLoader | |
| # Set up logging | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def convert_file(file_path, output_dir): | |
| file_path = Path(file_path) | |
| output_path = Path(output_dir) / f"{file_path.stem}.txt" | |
| loaders = { | |
| ".epub": [(UnstructuredEPubLoader, {"mode": "elements"})], | |
| ".chm": [(UnstructuredFileLoader, {"mode": "elements"})], | |
| } | |
| loader_options = loaders.get(file_path.suffix.lower(), []) | |
| if not loader_options: | |
| logger.warning(f"Unsupported file type: {file_path}") | |
| return | |
| for loader_class, loader_args in loader_options: | |
| try: | |
| loader = loader_class(str(file_path), **loader_args) | |
| elements = loader.load() | |
| with open(output_path, 'w', encoding='utf-8') as txt_file: | |
| for elem in elements: | |
| if hasattr(elem, 'page_content'): | |
| txt_file.write(elem.page_content + "\n\n") | |
| elif isinstance(elem, str): | |
| txt_file.write(elem + "\n\n") | |
| else: | |
| logger.warning(f"Unexpected element type: {type(elem)}") | |
| logger.info(f"Converted {file_path} to {output_path}") | |
| return # Successfully converted, exit the function | |
| except Exception as e: | |
| logger.error(f"Failed to convert {file_path} with {loader_class.__name__}: {str(e)}") | |
| logger.error(f"All conversion attempts failed for {file_path}") | |
| def convert_directory(input_dir, output_dir): | |
| input_dir = Path(input_dir) | |
| output_dir = Path(output_dir) | |
| output_dir.mkdir(parents=True, exist_ok=True) | |
| files_to_convert = [] | |
| for ext in ['.epub', '.chm']: | |
| files_to_convert.extend(input_dir.rglob(f'*{ext}')) | |
| with ProcessPoolExecutor() as executor: | |
| futures = [executor.submit(convert_file, file, output_dir) for file in files_to_convert] | |
| for future in as_completed(futures): | |
| future.result() # This will raise any exceptions that occurred during execution | |
| def main(): | |
| parser = argparse.ArgumentParser(description="Convert EPUB and CHM files to TXT format.") | |
| parser.add_argument("input_dir", help="Directory containing the input files") | |
| parser.add_argument("output_dir", help="Directory to save the converted TXT files") | |
| args = parser.parse_args() | |
| logger.info(f"Starting conversion from {args.input_dir} to {args.output_dir}") | |
| convert_directory(args.input_dir, args.output_dir) | |
| logger.info("Conversion completed") | |
| if __name__ == "__main__": | |
| main() |