rag / convert.py
poemsforaphrodite's picture
Upload folder using huggingface_hub
8e0205b verified
import os
import argparse
import logging
from pathlib import Path
from concurrent.futures import ProcessPoolExecutor, as_completed
from langchain_community.document_loaders import UnstructuredEPubLoader
from langchain_community.document_loaders.unstructured import UnstructuredFileLoader
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def convert_file(file_path, output_dir):
file_path = Path(file_path)
output_path = Path(output_dir) / f"{file_path.stem}.txt"
loaders = {
".epub": [(UnstructuredEPubLoader, {"mode": "elements"})],
".chm": [(UnstructuredFileLoader, {"mode": "elements"})],
}
loader_options = loaders.get(file_path.suffix.lower(), [])
if not loader_options:
logger.warning(f"Unsupported file type: {file_path}")
return
for loader_class, loader_args in loader_options:
try:
loader = loader_class(str(file_path), **loader_args)
elements = loader.load()
with open(output_path, 'w', encoding='utf-8') as txt_file:
for elem in elements:
if hasattr(elem, 'page_content'):
txt_file.write(elem.page_content + "\n\n")
elif isinstance(elem, str):
txt_file.write(elem + "\n\n")
else:
logger.warning(f"Unexpected element type: {type(elem)}")
logger.info(f"Converted {file_path} to {output_path}")
return # Successfully converted, exit the function
except Exception as e:
logger.error(f"Failed to convert {file_path} with {loader_class.__name__}: {str(e)}")
logger.error(f"All conversion attempts failed for {file_path}")
def convert_directory(input_dir, output_dir):
input_dir = Path(input_dir)
output_dir = Path(output_dir)
output_dir.mkdir(parents=True, exist_ok=True)
files_to_convert = []
for ext in ['.epub', '.chm']:
files_to_convert.extend(input_dir.rglob(f'*{ext}'))
with ProcessPoolExecutor() as executor:
futures = [executor.submit(convert_file, file, output_dir) for file in files_to_convert]
for future in as_completed(futures):
future.result() # This will raise any exceptions that occurred during execution
def main():
parser = argparse.ArgumentParser(description="Convert EPUB and CHM files to TXT format.")
parser.add_argument("input_dir", help="Directory containing the input files")
parser.add_argument("output_dir", help="Directory to save the converted TXT files")
args = parser.parse_args()
logger.info(f"Starting conversion from {args.input_dir} to {args.output_dir}")
convert_directory(args.input_dir, args.output_dir)
logger.info("Conversion completed")
if __name__ == "__main__":
main()