Spaces:
Running
Running
| # Copyright (C) 2021-2025, Mindee. | |
| # This program is licensed under the Apache License 2.0. | |
| # See LICENSE or go to <https://opensource.org/licenses/Apache-2.0> for full license details. | |
| import argparse | |
| import json | |
| import os | |
| from pathlib import Path | |
| from tqdm import tqdm | |
| from doctr.io import DocumentFile | |
| from doctr.models import detection, ocr_predictor | |
| IMAGE_FILE_EXTENSIONS = [".jpeg", ".jpg", ".png", ".tif", ".tiff", ".bmp"] | |
| OTHER_EXTENSIONS = [".pdf"] | |
| def _process_file(model, file_path: Path, out_format: str) -> None: | |
| if out_format not in ["txt", "json", "xml"]: | |
| raise ValueError(f"Unsupported output format: {out_format}") | |
| if os.path.splitext(file_path)[1] in IMAGE_FILE_EXTENSIONS: | |
| doc = DocumentFile.from_images([file_path]) | |
| elif os.path.splitext(file_path)[1] in OTHER_EXTENSIONS: | |
| doc = DocumentFile.from_pdf(file_path) | |
| else: | |
| print(f"Skip unsupported file type: {file_path}") | |
| out = model(doc) | |
| if out_format == "json": | |
| output = json.dumps(out.export(), indent=2) | |
| elif out_format == "txt": | |
| output = out.render() | |
| elif out_format == "xml": | |
| output = out.export_as_xml() | |
| path = Path("output").joinpath(file_path.stem + "." + out_format) | |
| if out_format == "xml": | |
| for i, (xml_bytes, xml_tree) in enumerate(output): | |
| path = Path("output").joinpath(file_path.stem + f"_{i}." + out_format) | |
| xml_tree.write(path, encoding="utf-8", xml_declaration=True) | |
| else: | |
| with open(path, "w") as f: | |
| f.write(output) | |
| def main(args): | |
| detection_model = detection.__dict__[args.detection]( | |
| pretrained=True, | |
| bin_thresh=args.bin_thresh, | |
| box_thresh=args.box_thresh, | |
| ) | |
| model = ocr_predictor(detection_model, args.recognition, pretrained=True) | |
| path = Path(args.path) | |
| os.makedirs(name="output", exist_ok=True) | |
| if path.is_dir(): | |
| to_process = [ | |
| f for f in path.iterdir() if str(f).lower().endswith(tuple(IMAGE_FILE_EXTENSIONS + OTHER_EXTENSIONS)) | |
| ] | |
| for file_path in tqdm(to_process): | |
| _process_file(model, file_path, args.format) | |
| else: | |
| _process_file(model, path, args.format) | |
| def parse_args(): | |
| parser = argparse.ArgumentParser( | |
| description="DocTR text detection", | |
| formatter_class=argparse.ArgumentDefaultsHelpFormatter, | |
| ) | |
| parser.add_argument("path", type=str, help="Path to process: PDF, image, directory") | |
| parser.add_argument("--detection", type=str, default="fast_base", help="Text detection model to use for analysis") | |
| parser.add_argument("--bin-thresh", type=float, default=0.3, help="Binarization threshold for the detection model.") | |
| parser.add_argument("--box-thresh", type=float, default=0.1, help="Threshold for the detection boxes.") | |
| parser.add_argument( | |
| "--recognition", type=str, default="crnn_vgg16_bn", help="Text recognition model to use for analysis" | |
| ) | |
| parser.add_argument("-f", "--format", choices=["txt", "json", "xml"], default="txt", help="Output format") | |
| return parser.parse_args() | |
| if __name__ == "__main__": | |
| parsed_args = parse_args() | |
| main(parsed_args) | |