File size: 2,164 Bytes
b28505d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0

import argparse

from nemotron_ocr.inference.pipeline import NemotronOCR


def main(image_path, merge_level, no_visualize, model_dir, lang):
    if model_dir is not None:
        ocr_pipeline = NemotronOCR(model_dir=model_dir)
    else:
        ocr_pipeline = NemotronOCR(lang=lang)

    predictions = ocr_pipeline(image_path, merge_level=merge_level, visualize=not no_visualize)

    print(f"Found {len(predictions)} text regions.")
    for pred in predictions:
        print(
            f"  - Text: '{pred['text']}', "
            f"Confidence: {pred['confidence']:.2f}, "
            f"Bbox: [left={pred['left']:.4f}, upper={pred['upper']:.4f}, "
            f"right={pred['right']:.4f}, lower={pred['lower']:.4f}]"
        )


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Run OCR inference and annotate image.")
    parser.add_argument("image_path", type=str, help="Path to the input image.")
    parser.add_argument(
        "--merge-level",
        type=str,
        choices=["word", "sentence", "paragraph"],
        default="paragraph",
        help="Merge level for OCR output (word, sentence, paragraph).",
    )
    parser.add_argument("--no-visualize", action="store_true", help="Do not save the annotated image.")
    parser.add_argument(
        "--model-dir",
        type=str,
        default=None,
        help="Path to a directory with detector.pth, recognizer.pth, relational.pth, charset.txt. "
        "If omitted, weights are downloaded from Hugging Face (default: v2 multilingual).",
    )
    parser.add_argument(
        "--lang",
        type=str,
        choices=["en", "multi", "v1"],
        default=None,
        help="Hub checkpoint when --model-dir is omitted: en=v2 English, multi=v2 multilingual (default), v1=legacy.",
    )
    args = parser.parse_args()

    main(
        args.image_path,
        merge_level=args.merge_level,
        no_visualize=args.no_visualize,
        model_dir=args.model_dir,
        lang=args.lang,
    )