NVIDIA-Nemotron-Parse-v1.2 / vllm_example.py
katerynaCh's picture
Upload folder using huggingface_hub
9591148 verified
import argparse
import base64
import os
from concurrent.futures import ThreadPoolExecutor, as_completed
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Iterable, List, Optional, Tuple
from openai import OpenAI
def _guess_mime(path: str) -> str:
ext = Path(path).suffix.lower().lstrip(".")
if ext in ("jpg", "jpeg"):
return "image/jpeg"
if ext in ("webp",):
return "image/webp"
# default
return "image/png"
def _b64_image_data_url(path: str) -> str:
with open(path, "rb") as f:
img_b64 = base64.b64encode(f.read()).decode("utf-8")
mime = _guess_mime(path)
return f"data:{mime};base64,{img_b64}"
def _iter_images(paths: List[str], image_dir: Optional[str]) -> List[str]:
out: List[str] = []
for p in paths:
out.append(p)
if image_dir:
for ext in ("*.png", "*.jpg", "*.jpeg", "*.webp"):
out.extend([str(x) for x in sorted(Path(image_dir).glob(ext))])
# De-dupe, keep order
seen = set()
deduped: List[str] = []
for p in out:
if p in seen:
continue
seen.add(p)
deduped.append(p)
return deduped
@dataclass(frozen=True)
class _ReqSpec:
image_path: str
request_idx: int
def _make_client(base_url: str) -> OpenAI:
# openai>=1.x requires an API key; vLLM ignores it by default.
api_key = os.environ.get("OPENAI_API_KEY", "EMPTY")
return OpenAI(base_url=base_url, api_key=api_key)
def _run_one(
req: _ReqSpec,
*,
base_url: str,
model: str,
prompt_text: str,
max_tokens: int,
temperature: float,
extra_body: Dict[str, Any],
) -> Tuple[_ReqSpec, str]:
client = _make_client(base_url)
img_url = _b64_image_data_url(req.image_path)
resp = client.chat.completions.create(
model=model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt_text},
{"type": "image_url", "image_url": {"url": img_url}},
],
}
],
max_tokens=max_tokens,
temperature=temperature,
extra_body=extra_body,
)
text = resp.choices[0].message.content or ""
return req, text
def _maybe_annotate(image_path: str, generated_text: str, out_image_path: str) -> None:
# Optional visualization (similar to example_with_table_processor.py).
from PIL import Image, ImageDraw # local import so batching can run without pillow
from postprocessing import extract_classes_bboxes, postprocess_text, transform_bbox_to_original
image = Image.open(image_path).convert("RGB")
classes, bboxes, texts = extract_classes_bboxes(generated_text)
bboxes = [transform_bbox_to_original(bbox, image.width, image.height) for bbox in bboxes]
table_format = "HTML" # latex | HTML | markdown
text_format = "markdown" # markdown | plain
blank_text_in_figures = False
_ = [
postprocess_text(
text,
cls=cls,
table_format=table_format,
text_format=text_format,
blank_text_in_figures=blank_text_in_figures,
)
for text, cls in zip(texts, classes)
]
draw = ImageDraw.Draw(image)
for bbox in bboxes:
draw.rectangle(
(bbox[0], bbox[1], max(bbox[0], bbox[2]), max(bbox[1], bbox[3])),
outline="red",
width=2,
)
image.save(out_image_path)
def main() -> None:
ap = argparse.ArgumentParser(description="vLLM OpenAI-compatible example (batch + .txt outputs).")
ap.add_argument("--base-url", default="http://localhost:8000/v1")
ap.add_argument("--model", default="nvidia/NVIDIA-Nemotron-Parse-v1.2")
ap.add_argument("--image", action="append", default=[], help="Image path (repeatable).")
ap.add_argument("--image-dir", default=None, help="Directory of images to run (png/jpg/jpeg/webp).")
ap.add_argument("--out-dir", default="vllm_outputs", help="Where to write .txt outputs.")
ap.add_argument("--concurrency", type=int, default=4, help="How many concurrent requests to send.")
ap.add_argument("--max-tokens", type=int, default=8994)
ap.add_argument("--temperature", type=float, default=0.0)
ap.add_argument(
"--annotate",
action=argparse.BooleanOptionalAction,
default=True,
help="Write annotated images with boxes to --out-dir (default: enabled). Use --no-annotate to disable.",
)
args = ap.parse_args()
image_paths = _iter_images(args.image, args.image_dir)
if not image_paths:
raise SystemExit("No images provided. Use --image PATH (repeatable) or --image-dir DIR.")
out_dir = Path(args.out_dir)
out_dir.mkdir(parents=True, exist_ok=True)
prompt_text = "</s><s><predict_bbox><predict_classes><output_markdown><predict_no_text_in_pic>"
#prompt_text = "</s><s><predict_bbox><predict_classes><output_markdown><predict_text_in_pic>"
extra_body = {
"repetition_penalty": 1.1,
"top_k": 1,
"skip_special_tokens": False,
}
reqs: List[_ReqSpec] = []
for idx, img in enumerate(image_paths):
reqs.append(_ReqSpec(image_path=img, request_idx=idx))
# Concurrency is the simplest way to make sure vLLM batches requests internally.
summary_lines: List[str] = []
with ThreadPoolExecutor(max_workers=max(1, args.concurrency)) as ex:
futs = [
ex.submit(
_run_one,
r,
base_url=args.base_url,
model=args.model,
prompt_text=prompt_text,
max_tokens=args.max_tokens,
temperature=args.temperature,
extra_body=extra_body,
)
for r in reqs
]
for fut in as_completed(futs):
req, text = fut.result()
base = Path(req.image_path).name
stem = f"{req.request_idx:04d}_{base}"
out_txt = out_dir / f"{stem}.txt"
out_txt.write_text(text, encoding="utf-8")
summary_lines.append(f"{req.image_path}\t{out_txt}")
if args.annotate:
out_img = out_dir / f"{stem}.annotated.jpg"
_maybe_annotate(req.image_path, text, str(out_img))
(out_dir / "summary.txt").write_text("\n".join(sorted(summary_lines)) + "\n", encoding="utf-8")
if __name__ == "__main__":
main()