Spaces:

jovanseetk
/

67rp

Sleeping

67rp / scraper /code /_4heic_to_markup.py

Jovanseetk

Prepare Hugging Face Spaces deploy

554d9f2 about 2 months ago

4.57 kB

	from __future__ import annotations

	import argparse
	from pathlib import Path

	import pillow_heif
	import torch
	from PIL import Image
	from transformers import AutoModelForImageTextToText, AutoProcessor

	pillow_heif.register_heif_opener()

	DEFAULT_MODEL_ID = "THUDM/glm-4v-9b"


	def parse_args() -> argparse.Namespace:
	parser = argparse.ArgumentParser(
	description=(
	"Convert a HEIC image into Markdown or LaTeX using a GLM OCR-capable vision model "
	"from Hugging Face."
	)
	)
	parser.add_argument("input", type=Path, help="Input .heic/.heif image path")
	parser.add_argument(
	"--output",
	type=Path,
	help="Output file path. Defaults to input stem + .md/.tex based on --format.",
	)
	parser.add_argument(
	"--format",
	choices=["md", "latex"],
	default="md",
	help="Output format for OCR transcription.",
	)
	parser.add_argument(
	"--model-id",
	default=DEFAULT_MODEL_ID,
	help="Hugging Face model id for GLM OCR-style VLM inference.",
	)
	parser.add_argument("--max-new-tokens", type=int, default=2048)
	parser.add_argument(
	"--device",
	choices=["auto", "cpu", "cuda"],
	default="auto",
	help="Run inference on CPU/CUDA, or auto-detect.",
	)
	return parser.parse_args()


	def build_prompt(target_format: str) -> str:
	if target_format == "latex":
	return (
	"You are an OCR engine. Read the image exactly and return clean LaTeX only. "
	"Keep math in proper LaTeX syntax and preserve document structure where possible. "
	"Do not add explanations."
	)

	return (
	"You are an OCR engine. Read the image exactly and return clean Markdown only. "
	"Use standard markdown headings/lists/tables where appropriate and preserve equations "
	"using $...$ or $$...$$. Do not add explanations."
	)


	def load_model(model_id: str, device: str):
	torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32

	if device == "cpu":
	device_map = {"": "cpu"}
	elif device == "cuda":
	if not torch.cuda.is_available():
	raise RuntimeError("--device cuda was requested but CUDA is not available.")
	device_map = "auto"
	else:
	device_map = "auto"

	processor = AutoProcessor.from_pretrained(model_id, trust_remote_code=True)
	model = AutoModelForImageTextToText.from_pretrained(
	model_id,
	torch_dtype=torch_dtype,
	device_map=device_map,
	trust_remote_code=True,
	)
	return processor, model


	def run_ocr(
	image_path: Path, target_format: str, model_id: str, max_new_tokens: int, device: str
	) -> str:
	image = Image.open(image_path).convert("RGB")
	prompt = build_prompt(target_format)

	processor, model = load_model(model_id, device)

	messages = [
	{
	"role": "user",
	"content": [
	{"type": "image", "image": image},
	{"type": "text", "text": prompt},
	],
	}
	]

	text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	inputs = processor(text=[text], images=[image], return_tensors="pt")
	inputs = {k: v.to(model.device) if hasattr(v, "to") else v for k, v in inputs.items()}

	with torch.inference_mode():
	generated = model.generate(**inputs, max_new_tokens=max_new_tokens)

	prompt_len = inputs["input_ids"].shape[-1]
	generated_trimmed = generated[:, prompt_len:]
	output = processor.batch_decode(generated_trimmed, skip_special_tokens=True)

	return output[0].strip()


	def resolve_output_path(image_path: Path, output: Path \| None, target_format: str) -> Path:
	if output is not None:
	return output

	extension = ".md" if target_format == "md" else ".tex"
	return image_path.with_suffix(extension)


	def main() -> None:
	args = parse_args()

	if not args.input.exists():
	raise FileNotFoundError(f"Input file not found: {args.input}")

	output_path = resolve_output_path(args.input, args.output, args.format)
	output_path.parent.mkdir(parents=True, exist_ok=True)

	text = run_ocr(
	image_path=args.input,
	target_format=args.format,
	model_id=args.model_id,
	max_new_tokens=args.max_new_tokens,
	device=args.device,
	)

	output_path.write_text(text, encoding="utf-8")
	print(f"Saved {args.format} output to: {output_path}")


	if __name__ == "__main__":
	main()