Spaces:

libacc
/

claveros-extract

Sleeping

App Files Files Community

claveros-extract / app.py

libacc

Fix: remove allow_flagging, use dtype instead of torch_dtype

0fb262b verified about 1 month ago

raw

history blame contribute delete

8.61 kB

	"""
	Claveros 4-page extraction Space — ZeroGPU on H200.

	Processes 4-page slim claveros PDFs:
	Page 0 = Nivelación → votantes_e11, votos_urna, votos_incinerados
	Page 1 = Verde (3020) → verde_lista, cand_7, verde_total
	Page 2 = Especiales → votos_blancos, votos_nulos, votos_no_marcados
	Page 3 = Constancias → constancias text, hubo_recuento, firmas_count

	Call via Gradio Client:
	from gradio_client import Client
	client = Client("libacc/claveros-extract")
	result = client.predict(pdf_file, api_name="/extract")

	Co-Authored-By: Oz <oz-agent@warp.dev>
	"""

	import json
	import os
	import spaces
	import gradio as gr
	import torch
	import fitz # PyMuPDF
	from PIL import Image
	from transformers import AutoModelForImageTextToText, AutoProcessor
	from qwen_vl_utils import process_vision_info

	# ── Model (loaded at module level for ZeroGPU CUDA emulation) ─────────
	MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
	DPI = 300

	print(f"Loading {MODEL_ID}...")
	model = AutoModelForImageTextToText.from_pretrained(
	MODEL_ID,
	dtype=torch.bfloat16,
	device_map="auto",
	)
	processor = AutoProcessor.from_pretrained(
	MODEL_ID, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28
	)
	print("Model loaded.")

	# ── Prompts ───────────────────────────────────────────────────────────

	PROMPT_NIV = """\
	E-14 CLAVEROS SENADO — NIVELACIÓN page.
	Read handwritten digit boxes:
	1. "TOTAL VOTANTES FORMULARIO E-11" → votantes_e11
	2. "TOTAL VOTOS DE SENADO EN LA URNA" → votos_urna
	3. "TOTAL VOTOS INCINERADOS" → votos_incinerados (often 0)
	Also read printed: dept (2-digit), muni (3-digit), mesa.
	KIT/Form numbers at bottom are NOT votes.
	Each row: 3 boxes (hundreds\|tens\|ones). Empty=0.
	Return ONLY:
	{"votantes_e11": <int>, "votos_urna": <int>, "votos_incinerados": <int>, "dept": "<str>", "muni": "<str>", "mesa": "<str>"}"""

	PROMPT_VERDE = """\
	E-14 CLAVEROS SENADO — ALIANZA POR COLOMBIA (3020).
	Read 3 handwritten values from digit boxes (hundreds\|tens\|ones, empty=0):
	1) "VOTOS SOLO POR LA LISTA" (row 0) → verde_lista
	2) Row "7" — handwritten boxes RIGHT of printed "7" → cand_7
	3) "TOTAL AGRUPACIÓN POLÍTICA" (bottom) → verde_total
	Printed numbers 1-100 are ROW LABELS, not votes. KIT/Form numbers are NOT votes.
	VERIFY: verde_lista ≤ verde_total AND cand_7 ≤ verde_total.
	Return ONLY:
	{"verde_lista": <int>, "cand_7": <int>, "verde_total": <int>}"""

	PROMPT_VERDE_RETRY = """\
	Re-read. Previous: {prev}. Common errors: 1 misread as 7, 0 as 6, \
	printed row label "7" used as vote, KIT number used as total.
	Constraints: verde_lista ≤ verde_total, cand_7 ≤ verde_total.
	Return ONLY:
	{{"verde_lista": <int>, "cand_7": <int>, "verde_total": <int>}}"""

	PROMPT_ESP = """\
	E-14 CLAVEROS SENADO — VOTOS ESPECIALES.
	Read 3 rows (3 digit boxes each, empty=0):
	1) VOTOS EN BLANCO → votos_blancos
	2) VOTOS NULOS → votos_nulos
	3) VOTOS NO MARCADOS → votos_no_marcados
	Handwritten 0 can look like 6 — recheck if values seem high.
	Return ONLY:
	{"votos_blancos": <int>, "votos_nulos": <int>, "votos_no_marcados": <int>}"""

	PROMPT_CONST = """\
	E-14 CLAVEROS SENADO — CONSTANCIAS page.
	1) Transcribe ALL handwritten text in "CONSTANCIAS DE LOS JURADOS" box. \
	Preserve original Spanish exactly. Empty box = "".
	2) "¿HUBO RECUENTO DE VOTOS?" — "si", "no", or "unclear".
	3) Count signature boxes (FIRMA JURADO 1-6) that have signatures (0-6).
	Return ONLY:
	{"constancias": "<text>", "hubo_recuento": "si"\|"no"\|"unclear", "firmas_count": <int>}"""


	# ── Helpers ────────────────────────────────────────────────────────────

	def render_page(pdf_path, page_idx):
	doc = fitz.open(pdf_path)
	if page_idx >= len(doc):
	page_idx = len(doc) - 1
	mat = fitz.Matrix(DPI / 72, DPI / 72)
	pix = doc[page_idx].get_pixmap(matrix=mat)
	img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
	doc.close()
	if img.width > img.height:
	img = img.rotate(90, expand=True)
	return img


	def vlm_call(img, prompt, max_tokens=120):
	messages = [
	{"role": "system", "content": [{"type": "text", "text": "You are a careful OCR assistant. /no_think"}]},
	{"role": "user", "content": [
	{"type": "image", "image": img},
	{"type": "text", "text": prompt},
	]},
	]
	text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
	image_inputs, video_inputs = process_vision_info(messages)
	inputs = processor(
	text=[text_input], images=image_inputs, videos=video_inputs,
	padding=True, return_tensors="pt",
	).to(model.device)
	with torch.no_grad():
	out = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
	trimmed = out[0, inputs["input_ids"].shape[1]:]
	return processor.decode(trimmed, skip_special_tokens=True)


	def parse_json(text):
	clean = text.strip()
	if "<think>" in clean:
	end = clean.find("</think>")
	clean = clean[end + 8:].strip() if end >= 0 else clean[clean.find("<think>") + 7:].strip()
	if clean.startswith("```"):
	lines = clean.split("\n")
	clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:]).strip()
	try:
	return json.loads(clean)
	except json.JSONDecodeError:
	pass
	s, e = clean.find("{"), clean.rfind("}") + 1
	if s >= 0 and e > s:
	try:
	return json.loads(clean[s:e])
	except json.JSONDecodeError:
	pass
	return {"_parse_error": True, "_raw": text[:500]}


	def to_int(v):
	if isinstance(v, int): return v
	if isinstance(v, float): return int(v)
	if isinstance(v, str):
	s = v.strip().replace(",", "")
	try: return int(s)
	except: return 0
	return 0


	# ── Main extraction (single GPU burst for all 4 pages) ────────────────

	@spaces.GPU(duration=120)
	def extract_form(pdf_path):
	"""Extract all 4 pages from a slim claveros PDF in one GPU burst."""
	import time
	t0 = time.time()
	result = {}

	# Page 0: Nivelación
	try:
	img = render_page(pdf_path, 0)
	raw = vlm_call(img, PROMPT_NIV)
	result["nivelacion"] = parse_json(raw)
	except Exception as e:
	result["nivelacion"] = {"_error": str(e)}

	# Page 1: Verde
	try:
	img = render_page(pdf_path, 1)
	raw = vlm_call(img, PROMPT_VERDE)
	parsed = parse_json(raw)

	# Retry if arithmetic fails
	vl = to_int(parsed.get("verde_lista", 0))
	c7 = to_int(parsed.get("cand_7", 0))
	vt = to_int(parsed.get("verde_total", 0))
	if (vl > vt and vt > 0) or (c7 > vt and vt > 0) or c7 >= 50:
	raw2 = vlm_call(img, PROMPT_VERDE_RETRY.format(prev=json.dumps(parsed)))
	p2 = parse_json(raw2)
	if not p2.get("_parse_error"):
	parsed = p2

	result["verde"] = parsed
	except Exception as e:
	result["verde"] = {"_error": str(e)}

	# Page 2: Especiales
	try:
	img = render_page(pdf_path, 2)
	raw = vlm_call(img, PROMPT_ESP)
	result["especiales"] = parse_json(raw)
	except Exception as e:
	result["especiales"] = {"_error": str(e)}

	# Page 3: Constancias
	try:
	img = render_page(pdf_path, 3)
	raw = vlm_call(img, PROMPT_CONST, max_tokens=1500)
	parsed = parse_json(raw)
	ctext = str(parsed.get("constancias", "")).lower()
	parsed["constancia_relevant_verde"] = any(
	kw in ctext for kw in ["alianza", "verde", "3020", "candidat"]
	)
	result["constancias"] = parsed
	except Exception as e:
	result["constancias"] = {"_error": str(e)}

	result["elapsed_s"] = round(time.time() - t0, 1)
	return json.dumps(result, ensure_ascii=False)


	# ── Gradio Interface ──────────────────────────────────────────────────

	demo = gr.Interface(
	fn=extract_form,
	inputs=gr.File(label="Slim 4-page claveros PDF", file_types=[".pdf"]),
	outputs=gr.Textbox(label="Extraction result (JSON)", lines=20),
	title="Claveros 4-Page Extraction",
	description="Upload a 4-page slim claveros PDF. Extracts nivelación, Verde votes, especiales, and constancias.",
	)

	demo.launch()