libacc's picture
Fix: remove allow_flagging, use dtype instead of torch_dtype
0fb262b verified
"""
Claveros 4-page extraction Space β€” ZeroGPU on H200.
Processes 4-page slim claveros PDFs:
Page 0 = NivelaciΓ³n β†’ votantes_e11, votos_urna, votos_incinerados
Page 1 = Verde (3020) β†’ verde_lista, cand_7, verde_total
Page 2 = Especiales β†’ votos_blancos, votos_nulos, votos_no_marcados
Page 3 = Constancias β†’ constancias text, hubo_recuento, firmas_count
Call via Gradio Client:
from gradio_client import Client
client = Client("libacc/claveros-extract")
result = client.predict(pdf_file, api_name="/extract")
Co-Authored-By: Oz <oz-agent@warp.dev>
"""
import json
import os
import spaces
import gradio as gr
import torch
import fitz # PyMuPDF
from PIL import Image
from transformers import AutoModelForImageTextToText, AutoProcessor
from qwen_vl_utils import process_vision_info
# ── Model (loaded at module level for ZeroGPU CUDA emulation) ─────────
MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct"
DPI = 300
print(f"Loading {MODEL_ID}...")
model = AutoModelForImageTextToText.from_pretrained(
MODEL_ID,
dtype=torch.bfloat16,
device_map="auto",
)
processor = AutoProcessor.from_pretrained(
MODEL_ID, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28
)
print("Model loaded.")
# ── Prompts ───────────────────────────────────────────────────────────
PROMPT_NIV = """\
E-14 CLAVEROS SENADO β€” NIVELACIΓ“N page.
Read handwritten digit boxes:
1. "TOTAL VOTANTES FORMULARIO E-11" β†’ votantes_e11
2. "TOTAL VOTOS DE SENADO EN LA URNA" β†’ votos_urna
3. "TOTAL VOTOS INCINERADOS" β†’ votos_incinerados (often 0)
Also read printed: dept (2-digit), muni (3-digit), mesa.
KIT/Form numbers at bottom are NOT votes.
Each row: 3 boxes (hundreds|tens|ones). Empty=0.
Return ONLY:
{"votantes_e11": <int>, "votos_urna": <int>, "votos_incinerados": <int>, "dept": "<str>", "muni": "<str>", "mesa": "<str>"}"""
PROMPT_VERDE = """\
E-14 CLAVEROS SENADO β€” ALIANZA POR COLOMBIA (3020).
Read 3 handwritten values from digit boxes (hundreds|tens|ones, empty=0):
1) "VOTOS SOLO POR LA LISTA" (row 0) β†’ verde_lista
2) Row "7" β€” handwritten boxes RIGHT of printed "7" β†’ cand_7
3) "TOTAL AGRUPACIΓ“N POLÍTICA" (bottom) β†’ verde_total
Printed numbers 1-100 are ROW LABELS, not votes. KIT/Form numbers are NOT votes.
VERIFY: verde_lista ≀ verde_total AND cand_7 ≀ verde_total.
Return ONLY:
{"verde_lista": <int>, "cand_7": <int>, "verde_total": <int>}"""
PROMPT_VERDE_RETRY = """\
Re-read. Previous: {prev}. Common errors: 1 misread as 7, 0 as 6, \
printed row label "7" used as vote, KIT number used as total.
Constraints: verde_lista ≀ verde_total, cand_7 ≀ verde_total.
Return ONLY:
{{"verde_lista": <int>, "cand_7": <int>, "verde_total": <int>}}"""
PROMPT_ESP = """\
E-14 CLAVEROS SENADO β€” VOTOS ESPECIALES.
Read 3 rows (3 digit boxes each, empty=0):
1) VOTOS EN BLANCO β†’ votos_blancos
2) VOTOS NULOS β†’ votos_nulos
3) VOTOS NO MARCADOS β†’ votos_no_marcados
Handwritten 0 can look like 6 β€” recheck if values seem high.
Return ONLY:
{"votos_blancos": <int>, "votos_nulos": <int>, "votos_no_marcados": <int>}"""
PROMPT_CONST = """\
E-14 CLAVEROS SENADO β€” CONSTANCIAS page.
1) Transcribe ALL handwritten text in "CONSTANCIAS DE LOS JURADOS" box. \
Preserve original Spanish exactly. Empty box = "".
2) "ΒΏHUBO RECUENTO DE VOTOS?" β€” "si", "no", or "unclear".
3) Count signature boxes (FIRMA JURADO 1-6) that have signatures (0-6).
Return ONLY:
{"constancias": "<text>", "hubo_recuento": "si"|"no"|"unclear", "firmas_count": <int>}"""
# ── Helpers ────────────────────────────────────────────────────────────
def render_page(pdf_path, page_idx):
doc = fitz.open(pdf_path)
if page_idx >= len(doc):
page_idx = len(doc) - 1
mat = fitz.Matrix(DPI / 72, DPI / 72)
pix = doc[page_idx].get_pixmap(matrix=mat)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
doc.close()
if img.width > img.height:
img = img.rotate(90, expand=True)
return img
def vlm_call(img, prompt, max_tokens=120):
messages = [
{"role": "system", "content": [{"type": "text", "text": "You are a careful OCR assistant. /no_think"}]},
{"role": "user", "content": [
{"type": "image", "image": img},
{"type": "text", "text": prompt},
]},
]
text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
inputs = processor(
text=[text_input], images=image_inputs, videos=video_inputs,
padding=True, return_tensors="pt",
).to(model.device)
with torch.no_grad():
out = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False)
trimmed = out[0, inputs["input_ids"].shape[1]:]
return processor.decode(trimmed, skip_special_tokens=True)
def parse_json(text):
clean = text.strip()
if "<think>" in clean:
end = clean.find("</think>")
clean = clean[end + 8:].strip() if end >= 0 else clean[clean.find("<think>") + 7:].strip()
if clean.startswith("```"):
lines = clean.split("\n")
clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:]).strip()
try:
return json.loads(clean)
except json.JSONDecodeError:
pass
s, e = clean.find("{"), clean.rfind("}") + 1
if s >= 0 and e > s:
try:
return json.loads(clean[s:e])
except json.JSONDecodeError:
pass
return {"_parse_error": True, "_raw": text[:500]}
def to_int(v):
if isinstance(v, int): return v
if isinstance(v, float): return int(v)
if isinstance(v, str):
s = v.strip().replace(",", "")
try: return int(s)
except: return 0
return 0
# ── Main extraction (single GPU burst for all 4 pages) ────────────────
@spaces.GPU(duration=120)
def extract_form(pdf_path):
"""Extract all 4 pages from a slim claveros PDF in one GPU burst."""
import time
t0 = time.time()
result = {}
# Page 0: NivelaciΓ³n
try:
img = render_page(pdf_path, 0)
raw = vlm_call(img, PROMPT_NIV)
result["nivelacion"] = parse_json(raw)
except Exception as e:
result["nivelacion"] = {"_error": str(e)}
# Page 1: Verde
try:
img = render_page(pdf_path, 1)
raw = vlm_call(img, PROMPT_VERDE)
parsed = parse_json(raw)
# Retry if arithmetic fails
vl = to_int(parsed.get("verde_lista", 0))
c7 = to_int(parsed.get("cand_7", 0))
vt = to_int(parsed.get("verde_total", 0))
if (vl > vt and vt > 0) or (c7 > vt and vt > 0) or c7 >= 50:
raw2 = vlm_call(img, PROMPT_VERDE_RETRY.format(prev=json.dumps(parsed)))
p2 = parse_json(raw2)
if not p2.get("_parse_error"):
parsed = p2
result["verde"] = parsed
except Exception as e:
result["verde"] = {"_error": str(e)}
# Page 2: Especiales
try:
img = render_page(pdf_path, 2)
raw = vlm_call(img, PROMPT_ESP)
result["especiales"] = parse_json(raw)
except Exception as e:
result["especiales"] = {"_error": str(e)}
# Page 3: Constancias
try:
img = render_page(pdf_path, 3)
raw = vlm_call(img, PROMPT_CONST, max_tokens=1500)
parsed = parse_json(raw)
ctext = str(parsed.get("constancias", "")).lower()
parsed["constancia_relevant_verde"] = any(
kw in ctext for kw in ["alianza", "verde", "3020", "candidat"]
)
result["constancias"] = parsed
except Exception as e:
result["constancias"] = {"_error": str(e)}
result["elapsed_s"] = round(time.time() - t0, 1)
return json.dumps(result, ensure_ascii=False)
# ── Gradio Interface ──────────────────────────────────────────────────
demo = gr.Interface(
fn=extract_form,
inputs=gr.File(label="Slim 4-page claveros PDF", file_types=[".pdf"]),
outputs=gr.Textbox(label="Extraction result (JSON)", lines=20),
title="Claveros 4-Page Extraction",
description="Upload a 4-page slim claveros PDF. Extracts nivelaciΓ³n, Verde votes, especiales, and constancias.",
)
demo.launch()