""" Claveros 4-page extraction Space — ZeroGPU on H200. Processes 4-page slim claveros PDFs: Page 0 = Nivelación → votantes_e11, votos_urna, votos_incinerados Page 1 = Verde (3020) → verde_lista, cand_7, verde_total Page 2 = Especiales → votos_blancos, votos_nulos, votos_no_marcados Page 3 = Constancias → constancias text, hubo_recuento, firmas_count Call via Gradio Client: from gradio_client import Client client = Client("libacc/claveros-extract") result = client.predict(pdf_file, api_name="/extract") Co-Authored-By: Oz """ import json import os import spaces import gradio as gr import torch import fitz # PyMuPDF from PIL import Image from transformers import AutoModelForImageTextToText, AutoProcessor from qwen_vl_utils import process_vision_info # ── Model (loaded at module level for ZeroGPU CUDA emulation) ───────── MODEL_ID = "Qwen/Qwen3-VL-8B-Instruct" DPI = 300 print(f"Loading {MODEL_ID}...") model = AutoModelForImageTextToText.from_pretrained( MODEL_ID, dtype=torch.bfloat16, device_map="auto", ) processor = AutoProcessor.from_pretrained( MODEL_ID, min_pixels=256 * 28 * 28, max_pixels=1280 * 28 * 28 ) print("Model loaded.") # ── Prompts ─────────────────────────────────────────────────────────── PROMPT_NIV = """\ E-14 CLAVEROS SENADO — NIVELACIÓN page. Read handwritten digit boxes: 1. "TOTAL VOTANTES FORMULARIO E-11" → votantes_e11 2. "TOTAL VOTOS DE SENADO EN LA URNA" → votos_urna 3. "TOTAL VOTOS INCINERADOS" → votos_incinerados (often 0) Also read printed: dept (2-digit), muni (3-digit), mesa. KIT/Form numbers at bottom are NOT votes. Each row: 3 boxes (hundreds|tens|ones). Empty=0. Return ONLY: {"votantes_e11": , "votos_urna": , "votos_incinerados": , "dept": "", "muni": "", "mesa": ""}""" PROMPT_VERDE = """\ E-14 CLAVEROS SENADO — ALIANZA POR COLOMBIA (3020). Read 3 handwritten values from digit boxes (hundreds|tens|ones, empty=0): 1) "VOTOS SOLO POR LA LISTA" (row 0) → verde_lista 2) Row "7" — handwritten boxes RIGHT of printed "7" → cand_7 3) "TOTAL AGRUPACIÓN POLÍTICA" (bottom) → verde_total Printed numbers 1-100 are ROW LABELS, not votes. KIT/Form numbers are NOT votes. VERIFY: verde_lista ≤ verde_total AND cand_7 ≤ verde_total. Return ONLY: {"verde_lista": , "cand_7": , "verde_total": }""" PROMPT_VERDE_RETRY = """\ Re-read. Previous: {prev}. Common errors: 1 misread as 7, 0 as 6, \ printed row label "7" used as vote, KIT number used as total. Constraints: verde_lista ≤ verde_total, cand_7 ≤ verde_total. Return ONLY: {{"verde_lista": , "cand_7": , "verde_total": }}""" PROMPT_ESP = """\ E-14 CLAVEROS SENADO — VOTOS ESPECIALES. Read 3 rows (3 digit boxes each, empty=0): 1) VOTOS EN BLANCO → votos_blancos 2) VOTOS NULOS → votos_nulos 3) VOTOS NO MARCADOS → votos_no_marcados Handwritten 0 can look like 6 — recheck if values seem high. Return ONLY: {"votos_blancos": , "votos_nulos": , "votos_no_marcados": }""" PROMPT_CONST = """\ E-14 CLAVEROS SENADO — CONSTANCIAS page. 1) Transcribe ALL handwritten text in "CONSTANCIAS DE LOS JURADOS" box. \ Preserve original Spanish exactly. Empty box = "". 2) "¿HUBO RECUENTO DE VOTOS?" — "si", "no", or "unclear". 3) Count signature boxes (FIRMA JURADO 1-6) that have signatures (0-6). Return ONLY: {"constancias": "", "hubo_recuento": "si"|"no"|"unclear", "firmas_count": }""" # ── Helpers ──────────────────────────────────────────────────────────── def render_page(pdf_path, page_idx): doc = fitz.open(pdf_path) if page_idx >= len(doc): page_idx = len(doc) - 1 mat = fitz.Matrix(DPI / 72, DPI / 72) pix = doc[page_idx].get_pixmap(matrix=mat) img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) doc.close() if img.width > img.height: img = img.rotate(90, expand=True) return img def vlm_call(img, prompt, max_tokens=120): messages = [ {"role": "system", "content": [{"type": "text", "text": "You are a careful OCR assistant. /no_think"}]}, {"role": "user", "content": [ {"type": "image", "image": img}, {"type": "text", "text": prompt}, ]}, ] text_input = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) inputs = processor( text=[text_input], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to(model.device) with torch.no_grad(): out = model.generate(**inputs, max_new_tokens=max_tokens, do_sample=False) trimmed = out[0, inputs["input_ids"].shape[1]:] return processor.decode(trimmed, skip_special_tokens=True) def parse_json(text): clean = text.strip() if "" in clean: end = clean.find("") clean = clean[end + 8:].strip() if end >= 0 else clean[clean.find("") + 7:].strip() if clean.startswith("```"): lines = clean.split("\n") clean = "\n".join(lines[1:-1] if lines[-1].strip() == "```" else lines[1:]).strip() try: return json.loads(clean) except json.JSONDecodeError: pass s, e = clean.find("{"), clean.rfind("}") + 1 if s >= 0 and e > s: try: return json.loads(clean[s:e]) except json.JSONDecodeError: pass return {"_parse_error": True, "_raw": text[:500]} def to_int(v): if isinstance(v, int): return v if isinstance(v, float): return int(v) if isinstance(v, str): s = v.strip().replace(",", "") try: return int(s) except: return 0 return 0 # ── Main extraction (single GPU burst for all 4 pages) ──────────────── @spaces.GPU(duration=120) def extract_form(pdf_path): """Extract all 4 pages from a slim claveros PDF in one GPU burst.""" import time t0 = time.time() result = {} # Page 0: Nivelación try: img = render_page(pdf_path, 0) raw = vlm_call(img, PROMPT_NIV) result["nivelacion"] = parse_json(raw) except Exception as e: result["nivelacion"] = {"_error": str(e)} # Page 1: Verde try: img = render_page(pdf_path, 1) raw = vlm_call(img, PROMPT_VERDE) parsed = parse_json(raw) # Retry if arithmetic fails vl = to_int(parsed.get("verde_lista", 0)) c7 = to_int(parsed.get("cand_7", 0)) vt = to_int(parsed.get("verde_total", 0)) if (vl > vt and vt > 0) or (c7 > vt and vt > 0) or c7 >= 50: raw2 = vlm_call(img, PROMPT_VERDE_RETRY.format(prev=json.dumps(parsed))) p2 = parse_json(raw2) if not p2.get("_parse_error"): parsed = p2 result["verde"] = parsed except Exception as e: result["verde"] = {"_error": str(e)} # Page 2: Especiales try: img = render_page(pdf_path, 2) raw = vlm_call(img, PROMPT_ESP) result["especiales"] = parse_json(raw) except Exception as e: result["especiales"] = {"_error": str(e)} # Page 3: Constancias try: img = render_page(pdf_path, 3) raw = vlm_call(img, PROMPT_CONST, max_tokens=1500) parsed = parse_json(raw) ctext = str(parsed.get("constancias", "")).lower() parsed["constancia_relevant_verde"] = any( kw in ctext for kw in ["alianza", "verde", "3020", "candidat"] ) result["constancias"] = parsed except Exception as e: result["constancias"] = {"_error": str(e)} result["elapsed_s"] = round(time.time() - t0, 1) return json.dumps(result, ensure_ascii=False) # ── Gradio Interface ────────────────────────────────────────────────── demo = gr.Interface( fn=extract_form, inputs=gr.File(label="Slim 4-page claveros PDF", file_types=[".pdf"]), outputs=gr.Textbox(label="Extraction result (JSON)", lines=20), title="Claveros 4-Page Extraction", description="Upload a 4-page slim claveros PDF. Extracts nivelación, Verde votes, especiales, and constancias.", ) demo.launch()