""" upload_to_labelstudio.py ──────────────────────── Uploads every file from batch_dataref_results.json directly into Label Studio via its REST API. No local file serving, no env variables needed. How it works ──────────── 1. Reads batch_dataref_results.json 2. For each entry: - PDFs → rasterised to PNG pages with pdf2image, then uploaded as images - PNGs/JPGs → uploaded directly 3. Each uploaded file gets a Label Studio task with: - "image" → the hosted URL Label Studio assigns after upload - "ocr" → extracted fields text (required by LS OCR template) 4. All tasks are created in the specified project via the API Usage ───── # First create a project in Label Studio UI, note its ID (shown in URL) python upload_to_labelstudio.py --project_id 1 # Full options python upload_to_labelstudio.py ^ --results_json batch_dataref_results.json ^ --data_root C:\\Users\\azizmohamed.miladi_a\\Desktop\\GuichetOI_ML\\processed_dataref ^ --ls_url http://localhost:8081 ^ --api_token YOUR_TOKEN_HERE ^ --project_id 1 ^ --dpi 150 Getting your API token ────────────────────── Label Studio → top-right avatar → Account & Settings → Access Token """ import argparse import json import logging import sys import time from io import BytesIO from pathlib import Path, PureWindowsPath # ── Third-party ─────────────────────────────────────────────────────────────── try: import requests except ImportError: sys.exit("pip install requests") try: from PIL import Image except ImportError: sys.exit("pip install Pillow") # ── Logging ─────────────────────────────────────────────────────────────────── logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)-8s %(message)s", datefmt="%H:%M:%S", ) log = logging.getLogger(__name__) # ───────────────────────────────────────────────────────────────────────────── # HELPERS # ───────────────────────────────────────────────────────────────────────────── def get_api_token(ls_url: str, username: str, password: str) -> str: """ Exchange Label Studio username + password for an API token. Use this only if you don't have a token yet. """ resp = requests.post( f"{ls_url}/api/token", json={"username": username, "password": password}, timeout=15, ) resp.raise_for_status() return resp.json()["token"] def upload_image_bytes( ls_url: str, headers: dict, project_id: int, img_bytes: bytes, filename: str, ) -> str: """ Upload raw image bytes to Label Studio and return the hosted file URL. LS stores the file and returns a URL like /data/upload/-filename.png """ resp = requests.post( f"{ls_url}/api/projects/{project_id}/import", headers=headers, files={"file": (filename, BytesIO(img_bytes), "image/png")}, timeout=60, ) if resp.status_code != 201: raise RuntimeError( f"Upload failed ({resp.status_code}): {resp.text[:200]}" ) # LS returns the created task(s); extract the image URL from the first one tasks = resp.json() if isinstance(tasks, list) and tasks: return tasks[0].get("data", {}).get("image", "") return "" def create_task( ls_url: str, headers: dict, project_id: int, image_url: str, ocr_text: str, meta: dict, ) -> int: """Create a single task in Label Studio and return its ID.""" payload = { "data": { "image": image_url, "ocr": ocr_text, # required by LS OCR template "doc_class": meta.get("doc_class", ""), "doc_confidence": meta.get("doc_confidence", 0), "ocr_source": meta.get("ocr_source", ""), "source_file": meta.get("source_file", ""), } } resp = requests.post( f"{ls_url}/api/tasks", headers={**headers, "Content-Type": "application/json"}, json=payload, timeout=30, ) if resp.status_code not in (200, 201): raise RuntimeError( f"Task creation failed ({resp.status_code}): {resp.text[:200]}" ) return resp.json().get("id", -1) def pil_to_png_bytes(img: Image.Image) -> bytes: """Convert a PIL image to PNG bytes in memory.""" buf = BytesIO() img.save(buf, format="PNG") return buf.getvalue() def pdf_to_pil_pages(pdf_path: Path, dpi: int = 150) -> list[Image.Image]: """Rasterise a PDF to a list of PIL RGB images (one per page).""" try: from pdf2image import convert_from_path pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png") return [p.convert("RGB") for p in pages] except Exception as exc: log.error(" PDF rasterise failed for %s: %s", pdf_path.name, exc) return [] # ───────────────────────────────────────────────────────────────────────────── # MAIN # ───────────────────────────────────────────────────────────────────────────── def run( results_json: Path, data_root: Path, ls_url: str, api_token: str, project_id: int, dpi: int, max_pages: int, start_from: int, ) -> None: ls_url = ls_url.rstrip("/") headers = {"Authorization": f"Token {api_token}"} # ── Verify connection ───────────────────────────────────────────────────── try: r = requests.get(f"{ls_url}/api/projects/{project_id}", headers=headers, timeout=10) r.raise_for_status() proj_name = r.json().get("title", "?") log.info("Connected to Label Studio — project %d: '%s'", project_id, proj_name) except Exception as exc: sys.exit(f"Cannot reach Label Studio at {ls_url}: {exc}") # ── Load results ────────────────────────────────────────────────────────── with open(results_json, encoding="utf-8") as f: data = json.load(f) results = data["results"] log.info("Loaded %d entries from %s", len(results), results_json) # ── Process each entry ──────────────────────────────────────────────────── success = skipped = failed = 0 for idx, entry in enumerate(results): if idx < start_from: continue # Convert Windows backslash path → local absolute path rel_path = PureWindowsPath(entry["image"]) local_path = data_root / rel_path log.info( "[%d/%d] %s (%s)", idx + 1, len(results), rel_path.name, entry["doc_class"] ) if not local_path.exists(): log.warning(" File not found: %s — skipping", local_path) skipped += 1 continue # Build OCR text from extracted fields fields_text = "\n".join( f"{name}: {info['value']} (conf={info['confidence']})" for name, info in entry.get("fields", {}).items() ) meta = { "doc_class": entry["doc_class"], "doc_confidence": entry["doc_confidence"], "ocr_source": entry["ocr_source"], "source_file": rel_path.as_posix(), } ext = local_path.suffix.lower() try: # ── PDF: rasterise each page and upload separately ──────────────── if ext == ".pdf": pages = pdf_to_pil_pages(local_path, dpi=dpi) if not pages: log.warning(" No pages extracted — skipping") skipped += 1 continue pages = pages[:max_pages] # limit pages per document log.info(" %d page(s) to upload", len(pages)) for p_idx, page_img in enumerate(pages): png_bytes = pil_to_png_bytes(page_img) fname = f"{local_path.stem}_p{p_idx:03d}.png" # Upload image file → get hosted URL img_url = upload_image_bytes( ls_url, headers, project_id, png_bytes, fname ) if not img_url: # Upload via import endpoint returns the task directly; # create a separate task with correct metadata instead task_id = create_task( ls_url, headers, project_id, image_url=f"/data/upload/{fname}", ocr_text=fields_text, meta={**meta, "page": p_idx}, ) else: # Update the auto-created task with correct metadata task_id = create_task( ls_url, headers, project_id, image_url=img_url, ocr_text=fields_text, meta={**meta, "page": p_idx}, ) log.info(" Page %d → task %d", p_idx, task_id) time.sleep(0.1) # be gentle with the local server # ── Image: upload directly ──────────────────────────────────────── elif ext in {".png", ".jpg", ".jpeg"}: with open(local_path, "rb") as f: img_bytes = f.read() fname = local_path.name img_url = upload_image_bytes( ls_url, headers, project_id, img_bytes, fname ) task_id = create_task( ls_url, headers, project_id, image_url=img_url or f"/data/upload/{fname}", ocr_text=fields_text, meta=meta, ) log.info(" Uploaded → task %d", task_id) success += 1 except Exception as exc: log.error(" FAILED: %s", exc) failed += 1 continue # ── Summary ─────────────────────────────────────────────────────────────── print("\n" + "═" * 48) print(f" Total entries : {len(results)}") print(f" Uploaded : {success}") print(f" Skipped : {skipped} (file not found)") print(f" Failed : {failed}") print("═" * 48) print(f"\nOpen your project: {ls_url}/projects/{project_id}/") # ───────────────────────────────────────────────────────────────────────────── # CLI # ───────────────────────────────────────────────────────────────────────────── def _parse_args() -> argparse.Namespace: p = argparse.ArgumentParser( description="Upload DataRef files directly into Label Studio via API" ) p.add_argument( "--results_json", type=Path, default=Path("batch_dataref_results.json"), help="Path to batch_dataref_results.json (default: ./batch_dataref_results.json)", ) p.add_argument( "--data_root", type=Path, default=Path("C:/Users/azizmohamed.miladi_a/Desktop/GuichetOI_ML\\processed_dataref"), help="Root folder that contains the DataRef\\ sub-folders", ) p.add_argument( "--ls_url", type=str, default="http://localhost:8081", help="Label Studio base URL (default: http://localhost:8081)", ) p.add_argument( "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA4NTY0NzQyNSwiaWF0IjoxNzc4NDQ3NDI1LCJqdGkiOiIxMTIzMjAxMGQ3YmU0NDM3ODlmN2YwMjA3MWQ0MTI4NyIsInVzZXJfaWQiOiIxIn0.D3vcHfxHiXBTK32XueSABFE2srKR_tUruesYIGqpGKE", type=str, required=True, help=( "Label Studio API token. " "Find it at: LS → avatar (top right) → Account & Settings → Access Token" ), ) p.add_argument( "http://localhost:8081/projects/9/data?tab=21", type=int, required=True, help="Label Studio project ID (visible in the URL when you open the project)", ) p.add_argument( "--dpi", type=int, default=150, help="DPI for PDF rasterisation (default: 150 — lower = faster upload)", ) p.add_argument( "--max_pages", type=int, default=3, help="Max pages to upload per PDF (default: 3 — avoids uploading 26-page docs)", ) p.add_argument( "--start_from", type=int, default=0, help="Resume from this entry index if a previous run was interrupted", ) return p.parse_args() if __name__ == "__main__": args = _parse_args() run( results_json = args.results_json, data_root = args.data_root, ls_url = args.ls_url, api_token = args.api_token, project_id = args.project_id, dpi = args.dpi, max_pages = args.max_pages, start_from = args.start_from, )