FiberGate / scripts /label.py
AzizMiladi's picture
chore: git mv scripts, UI, dev tools, docs into folders
70c46cc
Raw
History Blame
14.8 kB
"""
upload_to_labelstudio.py
────────────────────────
Uploads every file from batch_dataref_results.json directly into Label Studio
via its REST API. No local file serving, no env variables needed.
How it works
────────────
1. Reads batch_dataref_results.json
2. For each entry:
- PDFs β†’ rasterised to PNG pages with pdf2image, then uploaded as images
- PNGs/JPGs β†’ uploaded directly
3. Each uploaded file gets a Label Studio task with:
- "image" β†’ the hosted URL Label Studio assigns after upload
- "ocr" β†’ extracted fields text (required by LS OCR template)
4. All tasks are created in the specified project via the API
Usage
─────
# First create a project in Label Studio UI, note its ID (shown in URL)
python upload_to_labelstudio.py --project_id 1
# Full options
python upload_to_labelstudio.py ^
--results_json batch_dataref_results.json ^
--data_root C:\\Users\\azizmohamed.miladi_a\\Desktop\\GuichetOI_ML\\processed_dataref ^
--ls_url http://localhost:8081 ^
--api_token YOUR_TOKEN_HERE ^
--project_id 1 ^
--dpi 150
Getting your API token
──────────────────────
Label Studio β†’ top-right avatar β†’ Account & Settings β†’ Access Token
"""
import argparse
import json
import logging
import sys
import time
from io import BytesIO
from pathlib import Path, PureWindowsPath
# ── Third-party ───────────────────────────────────────────────────────────────
try:
import requests
except ImportError:
sys.exit("pip install requests")
try:
from PIL import Image
except ImportError:
sys.exit("pip install Pillow")
# ── Logging ───────────────────────────────────────────────────────────────────
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)
# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────
def get_api_token(ls_url: str, username: str, password: str) -> str:
"""
Exchange Label Studio username + password for an API token.
Use this only if you don't have a token yet.
"""
resp = requests.post(
f"{ls_url}/api/token",
json={"username": username, "password": password},
timeout=15,
)
resp.raise_for_status()
return resp.json()["token"]
def upload_image_bytes(
ls_url: str,
headers: dict,
project_id: int,
img_bytes: bytes,
filename: str,
) -> str:
"""
Upload raw image bytes to Label Studio and return the hosted file URL.
LS stores the file and returns a URL like /data/upload/<id>-filename.png
"""
resp = requests.post(
f"{ls_url}/api/projects/{project_id}/import",
headers=headers,
files={"file": (filename, BytesIO(img_bytes), "image/png")},
timeout=60,
)
if resp.status_code != 201:
raise RuntimeError(
f"Upload failed ({resp.status_code}): {resp.text[:200]}"
)
# LS returns the created task(s); extract the image URL from the first one
tasks = resp.json()
if isinstance(tasks, list) and tasks:
return tasks[0].get("data", {}).get("image", "")
return ""
def create_task(
ls_url: str,
headers: dict,
project_id: int,
image_url: str,
ocr_text: str,
meta: dict,
) -> int:
"""Create a single task in Label Studio and return its ID."""
payload = {
"data": {
"image": image_url,
"ocr": ocr_text, # required by LS OCR template
"doc_class": meta.get("doc_class", ""),
"doc_confidence": meta.get("doc_confidence", 0),
"ocr_source": meta.get("ocr_source", ""),
"source_file": meta.get("source_file", ""),
}
}
resp = requests.post(
f"{ls_url}/api/tasks",
headers={**headers, "Content-Type": "application/json"},
json=payload,
timeout=30,
)
if resp.status_code not in (200, 201):
raise RuntimeError(
f"Task creation failed ({resp.status_code}): {resp.text[:200]}"
)
return resp.json().get("id", -1)
def pil_to_png_bytes(img: Image.Image) -> bytes:
"""Convert a PIL image to PNG bytes in memory."""
buf = BytesIO()
img.save(buf, format="PNG")
return buf.getvalue()
def pdf_to_pil_pages(pdf_path: Path, dpi: int = 150) -> list[Image.Image]:
"""Rasterise a PDF to a list of PIL RGB images (one per page)."""
try:
from pdf2image import convert_from_path
pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png")
return [p.convert("RGB") for p in pages]
except Exception as exc:
log.error(" PDF rasterise failed for %s: %s", pdf_path.name, exc)
return []
# ─────────────────────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────────────────────
def run(
results_json: Path,
data_root: Path,
ls_url: str,
api_token: str,
project_id: int,
dpi: int,
max_pages: int,
start_from: int,
) -> None:
ls_url = ls_url.rstrip("/")
headers = {"Authorization": f"Token {api_token}"}
# ── Verify connection ─────────────────────────────────────────────────────
try:
r = requests.get(f"{ls_url}/api/projects/{project_id}", headers=headers, timeout=10)
r.raise_for_status()
proj_name = r.json().get("title", "?")
log.info("Connected to Label Studio β€” project %d: '%s'", project_id, proj_name)
except Exception as exc:
sys.exit(f"Cannot reach Label Studio at {ls_url}: {exc}")
# ── Load results ──────────────────────────────────────────────────────────
with open(results_json, encoding="utf-8") as f:
data = json.load(f)
results = data["results"]
log.info("Loaded %d entries from %s", len(results), results_json)
# ── Process each entry ────────────────────────────────────────────────────
success = skipped = failed = 0
for idx, entry in enumerate(results):
if idx < start_from:
continue
# Convert Windows backslash path β†’ local absolute path
rel_path = PureWindowsPath(entry["image"])
local_path = data_root / rel_path
log.info(
"[%d/%d] %s (%s)",
idx + 1, len(results), rel_path.name, entry["doc_class"]
)
if not local_path.exists():
log.warning(" File not found: %s β€” skipping", local_path)
skipped += 1
continue
# Build OCR text from extracted fields
fields_text = "\n".join(
f"{name}: {info['value']} (conf={info['confidence']})"
for name, info in entry.get("fields", {}).items()
)
meta = {
"doc_class": entry["doc_class"],
"doc_confidence": entry["doc_confidence"],
"ocr_source": entry["ocr_source"],
"source_file": rel_path.as_posix(),
}
ext = local_path.suffix.lower()
try:
# ── PDF: rasterise each page and upload separately ────────────────
if ext == ".pdf":
pages = pdf_to_pil_pages(local_path, dpi=dpi)
if not pages:
log.warning(" No pages extracted β€” skipping")
skipped += 1
continue
pages = pages[:max_pages] # limit pages per document
log.info(" %d page(s) to upload", len(pages))
for p_idx, page_img in enumerate(pages):
png_bytes = pil_to_png_bytes(page_img)
fname = f"{local_path.stem}_p{p_idx:03d}.png"
# Upload image file β†’ get hosted URL
img_url = upload_image_bytes(
ls_url, headers, project_id, png_bytes, fname
)
if not img_url:
# Upload via import endpoint returns the task directly;
# create a separate task with correct metadata instead
task_id = create_task(
ls_url, headers, project_id,
image_url=f"/data/upload/{fname}",
ocr_text=fields_text,
meta={**meta, "page": p_idx},
)
else:
# Update the auto-created task with correct metadata
task_id = create_task(
ls_url, headers, project_id,
image_url=img_url,
ocr_text=fields_text,
meta={**meta, "page": p_idx},
)
log.info(" Page %d β†’ task %d", p_idx, task_id)
time.sleep(0.1) # be gentle with the local server
# ── Image: upload directly ────────────────────────────────────────
elif ext in {".png", ".jpg", ".jpeg"}:
with open(local_path, "rb") as f:
img_bytes = f.read()
fname = local_path.name
img_url = upload_image_bytes(
ls_url, headers, project_id, img_bytes, fname
)
task_id = create_task(
ls_url, headers, project_id,
image_url=img_url or f"/data/upload/{fname}",
ocr_text=fields_text,
meta=meta,
)
log.info(" Uploaded β†’ task %d", task_id)
success += 1
except Exception as exc:
log.error(" FAILED: %s", exc)
failed += 1
continue
# ── Summary ───────────────────────────────────────────────────────────────
print("\n" + "═" * 48)
print(f" Total entries : {len(results)}")
print(f" Uploaded : {success}")
print(f" Skipped : {skipped} (file not found)")
print(f" Failed : {failed}")
print("═" * 48)
print(f"\nOpen your project: {ls_url}/projects/{project_id}/")
# ─────────────────────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────────────────────
def _parse_args() -> argparse.Namespace:
p = argparse.ArgumentParser(
description="Upload DataRef files directly into Label Studio via API"
)
p.add_argument(
"--results_json",
type=Path,
default=Path("batch_dataref_results.json"),
help="Path to batch_dataref_results.json (default: ./batch_dataref_results.json)",
)
p.add_argument(
"--data_root",
type=Path,
default=Path("C:/Users/azizmohamed.miladi_a/Desktop/GuichetOI_ML\\processed_dataref"),
help="Root folder that contains the DataRef\\ sub-folders",
)
p.add_argument(
"--ls_url",
type=str,
default="http://localhost:8081",
help="Label Studio base URL (default: http://localhost:8081)",
)
p.add_argument(
"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA4NTY0NzQyNSwiaWF0IjoxNzc4NDQ3NDI1LCJqdGkiOiIxMTIzMjAxMGQ3YmU0NDM3ODlmN2YwMjA3MWQ0MTI4NyIsInVzZXJfaWQiOiIxIn0.D3vcHfxHiXBTK32XueSABFE2srKR_tUruesYIGqpGKE",
type=str,
required=True,
help=(
"Label Studio API token. "
"Find it at: LS β†’ avatar (top right) β†’ Account & Settings β†’ Access Token"
),
)
p.add_argument(
"http://localhost:8081/projects/9/data?tab=21",
type=int,
required=True,
help="Label Studio project ID (visible in the URL when you open the project)",
)
p.add_argument(
"--dpi",
type=int,
default=150,
help="DPI for PDF rasterisation (default: 150 β€” lower = faster upload)",
)
p.add_argument(
"--max_pages",
type=int,
default=3,
help="Max pages to upload per PDF (default: 3 β€” avoids uploading 26-page docs)",
)
p.add_argument(
"--start_from",
type=int,
default=0,
help="Resume from this entry index if a previous run was interrupted",
)
return p.parse_args()
if __name__ == "__main__":
args = _parse_args()
run(
results_json = args.results_json,
data_root = args.data_root,
ls_url = args.ls_url,
api_token = args.api_token,
project_id = args.project_id,
dpi = args.dpi,
max_pages = args.max_pages,
start_from = args.start_from,
)