Spaces:

AzizMiladi
/

FiberGate

Sleeping

App Files Files

FiberGate / scripts /label.py

AzizMiladi

chore: git mv scripts, UI, dev tools, docs into folders

70c46cc about 1 month ago

Raw

History Blame

14.8 kB

	"""
	upload_to_labelstudio.py
	────────────────────────
	Uploads every file from batch_dataref_results.json directly into Label Studio
	via its REST API. No local file serving, no env variables needed.

	How it works
	────────────
	1. Reads batch_dataref_results.json
	2. For each entry:
	- PDFs → rasterised to PNG pages with pdf2image, then uploaded as images
	- PNGs/JPGs → uploaded directly
	3. Each uploaded file gets a Label Studio task with:
	- "image" → the hosted URL Label Studio assigns after upload
	- "ocr" → extracted fields text (required by LS OCR template)
	4. All tasks are created in the specified project via the API

	Usage
	─────
	# First create a project in Label Studio UI, note its ID (shown in URL)
	python upload_to_labelstudio.py --project_id 1

	# Full options
	python upload_to_labelstudio.py ^
	--results_json batch_dataref_results.json ^
	--data_root C:\\Users\\azizmohamed.miladi_a\\Desktop\\GuichetOI_ML\\processed_dataref ^
	--ls_url http://localhost:8081 ^
	--api_token YOUR_TOKEN_HERE ^
	--project_id 1 ^
	--dpi 150

	Getting your API token
	──────────────────────
	Label Studio → top-right avatar → Account & Settings → Access Token
	"""

	import argparse
	import json
	import logging
	import sys
	import time
	from io import BytesIO
	from pathlib import Path, PureWindowsPath

	# ── Third-party ───────────────────────────────────────────────────────────────
	try:
	import requests
	except ImportError:
	sys.exit("pip install requests")

	try:
	from PIL import Image
	except ImportError:
	sys.exit("pip install Pillow")

	# ── Logging ───────────────────────────────────────────────────────────────────
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s %(levelname)-8s %(message)s",
	datefmt="%H:%M:%S",
	)
	log = logging.getLogger(__name__)

	# ─────────────────────────────────────────────────────────────────────────────
	# HELPERS
	# ─────────────────────────────────────────────────────────────────────────────

	def get_api_token(ls_url: str, username: str, password: str) -> str:
	"""
	Exchange Label Studio username + password for an API token.
	Use this only if you don't have a token yet.
	"""
	resp = requests.post(
	f"{ls_url}/api/token",
	json={"username": username, "password": password},
	timeout=15,
	)
	resp.raise_for_status()
	return resp.json()["token"]


	def upload_image_bytes(
	ls_url: str,
	headers: dict,
	project_id: int,
	img_bytes: bytes,
	filename: str,
	) -> str:
	"""
	Upload raw image bytes to Label Studio and return the hosted file URL.
	LS stores the file and returns a URL like /data/upload/<id>-filename.png
	"""
	resp = requests.post(
	f"{ls_url}/api/projects/{project_id}/import",
	headers=headers,
	files={"file": (filename, BytesIO(img_bytes), "image/png")},
	timeout=60,
	)
	if resp.status_code != 201:
	raise RuntimeError(
	f"Upload failed ({resp.status_code}): {resp.text[:200]}"
	)
	# LS returns the created task(s); extract the image URL from the first one
	tasks = resp.json()
	if isinstance(tasks, list) and tasks:
	return tasks[0].get("data", {}).get("image", "")
	return ""


	def create_task(
	ls_url: str,
	headers: dict,
	project_id: int,
	image_url: str,
	ocr_text: str,
	meta: dict,
	) -> int:
	"""Create a single task in Label Studio and return its ID."""
	payload = {
	"data": {
	"image": image_url,
	"ocr": ocr_text, # required by LS OCR template
	"doc_class": meta.get("doc_class", ""),
	"doc_confidence": meta.get("doc_confidence", 0),
	"ocr_source": meta.get("ocr_source", ""),
	"source_file": meta.get("source_file", ""),
	}
	}
	resp = requests.post(
	f"{ls_url}/api/tasks",
	headers={**headers, "Content-Type": "application/json"},
	json=payload,
	timeout=30,
	)
	if resp.status_code not in (200, 201):
	raise RuntimeError(
	f"Task creation failed ({resp.status_code}): {resp.text[:200]}"
	)
	return resp.json().get("id", -1)


	def pil_to_png_bytes(img: Image.Image) -> bytes:
	"""Convert a PIL image to PNG bytes in memory."""
	buf = BytesIO()
	img.save(buf, format="PNG")
	return buf.getvalue()


	def pdf_to_pil_pages(pdf_path: Path, dpi: int = 150) -> list[Image.Image]:
	"""Rasterise a PDF to a list of PIL RGB images (one per page)."""
	try:
	from pdf2image import convert_from_path
	pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png")
	return [p.convert("RGB") for p in pages]
	except Exception as exc:
	log.error(" PDF rasterise failed for %s: %s", pdf_path.name, exc)
	return []


	# ─────────────────────────────────────────────────────────────────────────────
	# MAIN
	# ─────────────────────────────────────────────────────────────────────────────

	def run(
	results_json: Path,
	data_root: Path,
	ls_url: str,
	api_token: str,
	project_id: int,
	dpi: int,
	max_pages: int,
	start_from: int,
	) -> None:

	ls_url = ls_url.rstrip("/")
	headers = {"Authorization": f"Token {api_token}"}

	# ── Verify connection ─────────────────────────────────────────────────────
	try:
	r = requests.get(f"{ls_url}/api/projects/{project_id}", headers=headers, timeout=10)
	r.raise_for_status()
	proj_name = r.json().get("title", "?")
	log.info("Connected to Label Studio — project %d: '%s'", project_id, proj_name)
	except Exception as exc:
	sys.exit(f"Cannot reach Label Studio at {ls_url}: {exc}")

	# ── Load results ──────────────────────────────────────────────────────────
	with open(results_json, encoding="utf-8") as f:
	data = json.load(f)

	results = data["results"]
	log.info("Loaded %d entries from %s", len(results), results_json)

	# ── Process each entry ────────────────────────────────────────────────────
	success = skipped = failed = 0

	for idx, entry in enumerate(results):
	if idx < start_from:
	continue

	# Convert Windows backslash path → local absolute path
	rel_path = PureWindowsPath(entry["image"])
	local_path = data_root / rel_path

	log.info(
	"[%d/%d] %s (%s)",
	idx + 1, len(results), rel_path.name, entry["doc_class"]
	)

	if not local_path.exists():
	log.warning(" File not found: %s — skipping", local_path)
	skipped += 1
	continue

	# Build OCR text from extracted fields
	fields_text = "\n".join(
	f"{name}: {info['value']} (conf={info['confidence']})"
	for name, info in entry.get("fields", {}).items()
	)

	meta = {
	"doc_class": entry["doc_class"],
	"doc_confidence": entry["doc_confidence"],
	"ocr_source": entry["ocr_source"],
	"source_file": rel_path.as_posix(),
	}

	ext = local_path.suffix.lower()

	try:
	# ── PDF: rasterise each page and upload separately ────────────────
	if ext == ".pdf":
	pages = pdf_to_pil_pages(local_path, dpi=dpi)
	if not pages:
	log.warning(" No pages extracted — skipping")
	skipped += 1
	continue

	pages = pages[:max_pages] # limit pages per document
	log.info(" %d page(s) to upload", len(pages))

	for p_idx, page_img in enumerate(pages):
	png_bytes = pil_to_png_bytes(page_img)
	fname = f"{local_path.stem}_p{p_idx:03d}.png"

	# Upload image file → get hosted URL
	img_url = upload_image_bytes(
	ls_url, headers, project_id, png_bytes, fname
	)

	if not img_url:
	# Upload via import endpoint returns the task directly;
	# create a separate task with correct metadata instead
	task_id = create_task(
	ls_url, headers, project_id,
	image_url=f"/data/upload/{fname}",
	ocr_text=fields_text,
	meta={**meta, "page": p_idx},
	)
	else:
	# Update the auto-created task with correct metadata
	task_id = create_task(
	ls_url, headers, project_id,
	image_url=img_url,
	ocr_text=fields_text,
	meta={**meta, "page": p_idx},
	)

	log.info(" Page %d → task %d", p_idx, task_id)
	time.sleep(0.1) # be gentle with the local server

	# ── Image: upload directly ────────────────────────────────────────
	elif ext in {".png", ".jpg", ".jpeg"}:
	with open(local_path, "rb") as f:
	img_bytes = f.read()

	fname = local_path.name
	img_url = upload_image_bytes(
	ls_url, headers, project_id, img_bytes, fname
	)
	task_id = create_task(
	ls_url, headers, project_id,
	image_url=img_url or f"/data/upload/{fname}",
	ocr_text=fields_text,
	meta=meta,
	)
	log.info(" Uploaded → task %d", task_id)

	success += 1

	except Exception as exc:
	log.error(" FAILED: %s", exc)
	failed += 1
	continue

	# ── Summary ───────────────────────────────────────────────────────────────
	print("\n" + "═" * 48)
	print(f" Total entries : {len(results)}")
	print(f" Uploaded : {success}")
	print(f" Skipped : {skipped} (file not found)")
	print(f" Failed : {failed}")
	print("═" * 48)
	print(f"\nOpen your project: {ls_url}/projects/{project_id}/")


	# ─────────────────────────────────────────────────────────────────────────────
	# CLI
	# ─────────────────────────────────────────────────────────────────────────────

	def _parse_args() -> argparse.Namespace:
	p = argparse.ArgumentParser(
	description="Upload DataRef files directly into Label Studio via API"
	)
	p.add_argument(
	"--results_json",
	type=Path,
	default=Path("batch_dataref_results.json"),
	help="Path to batch_dataref_results.json (default: ./batch_dataref_results.json)",
	)
	p.add_argument(
	"--data_root",
	type=Path,
	default=Path("C:/Users/azizmohamed.miladi_a/Desktop/GuichetOI_ML\\processed_dataref"),
	help="Root folder that contains the DataRef\\ sub-folders",
	)
	p.add_argument(
	"--ls_url",
	type=str,
	default="http://localhost:8081",
	help="Label Studio base URL (default: http://localhost:8081)",
	)
	p.add_argument(
	"eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA4NTY0NzQyNSwiaWF0IjoxNzc4NDQ3NDI1LCJqdGkiOiIxMTIzMjAxMGQ3YmU0NDM3ODlmN2YwMjA3MWQ0MTI4NyIsInVzZXJfaWQiOiIxIn0.D3vcHfxHiXBTK32XueSABFE2srKR_tUruesYIGqpGKE",
	type=str,
	required=True,
	help=(
	"Label Studio API token. "
	"Find it at: LS → avatar (top right) → Account & Settings → Access Token"
	),
	)
	p.add_argument(
	"http://localhost:8081/projects/9/data?tab=21",
	type=int,
	required=True,
	help="Label Studio project ID (visible in the URL when you open the project)",
	)
	p.add_argument(
	"--dpi",
	type=int,
	default=150,
	help="DPI for PDF rasterisation (default: 150 — lower = faster upload)",
	)
	p.add_argument(
	"--max_pages",
	type=int,
	default=3,
	help="Max pages to upload per PDF (default: 3 — avoids uploading 26-page docs)",
	)
	p.add_argument(
	"--start_from",
	type=int,
	default=0,
	help="Resume from this entry index if a previous run was interrupted",
	)
	return p.parse_args()


	if __name__ == "__main__":
	args = _parse_args()
	run(
	results_json = args.results_json,
	data_root = args.data_root,
	ls_url = args.ls_url,
	api_token = args.api_token,
	project_id = args.project_id,
	dpi = args.dpi,
	max_pages = args.max_pages,
	start_from = args.start_from,
	)