Spaces:
Sleeping
Sleeping
| """ | |
| upload_to_labelstudio.py | |
| ββββββββββββββββββββββββ | |
| Uploads every file from batch_dataref_results.json directly into Label Studio | |
| via its REST API. No local file serving, no env variables needed. | |
| How it works | |
| ββββββββββββ | |
| 1. Reads batch_dataref_results.json | |
| 2. For each entry: | |
| - PDFs β rasterised to PNG pages with pdf2image, then uploaded as images | |
| - PNGs/JPGs β uploaded directly | |
| 3. Each uploaded file gets a Label Studio task with: | |
| - "image" β the hosted URL Label Studio assigns after upload | |
| - "ocr" β extracted fields text (required by LS OCR template) | |
| 4. All tasks are created in the specified project via the API | |
| Usage | |
| βββββ | |
| # First create a project in Label Studio UI, note its ID (shown in URL) | |
| python upload_to_labelstudio.py --project_id 1 | |
| # Full options | |
| python upload_to_labelstudio.py ^ | |
| --results_json batch_dataref_results.json ^ | |
| --data_root C:\\Users\\azizmohamed.miladi_a\\Desktop\\GuichetOI_ML\\processed_dataref ^ | |
| --ls_url http://localhost:8081 ^ | |
| --api_token YOUR_TOKEN_HERE ^ | |
| --project_id 1 ^ | |
| --dpi 150 | |
| Getting your API token | |
| ββββββββββββββββββββββ | |
| Label Studio β top-right avatar β Account & Settings β Access Token | |
| """ | |
| import argparse | |
| import json | |
| import logging | |
| import sys | |
| import time | |
| from io import BytesIO | |
| from pathlib import Path, PureWindowsPath | |
| # ββ Third-party βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| import requests | |
| except ImportError: | |
| sys.exit("pip install requests") | |
| try: | |
| from PIL import Image | |
| except ImportError: | |
| sys.exit("pip install Pillow") | |
| # ββ Logging βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-8s %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| log = logging.getLogger(__name__) | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def get_api_token(ls_url: str, username: str, password: str) -> str: | |
| """ | |
| Exchange Label Studio username + password for an API token. | |
| Use this only if you don't have a token yet. | |
| """ | |
| resp = requests.post( | |
| f"{ls_url}/api/token", | |
| json={"username": username, "password": password}, | |
| timeout=15, | |
| ) | |
| resp.raise_for_status() | |
| return resp.json()["token"] | |
| def upload_image_bytes( | |
| ls_url: str, | |
| headers: dict, | |
| project_id: int, | |
| img_bytes: bytes, | |
| filename: str, | |
| ) -> str: | |
| """ | |
| Upload raw image bytes to Label Studio and return the hosted file URL. | |
| LS stores the file and returns a URL like /data/upload/<id>-filename.png | |
| """ | |
| resp = requests.post( | |
| f"{ls_url}/api/projects/{project_id}/import", | |
| headers=headers, | |
| files={"file": (filename, BytesIO(img_bytes), "image/png")}, | |
| timeout=60, | |
| ) | |
| if resp.status_code != 201: | |
| raise RuntimeError( | |
| f"Upload failed ({resp.status_code}): {resp.text[:200]}" | |
| ) | |
| # LS returns the created task(s); extract the image URL from the first one | |
| tasks = resp.json() | |
| if isinstance(tasks, list) and tasks: | |
| return tasks[0].get("data", {}).get("image", "") | |
| return "" | |
| def create_task( | |
| ls_url: str, | |
| headers: dict, | |
| project_id: int, | |
| image_url: str, | |
| ocr_text: str, | |
| meta: dict, | |
| ) -> int: | |
| """Create a single task in Label Studio and return its ID.""" | |
| payload = { | |
| "data": { | |
| "image": image_url, | |
| "ocr": ocr_text, # required by LS OCR template | |
| "doc_class": meta.get("doc_class", ""), | |
| "doc_confidence": meta.get("doc_confidence", 0), | |
| "ocr_source": meta.get("ocr_source", ""), | |
| "source_file": meta.get("source_file", ""), | |
| } | |
| } | |
| resp = requests.post( | |
| f"{ls_url}/api/tasks", | |
| headers={**headers, "Content-Type": "application/json"}, | |
| json=payload, | |
| timeout=30, | |
| ) | |
| if resp.status_code not in (200, 201): | |
| raise RuntimeError( | |
| f"Task creation failed ({resp.status_code}): {resp.text[:200]}" | |
| ) | |
| return resp.json().get("id", -1) | |
| def pil_to_png_bytes(img: Image.Image) -> bytes: | |
| """Convert a PIL image to PNG bytes in memory.""" | |
| buf = BytesIO() | |
| img.save(buf, format="PNG") | |
| return buf.getvalue() | |
| def pdf_to_pil_pages(pdf_path: Path, dpi: int = 150) -> list[Image.Image]: | |
| """Rasterise a PDF to a list of PIL RGB images (one per page).""" | |
| try: | |
| from pdf2image import convert_from_path | |
| pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png") | |
| return [p.convert("RGB") for p in pages] | |
| except Exception as exc: | |
| log.error(" PDF rasterise failed for %s: %s", pdf_path.name, exc) | |
| return [] | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # MAIN | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def run( | |
| results_json: Path, | |
| data_root: Path, | |
| ls_url: str, | |
| api_token: str, | |
| project_id: int, | |
| dpi: int, | |
| max_pages: int, | |
| start_from: int, | |
| ) -> None: | |
| ls_url = ls_url.rstrip("/") | |
| headers = {"Authorization": f"Token {api_token}"} | |
| # ββ Verify connection βββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| try: | |
| r = requests.get(f"{ls_url}/api/projects/{project_id}", headers=headers, timeout=10) | |
| r.raise_for_status() | |
| proj_name = r.json().get("title", "?") | |
| log.info("Connected to Label Studio β project %d: '%s'", project_id, proj_name) | |
| except Exception as exc: | |
| sys.exit(f"Cannot reach Label Studio at {ls_url}: {exc}") | |
| # ββ Load results ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with open(results_json, encoding="utf-8") as f: | |
| data = json.load(f) | |
| results = data["results"] | |
| log.info("Loaded %d entries from %s", len(results), results_json) | |
| # ββ Process each entry ββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| success = skipped = failed = 0 | |
| for idx, entry in enumerate(results): | |
| if idx < start_from: | |
| continue | |
| # Convert Windows backslash path β local absolute path | |
| rel_path = PureWindowsPath(entry["image"]) | |
| local_path = data_root / rel_path | |
| log.info( | |
| "[%d/%d] %s (%s)", | |
| idx + 1, len(results), rel_path.name, entry["doc_class"] | |
| ) | |
| if not local_path.exists(): | |
| log.warning(" File not found: %s β skipping", local_path) | |
| skipped += 1 | |
| continue | |
| # Build OCR text from extracted fields | |
| fields_text = "\n".join( | |
| f"{name}: {info['value']} (conf={info['confidence']})" | |
| for name, info in entry.get("fields", {}).items() | |
| ) | |
| meta = { | |
| "doc_class": entry["doc_class"], | |
| "doc_confidence": entry["doc_confidence"], | |
| "ocr_source": entry["ocr_source"], | |
| "source_file": rel_path.as_posix(), | |
| } | |
| ext = local_path.suffix.lower() | |
| try: | |
| # ββ PDF: rasterise each page and upload separately ββββββββββββββββ | |
| if ext == ".pdf": | |
| pages = pdf_to_pil_pages(local_path, dpi=dpi) | |
| if not pages: | |
| log.warning(" No pages extracted β skipping") | |
| skipped += 1 | |
| continue | |
| pages = pages[:max_pages] # limit pages per document | |
| log.info(" %d page(s) to upload", len(pages)) | |
| for p_idx, page_img in enumerate(pages): | |
| png_bytes = pil_to_png_bytes(page_img) | |
| fname = f"{local_path.stem}_p{p_idx:03d}.png" | |
| # Upload image file β get hosted URL | |
| img_url = upload_image_bytes( | |
| ls_url, headers, project_id, png_bytes, fname | |
| ) | |
| if not img_url: | |
| # Upload via import endpoint returns the task directly; | |
| # create a separate task with correct metadata instead | |
| task_id = create_task( | |
| ls_url, headers, project_id, | |
| image_url=f"/data/upload/{fname}", | |
| ocr_text=fields_text, | |
| meta={**meta, "page": p_idx}, | |
| ) | |
| else: | |
| # Update the auto-created task with correct metadata | |
| task_id = create_task( | |
| ls_url, headers, project_id, | |
| image_url=img_url, | |
| ocr_text=fields_text, | |
| meta={**meta, "page": p_idx}, | |
| ) | |
| log.info(" Page %d β task %d", p_idx, task_id) | |
| time.sleep(0.1) # be gentle with the local server | |
| # ββ Image: upload directly ββββββββββββββββββββββββββββββββββββββββ | |
| elif ext in {".png", ".jpg", ".jpeg"}: | |
| with open(local_path, "rb") as f: | |
| img_bytes = f.read() | |
| fname = local_path.name | |
| img_url = upload_image_bytes( | |
| ls_url, headers, project_id, img_bytes, fname | |
| ) | |
| task_id = create_task( | |
| ls_url, headers, project_id, | |
| image_url=img_url or f"/data/upload/{fname}", | |
| ocr_text=fields_text, | |
| meta=meta, | |
| ) | |
| log.info(" Uploaded β task %d", task_id) | |
| success += 1 | |
| except Exception as exc: | |
| log.error(" FAILED: %s", exc) | |
| failed += 1 | |
| continue | |
| # ββ Summary βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("\n" + "β" * 48) | |
| print(f" Total entries : {len(results)}") | |
| print(f" Uploaded : {success}") | |
| print(f" Skipped : {skipped} (file not found)") | |
| print(f" Failed : {failed}") | |
| print("β" * 48) | |
| print(f"\nOpen your project: {ls_url}/projects/{project_id}/") | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # CLI | |
| # βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _parse_args() -> argparse.Namespace: | |
| p = argparse.ArgumentParser( | |
| description="Upload DataRef files directly into Label Studio via API" | |
| ) | |
| p.add_argument( | |
| "--results_json", | |
| type=Path, | |
| default=Path("batch_dataref_results.json"), | |
| help="Path to batch_dataref_results.json (default: ./batch_dataref_results.json)", | |
| ) | |
| p.add_argument( | |
| "--data_root", | |
| type=Path, | |
| default=Path("C:/Users/azizmohamed.miladi_a/Desktop/GuichetOI_ML\\processed_dataref"), | |
| help="Root folder that contains the DataRef\\ sub-folders", | |
| ) | |
| p.add_argument( | |
| "--ls_url", | |
| type=str, | |
| default="http://localhost:8081", | |
| help="Label Studio base URL (default: http://localhost:8081)", | |
| ) | |
| p.add_argument( | |
| "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA4NTY0NzQyNSwiaWF0IjoxNzc4NDQ3NDI1LCJqdGkiOiIxMTIzMjAxMGQ3YmU0NDM3ODlmN2YwMjA3MWQ0MTI4NyIsInVzZXJfaWQiOiIxIn0.D3vcHfxHiXBTK32XueSABFE2srKR_tUruesYIGqpGKE", | |
| type=str, | |
| required=True, | |
| help=( | |
| "Label Studio API token. " | |
| "Find it at: LS β avatar (top right) β Account & Settings β Access Token" | |
| ), | |
| ) | |
| p.add_argument( | |
| "http://localhost:8081/projects/9/data?tab=21", | |
| type=int, | |
| required=True, | |
| help="Label Studio project ID (visible in the URL when you open the project)", | |
| ) | |
| p.add_argument( | |
| "--dpi", | |
| type=int, | |
| default=150, | |
| help="DPI for PDF rasterisation (default: 150 β lower = faster upload)", | |
| ) | |
| p.add_argument( | |
| "--max_pages", | |
| type=int, | |
| default=3, | |
| help="Max pages to upload per PDF (default: 3 β avoids uploading 26-page docs)", | |
| ) | |
| p.add_argument( | |
| "--start_from", | |
| type=int, | |
| default=0, | |
| help="Resume from this entry index if a previous run was interrupted", | |
| ) | |
| return p.parse_args() | |
| if __name__ == "__main__": | |
| args = _parse_args() | |
| run( | |
| results_json = args.results_json, | |
| data_root = args.data_root, | |
| ls_url = args.ls_url, | |
| api_token = args.api_token, | |
| project_id = args.project_id, | |
| dpi = args.dpi, | |
| max_pages = args.max_pages, | |
| start_from = args.start_from, | |
| ) |