Spaces:

AzizMiladi
/

FiberGate

Sleeping

File size: 14,839 Bytes

33ddb61

"""
upload_to_labelstudio.py
────────────────────────
Uploads every file from batch_dataref_results.json directly into Label Studio
via its REST API. No local file serving, no env variables needed.

How it works
────────────
1. Reads batch_dataref_results.json
2. For each entry:
   - PDFs  → rasterised to PNG pages with pdf2image, then uploaded as images
   - PNGs/JPGs → uploaded directly
3. Each uploaded file gets a Label Studio task with:
   - "image" → the hosted URL Label Studio assigns after upload
   - "ocr"   → extracted fields text (required by LS OCR template)
4. All tasks are created in the specified project via the API

Usage
─────
    # First create a project in Label Studio UI, note its ID (shown in URL)
    python upload_to_labelstudio.py --project_id 1

    # Full options
    python upload_to_labelstudio.py ^
        --results_json  batch_dataref_results.json ^
        --data_root     C:\\Users\\azizmohamed.miladi_a\\Desktop\\GuichetOI_ML\\processed_dataref ^
        --ls_url        http://localhost:8081 ^
        --api_token     YOUR_TOKEN_HERE ^
        --project_id    1 ^
        --dpi           150

Getting your API token
──────────────────────
    Label Studio → top-right avatar → Account & Settings → Access Token
"""

import argparse
import json
import logging
import sys
import time
from io import BytesIO
from pathlib import Path, PureWindowsPath

# ── Third-party ───────────────────────────────────────────────────────────────
try:
    import requests
except ImportError:
    sys.exit("pip install requests")

try:
    from PIL import Image
except ImportError:
    sys.exit("pip install Pillow")

# ── Logging ───────────────────────────────────────────────────────────────────
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
log = logging.getLogger(__name__)

# ─────────────────────────────────────────────────────────────────────────────
# HELPERS
# ─────────────────────────────────────────────────────────────────────────────

def get_api_token(ls_url: str, username: str, password: str) -> str:
    """
    Exchange Label Studio username + password for an API token.
    Use this only if you don't have a token yet.
    """
    resp = requests.post(
        f"{ls_url}/api/token",
        json={"username": username, "password": password},
        timeout=15,
    )
    resp.raise_for_status()
    return resp.json()["token"]


def upload_image_bytes(
    ls_url: str,
    headers: dict,
    project_id: int,
    img_bytes: bytes,
    filename: str,
) -> str:
    """
    Upload raw image bytes to Label Studio and return the hosted file URL.
    LS stores the file and returns a URL like /data/upload/<id>-filename.png
    """
    resp = requests.post(
        f"{ls_url}/api/projects/{project_id}/import",
        headers=headers,
        files={"file": (filename, BytesIO(img_bytes), "image/png")},
        timeout=60,
    )
    if resp.status_code != 201:
        raise RuntimeError(
            f"Upload failed ({resp.status_code}): {resp.text[:200]}"
        )
    # LS returns the created task(s); extract the image URL from the first one
    tasks = resp.json()
    if isinstance(tasks, list) and tasks:
        return tasks[0].get("data", {}).get("image", "")
    return ""


def create_task(
    ls_url: str,
    headers: dict,
    project_id: int,
    image_url: str,
    ocr_text: str,
    meta: dict,
) -> int:
    """Create a single task in Label Studio and return its ID."""
    payload = {
        "data": {
            "image":           image_url,
            "ocr":             ocr_text,     # required by LS OCR template
            "doc_class":       meta.get("doc_class", ""),
            "doc_confidence":  meta.get("doc_confidence", 0),
            "ocr_source":      meta.get("ocr_source", ""),
            "source_file":     meta.get("source_file", ""),
        }
    }
    resp = requests.post(
        f"{ls_url}/api/tasks",
        headers={**headers, "Content-Type": "application/json"},
        json=payload,
        timeout=30,
    )
    if resp.status_code not in (200, 201):
        raise RuntimeError(
            f"Task creation failed ({resp.status_code}): {resp.text[:200]}"
        )
    return resp.json().get("id", -1)


def pil_to_png_bytes(img: Image.Image) -> bytes:
    """Convert a PIL image to PNG bytes in memory."""
    buf = BytesIO()
    img.save(buf, format="PNG")
    return buf.getvalue()


def pdf_to_pil_pages(pdf_path: Path, dpi: int = 150) -> list[Image.Image]:
    """Rasterise a PDF to a list of PIL RGB images (one per page)."""
    try:
        from pdf2image import convert_from_path
        pages = convert_from_path(str(pdf_path), dpi=dpi, fmt="png")
        return [p.convert("RGB") for p in pages]
    except Exception as exc:
        log.error("  PDF rasterise failed for %s: %s", pdf_path.name, exc)
        return []


# ─────────────────────────────────────────────────────────────────────────────
# MAIN
# ─────────────────────────────────────────────────────────────────────────────

def run(
    results_json: Path,
    data_root:    Path,
    ls_url:       str,
    api_token:    str,
    project_id:   int,
    dpi:          int,
    max_pages:    int,
    start_from:   int,
) -> None:

    ls_url = ls_url.rstrip("/")
    headers = {"Authorization": f"Token {api_token}"}

    # ── Verify connection ─────────────────────────────────────────────────────
    try:
        r = requests.get(f"{ls_url}/api/projects/{project_id}", headers=headers, timeout=10)
        r.raise_for_status()
        proj_name = r.json().get("title", "?")
        log.info("Connected to Label Studio — project %d: '%s'", project_id, proj_name)
    except Exception as exc:
        sys.exit(f"Cannot reach Label Studio at {ls_url}: {exc}")

    # ── Load results ──────────────────────────────────────────────────────────
    with open(results_json, encoding="utf-8") as f:
        data = json.load(f)

    results = data["results"]
    log.info("Loaded %d entries from %s", len(results), results_json)

    # ── Process each entry ────────────────────────────────────────────────────
    success = skipped = failed = 0

    for idx, entry in enumerate(results):
        if idx < start_from:
            continue

        # Convert Windows backslash path → local absolute path
        rel_path   = PureWindowsPath(entry["image"])
        local_path = data_root / rel_path

        log.info(
            "[%d/%d] %s  (%s)",
            idx + 1, len(results), rel_path.name, entry["doc_class"]
        )

        if not local_path.exists():
            log.warning("  File not found: %s — skipping", local_path)
            skipped += 1
            continue

        # Build OCR text from extracted fields
        fields_text = "\n".join(
            f"{name}: {info['value']} (conf={info['confidence']})"
            for name, info in entry.get("fields", {}).items()
        )

        meta = {
            "doc_class":      entry["doc_class"],
            "doc_confidence": entry["doc_confidence"],
            "ocr_source":     entry["ocr_source"],
            "source_file":    rel_path.as_posix(),
        }

        ext = local_path.suffix.lower()

        try:
            # ── PDF: rasterise each page and upload separately ────────────────
            if ext == ".pdf":
                pages = pdf_to_pil_pages(local_path, dpi=dpi)
                if not pages:
                    log.warning("  No pages extracted — skipping")
                    skipped += 1
                    continue

                pages = pages[:max_pages]   # limit pages per document
                log.info("  %d page(s) to upload", len(pages))

                for p_idx, page_img in enumerate(pages):
                    png_bytes = pil_to_png_bytes(page_img)
                    fname     = f"{local_path.stem}_p{p_idx:03d}.png"

                    # Upload image file → get hosted URL
                    img_url = upload_image_bytes(
                        ls_url, headers, project_id, png_bytes, fname
                    )

                    if not img_url:
                        # Upload via import endpoint returns the task directly;
                        # create a separate task with correct metadata instead
                        task_id = create_task(
                            ls_url, headers, project_id,
                            image_url=f"/data/upload/{fname}",
                            ocr_text=fields_text,
                            meta={**meta, "page": p_idx},
                        )
                    else:
                        # Update the auto-created task with correct metadata
                        task_id = create_task(
                            ls_url, headers, project_id,
                            image_url=img_url,
                            ocr_text=fields_text,
                            meta={**meta, "page": p_idx},
                        )

                    log.info("    Page %d → task %d", p_idx, task_id)
                    time.sleep(0.1)   # be gentle with the local server

            # ── Image: upload directly ────────────────────────────────────────
            elif ext in {".png", ".jpg", ".jpeg"}:
                with open(local_path, "rb") as f:
                    img_bytes = f.read()

                fname   = local_path.name
                img_url = upload_image_bytes(
                    ls_url, headers, project_id, img_bytes, fname
                )
                task_id = create_task(
                    ls_url, headers, project_id,
                    image_url=img_url or f"/data/upload/{fname}",
                    ocr_text=fields_text,
                    meta=meta,
                )
                log.info("  Uploaded → task %d", task_id)

            success += 1

        except Exception as exc:
            log.error("  FAILED: %s", exc)
            failed += 1
            continue

    # ── Summary ───────────────────────────────────────────────────────────────
    print("\n" + "═" * 48)
    print(f"  Total entries : {len(results)}")
    print(f"  Uploaded      : {success}")
    print(f"  Skipped       : {skipped}  (file not found)")
    print(f"  Failed        : {failed}")
    print("═" * 48)
    print(f"\nOpen your project: {ls_url}/projects/{project_id}/")


# ─────────────────────────────────────────────────────────────────────────────
# CLI
# ─────────────────────────────────────────────────────────────────────────────

def _parse_args() -> argparse.Namespace:
    p = argparse.ArgumentParser(
        description="Upload DataRef files directly into Label Studio via API"
    )
    p.add_argument(
        "--results_json",
        type=Path,
        default=Path("batch_dataref_results.json"),
        help="Path to batch_dataref_results.json (default: ./batch_dataref_results.json)",
    )
    p.add_argument(
        "--data_root",
        type=Path,
        default=Path("C:/Users/azizmohamed.miladi_a/Desktop/GuichetOI_ML\\processed_dataref"),
        help="Root folder that contains the DataRef\\ sub-folders",
    )
    p.add_argument(
        "--ls_url",
        type=str,
        default="http://localhost:8081",
        help="Label Studio base URL (default: http://localhost:8081)",
    )
    p.add_argument(
        "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJ0b2tlbl90eXBlIjoicmVmcmVzaCIsImV4cCI6ODA4NTY0NzQyNSwiaWF0IjoxNzc4NDQ3NDI1LCJqdGkiOiIxMTIzMjAxMGQ3YmU0NDM3ODlmN2YwMjA3MWQ0MTI4NyIsInVzZXJfaWQiOiIxIn0.D3vcHfxHiXBTK32XueSABFE2srKR_tUruesYIGqpGKE",
        type=str,
        required=True,
        help=(
            "Label Studio API token. "
            "Find it at: LS → avatar (top right) → Account & Settings → Access Token"
        ),
    )
    p.add_argument(
        "http://localhost:8081/projects/9/data?tab=21",
        type=int,
        required=True,
        help="Label Studio project ID (visible in the URL when you open the project)",
    )
    p.add_argument(
        "--dpi",
        type=int,
        default=150,
        help="DPI for PDF rasterisation (default: 150 — lower = faster upload)",
    )
    p.add_argument(
        "--max_pages",
        type=int,
        default=3,
        help="Max pages to upload per PDF (default: 3 — avoids uploading 26-page docs)",
    )
    p.add_argument(
        "--start_from",
        type=int,
        default=0,
        help="Resume from this entry index if a previous run was interrupted",
    )
    return p.parse_args()


if __name__ == "__main__":
    args = _parse_args()
    run(
        results_json = args.results_json,
        data_root    = args.data_root,
        ls_url       = args.ls_url,
        api_token    = args.api_token,
        project_id   = args.project_id,
        dpi          = args.dpi,
        max_pages    = args.max_pages,
        start_from   = args.start_from,
    )