File size: 4,760 Bytes

9894d76

from dotenv import load_dotenv
from pathlib import Path
import os
import json
import pandas as pd
from huggingface_hub import InferenceClient
from rich import print
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn

load_dotenv()

DATA_PATH = Path(__file__).parent.parent / "data"
RESULTS_PATH = DATA_PATH / "results"
IMAGES_PATH = DATA_PATH / "imgs"

with open(DATA_PATH / "annotations.json", "r") as f:
    annotations = json.load(f)

print(f"[bold]Loaded {len(annotations)} annotations from Label Studio[/bold]")

# Extract annotated images with their labels
# Files should now be in data/imgs/ with their file_upload names
annotated_images = []
for ann in annotations:
    file_upload = ann.get("file_upload")
    if not file_upload:
        continue

    # Get the annotation choice
    choice = None
    if ann.get("annotations") and len(ann["annotations"]) > 0:
        result = ann["annotations"][0].get("result", [])
        if result and len(result) > 0:
            choices = result[0].get("value", {}).get("choices", [])
            if choices:
                choice = choices[0]

    # File should be in data/imgs/ with the file_upload name
    file_path = IMAGES_PATH / file_upload

    annotated_images.append({
        "file_upload": file_upload,
        "file_path": file_path,
        "choice": choice,
        "annotation_id": ann.get("id")
    })

# Check how many files actually exist
existing_files = [img for img in annotated_images if img["file_path"].exists()]
print(
    f"[bold]Found {len(existing_files)}/{len(annotated_images)} annotated image files[/bold]")

# Initialize client
client = InferenceClient(
    provider="hf-inference",
    api_key=os.environ.get("HF_TOKEN"),
)

if not os.environ.get("HF_TOKEN"):
    raise ValueError(
        "HF_TOKEN environment variable not set. Please set it in .env file.")

# Filter to only images that exist
images_to_process = [
    img for img in annotated_images if img["file_path"].exists()]

if not images_to_process:
    print("[red]✗ No annotated image files found![/red]")
    print("[yellow]Please copy files from Label Studio media directory first.[/yellow]")
    print(f"[dim]Expected location: {IMAGES_PATH}[/dim]")

predictions = []
errors = []

print(f"[bold]Processing {len(images_to_process)} annotated images...[/bold]")

with Progress(
    SpinnerColumn(),
    TextColumn("[progress.description]{task.description}"),
    BarColumn(),
    TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
    console=None,
) as progress:
    task = progress.add_task("Classifying images",
                             total=len(images_to_process))

    for ann_img in images_to_process:
        file_path = ann_img["file_path"]
        file_upload = ann_img["file_upload"]
        choice = ann_img.get("choice")
        annotation_id = ann_img.get("annotation_id")

        try:
            if not file_path or not file_path.exists():
                errors.append({
                    "annotation_id": annotation_id,
                    "file_upload": file_upload,
                    "error": "File not found"
                })
                progress.update(task, advance=1)
                continue

            # Classify image
            output = client.image_classification(
                str(file_path),
                model="Falconsai/nsfw_image_detection"
            )

            # Flatten the output (list of dicts) and add metadata
            result = {
                "annotation_id": annotation_id,
                "file_upload": file_upload,
                "actual_filename": file_path.name,
                "label_studio_choice": choice,
                **{f"label_{i}": pred["label"] for i, pred in enumerate(output)},
                **{f"score_{i}": pred["score"] for i, pred in enumerate(output)}
            }
            predictions.append(result)

        except Exception as e:
            errors.append({
                "annotation_id": annotation_id,
                "file_upload": file_upload,
                "error": str(e)
            })
            print(f"[red]Error processing {file_upload}: {e}[/red]")

        finally:
            progress.update(task, advance=1)

# Save predictions
if predictions:
    predictions_df = pd.DataFrame(predictions)
    predictions_df.to_csv(RESULTS_PATH / "baseline.csv", index=False)
    print(
        f"[green]✓ Saved {len(predictions)} predictions to baseline.csv[/green]")
else:
    print("[red]✗ No predictions generated[/red]")

# Save errors if any
if errors:
    errors_df = pd.DataFrame(errors)
    errors_df.to_csv(DATA_PATH / "falcons_errors.csv", index=False)
    print(
        f"[yellow]⚠ Saved {len(errors)} errors to falcons_errors.csv[/yellow]")