from dotenv import load_dotenv from pathlib import Path import os import json import pandas as pd from huggingface_hub import InferenceClient from rich import print from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn load_dotenv() DATA_PATH = Path(__file__).parent.parent / "data" RESULTS_PATH = DATA_PATH / "results" IMAGES_PATH = DATA_PATH / "imgs" with open(DATA_PATH / "annotations.json", "r") as f: annotations = json.load(f) print(f"[bold]Loaded {len(annotations)} annotations from Label Studio[/bold]") # Extract annotated images with their labels # Files should now be in data/imgs/ with their file_upload names annotated_images = [] for ann in annotations: file_upload = ann.get("file_upload") if not file_upload: continue # Get the annotation choice choice = None if ann.get("annotations") and len(ann["annotations"]) > 0: result = ann["annotations"][0].get("result", []) if result and len(result) > 0: choices = result[0].get("value", {}).get("choices", []) if choices: choice = choices[0] # File should be in data/imgs/ with the file_upload name file_path = IMAGES_PATH / file_upload annotated_images.append({ "file_upload": file_upload, "file_path": file_path, "choice": choice, "annotation_id": ann.get("id") }) # Check how many files actually exist existing_files = [img for img in annotated_images if img["file_path"].exists()] print( f"[bold]Found {len(existing_files)}/{len(annotated_images)} annotated image files[/bold]") # Initialize client client = InferenceClient( provider="hf-inference", api_key=os.environ.get("HF_TOKEN"), ) if not os.environ.get("HF_TOKEN"): raise ValueError( "HF_TOKEN environment variable not set. Please set it in .env file.") # Filter to only images that exist images_to_process = [ img for img in annotated_images if img["file_path"].exists()] if not images_to_process: print("[red]✗ No annotated image files found![/red]") print("[yellow]Please copy files from Label Studio media directory first.[/yellow]") print(f"[dim]Expected location: {IMAGES_PATH}[/dim]") predictions = [] errors = [] print(f"[bold]Processing {len(images_to_process)} annotated images...[/bold]") with Progress( SpinnerColumn(), TextColumn("[progress.description]{task.description}"), BarColumn(), TextColumn("[progress.percentage]{task.percentage:>3.0f}%"), console=None, ) as progress: task = progress.add_task("Classifying images", total=len(images_to_process)) for ann_img in images_to_process: file_path = ann_img["file_path"] file_upload = ann_img["file_upload"] choice = ann_img.get("choice") annotation_id = ann_img.get("annotation_id") try: if not file_path or not file_path.exists(): errors.append({ "annotation_id": annotation_id, "file_upload": file_upload, "error": "File not found" }) progress.update(task, advance=1) continue # Classify image output = client.image_classification( str(file_path), model="Falconsai/nsfw_image_detection" ) # Flatten the output (list of dicts) and add metadata result = { "annotation_id": annotation_id, "file_upload": file_upload, "actual_filename": file_path.name, "label_studio_choice": choice, **{f"label_{i}": pred["label"] for i, pred in enumerate(output)}, **{f"score_{i}": pred["score"] for i, pred in enumerate(output)} } predictions.append(result) except Exception as e: errors.append({ "annotation_id": annotation_id, "file_upload": file_upload, "error": str(e) }) print(f"[red]Error processing {file_upload}: {e}[/red]") finally: progress.update(task, advance=1) # Save predictions if predictions: predictions_df = pd.DataFrame(predictions) predictions_df.to_csv(RESULTS_PATH / "baseline.csv", index=False) print( f"[green]✓ Saved {len(predictions)} predictions to baseline.csv[/green]") else: print("[red]✗ No predictions generated[/red]") # Save errors if any if errors: errors_df = pd.DataFrame(errors) errors_df.to_csv(DATA_PATH / "falcons_errors.csv", index=False) print( f"[yellow]⚠ Saved {len(errors)} errors to falcons_errors.csv[/yellow]")