DISCO-v0.1 / src /baseline.py
younissk's picture
Upload folder using huggingface_hub
9894d76 verified
from dotenv import load_dotenv
from pathlib import Path
import os
import json
import pandas as pd
from huggingface_hub import InferenceClient
from rich import print
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
load_dotenv()
DATA_PATH = Path(__file__).parent.parent / "data"
RESULTS_PATH = DATA_PATH / "results"
IMAGES_PATH = DATA_PATH / "imgs"
with open(DATA_PATH / "annotations.json", "r") as f:
annotations = json.load(f)
print(f"[bold]Loaded {len(annotations)} annotations from Label Studio[/bold]")
# Extract annotated images with their labels
# Files should now be in data/imgs/ with their file_upload names
annotated_images = []
for ann in annotations:
file_upload = ann.get("file_upload")
if not file_upload:
continue
# Get the annotation choice
choice = None
if ann.get("annotations") and len(ann["annotations"]) > 0:
result = ann["annotations"][0].get("result", [])
if result and len(result) > 0:
choices = result[0].get("value", {}).get("choices", [])
if choices:
choice = choices[0]
# File should be in data/imgs/ with the file_upload name
file_path = IMAGES_PATH / file_upload
annotated_images.append({
"file_upload": file_upload,
"file_path": file_path,
"choice": choice,
"annotation_id": ann.get("id")
})
# Check how many files actually exist
existing_files = [img for img in annotated_images if img["file_path"].exists()]
print(
f"[bold]Found {len(existing_files)}/{len(annotated_images)} annotated image files[/bold]")
# Initialize client
client = InferenceClient(
provider="hf-inference",
api_key=os.environ.get("HF_TOKEN"),
)
if not os.environ.get("HF_TOKEN"):
raise ValueError(
"HF_TOKEN environment variable not set. Please set it in .env file.")
# Filter to only images that exist
images_to_process = [
img for img in annotated_images if img["file_path"].exists()]
if not images_to_process:
print("[red]✗ No annotated image files found![/red]")
print("[yellow]Please copy files from Label Studio media directory first.[/yellow]")
print(f"[dim]Expected location: {IMAGES_PATH}[/dim]")
predictions = []
errors = []
print(f"[bold]Processing {len(images_to_process)} annotated images...[/bold]")
with Progress(
SpinnerColumn(),
TextColumn("[progress.description]{task.description}"),
BarColumn(),
TextColumn("[progress.percentage]{task.percentage:>3.0f}%"),
console=None,
) as progress:
task = progress.add_task("Classifying images",
total=len(images_to_process))
for ann_img in images_to_process:
file_path = ann_img["file_path"]
file_upload = ann_img["file_upload"]
choice = ann_img.get("choice")
annotation_id = ann_img.get("annotation_id")
try:
if not file_path or not file_path.exists():
errors.append({
"annotation_id": annotation_id,
"file_upload": file_upload,
"error": "File not found"
})
progress.update(task, advance=1)
continue
# Classify image
output = client.image_classification(
str(file_path),
model="Falconsai/nsfw_image_detection"
)
# Flatten the output (list of dicts) and add metadata
result = {
"annotation_id": annotation_id,
"file_upload": file_upload,
"actual_filename": file_path.name,
"label_studio_choice": choice,
**{f"label_{i}": pred["label"] for i, pred in enumerate(output)},
**{f"score_{i}": pred["score"] for i, pred in enumerate(output)}
}
predictions.append(result)
except Exception as e:
errors.append({
"annotation_id": annotation_id,
"file_upload": file_upload,
"error": str(e)
})
print(f"[red]Error processing {file_upload}: {e}[/red]")
finally:
progress.update(task, advance=1)
# Save predictions
if predictions:
predictions_df = pd.DataFrame(predictions)
predictions_df.to_csv(RESULTS_PATH / "baseline.csv", index=False)
print(
f"[green]✓ Saved {len(predictions)} predictions to baseline.csv[/green]")
else:
print("[red]✗ No predictions generated[/red]")
# Save errors if any
if errors:
errors_df = pd.DataFrame(errors)
errors_df.to_csv(DATA_PATH / "falcons_errors.csv", index=False)
print(
f"[yellow]⚠ Saved {len(errors)} errors to falcons_errors.csv[/yellow]")