Spaces:
Running
Running
| import json | |
| from pathlib import Path | |
| from PIL import Image | |
| import torch | |
| from torch.utils.data import Dataset, DataLoader | |
| from torchvision import transforms | |
| class AuctionDatasetFromJSON(Dataset): | |
| def __init__(self, json_path: str, root_dir: str, transform=None, max_samples=None): | |
| """ | |
| json_path: dataset/dataset.json | |
| root_dir: dataset/raw_data | |
| """ | |
| with open(json_path, 'r', encoding='utf-8') as f: | |
| self.data = json.load(f) | |
| if max_samples: | |
| self.data = self.data[:max_samples] | |
| self.root_dir = Path(root_dir) | |
| self.transform = transform | |
| def __len__(self): | |
| return len(self.data) | |
| def __getitem__(self, idx): | |
| auction = self.data[idx] | |
| # Ścieżka do zdjęcia | |
| img_path = self.root_dir / auction['folder_path'] / auction['images'][0] | |
| try: | |
| img = Image.open(img_path).convert('RGB') | |
| except Exception as e: | |
| print(f"Błąd wczytywania {img_path}: {e}") | |
| # Fallback: czarne zdjęcie | |
| img = Image.new('RGB', (224, 224), color='black') | |
| if self.transform: | |
| img = self.transform(img) | |
| # Tekst: title + opis | |
| text = f"{auction.get('title', '')} {auction.get('description', '')}" | |
| return { | |
| 'image': img, | |
| 'text': text, | |
| 'platform': auction['platform'], | |
| 'title': auction['title'], | |
| 'id': auction['id'], | |
| 'label': torch.tensor(auction.get('label', 0), dtype=torch.long), | |
| 'folder_path': auction['folder_path'] | |
| } | |
| # Transformacje | |
| get_transforms = lambda: transforms.Compose([ | |
| transforms.Resize((224, 224)), | |
| transforms.ToTensor(), | |
| transforms.Normalize( | |
| mean=[0.485, 0.456, 0.406], | |
| std=[0.229, 0.224, 0.225] | |
| ) | |
| ]) | |
| if __name__ == '__main__': | |
| print("Testowanie DataLoadera...") | |
| dataset = AuctionDatasetFromJSON( | |
| json_path='../dataset/dataset.json', | |
| root_dir='../dataset/raw_data', | |
| transform=get_transforms(), | |
| max_samples=5 | |
| ) | |
| print(f"✓ Dataset załadowany: {len(dataset)} próbek") | |
| loader = DataLoader(dataset, batch_size=2, shuffle=True, num_workers=0) | |
| for batch in loader: | |
| print(f"\nBatch:") | |
| print(f" - Image shape: {batch['image'].shape}") | |
| print(f" - Texts: {len(batch['text'])}") | |
| print(f" - Platforms: {batch['platform']}") | |
| print(f" - Labels: {batch['label']}") | |
| print(f" - Example text: {batch['text'][0][:100]}...") | |
| break |