Spaces:
Running
Running
File size: 4,269 Bytes
718c4ae |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 |
import os
import json
from pathlib import Path
from typing import Dict, List
def parse_info_txt(info_path: str) -> Dict:
"""
Parsuje info.txt z aukcji
"""
with open(info_path, 'r', encoding='utf-8') as f:
content = f.read()
metadata = {}
# TITLE
if 'TITLE:' in content:
title_start = content.find('TITLE:') + len('TITLE:')
title_end = content.find('\n', title_start)
metadata['title'] = content[title_start:title_end].strip()
else:
metadata['title'] = 'Unknown'
# LINK
if 'LINK:' in content:
link_start = content.find('LINK:') + len('LINK:')
link_end = content.find('\n', link_start)
metadata['link'] = content[link_start:link_end].strip()
else:
metadata['link'] = ''
# PARAMETERS
metadata['parameters'] = {}
if 'PARAMETERS:' in content:
params_start = content.find('PARAMETERS:') + len('PARAMETERS:')
params_end = content.find('----', params_start)
if params_end == -1:
params_end = content.find('DESCRIPTION:', params_start)
params_text = content[params_start:params_end]
for line in params_text.split('\n'):
if line.strip().startswith('*'):
line_clean = line.strip()[2:]
if ':' in line_clean:
key, value = line_clean.split(':', 1)
metadata['parameters'][key.strip()] = value.strip()
# DESCRIPTION
if 'DESCRIPTION:' in content:
desc_start = content.find('DESCRIPTION:') + len('DESCRIPTION:')
metadata['description'] = content[desc_start:].strip()
else:
metadata['description'] = ''
return metadata
def organize_dataset(root_dir: str, output_json: str = 'dataset/dataset.json'):
"""
Skanuje strukturę i tworzy dataset.json
"""
root = Path(root_dir)
dataset = []
for platform_dir in sorted(root.iterdir()):
if not platform_dir.is_dir():
continue
platform_name = platform_dir.name
print(f"\n📁 Platform: {platform_name}")
for auction_dir in sorted(platform_dir.iterdir()):
if not auction_dir.is_dir():
continue
auction_id = auction_dir.name
info_txt = auction_dir / 'info.txt'
if not info_txt.exists():
print(f" ⚠️ {auction_id} - brak info.txt")
continue
try:
metadata = parse_info_txt(str(info_txt))
except Exception as e:
print(f" ❌ {auction_id} - błąd: {e}")
continue
# Zbierz zdjęcia
images = sorted([
img.name for img in auction_dir.glob('*.jpg')
])
images += sorted([
img.name for img in auction_dir.glob('*.png')
])
if not images:
print(f" ⚠️ {auction_id} - brak zdjęć")
continue
entry = {
'id': f"{platform_name}_{auction_id}",
'platform': platform_name,
'folder_path': str(auction_dir.relative_to(root)),
'image_count': len(images),
'images': images,
'label': 0, # Default: authentic
'label_confidence': 0.0, # Do ręcznego wypełnienia
**metadata
}
dataset.append(entry)
print(f" ✓ {auction_id} ({len(images)} zdjęć)")
# Zapis
os.makedirs(os.path.dirname(output_json), exist_ok=True)
with open(output_json, 'w', encoding='utf-8') as f:
json.dump(dataset, f, ensure_ascii=False, indent=2)
print(f"\n✅ Dataset wczytany: {len(dataset)} aukcji")
print(f"💾 Zapisano: {output_json}")
return dataset
if __name__ == '__main__':
dataset = organize_dataset('dataset/raw_data')
if dataset:
print("\n" + "="*60)
print("PRZYKŁAD PIERWSZEJ AUKCJI:")
print("="*60)
print(json.dumps(dataset[0], indent=2, ensure_ascii=False)[:800]) |