| | import subprocess |
| | import json |
| | import os |
| | import requests |
| |
|
| | |
| | base_url = "https://datasets-server.huggingface.co/rows" |
| | dataset_path = "cat-state/mscoco-1st-caption" |
| | config = "default" |
| | split = "train" |
| | offset = 0 |
| | length = 100 |
| | total_data = 1000 |
| | iterations = total_data // length |
| |
|
| | image_dir = "../images_large" |
| | if not os.path.exists(image_dir): |
| | os.makedirs(image_dir) |
| |
|
| | text_data = {} |
| |
|
| | |
| | for i in range(iterations): |
| | |
| | url = f"{base_url}?dataset={dataset_path}&config={config}&split={split}&offset={offset}&length={length}" |
| | |
| | |
| | result = subprocess.run( |
| | ["curl", "-X", "GET", url], |
| | capture_output=True, |
| | text=True |
| | ) |
| | |
| | output = result.stdout |
| | |
| | try: |
| | data_dict = json.loads(output) |
| | except json.JSONDecodeError: |
| | print(f"无法将输出转换为字典。输出内容: {output}") |
| | continue |
| | |
| | if 'rows' in data_dict: |
| | for item in data_dict['rows']: |
| | row_idx = item['row_idx'] |
| | row = item['row'] |
| | image_url = row.get('url') |
| | text = row.get('caption') |
| | |
| | if image_url: |
| | image_filename = f"{image_dir}/{row_idx}_row_image.jpg" |
| | response = requests.get(image_url, stream=True) |
| | if response.status_code == 200: |
| | with open(image_filename, 'wb') as f: |
| | for chunk in response.iter_content(chunk_size=8192): |
| | f.write(chunk) |
| | |
| | text_data[f"{row_idx}_row_image"] = text |
| | |
| | offset += length |
| |
|
| | |
| | json_filename = "../data/row_image_texts_large.json" |
| | with open(json_filename, 'w') as f: |
| | json.dump(text_data, f, indent=4) |
| |
|
| | print("图像下载并保存完成,文本信息已保存到 row_image_texts.json") |
| |
|
| |
|