Spaces:
Build error
Build error
| from datasets import load_dataset | |
| import os | |
| import subprocess | |
| from PIL import Image | |
| import json | |
| def generate_meta_json(base_dir='Paper2Poster-data'): | |
| # Loop over each item in the specified base directory | |
| for folder_name in os.listdir(base_dir): | |
| subfolder_path = os.path.join(base_dir, folder_name) | |
| # Ensure the item is a directory | |
| if os.path.isdir(subfolder_path): | |
| poster_path = os.path.join(subfolder_path, 'poster.png') | |
| # Check if the poster.png exists in the subfolder | |
| if os.path.exists(poster_path): | |
| try: | |
| # Open the image and get size (width, height) | |
| with Image.open(poster_path) as img: | |
| width, height = img.size | |
| # Prepare metadata dictionary | |
| metadata = { | |
| 'width': width, | |
| 'height': height | |
| } | |
| # Write metadata to meta.json in the same subfolder | |
| meta_json_path = os.path.join(subfolder_path, 'meta.json') | |
| with open(meta_json_path, 'w') as json_file: | |
| json.dump(metadata, json_file) | |
| print(f"Metadata for '{folder_name}' saved successfully.") | |
| except Exception as e: | |
| print(f"Error processing image in folder '{folder_name}': {e}") | |
| else: | |
| print(f"No poster.png found in folder '{folder_name}'.") | |
| if __name__ == "__main__": | |
| dataset = load_dataset("Paper2Poster/Paper2Poster", split="train") | |
| os.makedirs('Paper2Poster-data', exist_ok=True) | |
| for data in dataset: | |
| paper_title = data['title'] | |
| paper_url = data['paper_url'] | |
| poster_url = data['image_url'] | |
| qa = data['qa'] | |
| os.makedirs(f'Paper2Poster-data/{paper_title}', exist_ok=True) | |
| paper_output_path = os.path.join('Paper2Poster-data', paper_title, 'paper.pdf') | |
| poster_output_path = os.path.join('Paper2Poster-data', paper_title, 'poster.png') | |
| qa_path = os.path.join('Paper2Poster-data', paper_title, 'o3_qa.json') | |
| qa_dict = json.loads(qa) | |
| with open(qa_path, 'w') as f: | |
| json.dump(qa_dict, f, indent=4) | |
| print(f"Saved QA for {paper_title} into {qa_path}") | |
| try: | |
| subprocess.run(['wget', paper_url, '-O', paper_output_path], check=True) | |
| subprocess.run(['wget', poster_url, '-O', poster_output_path], check=True) | |
| print(f"Downloaded {poster_url} into {poster_output_path}") | |
| print(f"Downloaded {paper_url} into {paper_output_path}") | |
| except subprocess.CalledProcessError as e: | |
| print(f"Error downloading {paper_url} or {poster_url}: {e}") | |
| generate_meta_json('Paper2Poster-data') |