Spaces:
Runtime error
Runtime error
| import os | |
| import glob | |
| import json | |
| import pandas as pd | |
| class DataExtractor: | |
| def __init__(self, json_folder_path, image_root_directory): | |
| self.json_folder_path = json_folder_path | |
| self.image_root_directory = image_root_directory | |
| def extract_json_data(self): | |
| extracted_data = [] | |
| for filename in os.listdir(self.json_folder_path): | |
| if filename.endswith(".json"): | |
| with open(os.path.join(self.json_folder_path, filename), 'r') as json_file: | |
| data = json.load(json_file) | |
| if 'query' in data and 'images' in data: | |
| query = data['query'] | |
| images = data['images'] | |
| for image_data in images: | |
| extracted_data.append({ | |
| 'Class': query, | |
| 'id': image_data['Id'], | |
| 'Image_URL': image_data['url'], | |
| 'Title': image_data['title'], | |
| 'Page_URL': image_data['page_url'] | |
| }) | |
| return pd.DataFrame(extracted_data) | |
| def extract_image_paths(self): | |
| extracted_data = [] | |
| image_files = glob.glob(os.path.join(self.image_root_directory, '**', '*.jpg'), recursive=True) | |
| for image_file in image_files: | |
| class_name = os.path.basename(os.path.dirname(image_file)) | |
| id_name = os.path.splitext(os.path.basename(image_file))[0] | |
| extracted_data.append({ | |
| 'Class': class_name, | |
| 'id': id_name, | |
| 'Image_Path': image_file | |
| }) | |
| return pd.DataFrame(extracted_data) | |
| def concat_data(self): | |
| json_data = self.extract_json_data() | |
| image_data = self.extract_image_paths() | |
| combined_data = pd.merge(json_data, image_data, on=['id'], how='inner') | |
| paths = combined_data['Image_Path'] | |
| print(paths) | |
| return combined_data, paths | |