Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| from PIL import Image, ImageFile | |
| import os | |
| from tqdm import tqdm | |
| ImageFile.LOAD_TRUNCATED_IMAGES = True | |
| def verify_and_clean_csv(csv_path, output_path=None): | |
| if output_path is None: | |
| output_path = csv_path.replace('.csv', '_cleaned.csv') | |
| df = pd.read_csv(csv_path) | |
| valid_rows = [] | |
| issues = { | |
| 'missing': 0, | |
| 'corrupted': 0, | |
| 'truncated': 0, | |
| 'other': 0 | |
| } | |
| for idx, row in tqdm(df.iterrows(), total=len(df), desc="Verifying images"): | |
| img_path = row['image_path'] | |
| if not os.path.exists(img_path): | |
| issues['missing'] += 1 | |
| continue | |
| try: | |
| with Image.open(img_path) as img: | |
| img.verify() | |
| with Image.open(img_path) as img: | |
| img = img.convert('RGB') | |
| img.load() | |
| valid_rows.append(row) | |
| except OSError as e: | |
| if 'truncated' in str(e).lower(): | |
| issues['truncated'] += 1 | |
| else: | |
| issues['corrupted'] += 1 | |
| except Exception as e: | |
| issues['other'] += 1 | |
| cleaned_df = pd.DataFrame(valid_rows) | |
| cleaned_df.to_csv(output_path, index=False) | |
| return cleaned_df | |
| if __name__ == "__main__": | |
| phone_cleaned = verify_and_clean_csv( | |
| r'E:\fortransferee\mlproject7\aws_cust\training_data_phone213.csv', | |
| r'E:\fortransferee\mlproject7\aws_cust\data\phone\phone_training_data2134.csv' | |
| ) | |
| laptop_cleaned = verify_and_clean_csv( | |
| r'E:\fortransferee\mlproject7\aws_cust\training_data_laptop213.csv', | |
| r'E:\fortransferee\mlproject7\aws_cust\data\laptop\laptop_training_data2134.csv' | |
| ) | |