device_price_detector / data /clean_dataset.py
palakmathur's picture
first
8dc12f7
import pandas as pd
from PIL import Image, ImageFile
import os
from tqdm import tqdm
ImageFile.LOAD_TRUNCATED_IMAGES = True
def verify_and_clean_csv(csv_path, output_path=None):
if output_path is None:
output_path = csv_path.replace('.csv', '_cleaned.csv')
df = pd.read_csv(csv_path)
valid_rows = []
issues = {
'missing': 0,
'corrupted': 0,
'truncated': 0,
'other': 0
}
for idx, row in tqdm(df.iterrows(), total=len(df), desc="Verifying images"):
img_path = row['image_path']
if not os.path.exists(img_path):
issues['missing'] += 1
continue
try:
with Image.open(img_path) as img:
img.verify()
with Image.open(img_path) as img:
img = img.convert('RGB')
img.load()
valid_rows.append(row)
except OSError as e:
if 'truncated' in str(e).lower():
issues['truncated'] += 1
else:
issues['corrupted'] += 1
except Exception as e:
issues['other'] += 1
cleaned_df = pd.DataFrame(valid_rows)
cleaned_df.to_csv(output_path, index=False)
return cleaned_df
if __name__ == "__main__":
phone_cleaned = verify_and_clean_csv(
r'E:\fortransferee\mlproject7\aws_cust\training_data_phone213.csv',
r'E:\fortransferee\mlproject7\aws_cust\data\phone\phone_training_data2134.csv'
)
laptop_cleaned = verify_and_clean_csv(
r'E:\fortransferee\mlproject7\aws_cust\training_data_laptop213.csv',
r'E:\fortransferee\mlproject7\aws_cust\data\laptop\laptop_training_data2134.csv'
)