Doctor_Handwriting_Text_Detection / RemoveNotCorrectImages..py
Umer2762's picture
Upload folder using huggingface_hub
5626a1a verified
import os
import pandas as pd
def delete_images_with_patterns(directory: str, patterns: list):
"""
Deletes image files in the given directory if their filenames contain any of the specified patterns.
Args:
directory (str): The path to the directory containing images.
patterns (list): A list of substrings to check in filenames.
"""
if not os.path.exists(directory):
print(f"Directory '{directory}' does not exist.")
return
for filename in os.listdir(directory):
file_path = os.path.join(directory, filename)
# Check if the filename contains any of the specified patterns
if any("dr"+pattern in filename for pattern in patterns):
try:
os.remove(file_path)
print(f"Deleted: {file_path}")
except Exception as e:
print(f"Error deleting {file_path}: {e}")
def clean_csv(csv_path: str, patterns: list):
"""
Removes rows from the CSV if the first column contains filenames matching any pattern (e.g., "1_1" -> "dr1_1").
Ensures that there are no additional digits after the pattern unless separated by an underscore `_`.
"""
if not os.path.exists(csv_path):
print(f"CSV file '{csv_path}' does not exist.")
return
# Load CSV into a DataFrame
df = pd.read_csv(csv_path)
# Ensure the first column is treated as a string
df.iloc[:, 0] = df.iloc[:, 0].astype(str)
# Create modified patterns to match filenames
modified_patterns = [f"dr{p}" for p in patterns]
# Build a regex pattern to match filenames exactly or with an underscore and additional digits
regex_patterns = []
for pattern in modified_patterns:
# Match the pattern exactly or with an underscore and additional digits
regex_patterns.append(f"^{pattern}(_\\d+)?$")
# Combine all regex patterns into a single pattern
combined_regex = '|'.join(regex_patterns)
# Filter out rows where the first column matches any of the regex patterns
df = df[~df.iloc[:, 0].str.match(combined_regex, na=False)]
# Remove duplicates
df.drop_duplicates(inplace=True)
# Save cleaned data back to CSV
df.to_csv(csv_path, index=False)
print(f"Updated CSV saved: {csv_path}")
# List of text patterns to match in filenames
patterns_to_delete = [
"1_1",
"4_1",
"4_2",
"4_3",
"4_4",
"4_5",
"4_6",
"5_1",
"5_2",
"7_1",
"10_1",
"24_1",
"24_2",
"25_1",
"25_2",
"29_1",
"30_1",
"33_1",
"36_1",
"36_4",
"36_5",
"36_6",
"38_1",
"38_2",
"38_3",
"38_4",
"38_5",
"38_6",
"38_7",
"38_8",
"38_9",
"42_1",
"42_2",
"42_4",
"43_1",
"43_2",
"43_3",
"43_4",
"43_5",
"44_1",
"44_2",
"44_3",
"44_4",
"44_6",
"45_1",
"47_1",
"50_1",
"57_1",
"57_2",
"63_1",
"64_1",
"64_2",
"64_3",
"64_4",
"64_5",
"64_6",
"64_7",
"64_8",
"64_9",
"65_1",
"65_2",
"66_1",
"66_2",
"66_3",
"66_4",
"66_5",
"66_6",
"66_7",
"66_8",
"69_1",
"69_2",
"69_3",
"69_4",
"69_5",
"69_6",
"69_7",
"69_8",
"69_9",
"71_1",
"71_2",
"71_3",
"71_4",
"71_5",
"73_1",
"74_1",
"75_1",
"75_2",
"75_3",
"75_4",
"75_5",
"75_6",
"77_1",
"77_2",
"77_3",
"76_1",
"76_2",
"76_3",
"76_4",
"76_5",
"80_1",
"80_2",
"82_1",
"86_1",
"86_2",
"86_3",
"86_4",
"86_5",
"87_1",
"87_2",
"87_3",
"87_4",
"87_5",
"87_6",
"89_1",
"92_1",
"92_2",
"93_1",
"94_2",
"94_1",
"95_1",
"97_1",
"97_2",
"102_1",
"104_1",
"108_1",
"109_1",
"112_1",
"114_1",
"114_2",
"114_3",
"114_4",
"114_5",
"114_6",
"114_7",
"114_8",
"114_9",
"115_1",
"115_2",
"116_1",
"116_2",
"116_3",
"117_1",
"128_1",
"130_1",
"132_1",
"132_2",
"132_3",
"137_1",
"137_2",
"137_3",
"137_4",
"137_5",
"137_6",
"137_7",
"137_8",
"137_9",
"140_5",
"146_1",
"146_2",
"146_3",
"151_1",
"151_2",
"163_1",
"169_1",
"173_1",
"173_2",
"100_1"
]
# Specify your target directory
target_directory = "./cropped_images" # Change this to your actual directory
# Run the deletion function
# delete_images_with_patterns(target_directory, patterns_to_delete)
patterns_to_delete = [
"dr80_2",
"dr80_3",
"dr81_1",
"dr81_1",
"dr81_2",
"dr83_1",
"dr86_1",
"dr86_2",
"dr86_3",
"dr86_4",
"dr86_5",
"dr87_1",
"dr87_2",
"dr87_3",
"dr87_4",
"dr87_5",
"dr87_6",
"dr88_1",
"dr89_1",
"dr89_2",
"dr9_1",
"dr90_1",
"dr92_1",
"dr92_1",
"dr92_2",
"dr92_3",
"dr93_1",
"dr93_2",
"dr94_1",
"dr94_2",
"dr94_3",
"dr95_1",
"dr95_2",
"dr96_1",
"dr97_1",
"dr97_2",
"dr97_3",
"dr98_1",
]
clean_csv("all_cropped_data.csv",patterns=patterns_to_delete)