Spaces:

Umer2762
/

Doctor_Handwriting_Text_Detection

Running

File size: 5,634 Bytes

5626a1a

import os
import pandas as pd
def delete_images_with_patterns(directory: str, patterns: list):
    """

    Deletes image files in the given directory if their filenames contain any of the specified patterns.



    Args:

        directory (str): The path to the directory containing images.

        patterns (list): A list of substrings to check in filenames.

    """
    if not os.path.exists(directory):
        print(f"Directory '{directory}' does not exist.")
        return
    
    for filename in os.listdir(directory):
        file_path = os.path.join(directory, filename)
        
        # Check if the filename contains any of the specified patterns
        if any("dr"+pattern in filename for pattern in patterns):
            try:
                os.remove(file_path)
                print(f"Deleted: {file_path}")
            except Exception as e:
                print(f"Error deleting {file_path}: {e}")

def clean_csv(csv_path: str, patterns: list):
    """

    Removes rows from the CSV if the first column contains filenames matching any pattern (e.g., "1_1" -> "dr1_1").

    Ensures that there are no additional digits after the pattern unless separated by an underscore `_`.

    """
    if not os.path.exists(csv_path):
        print(f"CSV file '{csv_path}' does not exist.")
        return
    
    # Load CSV into a DataFrame
    df = pd.read_csv(csv_path)

    # Ensure the first column is treated as a string
    df.iloc[:, 0] = df.iloc[:, 0].astype(str)

    # Create modified patterns to match filenames
    modified_patterns = [f"dr{p}" for p in patterns]

    # Build a regex pattern to match filenames exactly or with an underscore and additional digits
    regex_patterns = []
    for pattern in modified_patterns:
        # Match the pattern exactly or with an underscore and additional digits
        regex_patterns.append(f"^{pattern}(_\\d+)?$")

    # Combine all regex patterns into a single pattern
    combined_regex = '|'.join(regex_patterns)

    # Filter out rows where the first column matches any of the regex patterns
    df = df[~df.iloc[:, 0].str.match(combined_regex, na=False)]

    # Remove duplicates
    df.drop_duplicates(inplace=True)

    # Save cleaned data back to CSV
    df.to_csv(csv_path, index=False)
    print(f"Updated CSV saved: {csv_path}")

# List of text patterns to match in filenames
patterns_to_delete = [
    "1_1", 
    "4_1", 
    "4_2", 
    "4_3", 
    "4_4", 
    "4_5", 
    "4_6",
    "5_1", 
    "5_2", 
    "7_1", 
    "10_1", 
    "24_1", 
    "24_2", 
    "25_1", 
    "25_2", 
    "29_1", 
    "30_1", 
    "33_1", 
    "36_1", 
    "36_4", 
    "36_5", 
    "36_6", 
    "38_1", 
    "38_2", 
    "38_3", 
    "38_4", 
    "38_5", 
    "38_6", 
    "38_7", 
    "38_8", 
    "38_9", 
    "42_1", 
    "42_2", 
    "42_4", 
    "43_1", 
    "43_2", 
    "43_3", 
    "43_4", 
    "43_5", 
    "44_1", 
    "44_2", 
    "44_3", 
    "44_4", 
    "44_6", 
    "45_1", 
    "47_1", 
    "50_1", 
    "57_1", 
    "57_2", 
    "63_1", 
    "64_1", 
    "64_2", 
    "64_3", 
    "64_4", 
    "64_5", 
    "64_6", 
    "64_7", 
    "64_8", 
    "64_9", 
    "65_1", 
    "65_2", 
    "66_1", 
    "66_2", 
    "66_3", 
    "66_4", 
    "66_5", 
    "66_6", 
    "66_7", 
    "66_8", 
    "69_1", 
    "69_2", 
    "69_3", 
    "69_4", 
    "69_5", 
    "69_6", 
    "69_7", 
    "69_8", 
    "69_9", 
    "71_1", 
    "71_2", 
    "71_3", 
    "71_4", 
    "71_5", 
    "73_1", 
    "74_1", 
    "75_1", 
    "75_2", 
    "75_3", 
    "75_4", 
    "75_5", 
    "75_6", 
    "77_1", 
    "77_2", 
    "77_3", 
    "76_1", 
    "76_2", 
    "76_3", 
    "76_4", 
    "76_5",
    "80_1", 
    "80_2", 
    "82_1", 
    "86_1", 
    "86_2", 
    "86_3", 
    "86_4", 
    "86_5",
    "87_1", 
    "87_2", 
    "87_3", 
    "87_4", 
    "87_5", 
    "87_6",
    "89_1", 
    "92_1", 
    "92_2", 
    "93_1", 
    "94_2", 
    "94_1", 
    "95_1",
    "97_1", 
    "97_2",
    "102_1",
    "104_1",
    "108_1",
    "109_1",
    "112_1",
    "114_1",
    "114_2",
    "114_3",
    "114_4",
    "114_5",
    "114_6",
    "114_7",
    "114_8",
    "114_9",
    "115_1",
    "115_2",
    "116_1",
    "116_2",
    "116_3",
    "117_1",
    "128_1",
    "130_1",
    "132_1",
    "132_2",
    "132_3",
    "137_1",
    "137_2",
    "137_3",
    "137_4",
    "137_5",
    "137_6",
    "137_7",
    "137_8",
    "137_9",
    "140_5",
    "146_1",
    "146_2",
    "146_3",
    "151_1",
    "151_2",
    "163_1",
    "169_1",
    "173_1",
    "173_2",
    "100_1"
]

# Specify your target directory
target_directory = "./cropped_images"  # Change this to your actual directory

# Run the deletion function
# delete_images_with_patterns(target_directory, patterns_to_delete)
patterns_to_delete = [
    "dr80_2",
    "dr80_3",
    "dr81_1",
    "dr81_1",
    "dr81_2",
    "dr83_1",
    "dr86_1",
    "dr86_2",
    "dr86_3",
    "dr86_4",
    "dr86_5",
    "dr87_1",
    "dr87_2",
    "dr87_3",
    "dr87_4",
    "dr87_5",
    "dr87_6",
    "dr88_1",
    "dr89_1",
    "dr89_2",
    "dr9_1",
    "dr90_1",
    "dr92_1",
    "dr92_1",
    "dr92_2",
    "dr92_3",
    "dr93_1",
    "dr93_2",
    "dr94_1",
    "dr94_2",
    "dr94_3",
    "dr95_1",
    "dr95_2",
    "dr96_1",
    "dr97_1",
    "dr97_2",
    "dr97_3",
    "dr98_1",
]
clean_csv("all_cropped_data.csv",patterns=patterns_to_delete)