Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import gradio as gr | |
| import re | |
| from datetime import timedelta | |
| def process_data(files_mindbody, files_medserv, tollerance, progress=gr.Progress()): | |
| mindbody = load_data(files_mindbody) | |
| medserv = load_data(files_medserv) | |
| medserv['Client'] = medserv['Client'].str.replace(r',+', ',', regex=True) | |
| mindbody['Client'] = mindbody['Client'].str.replace(r',+', ',', regex=True) | |
| # Split 'Client' names into first name and last name components for both DataFrames | |
| medserv[['Last Name', 'First Name']] = medserv['Client'].str.split(',', expand=True) | |
| mindbody[['Last Name', 'First Name']] = mindbody['Client'].str.split(',', expand=True) | |
| mindbody['DOS'] = pd.to_datetime(mindbody['DOS'], format='%d/%m/%Y') | |
| # Split dates if they contain commas in the 'DOS' column of medserv | |
| medserv['DOS'] = medserv['DOS'].astype(str) | |
| medserv['DOS'] = medserv['DOS'].str.split(',') | |
| medserv = medserv.explode('DOS') | |
| # Attempt to convert dates using multiple formats | |
| formats_to_try = ['%d/%m/%Y', '%Y-%m-%d'] # Add more formats as needed | |
| for format_to_try in formats_to_try: | |
| try: | |
| medserv['DOS'] = pd.to_datetime(medserv['DOS'].str.strip(), format=format_to_try) | |
| break # Break out of loop if conversion succeeds | |
| except ValueError: | |
| continue # Continue to next format if conversion fails | |
| # Initialize an empty list to store unmatched rows | |
| unmatched_rows = [] | |
| rows = len(mindbody) | |
| # Iterate through each row in the mindbody DataFrame | |
| for idx in progress.tqdm(range(rows), desc='Analyzing files...'): | |
| # Extract relevant information from the current row | |
| date = mindbody.iloc[idx]['DOS'] | |
| first_name = mindbody.iloc[idx]['First Name'] | |
| last_name = mindbody.iloc[idx]['Last Name'] | |
| # Define the range of dates to search for a match in medserv | |
| date_range = [date - timedelta(days=i) for i in range(tollerance, -tollerance-1, -1)] | |
| # Remove the time component from the dates in date_range | |
| date_range = [d.date() for d in date_range] | |
| # Filter medserv based on the date range and name criteria | |
| matches = medserv[((medserv['DOS'].dt.date.isin(date_range)) & | |
| ((medserv['First Name'].str.lower() == first_name.lower()) | | |
| (medserv['Last Name'].str.lower() == last_name.lower())))] | |
| # If no match is found, append the row to the unmatched_rows list | |
| if matches.empty: | |
| unmatched_rows.append(mindbody.iloc[idx]) | |
| # Create a DataFrame from the unmatched_rows list | |
| unmatched_df = pd.DataFrame(unmatched_rows, columns=mindbody.columns) | |
| # Specify the columns to include in the output Excel file | |
| columns_to_include = ['DOS', 'Client ID', 'Client', 'Sale ID', 'Item name', 'Location', 'Item Total'] | |
| # Format the 'DOS' column to remove time part | |
| unmatched_df['DOS'] = unmatched_df['DOS'].dt.strftime('%d-%m-%Y') | |
| output_file_path = 'Comparison Results.xlsx' | |
| unmatched_df[columns_to_include].to_excel(output_file_path, index=False) | |
| return output_file_path | |
| def load_data(files): | |
| # Check if a single file or multiple files are provided | |
| filepaths = [file.name for file in files] | |
| # Load and concatenate multiple files if provided | |
| dfs = [] | |
| for filepath in filepaths: | |
| if filepath.endswith('.xlsx') or filepath.endswith('.xls'): | |
| dfs.append(pd.read_excel(filepath)) | |
| else: | |
| raise gr.Error("Unsupported file format: Please provide a .xls or .xlsx file") | |
| # Concatenate dataframes if more than one file is provided | |
| if len(dfs) > 1: | |
| df = pd.concat(dfs, ignore_index=True) | |
| else: | |
| df = dfs[0] | |
| # Find and rename the date column to 'DOS' | |
| date_column = find_date_column(df) | |
| if date_column: | |
| df.rename(columns={date_column: 'DOS'}, inplace=True) | |
| # Find and rename the name column to 'Client' | |
| name_column = find_name_column(df) | |
| if name_column: | |
| df.rename(columns={name_column: 'Client'}, inplace=True) | |
| return df | |
| def find_name_column(df): | |
| name_pattern = r"^[A-Za-z'-]+,\s[A-Za-z'-]+(?:\s[A-Za-z'-]+)*$" # Regex pattern for last name, first name(s) | |
| max_count = 0 | |
| name_column = None | |
| for column in df.columns: | |
| # Count matches of the name pattern in each column | |
| matches = df[column].astype(str).apply(lambda x: bool(re.match(name_pattern, x))) | |
| valid_count = matches.sum() # Sum of True values indicating valid names | |
| # Select the column with the maximum count of valid names | |
| if valid_count > max_count: | |
| max_count = valid_count | |
| name_column = column | |
| return name_column | |
| def find_date_column(df): | |
| # Check if 'Treatment dates' column exists | |
| if 'Treatment dates' in df.columns: | |
| return 'Treatment dates' | |
| date_pattern = r"\b\d{2,4}[-/]\d{1,2}[-/]\d{2,4}\b" # Regex pattern for common date formats | |
| max_count = 0 | |
| date_column = None | |
| for column in df.columns: | |
| # Count matches of the date pattern in each column | |
| matches = df[column].astype(str).str.contains(date_pattern, na=False) | |
| valid_count = matches.sum() # Sum of True values indicating valid dates | |
| # Select the column with the maximum count of valid dates | |
| if valid_count > max_count: | |
| max_count = valid_count | |
| date_column = column | |
| return date_column | |