Spaces:

Sambit20030731
/

dedup

Runtime error

App Files Files Community

Sambit20030731 commited on Mar 22, 2024

Commit

6747401

verified ·

1 Parent(s): 8112a45

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +11 -0
app.py +451 -0
output/readme.txt.txt +1 -0
requirement.txt +9 -0
static/script.js +64 -0
static/styles.css +119 -0
templates/index.html +60 -0
uploads/readme.txt.txt +1 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,11 @@

+FROM python:3.9
+WORKDIR /code
+COPY ./requirements.txt /code/requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+COPY . .
+CMD ["uvicorn", "app.main:app", "--host", "0.0.0.0", "--port", "7860"]

app.py ADDED Viewed

	@@ -0,0 +1,451 @@

+#install dependencies
+from flask import Flask, render_template, request, redirect, url_for
+import os
+import shutil
+import webview
+import tkinter as tk
+from tkinter import filedialog
+import openpyxl
+import pandas as pd
+import requests
+from fuzzywuzzy import fuzz
+from openpyxl.styles import PatternFill
+from openpyxl.styles.alignment import Alignment
+import google.generativeai as genai
+app = Flask(__name__, static_folder='./static', template_folder='./templates')
+app.config['UPLOAD_FOLDER'] = 'uploads'
+app.config['OUTPUT_FOLDER'] = 'output'
+output_file = None
+window = webview.create_window('DeDuplicae-Vendor', app)
+#connect to google gemini API key
+GOOGLE_API_KEY='AIzaSyCtACPu9EOnEa1_iAWsv_u__PQRpaCT564'
+genai.configure(api_key=GOOGLE_API_KEY)
+#Load the gemini model
+model = genai.GenerativeModel('gemini-pro')
+# Function to apply to df1 to create the cont_person_name column
+def process_fuzzy_ratios(rows_dict):
+    fuzz_data = {}
+    for key, row in enumerate(rows_dict):
+        if key == 0:
+            # For the first row, delete specified columns
+            del row["address_fuzzy_ratio"]
+            del row["bank_fuzzy_ratio"]
+            del row["name_fuzzy_ratio"]
+            del row["accgrp_fuzzy_ratio"]
+            del row["tax_fuzzy_ratio"]
+            del row["postal_fuzzy_ratio"]
+        else:
+            # For subsequent rows, store data in fuzz_data dictionary
+            fuzz_data["row_" + str(key + 1)] = {
+                "address_fuzzy_ratio": row.pop("address_fuzzy_ratio"),
+                "bank_fuzzy_ratio": row.pop("bank_fuzzy_ratio"),
+                "name_fuzzy_ratio": row.pop("name_fuzzy_ratio"),
+                "accgrp_fuzzy_ratio": row.pop("accgrp_fuzzy_ratio"),
+                "tax_fuzzy_ratio": row.pop("tax_fuzzy_ratio"),
+                "postal_fuzzy_ratio": row.pop("postal_fuzzy_ratio")
+            }
+    return fuzz_data, rows_dict
+# Code to perform gemini analysis
+def gemini_analysis(dataframe):
+    prev_row_duplicate = False
+    prev_row_number = None
+    for index, row in dataframe.iterrows():
+        # Find duplicate pairs
+        if row['Remarks'] == 'Duplicate':
+            if prev_row_duplicate:
+                duplicate_pairs=[]
+                row1 = dataframe.loc[index-1].to_dict()
+                row2 = row.to_dict()
+                duplicate_pairs.append(row1)
+                duplicate_pairs.append(row2)
+                fuzzy_ratios, duplicate_pairs = process_fuzzy_ratios(duplicate_pairs)
+                for dictionary in duplicate_pairs:
+                    for _ in range(12):
+                        if dictionary:
+                            dictionary.popitem()
+                main_data_str = "[{}]".format(', '.join([str(d) for d in duplicate_pairs]))
+                fuzzy_data_str = "{}".format(fuzzy_ratios)
+                qs="I have the data",main_data_str,"The corresponding fuzzy ratios are here: ",fuzzy_data_str,"Give a concise explanation why these two rows are duplicate based on analyzing the main data and explaining which column values are same and which column values are different?"
+                # Ask gemini to analyse the data
+                try:
+                    response = model.generate_content(qs)
+                    dataframe.at[index-1, 'Explanation'] = response.text
+                except requests.HTTPError:
+                    dataframe.at[index-1, 'Explanation'] = 'An error occured'
+                except ValueError:
+                    dataframe.at[index-1, 'Explanation'] = 'An error occured'
+                except Exception:
+                    dataframe.at[index-1, 'Explanation'] = 'An error occured'
+            prev_row_duplicate = True
+        else:
+            prev_row_duplicate = False
+# The logic to find duplicacy
+def process_csv(file, check=['Tax','Bank','Address','Name','PostCode','AccGrp']):
+    def calculate_tax_duplicacy(df):
+        df.sort_values(['Tax'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'tax_fuzzy_ratio'] = 100
+        last_row_index = len(df) - 1
+        df.at[last_row_index, 'tax_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_tax = df['Tax'].iloc[i]
+            previous_tax = df['Tax'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_tax, current_tax)
+            df.at[i, 'tax_fuzzy_ratio'] = fuzzy_ratio
+        df['tax_fuzzy_ratio'] = pd.to_numeric(df['tax_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups based on tax column
+        group_counter = 1
+        df.at[0, 'tax_based_group'] = group_counter
+        for i in range(1, len(df)):
+            if df.at[i, 'tax_fuzzy_ratio'] > 90:
+                df.at[i, 'tax_based_group'] = df.at[i - 1, 'tax_based_group']
+            else:
+                group_counter += 1
+                df.at[i, 'tax_based_group'] = group_counter
+        return df
+    def calculate_bank_duplicacy(df):
+        df.sort_values(['Group_tax', 'Bank'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'bank_fuzzy_ratio'] = 100
+        df.at[last_row_index, 'bank_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_address = df['Bank'].iloc[i]
+            previous_address = df['Bank'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_address, current_address)
+            df.at[i, 'bank_fuzzy_ratio'] = fuzzy_ratio
+        df['bank_fuzzy_ratio'] = pd.to_numeric(df['bank_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups for bank column
+        bank_group_counter = 1
+        df.at[0, 'bank_based_group'] = str(bank_group_counter)
+        group = df.at[0, 'tax_based_group']
+        for i in range(1, len(df)):
+            if df.at[i, 'bank_fuzzy_ratio'] >= 100:
+                df.at[i, 'bank_based_group'] = df.at[i - 1, 'bank_based_group']
+            else:
+                if df.at[i, 'tax_based_group'] != group:
+                    bank_group_counter = 1
+                    group = df.at[i, 'tax_based_group']
+                else:
+                    bank_group_counter += 1
+            df.at[i, 'bank_based_group'] = str(bank_group_counter)
+        return df
+    def calculate_address_duplicacy(df):
+        df.sort_values(['Group_tax_bank', 'Address'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'address_fuzzy_ratio'] = 100
+        df.at[last_row_index, 'address_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_address = df['Address'].iloc[i]
+            previous_address = df['Address'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_address, current_address)
+            df.at[i, 'address_fuzzy_ratio'] = fuzzy_ratio
+        df['address_fuzzy_ratio'] = pd.to_numeric(df['address_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups for address column
+        address_group_counter = 1
+        df.at[0, 'address_based_group'] = str(address_group_counter)
+        group = df.at[0, 'Group_tax_bank']
+        for i in range(1, len(df)):
+            if df.at[i, 'address_fuzzy_ratio'] > 70:
+                df.at[i, 'address_based_group'] = df.at[i - 1, 'address_based_group']
+            else:
+                if df.at[i, 'Group_tax_bank'] != group:
+                    address_group_counter = 1
+                    group = df.at[i, 'Group_tax_bank']
+                else:
+                    address_group_counter += 1
+            df.at[i, 'address_based_group'] = str(address_group_counter)
+        return df
+    def calculate_name_duplicacy(df):
+        df.sort_values(['Group_tax_bank_add', 'Name'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'name_fuzzy_ratio'] = 100
+        df.at[last_row_index, 'name_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_address = df['Name'].iloc[i]
+            previous_address = df['Name'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_address, current_address)
+            df.at[i, 'name_fuzzy_ratio'] = fuzzy_ratio
+        df['name_fuzzy_ratio'] = pd.to_numeric(df['name_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups for name column
+        name_group_counter = 1
+        df.at[0, 'name_based_group'] = str(name_group_counter)
+        group = df.at[0, 'Group_tax_bank_add']
+        for i in range(1, len(df)):
+            if df.at[i, 'name_fuzzy_ratio'] > 80:
+                df.at[i, 'name_based_group'] = df.at[i - 1, 'name_based_group']
+            else:
+                if df.at[i, 'Group_tax_bank_add'] != group:
+                    name_group_counter = 1
+                    group = df.at[i, 'Group_tax_bank_add']
+                else:
+                    name_group_counter += 1
+            df.at[i, 'name_based_group'] = str(name_group_counter)
+        return df
+    def calculate_postcode_duplicacy(df):
+        df.sort_values(['Group_tax_bank_add_name', 'POSTCODE1'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'postal_fuzzy_ratio'] = 100
+        df.at[last_row_index, 'postal_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_address = df['POSTCODE1'].iloc[i]
+            previous_address = df['POSTCODE1'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_address, current_address)
+            df.at[i, 'postal_fuzzy_ratio'] = fuzzy_ratio
+        df['postal_fuzzy_ratio'] = pd.to_numeric(df['postal_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups for postcode column
+        postcode_group_counter = 1
+        df.at[0, 'postal_based_group'] = str(postcode_group_counter)
+        group = df.at[0, 'Group_tax_bank_add_name']
+        for i in range(1, len(df)):
+            if df.at[i, 'postal_fuzzy_ratio'] > 90:
+                df.at[i, 'postal_based_group'] = df.at[i - 1, 'postal_based_group']
+            else:
+                if df.at[i, 'Group_tax_bank_add_name'] != group:
+                    postcode_group_counter = 1
+                    group = df.at[i, 'Group_tax_bank_add_name']
+                else:
+                    postcode_group_counter += 1
+            df.at[i, 'postal_based_group'] = str(postcode_group_counter)
+        return df
+    def calculate_accgrp_duplicacy(df):
+        df.sort_values(['Group_tax_bank_add_name_post', 'KTOKK'], inplace=True)
+        df = df.reset_index(drop=True)
+        df.at[0, 'accgrp_fuzzy_ratio'] = 100
+        df.at[last_row_index, 'accgrp_fuzzy_ratio'] = 100
+        for i in range(1, last_row_index):
+            current_address = df['KTOKK'].iloc[i]
+            previous_address = df['KTOKK'].iloc[i - 1]
+            fuzzy_ratio = fuzz.ratio(previous_address, current_address)
+            df.at[i, 'accgrp_fuzzy_ratio'] = fuzzy_ratio
+        df['accgrp_fuzzy_ratio'] = pd.to_numeric(df['accgrp_fuzzy_ratio'], errors='coerce')
+        # Calculate the duplicate groups for accgrp column
+        accgrp_group_counter = 1
+        df.at[0, 'accgrp_based_group'] = str(accgrp_group_counter)
+        group = df.at[0, 'Group_tax_bank_add_name_post']
+        for i in range(1, len(df)):
+            if df.at[i, 'accgrp_fuzzy_ratio'] >= 100:
+                df.at[i, 'accgrp_based_group'] = df.at[i - 1, 'accgrp_based_group']
+            else:
+                if df.at[i, 'Group_tax_bank_add_name_post'] != group:
+                    accgrp_group_counter = 1
+                    group = df.at[i, 'Group_tax_bank_add_name_post']
+                else:
+                    accgrp_group_counter += 1
+            df.at[i, 'accgrp_based_group'] = str(accgrp_group_counter)
+        return df
+    # Search for the header row
+    def find_header_row(file_path, specified_headers, sheet_name):
+        workbook = openpyxl.load_workbook(file_path)
+        sheet = workbook[sheet_name]
+        header_row = None
+        temp_values = []
+        for row in sheet.iter_rows():
+            for cell in row:
+                if cell.value in specified_headers:
+                    header_row = cell.row
+                    break
+            if header_row is not None:
+                break
+        if header_row is None:
+            return
+        # Store values in temporary variable
+        for row in range(1, header_row):
+            for cell in sheet[row]:
+                temp_values.append(cell.value)
+        # Read DataFrame below the header row using pandas
+        df = pd.DataFrame(sheet.iter_rows(min_row=header_row + 1, values_only=True),
+                          columns=[cell.value for cell in next(sheet.iter_rows(min_row=header_row))])
+        return header_row, temp_values, df
+    sheet_name1 = 'General Data '
+    specified_headers = ["LIFNR",	"KTOKK",	"NAMEFIRST",	"NAMELAST",	"NAME3",	"NAME4",	"STREET",	"POSTCODE1",	"CITY1",	"COUNTRY",	"REGION",	"SMTPADDR",	"BANKL",	"BANKN",	"TAXTYPE",	"TAXNUM", "Unnamed: 16", "Unnamed: 17", "Unnamed: 18"]
+    header_row, temp_values, df = find_header_row(file, specified_headers, sheet_name1)
+    # Replace null values with a blank space
+    df = df.fillna(" ")
+    # Creating new columns by concatenating original columns
+    df['Address'] = df['STREET'].astype(str) + '-' + df['CITY1'].astype(str) + '-' + df['COUNTRY'].astype(str) + '-' + \
+                    df['REGION'].astype(str)
+    df['Name'] = df['NAMEFIRST'].astype(str) + '-' + df['NAMELAST'].astype(str) + '-' + df['NAME3'].astype(str) + '-' + \
+                 df['NAME4'].astype(str)
+    df['Bank'] = df['BANKL'].astype(str) + '-' + df['BANKN'].astype(str)
+    df['Tax'] = df['TAXTYPE'].astype(str) + '-' + df['TAXNUM'].astype(str)
+    # Converting all concatenated columns to lowercase
+    df['Name'] = df['Name'].str.lower()
+    df['Address'] = df['Address'].str.lower()
+    df['Bank'] = df['Bank'].str.lower()
+    df['Tax'] = df['Tax'].str.lower()
+    # Create new columns with the following names for fuzzy ratio
+    df['name_fuzzy_ratio'] = ''
+    df['accgrp_fuzzy_ratio'] = ''
+    df['address_fuzzy_ratio'] = ''
+    df['bank_fuzzy_ratio'] = ''
+    df['tax_fuzzy_ratio'] = ''
+    df['postal_fuzzy_ratio'] = ''
+    # Create new columns with the following names for crearing groups
+    df['name_based_group'] = ''
+    df['accgrp_based_group'] = ''
+    df['address_based_group'] = ''
+    df['bank_based_group'] = ''
+    df['tax_based_group'] = ''
+    df['postal_based_group'] = ''
+    # Calculate last row index value
+    last_row_index = len(df) - 1
+    # Calculate the fuzzy ratios for tax column
+    if 'Tax' in check:
+        df = calculate_tax_duplicacy(df)
+    df['Group_tax'] = df.apply(lambda row: '{}'.format(row['tax_based_group']), axis=1)
+    # Calculate the fuzzy ratios for bank column
+    if 'Bank' in check:
+        df = calculate_bank_duplicacy(df)
+    df['Group_tax_bank'] = df.apply(lambda row: '{}_{}'.format(row['tax_based_group'], row['bank_based_group']), axis=1)
+    # Calculate the fuzzy ratios for address column
+    if 'Address' in check:
+        df = calculate_address_duplicacy(df)
+    df['Group_tax_bank_add'] = df.apply(lambda row: '{}_{}'.format(row['Group_tax_bank'], row['address_based_group']),
+                                        axis=1)
+    # Calculate the fuzzy ratios for name column
+    if 'Name' in check:
+        df = calculate_name_duplicacy(df)
+    df['Group_tax_bank_add_name'] = df.apply(
+        lambda row: '{}_{}'.format(row['Group_tax_bank_add'], row['name_based_group']), axis=1)
+    # Calculate the fuzzy ratios for postcode column
+    if 'PostCode' in check:
+        df = calculate_postcode_duplicacy(df)
+    df['Group_tax_bank_add_name_post'] = df.apply(
+        lambda row: '{}_{}'.format(row['Group_tax_bank_add_name'], row['postal_based_group']), axis=1)
+    # Calculate the fuzzy ratios for accgrp column
+    if 'AccGrp' in check:
+        df = calculate_accgrp_duplicacy(df)
+    df['Group_tax_bank_add_name_post_accgrp'] = df.apply(
+        lambda row: '{}_{}'.format(row['Group_tax_bank_add_name_post'], row['accgrp_based_group']), axis=1)
+    # Find the final duplicate groups in AND condition
+    duplicate_groups = df['Group_tax_bank_add_name_post_accgrp'].duplicated(keep=False)
+    df['Remarks'] = ['Duplicate' if is_duplicate else 'Unique' for is_duplicate in duplicate_groups]
+    # Ask gemini to analyse the duplicate columns
+    gemini_analysis(df)
+    # Drop the columns related to fuzzy ratios and groups
+    columns_to_drop = ['name_fuzzy_ratio', 'accgrp_fuzzy_ratio', 'address_fuzzy_ratio', 'bank_fuzzy_ratio',
+                       'tax_fuzzy_ratio', 'postal_fuzzy_ratio', 'name_based_group', 'accgrp_based_group',
+                       'address_based_group', 'bank_based_group', 'tax_based_group', 'postal_based_group',
+                       'Group_tax_bank', 'Group_tax_bank_add', 'Group_tax_bank_add_name',
+                       'Group_tax_bank_add_name_post', 'Group_tax', 'Group_tax_bank_add_name_post_accgrp']
+    df = df.drop(columns=columns_to_drop, axis=1)
+    df.to_excel('output/output.xlsx', index=False)
+    excel_writer = pd.ExcelWriter('output/output.xlsx', engine='openpyxl')
+    df.to_excel(excel_writer, index=False, sheet_name='Sheet1')
+    # Access the workbook
+    workbook = excel_writer.book
+    worksheet = workbook['Sheet1']
+    # Apply row coloring based on the value in the 'Remarks' column and also wrap the texts
+    duplicate_fill = PatternFill(start_color="FFFF00", end_color="FFFF00", fill_type="solid")
+    for idx, row in df.iterrows():
+        if row['Remarks'] == 'Duplicate':
+            for cell in worksheet[idx + 2]:
+                cell.alignment = Alignment(wrap_text=True)
+                cell.fill = duplicate_fill
+    # Iterate over columns and set their width
+    for col in worksheet.columns:
+        col_letter = col[0].column_letter
+        worksheet.column_dimensions[col_letter].width = 28
+    # Iterate over rows and set their height
+    for row in worksheet.iter_rows():
+        worksheet.row_dimensions[row[0].row].height = 20
+    # Save the changes
+    excel_writer.close()
+    output_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.xlsx')
+    return output_path
+def save_error_message(error_message):
+    with open('static/error.txt', 'w') as f:
+        f.write(error_message)
+@app.route('/', methods=['GET', 'POST'])
+def upload_file():
+    global output_file
+    error_message = None
+    if request.method == 'POST':
+        file = request.files['file']
+        selected_options = request.form.getlist('option')
+        if file:
+            try:
+                file_path = os.path.join(app.config['UPLOAD_FOLDER'], file.filename)
+                file.save(file_path)
+                output_file = process_csv(file_path)
+                return redirect(url_for('upload_file'))
+            except Exception as e:
+                error_message = str(e)
+                save_error_message(error_message)
+    return render_template('index.html', output_file=output_file, error_message=error_message)
+def save_file_dialog(default_filename="output.xlsx", filetypes=(("XLSX files", ".xlsx"), ("All files", ".*"))):
+    root = tk.Tk()
+    root.withdraw()
+    file_path = filedialog.asksaveasfilename(initialfile=default_filename, filetypes=filetypes, defaultextension=".xlsx")
+    return file_path
+@app.route('/downloads/output.xlsx')
+def download_file():
+    output_file_path = os.path.join(app.config['OUTPUT_FOLDER'], 'output.xlsx')
+    selected_path = save_file_dialog()
+    if selected_path:
+        shutil.copyfile(output_file_path, selected_path)
+    return redirect(url_for('upload_file'))
+if __name__ == '__main__':
+    app.run(debug=True)

output/readme.txt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Deduplication

requirement.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+flask
+os
+shutil
+tkinter
+openpyxl
+pandas
+requests
+fuzzywuzzy
+google-generativeai

static/script.js ADDED Viewed

	@@ -0,0 +1,64 @@

+function submitForm() {
+    var fileInput = document.getElementById('csvFile');
+    var processingMsg = document.getElementById('processingMsg');
+    if (fileInput.files.length === 0) {
+        alert('Please select a CSV file.');
+        return;
+    }
+    var formData = new FormData();
+    formData.append('csvFile', fileInput.files[0]);
+    // Show processing message
+    document.getElementById('uploadForm').classList.add('hidden');
+    processingMsg.classList.remove('hidden');
+    // Simulate backend processing (replace with actual AJAX call)
+    setTimeout(function() {
+        // After processing (simulated with setTimeout), show success message
+        processingMsg.innerHTML = '<p>File processed successfully. <a href="#" onclick="downloadProcessedFile()">Download processed file</a></p>';
+    }, 2000);
+}
+function downloadProcessedFile() {
+    // Here you can add code to download the processed file
+    alert('Downloading processed file...');
+    // Replace this alert with your actual download logic
+}
+document.getElementById('submitBtn').addEventListener('click', function() {
+    var fileInput = document.getElementById('csvFile');
+    var file = fileInput.files[0];
+    if (file) {
+         var formData = new FormData();
+         formData.append('file', file);
+         // Capture checkbox values
+         var checkboxes = document.querySelectorAll('input[name="option"]:checked');
+         checkboxes.forEach(function(checkbox) {
+               formData.append('option', checkbox.value);
+         });
+         var xhr = new XMLHttpRequest();
+         xhr.open('POST', '/');
+         xhr.upload.onprogress = function(event) {
+         if (event.lengthComputable) {
+               var percentComplete = (event.loaded / event.total) * 100;
+               document.getElementById('progressBar').style.width = percentComplete + '%';
+         }
+    };
+    xhr.onloadstart = function() {
+           document.getElementById('processingMsg').classList.remove('hidden');
+    };
+    xhr.onloadend = function() {
+           document.getElementById('processingMsg').classList.add('hidden');
+           document.getElementById('downloadBtn').classList.remove('hidden');
+           var response = JSON.parse(xhr.responseText);
+           document.getElementById('downloadBtn').addEventListener('click', function() {
+                  window.location.href = '/downloads/output.xlsx';
+           });
+    };
+    xhr.send(formData);
+    }
+});

static/styles.css ADDED Viewed

	@@ -0,0 +1,119 @@

+body {
+    font-family: Arial, sans-serif;
+    background-color: #f0f0f0;
+    margin: 0;
+    padding: 100px 20px;
+}
+.container {
+    max-width: 600px;
+    margin: 0 auto;
+    background-color: #fff;
+    padding: 20px;
+    border-radius: 5px;
+    box-shadow: 0 2px 5px rgba(0, 0, 0, 0.1);
+    display: flex;
+    flex-direction: column;
+    align-items: center;
+    justify-content: center;
+}
+h1 {
+    text-align: center;
+    color: #333;
+}
+form {
+    display: flex;
+    flex-direction: column;
+}
+input[type="file"] {
+    margin-bottom: 10px;
+}
+button {
+    padding: 10px 20px;
+    background-color: #007bff;
+    color: #fff;
+    border: none;
+    cursor: pointer;
+}
+button:hover {
+    background-color: #0056b3;
+}
+#processingMsg {
+    text-align: center;
+}
+.hidden {
+    display: none;
+}
+#downloadBtn {
+    border-box: 5px;
+    margin-top: 20px;
+}
+#downloadBtn button {
+    border-box: 5px;
+    padding: 10px 20px;
+}
+.options-container {
+    margin-top: 20px;
+    display: flex;
+    flex-wrap: wrap;
+    justify-content: center;
+}
+.option {
+    margin-right: 20px;
+    margin-bottom: 10px;
+}
+.option label {
+    margin-left: 5px;
+}
+.options-wrapper {
+    background-color: #f2f2f2;
+    border-radius: 8px;
+    padding: 20px;
+    margin-top: 20px;
+}
+#checkbox-heading {
+    text-align: center;
+    font-size: 16px;
+    margin-bottom: 10px;
+}
+#explanation-note {
+    text-align: center;
+    margin-top: 20px;
+    font-style: italic;
+}
+#submitBtn {
+    margin-top: 20px;
+    border-radius: 5px;
+}
+.spinner {
+     border: 4px solid rgba(0, 0, 0, 0.1);
+     border-left-color: #333;
+     border-radius: 50%;
+     width: 50px;
+     height: 50px;
+     animation: spin 1s linear infinite;
+     margin: 20px auto;
+}
+@keyframes spin {
+     0% { transform: rotate(0deg); }
+     100% { transform: rotate(360deg); }
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,60 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>CSV File Upload</title>
+    <link rel="stylesheet" href="{{ url_for('static', filename='styles.css') }}">
+</head>
+<body>
+    <div class="container">
+        <h1>Vendor Master De-Duplication Tool</h1>
+        <form id="uploadForm" enctype="multipart/form-data">
+            <input type="file" name="file" id="csvFile" accept=".xlsx">
+        </form>
+        <div class="options-wrapper">
+            <div id="checkbox-heading">Select the options based on which duplication check will be performed and submit</div>
+            <div class="options-container">
+                <div class="option">
+                    <input type="checkbox" name="option" value="Tax" id="option1" checked>
+                    <label for="option1">Tax</label>
+                </div>
+                <div class="option">
+                    <input type="checkbox" name="option" value="Bank" id="option2" checked>
+                    <label for="option2">Bank</label>
+                </div>
+                <div class="option">
+                    <input type="checkbox" name="option" value="Address" id="option3" checked>
+                    <label for="option3">Address</label>
+                </div>
+                <div class="option">
+                    <input type="checkbox" name="option" value="Name" id="option4" checked>
+                    <label for="option4">Name</label>
+                </div>
+                <div class="option">
+                    <input type="checkbox" name="option" value="PostCode" id="option5" checked>
+                    <label for="option5">PostCode</label>
+                </div>
+                <div class="option">
+                    <input type="checkbox" name="option" value="AccGrp" id="option6" checked>
+                    <label for="option6">AccGrp</label>
+                </div>
+            </div>
+        </div>
+        <button type="button" id="submitBtn">Submit</button>
+        <div id="processingMsg" class="hidden">
+            <div class="spinner"></div>
+        </div>
+        <div id="progressBar"></div>
+        <div id="downloadBtn" class="hidden">
+            <a id="downloadLink" href="{{ url_for('download_file', filename='output.xlsx') }}">
+                <button>Download Processed XLSX</button>
+            </a>
+        </div>
+        <div id="explanation-note">
+            Note: The last column titled 'explanation' in output file contains the analysis for potential duplicates with the following row.
+        </div>
+    </div>
+    <script src="{{ url_for('static', filename='script.js') }}"></script>
+</body>
+</html>

uploads/readme.txt.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Deduplication