File size: 5,478 Bytes

d9198d9

# pip install virtualenv
# python -m virtualenv deepseek_env
# deepseek_env\Scripts\activate

import os
import pandas as pd
import ollama
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np


# Model configuration
desired_model = 'deepseek-r1:14b'

# Target columns for intelligent merging
TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
root_directories = ["A","B","C","D","E","F","G","H","I"]


# Initialize Sentence-BERT model for semantic similarity
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


def read_csv_files(root_directory):
    csv_files = []
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(subdir, file))
    return csv_files


def calculate_similarity(source_list, target_list):
    source_embeddings = semantic_model.encode(source_list)
    target_embeddings = semantic_model.encode(target_list)
    similarities = []
    for src_emb in source_embeddings:
        similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
        similarities.append(similarity_scores)
    return similarities


def get_header_similarity(headers):
    prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
    
    try:
        response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
        ollama_response = response.get('message', {}).get('content', 'No response content')
        print("Model Response:", ollama_response)
        
        try:
            mapped_headers = eval(ollama_response)
        except Exception as e:
            print("Error parsing model response:", e)
            mapped_headers = {col: [] for col in TARGET_COLUMNS}
    except Exception as e:
        print(f"Error in DeepSeek request: {e}")
        mapped_headers = {col: [] for col in TARGET_COLUMNS}
    
    similarities = calculate_similarity(headers, TARGET_COLUMNS)
    
    semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
    for idx, similarity_scores in enumerate(similarities):
        best_match_idx = np.argmax(similarity_scores)
        semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
    
    for target_col in semantic_mapped_headers:
        if not semantic_mapped_headers[target_col]:
            semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
    
    return semantic_mapped_headers


def is_valid_mobile(value):
    return str(value).isdigit() and 7 <= len(str(value)) <= 15


def merge_dataframes(file_paths):
    dataframes = []
    header_sets = set()

    for file_path in tqdm(file_paths, desc="Reading CSV files"):
        try:
            df = pd.read_csv(file_path)
            dataframes.append(df)
            header_sets.update(df.columns.tolist())
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    header_mapping = get_header_similarity(list(header_sets))

    merged_frames = []

    for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
        merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}

        for target_col, mapped_cols in header_mapping.items():
            column_data = pd.Series(dtype='object')

            for col in mapped_cols:
                if col in df.columns:
                    clean_data = df[col].dropna()

                    # Apply additional filters for specific columns
                    if target_col == "Mobile":
                        clean_data = clean_data[clean_data.apply(is_valid_mobile)]  # Ensure valid mobile numbers
                    elif target_col == "Pincode":
                        clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]

                    column_data = column_data.combine_first(clean_data)

            merged_dict[target_col] = column_data

        merged_frames.append(pd.DataFrame(merged_dict))

    final_df = pd.concat(merged_frames, ignore_index=True)
    return final_df


def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
    num_parts = (len(df) // max_rows_per_file) + 1
    for part in range(num_parts):
        start_idx = part * max_rows_per_file
        end_idx = min((part + 1) * max_rows_per_file, len(df))
        part_df = df.iloc[start_idx:end_idx]

        if not part_df.empty:
            output_file = f"{base_filename}_part_{part + 1}.csv"
            part_df.to_csv(output_file, index=False)
            print(f"Saved {output_file} with {len(part_df)} rows.")


def main():
    for root_directory in root_directories:
        directory_name = os.path.basename(root_directory).replace(' ', '_')
        print(f"Processing directory: {root_directory}")
        csv_files = read_csv_files(root_directory)
        if not csv_files:
            print(f"No CSV files found in {root_directory}.")
            continue

        print(f"Found {len(csv_files)} CSV files in {root_directory}.")

        merged_df = merge_dataframes(csv_files)

        base_filename = f"{directory_name}"
        save_dataframe_in_parts(merged_df, base_filename)


if __name__ == "__main__":
    main()