File size: 5,478 Bytes
d9198d9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
# pip install virtualenv
# python -m virtualenv deepseek_env
# deepseek_env\Scripts\activate

import os
import pandas as pd
import ollama
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np


# Model configuration
desired_model = 'deepseek-r1:14b'

# Target columns for intelligent merging
TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
root_directories = ["A","B","C","D","E","F","G","H","I"]


# Initialize Sentence-BERT model for semantic similarity
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')


def read_csv_files(root_directory):
    csv_files = []
    for subdir, _, files in os.walk(root_directory):
        for file in files:
            if file.endswith(".csv"):
                csv_files.append(os.path.join(subdir, file))
    return csv_files


def calculate_similarity(source_list, target_list):
    source_embeddings = semantic_model.encode(source_list)
    target_embeddings = semantic_model.encode(target_list)
    similarities = []
    for src_emb in source_embeddings:
        similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
        similarities.append(similarity_scores)
    return similarities


def get_header_similarity(headers):
    prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
    
    try:
        response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
        ollama_response = response.get('message', {}).get('content', 'No response content')
        print("Model Response:", ollama_response)
        
        try:
            mapped_headers = eval(ollama_response)
        except Exception as e:
            print("Error parsing model response:", e)
            mapped_headers = {col: [] for col in TARGET_COLUMNS}
    except Exception as e:
        print(f"Error in DeepSeek request: {e}")
        mapped_headers = {col: [] for col in TARGET_COLUMNS}
    
    similarities = calculate_similarity(headers, TARGET_COLUMNS)
    
    semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
    for idx, similarity_scores in enumerate(similarities):
        best_match_idx = np.argmax(similarity_scores)
        semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
    
    for target_col in semantic_mapped_headers:
        if not semantic_mapped_headers[target_col]:
            semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
    
    return semantic_mapped_headers


def is_valid_mobile(value):
    return str(value).isdigit() and 7 <= len(str(value)) <= 15


def merge_dataframes(file_paths):
    dataframes = []
    header_sets = set()

    for file_path in tqdm(file_paths, desc="Reading CSV files"):
        try:
            df = pd.read_csv(file_path)
            dataframes.append(df)
            header_sets.update(df.columns.tolist())
        except Exception as e:
            print(f"Error reading {file_path}: {e}")

    header_mapping = get_header_similarity(list(header_sets))

    merged_frames = []

    for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
        merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}

        for target_col, mapped_cols in header_mapping.items():
            column_data = pd.Series(dtype='object')

            for col in mapped_cols:
                if col in df.columns:
                    clean_data = df[col].dropna()

                    # Apply additional filters for specific columns
                    if target_col == "Mobile":
                        clean_data = clean_data[clean_data.apply(is_valid_mobile)]  # Ensure valid mobile numbers
                    elif target_col == "Pincode":
                        clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]

                    column_data = column_data.combine_first(clean_data)

            merged_dict[target_col] = column_data

        merged_frames.append(pd.DataFrame(merged_dict))

    final_df = pd.concat(merged_frames, ignore_index=True)
    return final_df


def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
    num_parts = (len(df) // max_rows_per_file) + 1
    for part in range(num_parts):
        start_idx = part * max_rows_per_file
        end_idx = min((part + 1) * max_rows_per_file, len(df))
        part_df = df.iloc[start_idx:end_idx]

        if not part_df.empty:
            output_file = f"{base_filename}_part_{part + 1}.csv"
            part_df.to_csv(output_file, index=False)
            print(f"Saved {output_file} with {len(part_df)} rows.")


def main():
    for root_directory in root_directories:
        directory_name = os.path.basename(root_directory).replace(' ', '_')
        print(f"Processing directory: {root_directory}")
        csv_files = read_csv_files(root_directory)
        if not csv_files:
            print(f"No CSV files found in {root_directory}.")
            continue

        print(f"Found {len(csv_files)} CSV files in {root_directory}.")

        merged_df = merge_dataframes(csv_files)

        base_filename = f"{directory_name}"
        save_dataframe_in_parts(merged_df, base_filename)


if __name__ == "__main__":
    main()