neelnsoni13 commited on
Commit
d9198d9
·
verified ·
1 Parent(s): 884027d

Create LargeMultiFolderExcelClean.py

Browse files
Files changed (1) hide show
  1. LargeMultiFolderExcelClean.py +152 -0
LargeMultiFolderExcelClean.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # pip install virtualenv
2
+ # python -m virtualenv deepseek_env
3
+ # deepseek_env\Scripts\activate
4
+
5
+ import os
6
+ import pandas as pd
7
+ import ollama
8
+ from tqdm import tqdm
9
+ from sentence_transformers import SentenceTransformer
10
+ import numpy as np
11
+
12
+
13
+ # Model configuration
14
+ desired_model = 'deepseek-r1:14b'
15
+
16
+ # Target columns for intelligent merging
17
+ TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
18
+ root_directories = ["A","B","C","D","E","F","G","H","I"]
19
+
20
+
21
+ # Initialize Sentence-BERT model for semantic similarity
22
+ semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
23
+
24
+
25
+ def read_csv_files(root_directory):
26
+ csv_files = []
27
+ for subdir, _, files in os.walk(root_directory):
28
+ for file in files:
29
+ if file.endswith(".csv"):
30
+ csv_files.append(os.path.join(subdir, file))
31
+ return csv_files
32
+
33
+
34
+ def calculate_similarity(source_list, target_list):
35
+ source_embeddings = semantic_model.encode(source_list)
36
+ target_embeddings = semantic_model.encode(target_list)
37
+ similarities = []
38
+ for src_emb in source_embeddings:
39
+ similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
40
+ similarities.append(similarity_scores)
41
+ return similarities
42
+
43
+
44
+ def get_header_similarity(headers):
45
+ prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
46
+
47
+ try:
48
+ response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
49
+ ollama_response = response.get('message', {}).get('content', 'No response content')
50
+ print("Model Response:", ollama_response)
51
+
52
+ try:
53
+ mapped_headers = eval(ollama_response)
54
+ except Exception as e:
55
+ print("Error parsing model response:", e)
56
+ mapped_headers = {col: [] for col in TARGET_COLUMNS}
57
+ except Exception as e:
58
+ print(f"Error in DeepSeek request: {e}")
59
+ mapped_headers = {col: [] for col in TARGET_COLUMNS}
60
+
61
+ similarities = calculate_similarity(headers, TARGET_COLUMNS)
62
+
63
+ semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
64
+ for idx, similarity_scores in enumerate(similarities):
65
+ best_match_idx = np.argmax(similarity_scores)
66
+ semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
67
+
68
+ for target_col in semantic_mapped_headers:
69
+ if not semantic_mapped_headers[target_col]:
70
+ semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
71
+
72
+ return semantic_mapped_headers
73
+
74
+
75
+ def is_valid_mobile(value):
76
+ return str(value).isdigit() and 7 <= len(str(value)) <= 15
77
+
78
+
79
+ def merge_dataframes(file_paths):
80
+ dataframes = []
81
+ header_sets = set()
82
+
83
+ for file_path in tqdm(file_paths, desc="Reading CSV files"):
84
+ try:
85
+ df = pd.read_csv(file_path)
86
+ dataframes.append(df)
87
+ header_sets.update(df.columns.tolist())
88
+ except Exception as e:
89
+ print(f"Error reading {file_path}: {e}")
90
+
91
+ header_mapping = get_header_similarity(list(header_sets))
92
+
93
+ merged_frames = []
94
+
95
+ for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
96
+ merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}
97
+
98
+ for target_col, mapped_cols in header_mapping.items():
99
+ column_data = pd.Series(dtype='object')
100
+
101
+ for col in mapped_cols:
102
+ if col in df.columns:
103
+ clean_data = df[col].dropna()
104
+
105
+ # Apply additional filters for specific columns
106
+ if target_col == "Mobile":
107
+ clean_data = clean_data[clean_data.apply(is_valid_mobile)] # Ensure valid mobile numbers
108
+ elif target_col == "Pincode":
109
+ clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]
110
+
111
+ column_data = column_data.combine_first(clean_data)
112
+
113
+ merged_dict[target_col] = column_data
114
+
115
+ merged_frames.append(pd.DataFrame(merged_dict))
116
+
117
+ final_df = pd.concat(merged_frames, ignore_index=True)
118
+ return final_df
119
+
120
+
121
+ def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
122
+ num_parts = (len(df) // max_rows_per_file) + 1
123
+ for part in range(num_parts):
124
+ start_idx = part * max_rows_per_file
125
+ end_idx = min((part + 1) * max_rows_per_file, len(df))
126
+ part_df = df.iloc[start_idx:end_idx]
127
+
128
+ if not part_df.empty:
129
+ output_file = f"{base_filename}_part_{part + 1}.csv"
130
+ part_df.to_csv(output_file, index=False)
131
+ print(f"Saved {output_file} with {len(part_df)} rows.")
132
+
133
+
134
+ def main():
135
+ for root_directory in root_directories:
136
+ directory_name = os.path.basename(root_directory).replace(' ', '_')
137
+ print(f"Processing directory: {root_directory}")
138
+ csv_files = read_csv_files(root_directory)
139
+ if not csv_files:
140
+ print(f"No CSV files found in {root_directory}.")
141
+ continue
142
+
143
+ print(f"Found {len(csv_files)} CSV files in {root_directory}.")
144
+
145
+ merged_df = merge_dataframes(csv_files)
146
+
147
+ base_filename = f"{directory_name}"
148
+ save_dataframe_in_parts(merged_df, base_filename)
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()