File size: 5,478 Bytes
d9198d9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
# pip install virtualenv
# python -m virtualenv deepseek_env
# deepseek_env\Scripts\activate
import os
import pandas as pd
import ollama
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
# Model configuration
desired_model = 'deepseek-r1:14b'
# Target columns for intelligent merging
TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
root_directories = ["A","B","C","D","E","F","G","H","I"]
# Initialize Sentence-BERT model for semantic similarity
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def read_csv_files(root_directory):
csv_files = []
for subdir, _, files in os.walk(root_directory):
for file in files:
if file.endswith(".csv"):
csv_files.append(os.path.join(subdir, file))
return csv_files
def calculate_similarity(source_list, target_list):
source_embeddings = semantic_model.encode(source_list)
target_embeddings = semantic_model.encode(target_list)
similarities = []
for src_emb in source_embeddings:
similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
similarities.append(similarity_scores)
return similarities
def get_header_similarity(headers):
prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
try:
response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
ollama_response = response.get('message', {}).get('content', 'No response content')
print("Model Response:", ollama_response)
try:
mapped_headers = eval(ollama_response)
except Exception as e:
print("Error parsing model response:", e)
mapped_headers = {col: [] for col in TARGET_COLUMNS}
except Exception as e:
print(f"Error in DeepSeek request: {e}")
mapped_headers = {col: [] for col in TARGET_COLUMNS}
similarities = calculate_similarity(headers, TARGET_COLUMNS)
semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
for idx, similarity_scores in enumerate(similarities):
best_match_idx = np.argmax(similarity_scores)
semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
for target_col in semantic_mapped_headers:
if not semantic_mapped_headers[target_col]:
semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
return semantic_mapped_headers
def is_valid_mobile(value):
return str(value).isdigit() and 7 <= len(str(value)) <= 15
def merge_dataframes(file_paths):
dataframes = []
header_sets = set()
for file_path in tqdm(file_paths, desc="Reading CSV files"):
try:
df = pd.read_csv(file_path)
dataframes.append(df)
header_sets.update(df.columns.tolist())
except Exception as e:
print(f"Error reading {file_path}: {e}")
header_mapping = get_header_similarity(list(header_sets))
merged_frames = []
for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}
for target_col, mapped_cols in header_mapping.items():
column_data = pd.Series(dtype='object')
for col in mapped_cols:
if col in df.columns:
clean_data = df[col].dropna()
# Apply additional filters for specific columns
if target_col == "Mobile":
clean_data = clean_data[clean_data.apply(is_valid_mobile)] # Ensure valid mobile numbers
elif target_col == "Pincode":
clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]
column_data = column_data.combine_first(clean_data)
merged_dict[target_col] = column_data
merged_frames.append(pd.DataFrame(merged_dict))
final_df = pd.concat(merged_frames, ignore_index=True)
return final_df
def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
num_parts = (len(df) // max_rows_per_file) + 1
for part in range(num_parts):
start_idx = part * max_rows_per_file
end_idx = min((part + 1) * max_rows_per_file, len(df))
part_df = df.iloc[start_idx:end_idx]
if not part_df.empty:
output_file = f"{base_filename}_part_{part + 1}.csv"
part_df.to_csv(output_file, index=False)
print(f"Saved {output_file} with {len(part_df)} rows.")
def main():
for root_directory in root_directories:
directory_name = os.path.basename(root_directory).replace(' ', '_')
print(f"Processing directory: {root_directory}")
csv_files = read_csv_files(root_directory)
if not csv_files:
print(f"No CSV files found in {root_directory}.")
continue
print(f"Found {len(csv_files)} CSV files in {root_directory}.")
merged_df = merge_dataframes(csv_files)
base_filename = f"{directory_name}"
save_dataframe_in_parts(merged_df, base_filename)
if __name__ == "__main__":
main()
|