LargeMultiExcelsCleanDeepSeek / LargeMultiFolderExcelClean.py
neelnsoni13's picture
Create LargeMultiFolderExcelClean.py
d9198d9 verified
# pip install virtualenv
# python -m virtualenv deepseek_env
# deepseek_env\Scripts\activate
import os
import pandas as pd
import ollama
from tqdm import tqdm
from sentence_transformers import SentenceTransformer
import numpy as np
# Model configuration
desired_model = 'deepseek-r1:14b'
# Target columns for intelligent merging
TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
root_directories = ["A","B","C","D","E","F","G","H","I"]
# Initialize Sentence-BERT model for semantic similarity
semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
def read_csv_files(root_directory):
csv_files = []
for subdir, _, files in os.walk(root_directory):
for file in files:
if file.endswith(".csv"):
csv_files.append(os.path.join(subdir, file))
return csv_files
def calculate_similarity(source_list, target_list):
source_embeddings = semantic_model.encode(source_list)
target_embeddings = semantic_model.encode(target_list)
similarities = []
for src_emb in source_embeddings:
similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
similarities.append(similarity_scores)
return similarities
def get_header_similarity(headers):
prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
try:
response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
ollama_response = response.get('message', {}).get('content', 'No response content')
print("Model Response:", ollama_response)
try:
mapped_headers = eval(ollama_response)
except Exception as e:
print("Error parsing model response:", e)
mapped_headers = {col: [] for col in TARGET_COLUMNS}
except Exception as e:
print(f"Error in DeepSeek request: {e}")
mapped_headers = {col: [] for col in TARGET_COLUMNS}
similarities = calculate_similarity(headers, TARGET_COLUMNS)
semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
for idx, similarity_scores in enumerate(similarities):
best_match_idx = np.argmax(similarity_scores)
semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
for target_col in semantic_mapped_headers:
if not semantic_mapped_headers[target_col]:
semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
return semantic_mapped_headers
def is_valid_mobile(value):
return str(value).isdigit() and 7 <= len(str(value)) <= 15
def merge_dataframes(file_paths):
dataframes = []
header_sets = set()
for file_path in tqdm(file_paths, desc="Reading CSV files"):
try:
df = pd.read_csv(file_path)
dataframes.append(df)
header_sets.update(df.columns.tolist())
except Exception as e:
print(f"Error reading {file_path}: {e}")
header_mapping = get_header_similarity(list(header_sets))
merged_frames = []
for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}
for target_col, mapped_cols in header_mapping.items():
column_data = pd.Series(dtype='object')
for col in mapped_cols:
if col in df.columns:
clean_data = df[col].dropna()
# Apply additional filters for specific columns
if target_col == "Mobile":
clean_data = clean_data[clean_data.apply(is_valid_mobile)] # Ensure valid mobile numbers
elif target_col == "Pincode":
clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]
column_data = column_data.combine_first(clean_data)
merged_dict[target_col] = column_data
merged_frames.append(pd.DataFrame(merged_dict))
final_df = pd.concat(merged_frames, ignore_index=True)
return final_df
def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
num_parts = (len(df) // max_rows_per_file) + 1
for part in range(num_parts):
start_idx = part * max_rows_per_file
end_idx = min((part + 1) * max_rows_per_file, len(df))
part_df = df.iloc[start_idx:end_idx]
if not part_df.empty:
output_file = f"{base_filename}_part_{part + 1}.csv"
part_df.to_csv(output_file, index=False)
print(f"Saved {output_file} with {len(part_df)} rows.")
def main():
for root_directory in root_directories:
directory_name = os.path.basename(root_directory).replace(' ', '_')
print(f"Processing directory: {root_directory}")
csv_files = read_csv_files(root_directory)
if not csv_files:
print(f"No CSV files found in {root_directory}.")
continue
print(f"Found {len(csv_files)} CSV files in {root_directory}.")
merged_df = merge_dataframes(csv_files)
base_filename = f"{directory_name}"
save_dataframe_in_parts(merged_df, base_filename)
if __name__ == "__main__":
main()