neelnsoni13
/

LargeMultiExcelsCleanDeepSeek

Model card Files Files and versions

xet

Community

neelnsoni13 commited on Feb 16, 2025

Commit

d9198d9

verified ·

1 Parent(s): 884027d

Create LargeMultiFolderExcelClean.py

Browse files

Files changed (1) hide show

LargeMultiFolderExcelClean.py +152 -0

LargeMultiFolderExcelClean.py ADDED Viewed

	@@ -0,0 +1,152 @@

+# pip install virtualenv
+# python -m virtualenv deepseek_env
+# deepseek_env\Scripts\activate
+import os
+import pandas as pd
+import ollama
+from tqdm import tqdm
+from sentence_transformers import SentenceTransformer
+import numpy as np
+# Model configuration
+desired_model = 'deepseek-r1:14b'
+# Target columns for intelligent merging
+TARGET_COLUMNS = ["Mobile", "Email", "Name", "City", "State","Pincode"]
+root_directories = ["A","B","C","D","E","F","G","H","I"]
+# Initialize Sentence-BERT model for semantic similarity
+semantic_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
+def read_csv_files(root_directory):
+    csv_files = []
+    for subdir, _, files in os.walk(root_directory):
+        for file in files:
+            if file.endswith(".csv"):
+                csv_files.append(os.path.join(subdir, file))
+    return csv_files
+def calculate_similarity(source_list, target_list):
+    source_embeddings = semantic_model.encode(source_list)
+    target_embeddings = semantic_model.encode(target_list)
+    similarities = []
+    for src_emb in source_embeddings:
+        similarity_scores = np.dot(target_embeddings, src_emb) / (np.linalg.norm(target_embeddings, axis=1) * np.linalg.norm(src_emb))
+        similarities.append(similarity_scores)
+    return similarities
+def get_header_similarity(headers):
+    prompt = f"Match the following headers to the closest ones from this list: {TARGET_COLUMNS}. Return a dictionary where keys are from the target list and values are the closest matches: {headers}"
+    try:
+        response = ollama.chat(model=desired_model, messages=[{'role': 'user', 'content': prompt}])
+        ollama_response = response.get('message', {}).get('content', 'No response content')
+        print("Model Response:", ollama_response)
+        try:
+            mapped_headers = eval(ollama_response)
+        except Exception as e:
+            print("Error parsing model response:", e)
+            mapped_headers = {col: [] for col in TARGET_COLUMNS}
+    except Exception as e:
+        print(f"Error in DeepSeek request: {e}")
+        mapped_headers = {col: [] for col in TARGET_COLUMNS}
+    similarities = calculate_similarity(headers, TARGET_COLUMNS)
+    semantic_mapped_headers = {target_col: [] for target_col in TARGET_COLUMNS}
+    for idx, similarity_scores in enumerate(similarities):
+        best_match_idx = np.argmax(similarity_scores)
+        semantic_mapped_headers[TARGET_COLUMNS[best_match_idx]].append(headers[idx])
+    for target_col in semantic_mapped_headers:
+        if not semantic_mapped_headers[target_col]:
+            semantic_mapped_headers[target_col] = mapped_headers.get(target_col, [])
+    return semantic_mapped_headers
+def is_valid_mobile(value):
+    return str(value).isdigit() and 7 <= len(str(value)) <= 15
+def merge_dataframes(file_paths):
+    dataframes = []
+    header_sets = set()
+    for file_path in tqdm(file_paths, desc="Reading CSV files"):
+        try:
+            df = pd.read_csv(file_path)
+            dataframes.append(df)
+            header_sets.update(df.columns.tolist())
+        except Exception as e:
+            print(f"Error reading {file_path}: {e}")
+    header_mapping = get_header_similarity(list(header_sets))
+    merged_frames = []
+    for df in tqdm(dataframes, desc="Merging DataFrames intelligently"):
+        merged_dict = {target_col: pd.Series(dtype='object') for target_col in TARGET_COLUMNS}
+        for target_col, mapped_cols in header_mapping.items():
+            column_data = pd.Series(dtype='object')
+            for col in mapped_cols:
+                if col in df.columns:
+                    clean_data = df[col].dropna()
+                    # Apply additional filters for specific columns
+                    if target_col == "Mobile":
+                        clean_data = clean_data[clean_data.apply(is_valid_mobile)]  # Ensure valid mobile numbers
+                    elif target_col == "Pincode":
+                        clean_data = clean_data[clean_data.apply(lambda x: str(x).isdigit() and len(str(x)) == 6)]
+                    column_data = column_data.combine_first(clean_data)
+            merged_dict[target_col] = column_data
+        merged_frames.append(pd.DataFrame(merged_dict))
+    final_df = pd.concat(merged_frames, ignore_index=True)
+    return final_df
+def save_dataframe_in_parts(df, base_filename, max_rows_per_file=500000):
+    num_parts = (len(df) // max_rows_per_file) + 1
+    for part in range(num_parts):
+        start_idx = part * max_rows_per_file
+        end_idx = min((part + 1) * max_rows_per_file, len(df))
+        part_df = df.iloc[start_idx:end_idx]
+        if not part_df.empty:
+            output_file = f"{base_filename}_part_{part + 1}.csv"
+            part_df.to_csv(output_file, index=False)
+            print(f"Saved {output_file} with {len(part_df)} rows.")
+def main():
+    for root_directory in root_directories:
+        directory_name = os.path.basename(root_directory).replace(' ', '_')
+        print(f"Processing directory: {root_directory}")
+        csv_files = read_csv_files(root_directory)
+        if not csv_files:
+            print(f"No CSV files found in {root_directory}.")
+            continue
+        print(f"Found {len(csv_files)} CSV files in {root_directory}.")
+        merged_df = merge_dataframes(csv_files)
+        base_filename = f"{directory_name}"
+        save_dataframe_in_parts(merged_df, base_filename)
+if __name__ == "__main__":
+    main()