haileyhalimj@gmail.com commited on
Commit
8c95080
Β·
1 Parent(s): e943e9a

add separate function in excel to csv

Browse files
src/preprocess/excel_to_csv_converter.py CHANGED
@@ -2,124 +2,110 @@ import pandas as pd
2
  import os
3
  from pathlib import Path
4
 
5
- def analyze_excel_structure(excel_path):
6
  """
7
- Analyze the structure of an Excel file and return sheet information.
8
-
9
- Args:
10
- excel_path (str): Path to the Excel file
11
-
12
- Returns:
13
- dict: Dictionary with sheet names and their basic info
14
  """
15
- try:
16
- # Read Excel file to get all sheet names
17
- excel_file = pd.ExcelFile(excel_path)
18
- sheet_info = {}
19
-
20
- print(f"πŸ“Š Analyzing Excel file: {excel_path}")
21
- print(f"πŸ“‹ Found {len(excel_file.sheet_names)} sheets:")
22
- print("-" * 50)
 
23
 
24
- for i, sheet_name in enumerate(excel_file.sheet_names, 1):
25
- # Read each sheet to get basic information
26
- df = pd.read_excel(excel_path, sheet_name=sheet_name)
 
 
 
 
 
27
 
28
- sheet_info[sheet_name] = {
29
- 'rows': len(df),
30
- 'columns': len(df.columns),
31
- 'column_names': list(df.columns)
32
- }
33
 
34
- print(f"{i}. Sheet: '{sheet_name}'")
35
- print(f" - Rows: {len(df)}")
36
- print(f" - Columns: {len(df.columns)}")
37
- print(f" - Column names: {list(df.columns)}")
38
-
39
- return sheet_info
40
-
41
- except Exception as e:
42
- print(f"❌ Error analyzing Excel file: {e}")
43
- return None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
44
 
45
- def convert_excel_to_csv(excel_path, output_dir=None):
46
- """
47
- Convert each sheet of an Excel file to a separate CSV file.
48
-
49
- Args:
50
- excel_path (str): Path to the Excel file
51
- output_dir (str): Output directory for CSV files. If None, uses same directory as Excel file
52
- """
53
- try:
54
- # Set up output directory
55
  if output_dir is None:
56
  output_dir = os.path.dirname(excel_path)
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
- # Create output directory if it doesn't exist
59
- Path(output_dir).mkdir(parents=True, exist_ok=True)
 
60
 
61
- # Read Excel file
62
- excel_file = pd.ExcelFile(excel_path)
 
 
63
 
64
- print(f"πŸ”„ Converting Excel sheets to CSV...")
65
- print(f"πŸ“ Output directory: {output_dir}")
66
- print("-" * 50)
67
 
68
- converted_files = []
 
69
 
70
- for i, sheet_name in enumerate(excel_file.sheet_names, 1):
71
- # Read the sheet
72
- df = pd.read_excel(excel_path, sheet_name=sheet_name)
73
-
74
- # Create a safe filename for the CSV
75
- safe_filename = "".join(c for c in sheet_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
76
- safe_filename = safe_filename.replace(' ', '_')
77
- csv_filename = f"{safe_filename}.csv"
78
- csv_path = os.path.join(output_dir, csv_filename)
79
-
80
- # Save as CSV
81
- df.to_csv(csv_path, index=False, encoding='utf-8')
82
- converted_files.append(csv_path)
83
-
84
- print(f"βœ… {i}. '{sheet_name}' β†’ {csv_filename}")
85
- print(f" - Saved {len(df)} rows, {len(df.columns)} columns")
86
 
87
- print(f"\nπŸŽ‰ Successfully converted {len(converted_files)} sheets to CSV files!")
88
- return converted_files
89
 
90
- except Exception as e:
91
- print(f"❌ Error converting Excel to CSV: {e}")
92
- return None
93
-
94
- def main():
95
- """Main function to analyze and convert Excel file"""
96
-
97
- # Define paths
98
- excel_path = "data/real_data_excel/AI Project document.xlsx"
99
- output_dir = "data/real_data_excel/converted_csv"
100
-
101
- # Check if Excel file exists
102
- if not os.path.exists(excel_path):
103
- print(f"❌ Excel file not found: {excel_path}")
104
- return
105
-
106
- print("=" * 60)
107
- print("πŸ“Š EXCEL TO CSV CONVERTER")
108
- print("=" * 60)
109
-
110
- # Step 1: Analyze Excel structure
111
- sheet_info = analyze_excel_structure(excel_path)
112
-
113
- if sheet_info is None:
114
- return
115
-
116
- # Step 2: Convert to CSV
117
- converted_files = convert_excel_to_csv(excel_path, output_dir)
118
-
119
- if converted_files:
120
- print("\nπŸ“‚ Converted files:")
121
- for file_path in converted_files:
122
- print(f" - {file_path}")
123
 
124
  if __name__ == "__main__":
125
  main()
 
2
  import os
3
  from pathlib import Path
4
 
5
+ class ExcelToCsvConverter:
6
  """
7
+ Convert an Excel file to CSV files.
 
 
 
 
 
 
8
  """
9
+
10
+ def __init__(self, excel_path, output_dir=None):
11
+ self.excel_path = excel_path
12
+ self.output_dir = output_dir
13
+
14
+
15
+ def convert_excel_to_csv(excel_path, output_dir=None):
16
+ """
17
+ Convert each sheet of an Excel file to a separate CSV file.
18
 
19
+ Args:
20
+ excel_path (str): Path to the Excel file
21
+ output_dir (str): Output directory for CSV files. If None, uses same directory as Excel file
22
+ """
23
+ try:
24
+ # Set up output directory
25
+ if output_dir is None:
26
+ output_dir = os.path.dirname(excel_path)
27
 
28
+ # Create output directory if it doesn't exist
29
+ Path(output_dir).mkdir(parents=True, exist_ok=True)
 
 
 
30
 
31
+ # Read Excel file
32
+ excel_file = pd.ExcelFile(excel_path)
33
+ converted_files = []
34
+
35
+ for i, sheet_name in enumerate(excel_file.sheet_names, 1):
36
+ # Read the sheet
37
+ df = pd.read_excel(excel_path, sheet_name=sheet_name)
38
+
39
+ # Create a safe filename for the CSV
40
+ safe_filename = "".join(c for c in sheet_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
41
+ #for specific sheet name, save the file name and use it later
42
+ self.sheet_name = sheet_name
43
+ self.safe_filename = safe_filename
44
+ safe_filename = safe_filename.replace(' ', '_')
45
+ csv_filename = f"{safe_filename}.csv"
46
+ csv_path = os.path.join(output_dir, csv_filename)
47
+
48
+ # Save as CSV
49
+ df.to_csv(csv_path, index=False, encoding='utf-8')
50
+ converted_files.append(csv_path)
51
+
52
+ print(f"βœ… {i}. '{sheet_name}' β†’ {csv_filename}")
53
+ print(f" - Saved {len(df)} rows, {len(df.columns)} columns")
54
+
55
+ print(f"\nπŸŽ‰ Successfully converted {len(converted_files)} sheets to CSV files!")
56
+ return converted_files
57
+
58
+ except Exception as e:
59
+ print(f"❌ Error converting Excel to CSV: {e}")
60
+ return None
61
 
62
+
63
+ def convert_specific_sheet_to_csv(excel_path, sheet_name, output_dir=None):
64
+ """
65
+ Convert a specific sheet of an Excel file to a CSV file.
66
+ """
 
 
 
 
 
67
  if output_dir is None:
68
  output_dir = os.path.dirname(excel_path)
69
+
70
+ df = pd.read_excel(excel_path, sheet_name=sheet_name)
71
+ safe_filename = "".join(c for c in sheet_name if c.isalnum() or c in (' ', '-', '_')).rstrip()
72
+ safe_filename = safe_filename.replace(' ', '_')
73
+ csv_filename = f"{safe_filename}.csv"
74
+ csv_path = os.path.join(output_dir, csv_filename)
75
+ df.to_csv(csv_path, index=False, encoding='utf-8')
76
+ print(f"βœ… {sheet_name} β†’ {csv_filename}")
77
+
78
+ return csv_path
79
+
80
+ def main():
81
+ """Main function to analyze and convert Excel file"""
82
 
83
+ # Define paths
84
+ excel_path = "data/real_data_excel/AI Project document.xlsx"
85
+ output_dir = "data/real_data_excel/converted_csv"
86
 
87
+ # Check if Excel file exists
88
+ if not os.path.exists(excel_path):
89
+ print(f"❌ Excel file not found: {excel_path}")
90
+ return
91
 
92
+ print("=" * 60)
93
+ print("πŸ“Š EXCEL TO CSV CONVERTER")
94
+ print("=" * 60)
95
 
96
+ # Step 1: Analyze Excel structure
97
+ sheet_info = analyze_excel_structure(excel_path)
98
 
99
+ if sheet_info is None:
100
+ return
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
 
102
+ # Step 2: Convert to CSV
103
+ converted_files = convert_excel_to_csv(excel_path, output_dir)
104
 
105
+ if converted_files:
106
+ print("\nπŸ“‚ Converted files:")
107
+ for file_path in converted_files:
108
+ print(f" - {file_path}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
  if __name__ == "__main__":
111
  main()