SherinJosephRoy commited on
Commit
bd7821d
·
verified ·
1 Parent(s): 28eb761

Upload utils/exporters.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. utils/exporters.py +272 -0
utils/exporters.py ADDED
@@ -0,0 +1,272 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Data Export Utilities
3
+
4
+ Provides functions to export generated data in various formats.
5
+ """
6
+
7
+ import json
8
+ import csv
9
+ import io
10
+ import zipfile
11
+ from typing import Any, Dict, List, Optional, Union
12
+ import pandas as pd
13
+ import pyarrow as pa
14
+ import pyarrow.parquet as pq
15
+ from openpyxl import Workbook
16
+ from openpyxl.styles import Font, PatternFill
17
+
18
+
19
+ class DataExporter:
20
+ """Handles data export in various formats."""
21
+
22
+ @staticmethod
23
+ def export_to_csv(data: List[Dict[str, Any]], filename: str = "synthetic_data.csv") -> str:
24
+ """Export data to CSV format."""
25
+ if not data:
26
+ return ""
27
+
28
+ output = io.StringIO()
29
+ fieldnames = data[0].keys()
30
+ writer = csv.DictWriter(output, fieldnames=fieldnames)
31
+ writer.writeheader()
32
+ writer.writerows(data)
33
+
34
+ return output.getvalue()
35
+
36
+ @staticmethod
37
+ def export_to_json(data: List[Dict[str, Any]], format_type: str = "array") -> str:
38
+ """Export data to JSON format."""
39
+ if not data:
40
+ return "[]"
41
+
42
+ if format_type == "array":
43
+ return json.dumps(data, indent=2, default=str)
44
+ elif format_type == "lines":
45
+ # Line-delimited JSON
46
+ lines = []
47
+ for record in data:
48
+ lines.append(json.dumps(record, default=str))
49
+ return "\n".join(lines)
50
+ else:
51
+ return json.dumps(data, indent=2, default=str)
52
+
53
+ @staticmethod
54
+ def export_to_parquet(data: List[Dict[str, Any]]) -> bytes:
55
+ """Export data to Parquet format."""
56
+ if not data:
57
+ return b""
58
+
59
+ df = pd.DataFrame(data)
60
+ table = pa.Table.from_pandas(df)
61
+ buffer = io.BytesIO()
62
+ pq.write_table(table, buffer)
63
+ return buffer.getvalue()
64
+
65
+ @staticmethod
66
+ def export_to_excel(data: List[Dict[str, Any]], filename: str = "synthetic_data.xlsx") -> bytes:
67
+ """Export data to Excel format."""
68
+ if not data:
69
+ return b""
70
+
71
+ wb = Workbook()
72
+ ws = wb.active
73
+ ws.title = "Synthetic Data"
74
+
75
+ # Add headers
76
+ if data:
77
+ headers = list(data[0].keys())
78
+ for col, header in enumerate(headers, 1):
79
+ cell = ws.cell(row=1, column=col, value=header)
80
+ cell.font = Font(bold=True)
81
+ cell.fill = PatternFill(start_color="CCCCCC", end_color="CCCCCC", fill_type="solid")
82
+
83
+ # Add data
84
+ for row, record in enumerate(data, 2):
85
+ for col, (key, value) in enumerate(record.items(), 1):
86
+ ws.cell(row=row, column=col, value=value)
87
+
88
+ # Auto-adjust column widths
89
+ for column in ws.columns:
90
+ max_length = 0
91
+ column_letter = column[0].column_letter
92
+ for cell in column:
93
+ try:
94
+ if len(str(cell.value)) > max_length:
95
+ max_length = len(str(cell.value))
96
+ except:
97
+ pass
98
+ adjusted_width = min(max_length + 2, 50)
99
+ ws.column_dimensions[column_letter].width = adjusted_width
100
+
101
+ buffer = io.BytesIO()
102
+ wb.save(buffer)
103
+ return buffer.getvalue()
104
+
105
+ @staticmethod
106
+ def export_to_sql(data: List[Dict[str, Any]], table_name: str = "synthetic_data") -> str:
107
+ """Export data as SQL INSERT statements."""
108
+ if not data:
109
+ return ""
110
+
111
+ sql_statements = []
112
+
113
+ # Get column names
114
+ columns = list(data[0].keys())
115
+ column_list = ", ".join([f"`{col}`" for col in columns])
116
+
117
+ # Generate INSERT statements
118
+ for record in data:
119
+ values = []
120
+ for col in columns:
121
+ value = record.get(col)
122
+ if value is None:
123
+ values.append("NULL")
124
+ elif isinstance(value, str):
125
+ # Escape single quotes
126
+ escaped_value = value.replace("'", "''")
127
+ values.append(f"'{escaped_value}'")
128
+ elif isinstance(value, (int, float)):
129
+ values.append(str(value))
130
+ else:
131
+ values.append(f"'{str(value)}'")
132
+
133
+ values_list = ", ".join(values)
134
+ sql_statements.append(f"INSERT INTO `{table_name}` ({column_list}) VALUES ({values_list});")
135
+
136
+ return "\n".join(sql_statements)
137
+
138
+ @staticmethod
139
+ def export_to_pandas_code(data: List[Dict[str, Any]], variable_name: str = "df") -> str:
140
+ """Export data as Python Pandas DataFrame code."""
141
+ if not data:
142
+ return f"{variable_name} = pd.DataFrame()"
143
+
144
+ # Convert data to a format that can be easily represented in code
145
+ code_lines = [f"{variable_name} = pd.DataFrame(["]
146
+
147
+ for i, record in enumerate(data):
148
+ if i > 0:
149
+ code_lines.append(",")
150
+
151
+ # Format record as dictionary
152
+ record_str = "{"
153
+ for j, (key, value) in enumerate(record.items()):
154
+ if j > 0:
155
+ record_str += ", "
156
+
157
+ if isinstance(value, str):
158
+ escaped_value = value.replace("'", "\\'")
159
+ record_str += f"'{key}': '{escaped_value}'"
160
+ elif value is None:
161
+ record_str += f"'{key}': None"
162
+ else:
163
+ record_str += f"'{key}': {value}"
164
+
165
+ record_str += "}"
166
+ code_lines.append(f" {record_str}")
167
+
168
+ code_lines.append("])")
169
+
170
+ return "\n".join(code_lines)
171
+
172
+ @staticmethod
173
+ def create_zip_archive(files: Dict[str, Union[str, bytes]],
174
+ archive_name: str = "synthetic_data.zip") -> bytes:
175
+ """Create a ZIP archive containing multiple files."""
176
+ buffer = io.BytesIO()
177
+
178
+ with zipfile.ZipFile(buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
179
+ for filename, content in files.items():
180
+ if isinstance(content, str):
181
+ zip_file.writestr(filename, content)
182
+ else:
183
+ zip_file.writestr(filename, content)
184
+
185
+ return buffer.getvalue()
186
+
187
+ @staticmethod
188
+ def export_with_compression(data: List[Dict[str, Any]],
189
+ format_type: str = "csv",
190
+ compress: bool = False) -> Union[str, bytes]:
191
+ """Export data with optional compression."""
192
+ if format_type == "csv":
193
+ content = DataExporter.export_to_csv(data)
194
+ if compress:
195
+ files = {"synthetic_data.csv": content}
196
+ return DataExporter.create_zip_archive(files)
197
+ return content
198
+
199
+ elif format_type == "json":
200
+ content = DataExporter.export_to_json(data)
201
+ if compress:
202
+ files = {"synthetic_data.json": content}
203
+ return DataExporter.create_zip_archive(files)
204
+ return content
205
+
206
+ elif format_type == "parquet":
207
+ return DataExporter.export_to_parquet(data)
208
+
209
+ elif format_type == "excel":
210
+ return DataExporter.export_to_excel(data)
211
+
212
+ elif format_type == "sql":
213
+ content = DataExporter.export_to_sql(data)
214
+ if compress:
215
+ files = {"synthetic_data.sql": content}
216
+ return DataExporter.create_zip_archive(files)
217
+ return content
218
+
219
+ elif format_type == "pandas":
220
+ content = DataExporter.export_to_pandas_code(data)
221
+ if compress:
222
+ files = {"synthetic_data.py": content}
223
+ return DataExporter.create_zip_archive(files)
224
+ return content
225
+
226
+ else:
227
+ raise ValueError(f"Unsupported format: {format_type}")
228
+
229
+ @staticmethod
230
+ def get_export_info(data: List[Dict[str, Any]], format_type: str) -> Dict[str, Any]:
231
+ """Get information about the export."""
232
+ if not data:
233
+ return {
234
+ 'format': format_type,
235
+ 'size_bytes': 0,
236
+ 'record_count': 0,
237
+ 'field_count': 0
238
+ }
239
+
240
+ # Get basic info
241
+ record_count = len(data)
242
+ field_count = len(data[0].keys()) if data else 0
243
+
244
+ # Calculate size
245
+ if format_type == "csv":
246
+ content = DataExporter.export_to_csv(data)
247
+ size_bytes = len(content.encode('utf-8'))
248
+ elif format_type == "json":
249
+ content = DataExporter.export_to_json(data)
250
+ size_bytes = len(content.encode('utf-8'))
251
+ elif format_type == "parquet":
252
+ content = DataExporter.export_to_parquet(data)
253
+ size_bytes = len(content)
254
+ elif format_type == "excel":
255
+ content = DataExporter.export_to_excel(data)
256
+ size_bytes = len(content)
257
+ elif format_type == "sql":
258
+ content = DataExporter.export_to_sql(data)
259
+ size_bytes = len(content.encode('utf-8'))
260
+ elif format_type == "pandas":
261
+ content = DataExporter.export_to_pandas_code(data)
262
+ size_bytes = len(content.encode('utf-8'))
263
+ else:
264
+ size_bytes = 0
265
+
266
+ return {
267
+ 'format': format_type,
268
+ 'size_bytes': size_bytes,
269
+ 'size_mb': round(size_bytes / (1024 * 1024), 2),
270
+ 'record_count': record_count,
271
+ 'field_count': field_count
272
+ }