""" File Reader — support CSV, Excel, Parquet, JSON """ import io import pandas as pd SUPPORTED_EXTENSIONS = { ".csv" : "CSV", ".xlsx" : "Excel", ".xls" : "Excel (legacy)", ".parquet" : "Parquet", ".json" : "JSON", } def read_dataset(file_bytes: bytes, filename: str) -> pd.DataFrame: """ Baca file dataset dari bytes ke DataFrame. Support: CSV, Excel (.xlsx/.xls), Parquet, JSON """ ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else "" if ext not in SUPPORTED_EXTENSIONS: raise ValueError( f"Format file '{ext}' tidak didukung. " f"Format yang didukung: {', '.join(SUPPORTED_EXTENSIONS.keys())}" ) buf = io.BytesIO(file_bytes) if ext == ".csv": # Coba beberapa separator umum for sep in [",", ";", "\t", "|"]: try: buf.seek(0) df = pd.read_csv(buf, sep=sep) if df.shape[1] > 1: return df except Exception: continue buf.seek(0) return pd.read_csv(buf) elif ext in (".xlsx", ".xls"): return pd.read_excel(buf, engine="openpyxl" if ext == ".xlsx" else "xlrd") elif ext == ".parquet": return pd.read_parquet(buf) elif ext == ".json": buf.seek(0) try: return pd.read_json(buf, orient="records") except Exception: buf.seek(0) return pd.read_json(buf) raise ValueError(f"Format tidak dikenali: {ext}") def get_file_info(file_bytes: bytes, filename: str) -> dict: """Informasi singkat tentang file yang diupload.""" ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else "unknown" size = len(file_bytes) / 1024 # KB return { "filename" : filename, "format" : SUPPORTED_EXTENSIONS.get(ext, "Unknown"), "size_kb" : round(size, 2), }