Spaces:
Sleeping
Sleeping
File size: 1,968 Bytes
f78d8cf | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | """
File Reader — support CSV, Excel, Parquet, JSON
"""
import io
import pandas as pd
SUPPORTED_EXTENSIONS = {
".csv" : "CSV",
".xlsx" : "Excel",
".xls" : "Excel (legacy)",
".parquet" : "Parquet",
".json" : "JSON",
}
def read_dataset(file_bytes: bytes, filename: str) -> pd.DataFrame:
"""
Baca file dataset dari bytes ke DataFrame.
Support: CSV, Excel (.xlsx/.xls), Parquet, JSON
"""
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ""
if ext not in SUPPORTED_EXTENSIONS:
raise ValueError(
f"Format file '{ext}' tidak didukung. "
f"Format yang didukung: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
)
buf = io.BytesIO(file_bytes)
if ext == ".csv":
# Coba beberapa separator umum
for sep in [",", ";", "\t", "|"]:
try:
buf.seek(0)
df = pd.read_csv(buf, sep=sep)
if df.shape[1] > 1:
return df
except Exception:
continue
buf.seek(0)
return pd.read_csv(buf)
elif ext in (".xlsx", ".xls"):
return pd.read_excel(buf, engine="openpyxl" if ext == ".xlsx" else "xlrd")
elif ext == ".parquet":
return pd.read_parquet(buf)
elif ext == ".json":
buf.seek(0)
try:
return pd.read_json(buf, orient="records")
except Exception:
buf.seek(0)
return pd.read_json(buf)
raise ValueError(f"Format tidak dikenali: {ext}")
def get_file_info(file_bytes: bytes, filename: str) -> dict:
"""Informasi singkat tentang file yang diupload."""
ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else "unknown"
size = len(file_bytes) / 1024 # KB
return {
"filename" : filename,
"format" : SUPPORTED_EXTENSIONS.get(ext, "Unknown"),
"size_kb" : round(size, 2),
}
|