File size: 1,968 Bytes
f78d8cf
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
File Reader — support CSV, Excel, Parquet, JSON
"""
import io
import pandas as pd


SUPPORTED_EXTENSIONS = {
    ".csv"     : "CSV",
    ".xlsx"    : "Excel",
    ".xls"     : "Excel (legacy)",
    ".parquet" : "Parquet",
    ".json"    : "JSON",
}


def read_dataset(file_bytes: bytes, filename: str) -> pd.DataFrame:
    """
    Baca file dataset dari bytes ke DataFrame.
    Support: CSV, Excel (.xlsx/.xls), Parquet, JSON
    """
    ext = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else ""

    if ext not in SUPPORTED_EXTENSIONS:
        raise ValueError(
            f"Format file '{ext}' tidak didukung. "
            f"Format yang didukung: {', '.join(SUPPORTED_EXTENSIONS.keys())}"
        )

    buf = io.BytesIO(file_bytes)

    if ext == ".csv":
        # Coba beberapa separator umum
        for sep in [",", ";", "\t", "|"]:
            try:
                buf.seek(0)
                df = pd.read_csv(buf, sep=sep)
                if df.shape[1] > 1:
                    return df
            except Exception:
                continue
        buf.seek(0)
        return pd.read_csv(buf)

    elif ext in (".xlsx", ".xls"):
        return pd.read_excel(buf, engine="openpyxl" if ext == ".xlsx" else "xlrd")

    elif ext == ".parquet":
        return pd.read_parquet(buf)

    elif ext == ".json":
        buf.seek(0)
        try:
            return pd.read_json(buf, orient="records")
        except Exception:
            buf.seek(0)
            return pd.read_json(buf)

    raise ValueError(f"Format tidak dikenali: {ext}")


def get_file_info(file_bytes: bytes, filename: str) -> dict:
    """Informasi singkat tentang file yang diupload."""
    ext  = "." + filename.rsplit(".", 1)[-1].lower() if "." in filename else "unknown"
    size = len(file_bytes) / 1024  # KB
    return {
        "filename"  : filename,
        "format"    : SUPPORTED_EXTENSIONS.get(ext, "Unknown"),
        "size_kb"   : round(size, 2),
    }