| import io |
| import csv |
| import os |
| import sqlite3 |
| import pandas as pd |
|
|
|
|
| |
| csv.field_size_limit(int(1e9)) |
|
|
|
|
| SUPPORTED_EXTENSIONS = { |
| ".csv", ".tsv", ".txt", ".dat", ".tab", ".log", |
| ".xlsx", ".xls", ".xlsm", ".ods", |
| ".json", ".jsonl", ".ndjson", |
| ".parquet", ".feather", ".arrow", ".orc", |
| ".dta", ".sas7bdat", ".sav", ".xpt", |
| ".xml", |
| ".html", ".htm", |
| ".db", ".sqlite", ".sqlite3", |
| ".pkl", ".pickle", |
| ".pdf", |
| ".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff", ".gif", |
| ".md", ".markdown", ".rtf", |
| } |
|
|
| EXTENSION_LABELS = [ext.lstrip(".") for ext in sorted(SUPPORTED_EXTENSIONS)] |
|
|
|
|
| def _read_text_payload(target, filepath: str = None) -> str: |
| if filepath: |
| with open(filepath, "r", encoding="utf-8", errors="replace") as handle: |
| return handle.read() |
|
|
| if hasattr(target, "seek"): |
| target.seek(0) |
| raw = target.read() if hasattr(target, "read") else target |
| if isinstance(raw, bytes): |
| return raw.decode("utf-8", errors="replace") |
| return str(raw) |
|
|
|
|
| def _dataframe_from_text_document(text: str, source_name: str, segment_label: str = "line") -> pd.DataFrame: |
| cleaned = (text or "").replace("\x00", "").strip() |
| if not cleaned: |
| raise ValueError(f"No readable text found in {source_name}.") |
|
|
| blocks = [block.strip() for block in cleaned.splitlines() if block.strip()] |
| if not blocks: |
| blocks = [cleaned] |
|
|
| return pd.DataFrame( |
| { |
| "source_file": source_name, |
| "segment_type": segment_label, |
| "segment_index": range(1, len(blocks) + 1), |
| "text": blocks, |
| "text_length": [len(block) for block in blocks], |
| } |
| ) |
|
|
|
|
| def _load_pdf_as_dataframe( |
| target, |
| filepath: str = None, |
| source_name: str = "document.pdf", |
| pdf_mode: str = "text", |
| ) -> pd.DataFrame: |
| pdf_mode = (pdf_mode or "text").lower().strip() |
|
|
| if pdf_mode == "tables": |
| try: |
| import pdfplumber |
| except ImportError as exc: |
| raise ValueError("PDF table extraction requires pdfplumber to be installed.") from exc |
|
|
| table_rows = [] |
| with pdfplumber.open(filepath or target) as pdf: |
| for page_idx, page in enumerate(pdf.pages, start=1): |
| try: |
| extracted_tables = page.extract_tables() or [] |
| except Exception: |
| extracted_tables = [] |
|
|
| for table_idx, table in enumerate(extracted_tables, start=1): |
| if not table: |
| continue |
| normalized = pd.DataFrame(table) |
| normalized = normalized.dropna(how="all").dropna(axis=1, how="all") |
| if normalized.empty: |
| continue |
| normalized.insert(0, "table_index", table_idx) |
| normalized.insert(0, "page", page_idx) |
| normalized.insert(0, "source_file", source_name) |
| table_rows.append(normalized) |
|
|
| if table_rows: |
| return pd.concat(table_rows, ignore_index=True) |
| raise ValueError("No tables detected in PDF. Try plain-text mode instead.") |
|
|
| try: |
| from PyPDF2 import PdfReader |
| except ImportError as exc: |
| raise ValueError("PDF upload requires PyPDF2 to be installed.") from exc |
|
|
| reader = PdfReader(filepath or target) |
| rows = [] |
| for idx, page in enumerate(reader.pages, start=1): |
| try: |
| text = (page.extract_text() or "").strip() |
| except Exception: |
| text = "" |
| rows.append( |
| { |
| "source_file": source_name, |
| "page": idx, |
| "text": text, |
| "text_length": len(text), |
| } |
| ) |
|
|
| if not rows: |
| raise ValueError("No pages found in PDF.") |
|
|
| if not any(row["text"] for row in rows): |
| raise ValueError("PDF was loaded but no extractable text was found.") |
|
|
| return pd.DataFrame(rows) |
|
|
|
|
| def _load_image_as_dataframe(target, filepath: str = None, source_name: str = "image") -> pd.DataFrame: |
| try: |
| from PIL import Image |
| except ImportError as exc: |
| raise ValueError("Image upload requires Pillow to be installed.") from exc |
|
|
| image = Image.open(filepath or target) |
| width, height = image.size |
| payload = { |
| "source_file": source_name, |
| "format": image.format, |
| "mode": image.mode, |
| "width": width, |
| "height": height, |
| "aspect_ratio": round(width / height, 6) if height else None, |
| } |
|
|
| extracted_text = "" |
| try: |
| import pytesseract |
|
|
| extracted_text = (pytesseract.image_to_string(image) or "").strip() |
| except Exception: |
| extracted_text = "" |
|
|
| payload["ocr_text"] = extracted_text |
| payload["ocr_text_length"] = len(extracted_text) |
| return pd.DataFrame([payload]) |
|
|
|
|
| def _sniff_delimiter(raw_bytes: bytes) -> str: |
| |
| if not raw_bytes: |
| return "," |
|
|
| try: |
| sample = raw_bytes[:4096].decode("utf-8", errors="replace") |
| dialect = csv.Sniffer().sniff(sample, delimiters=",\t|;: ") |
| return dialect.delimiter |
| except Exception: |
| return "," |
|
|
|
|
| def load_dataframe( |
| contents: bytes = None, |
| filename: str = None, |
| filepath: str = None, |
| pdf_mode: str = "text", |
| ) -> pd.DataFrame: |
|
|
| |
| if not filepath and contents is None: |
| raise ValueError("Either contents or filepath must be provided.") |
|
|
| if filepath: |
| if not os.path.exists(filepath): |
| raise ValueError(f"File path does not exist: {filepath}") |
| target = filepath |
| fname = os.path.basename(filepath).lower().strip() |
| else: |
| target = io.BytesIO(contents) |
| fname = filename.lower().strip() if filename else "data.csv" |
|
|
| ext = "." + fname.rsplit(".", 1)[-1] if "." in fname else "" |
|
|
| |
| if ext in {".csv", ".tsv", ".txt", ".dat", ".tab", ".log", ""}: |
|
|
| if ext in {".tsv", ".tab"}: |
| return pd.read_csv(target, sep="\t", encoding_errors="replace") |
|
|
| |
| try: |
| if filepath: |
| with open(filepath, "rb") as f: |
| sample_bytes = f.read(8192) |
| else: |
| sample_bytes = contents |
| except Exception: |
| sample_bytes = None |
|
|
| delim = _sniff_delimiter(sample_bytes) |
|
|
| try: |
| return pd.read_csv(target, sep=delim, encoding_errors="replace", engine="c") |
| except Exception: |
| if hasattr(target, "seek"): |
| target.seek(0) |
| return pd.read_csv(target, sep=delim, encoding_errors="replace", engine="python") |
|
|
| if ext in {".md", ".markdown", ".rtf"}: |
| text = _read_text_payload(target, filepath=filepath) |
| return _dataframe_from_text_document(text, fname, segment_label="block") |
|
|
| |
| if ext in {".xlsx", ".xlsm"}: |
| return pd.read_excel(target, engine="openpyxl") |
| if ext == ".xls": |
| return pd.read_excel(target, engine="xlrd") |
| if ext == ".ods": |
| return pd.read_excel(target, engine="odf") |
|
|
| |
| if ext in {".jsonl", ".ndjson"}: |
| return pd.read_json(target, lines=True) |
| if ext == ".json": |
| try: |
| return pd.read_json(target) |
| except Exception: |
| if hasattr(target, "seek"): |
| target.seek(0) |
| return pd.read_json(target, lines=True) |
|
|
| |
| if ext == ".parquet": |
| return pd.read_parquet(target) |
| if ext in {".feather", ".arrow"}: |
| return pd.read_feather(target) |
| if ext == ".orc": |
| return pd.read_orc(target) |
|
|
| |
| if ext == ".dta": |
| return pd.read_stata(target) |
| if ext in {".sas7bdat", ".xpt"}: |
| return pd.read_sas(target, format="sas7bdat" if ext == ".sas7bdat" else "xport") |
|
|
| if ext == ".sav": |
| try: |
| import pyreadstat |
| if filepath: |
| df, _ = pyreadstat.read_sav(filepath) |
| else: |
| df, _ = pyreadstat.read_sav(target) |
| return df |
| except ImportError: |
| raise ValueError("SPSS (.sav) requires pyreadstat") |
|
|
| |
| if ext == ".xml": |
| try: |
| return pd.read_xml(target) |
| except Exception as e: |
| raise ValueError(f"Could not parse XML: {e}") |
|
|
| |
| if ext in {".html", ".htm"}: |
| tables = pd.read_html(target) |
| if not tables: |
| raise ValueError("No tables found in HTML") |
| return max(tables, key=len) |
|
|
| if ext == ".pdf": |
| return _load_pdf_as_dataframe(target, filepath=filepath, source_name=fname, pdf_mode=pdf_mode) |
|
|
| if ext in {".png", ".jpg", ".jpeg", ".webp", ".bmp", ".tif", ".tiff", ".gif"}: |
| return _load_image_as_dataframe(target, filepath=filepath, source_name=fname) |
|
|
| |
| if ext in {".db", ".sqlite", ".sqlite3"}: |
| db_path = filepath |
| temp_used = False |
|
|
| if not db_path: |
| import tempfile |
| with tempfile.NamedTemporaryFile(delete=False, suffix=".db") as tmp: |
| tmp.write(contents) |
| db_path = tmp.name |
| temp_used = True |
|
|
| try: |
| conn = sqlite3.connect(db_path) |
| tables = conn.execute( |
| "SELECT name FROM sqlite_master WHERE type='table'" |
| ).fetchall() |
|
|
| if not tables: |
| raise ValueError("No tables found in SQLite database.") |
|
|
| table_name = tables[0][0] |
| df = pd.read_sql(f"SELECT * FROM [{table_name}]", conn) |
| conn.close() |
| return df |
|
|
| finally: |
| if temp_used and os.path.exists(db_path): |
| os.unlink(db_path) |
|
|
| |
| if ext in {".pkl", ".pickle"}: |
| try: |
| obj = pd.read_pickle(target) |
| except Exception as e: |
| raise ValueError( |
| "Pickle could not be read as a pandas DataFrame. " |
| "If this came from an AutoML export bundle, the archive may be pointing at a model artifact instead of the dataset. " |
| f"Original error: {e}" |
| ) from e |
| if isinstance(obj, pd.DataFrame): |
| return obj |
| raise ValueError( |
| f"Pickle contains {type(obj).__name__}, not a pandas DataFrame." |
| ) |
|
|
| |
| try: |
| if filepath: |
| with open(filepath, "rb") as f: |
| sample_bytes = f.read(8192) |
| else: |
| sample_bytes = contents |
|
|
| delim = _sniff_delimiter(sample_bytes) |
|
|
| return pd.read_csv( |
| target, |
| sep=delim, |
| encoding_errors="replace", |
| engine="python" |
| ) |
|
|
| except Exception as e: |
| raise ValueError( |
| f"Unsupported or unreadable file format '{ext}'. " |
| f"Supported: {', '.join(sorted(SUPPORTED_EXTENSIONS))}. Error: {e}" |
| ) |
|
|