| | from typing import Dict, List, Optional |
| | from pathlib import Path |
| | import pandas as pd |
| |
|
| | class EnhancedCSVReader: |
| | """Enhanced CSV reader with metadata extraction capabilities.""" |
| | |
| | def __init__(self): |
| | """Initialize the CSV reader.""" |
| | pass |
| | |
| | def load_data(self, file_path: str) -> List[Dict]: |
| | """Load CSV file and extract documents with metadata.""" |
| | |
| | csv_metadata = self._extract_metadata(file_path) |
| | |
| | |
| | df = pd.read_csv(file_path) |
| | |
| | |
| | documents = [] |
| | for _, row in df.head(10).iterrows(): |
| | doc = { |
| | "content": row.to_string(), |
| | "metadata": csv_metadata.copy() |
| | } |
| | documents.append(doc) |
| | |
| | |
| | schema_doc = { |
| | "content": f"CSV Schema: {', '.join(df.columns)}", |
| | "metadata": csv_metadata.copy() |
| | } |
| | documents.append(schema_doc) |
| | |
| | return documents |
| | |
| | def _extract_metadata(self, file_path: str) -> Dict: |
| | """Extract useful metadata from CSV file.""" |
| | df = pd.read_csv(file_path) |
| | filename = Path(file_path).name |
| | |
| | |
| | columns = df.columns.tolist() |
| | dtypes = {col: str(df[col].dtype) for col in columns} |
| | |
| | |
| | samples = {} |
| | for col in columns: |
| | non_null_values = df[col].dropna().head(3).tolist() |
| | samples[col] = [str(val) for val in non_null_values] |
| | |
| | |
| | row_count = len(df) |
| | |
| | return { |
| | "filename": filename, |
| | "columns": columns, |
| | "dtypes": dtypes, |
| | "samples": samples, |
| | "row_count": row_count |
| | } |
| |
|