Spaces:
Running
Running
Commit ·
7501c0e
1
Parent(s): 2554712
Fix list/dict column crash and remove trust_remote_code
Browse files- src/__pycache__/cleaning.cpython-313.pyc +0 -0
- src/__pycache__/ingestion.cpython-313.pyc +0 -0
- src/cleaning.py +23 -2
- src/ingestion.py +2 -2
- verify_pipeline_mock.py +25 -3
src/__pycache__/cleaning.cpython-313.pyc
CHANGED
|
Binary files a/src/__pycache__/cleaning.cpython-313.pyc and b/src/__pycache__/cleaning.cpython-313.pyc differ
|
|
|
src/__pycache__/ingestion.cpython-313.pyc
CHANGED
|
Binary files a/src/__pycache__/ingestion.cpython-313.pyc and b/src/__pycache__/ingestion.cpython-313.pyc differ
|
|
|
src/cleaning.py
CHANGED
|
@@ -43,9 +43,30 @@ def clean_data(df):
|
|
| 43 |
log.append(f"Dropped {dropped_rows} empty rows.")
|
| 44 |
|
| 45 |
# 3. Handle Duplicate Rows
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
if duplicates > 0:
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
log.append(f"Removed {duplicates} duplicate rows.")
|
| 50 |
|
| 51 |
# 4. Fill Missing Values (Simple Strategy for Analysis)
|
|
|
|
| 43 |
log.append(f"Dropped {dropped_rows} empty rows.")
|
| 44 |
|
| 45 |
# 3. Handle Duplicate Rows
|
| 46 |
+
# Convert unhashable types (lists, dicts) to strings for duplicate checking
|
| 47 |
+
try:
|
| 48 |
+
duplicates = df.duplicated().sum()
|
| 49 |
+
except TypeError:
|
| 50 |
+
# Fallback: Convert object columns to string to handle unhashable types
|
| 51 |
+
temp_df = df.copy()
|
| 52 |
+
for col in temp_df.select_dtypes(include=['object']):
|
| 53 |
+
temp_df[col] = temp_df[col].astype(str)
|
| 54 |
+
duplicates = temp_df.duplicated().sum()
|
| 55 |
+
del temp_df
|
| 56 |
+
|
| 57 |
if duplicates > 0:
|
| 58 |
+
# To actually drop them, we need a similar approach or just rely on the try/except
|
| 59 |
+
try:
|
| 60 |
+
df = df.drop_duplicates()
|
| 61 |
+
except TypeError:
|
| 62 |
+
# If dropping fails naturally, we must use the string conversion method to identifying indices
|
| 63 |
+
# This is a bit expensive but robust
|
| 64 |
+
temp_df = df.copy()
|
| 65 |
+
for col in temp_df.select_dtypes(include=['object']):
|
| 66 |
+
temp_df[col] = temp_df[col].astype(str)
|
| 67 |
+
df = df.loc[~temp_df.duplicated()].copy()
|
| 68 |
+
del temp_df
|
| 69 |
+
|
| 70 |
log.append(f"Removed {duplicates} duplicate rows.")
|
| 71 |
|
| 72 |
# 4. Fill Missing Values (Simple Strategy for Analysis)
|
src/ingestion.py
CHANGED
|
@@ -133,9 +133,9 @@ def load_hf_dataset(dataset_name, subset=None, split='train', api_token=None):
|
|
| 133 |
# Load dataset
|
| 134 |
# If subset is provided, pass it as the second argument
|
| 135 |
if subset:
|
| 136 |
-
ds = load_dataset(dataset_name, subset, split=split, token=api_token
|
| 137 |
else:
|
| 138 |
-
ds = load_dataset(dataset_name, split=split, token=api_token
|
| 139 |
|
| 140 |
# Convert to pandas
|
| 141 |
df = ds.to_pandas()
|
|
|
|
| 133 |
# Load dataset
|
| 134 |
# If subset is provided, pass it as the second argument
|
| 135 |
if subset:
|
| 136 |
+
ds = load_dataset(dataset_name, subset, split=split, token=api_token)
|
| 137 |
else:
|
| 138 |
+
ds = load_dataset(dataset_name, split=split, token=api_token)
|
| 139 |
|
| 140 |
# Convert to pandas
|
| 141 |
df = ds.to_pandas()
|
verify_pipeline_mock.py
CHANGED
|
@@ -19,9 +19,31 @@ with patch('src.llm.get_insights', return_value="Mocked Insights") as mock_insig
|
|
| 19 |
def __init__(self, path):
|
| 20 |
self.name = path
|
| 21 |
|
| 22 |
-
print("Generating example dataset...")
|
| 23 |
-
|
| 24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
mock_file = MockFile(example_path)
|
| 27 |
print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
|
|
|
|
| 19 |
def __init__(self, path):
|
| 20 |
self.name = path
|
| 21 |
|
| 22 |
+
print("Generating example dataset with complex types...")
|
| 23 |
+
# Create a dummy CSV with list columns to test unhashable type handling
|
| 24 |
+
complex_data = {
|
| 25 |
+
"Name": ["Alice", "Bob", "Charlie", "David", "Eve"],
|
| 26 |
+
"Age": [25, 30, 35, 40, 22],
|
| 27 |
+
"Tags": [["HR", "admin"], ["eng", "dev"], ["eng", "lead"], ["mgmt"], ["HR"]], # List type (unhashable)
|
| 28 |
+
"Details": [{"role": "staff"}, {"role": "dev"}, {"role": "lead"}, {"role": "manager"}, {"role": "staff"}] # Dict type (unhashable)
|
| 29 |
+
}
|
| 30 |
+
df_complex = pd.DataFrame(complex_data)
|
| 31 |
+
example_path = "complex_test_data.csv"
|
| 32 |
+
# yielding list/dict in csv might be tricky as pandas read_csv reads them as strings usually.
|
| 33 |
+
# But ingestion.py `load_file` reads CSV.
|
| 34 |
+
# If I want to test "unhashable type", I need `read_csv` to produce lists/dicts? No, read_csv produces strings representation of lists.
|
| 35 |
+
# The error "unhashable type: dict" usually comes from parquet or HF datasets where real lists/dicts are preserved.
|
| 36 |
+
# So I should mock a parquet file or just manually create the dataframe and bypass load_file?
|
| 37 |
+
# verify_pipeline_mock uses `MockFile` which is passed to `analyze_dataset`.
|
| 38 |
+
# `analyze_dataset` calls `load_file`.
|
| 39 |
+
# `load_file` reads the file.
|
| 40 |
+
|
| 41 |
+
# To reproduce "unhashable type: dict", I need to pass a file that loads as dicts/lists.
|
| 42 |
+
# Parquet or HF Dataset is the way.
|
| 43 |
+
# Let's create a parquet file with complex types.
|
| 44 |
+
df_complex.to_parquet(example_path)
|
| 45 |
+
|
| 46 |
+
print(f"Example complex dataset created at: {example_path}")
|
| 47 |
|
| 48 |
mock_file = MockFile(example_path)
|
| 49 |
print(f"Running pipeline with MOCKED LLM...{os.path.basename(mock_file.name)}")
|