Commit ·
a49dc1b
1
Parent(s): 8daf9b5
fix: 5 bug fixes on tabular executor
Browse files- top_n: remove fallback to sort_col (must use value_col only)
- coercion warning: log NaN introduced by numeric coercion in filters
- source_type filter: restrict tabular results to source_type=document
- raw fallback: cap columns to 20 to avoid oversized response
- add comment clarifying cross-file JOIN is out of scope for v1
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
src/query/executors/tabular.py
CHANGED
|
@@ -82,6 +82,9 @@ class TabularOperation(BaseModel):
|
|
| 82 |
|
| 83 |
def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
|
| 84 |
numeric = pd.to_numeric(df[col], errors="coerce")
|
|
|
|
|
|
|
|
|
|
| 85 |
if operator == "eq":
|
| 86 |
return df[col].astype(str) == str(value)
|
| 87 |
elif operator == "ne":
|
|
@@ -99,6 +102,9 @@ def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> p
|
|
| 99 |
|
| 100 |
def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
|
| 101 |
numeric = pd.to_numeric(df[col], errors="coerce")
|
|
|
|
|
|
|
|
|
|
| 102 |
if operator == "eq":
|
| 103 |
return df[df[col].astype(str) == str(value)]
|
| 104 |
elif operator == "ne":
|
|
@@ -162,7 +168,7 @@ def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.D
|
|
| 162 |
raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
|
| 163 |
return result.head(limit)
|
| 164 |
elif op.operation == "top_n":
|
| 165 |
-
col = op.value_col
|
| 166 |
if not col:
|
| 167 |
raise ValueError(f"top_n requires value_col, got {op}")
|
| 168 |
n = op.n or limit
|
|
@@ -206,7 +212,8 @@ class TabularExecutor(BaseExecutor):
|
|
| 206 |
) -> list[QueryResult]:
|
| 207 |
tabular = [
|
| 208 |
r for r in results
|
| 209 |
-
if r.
|
|
|
|
| 210 |
]
|
| 211 |
|
| 212 |
if not tabular:
|
|
@@ -270,6 +277,7 @@ class TabularExecutor(BaseExecutor):
|
|
| 270 |
)
|
| 271 |
return None
|
| 272 |
|
|
|
|
| 273 |
gathered = await asyncio.gather(*[
|
| 274 |
_process_group(doc_id, sheet_name, info)
|
| 275 |
for (doc_id, sheet_name), info in groups.items()
|
|
@@ -305,7 +313,7 @@ class TabularExecutor(BaseExecutor):
|
|
| 305 |
|
| 306 |
# Fallback: return raw rows
|
| 307 |
logger.warning("tabular agent failed after retries, returning raw rows")
|
| 308 |
-
return df.head(limit)
|
| 309 |
|
| 310 |
|
| 311 |
tabular_executor = TabularExecutor()
|
|
|
|
| 82 |
|
| 83 |
def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
|
| 84 |
numeric = pd.to_numeric(df[col], errors="coerce")
|
| 85 |
+
coerced_nulls = numeric.isnull() & df[col].notna()
|
| 86 |
+
if coerced_nulls.any():
|
| 87 |
+
logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
|
| 88 |
if operator == "eq":
|
| 89 |
return df[col].astype(str) == str(value)
|
| 90 |
elif operator == "ne":
|
|
|
|
| 102 |
|
| 103 |
def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
|
| 104 |
numeric = pd.to_numeric(df[col], errors="coerce")
|
| 105 |
+
coerced_nulls = numeric.isnull() & df[col].notna()
|
| 106 |
+
if coerced_nulls.any():
|
| 107 |
+
logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
|
| 108 |
if operator == "eq":
|
| 109 |
return df[df[col].astype(str) == str(value)]
|
| 110 |
elif operator == "ne":
|
|
|
|
| 168 |
raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
|
| 169 |
return result.head(limit)
|
| 170 |
elif op.operation == "top_n":
|
| 171 |
+
col = op.value_col
|
| 172 |
if not col:
|
| 173 |
raise ValueError(f"top_n requires value_col, got {op}")
|
| 174 |
n = op.n or limit
|
|
|
|
| 212 |
) -> list[QueryResult]:
|
| 213 |
tabular = [
|
| 214 |
r for r in results
|
| 215 |
+
if r.source_type == "document"
|
| 216 |
+
and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
|
| 217 |
]
|
| 218 |
|
| 219 |
if not tabular:
|
|
|
|
| 277 |
)
|
| 278 |
return None
|
| 279 |
|
| 280 |
+
# Each group runs independently — cross-file JOIN is out of scope for v1.
|
| 281 |
gathered = await asyncio.gather(*[
|
| 282 |
_process_group(doc_id, sheet_name, info)
|
| 283 |
for (doc_id, sheet_name), info in groups.items()
|
|
|
|
| 313 |
|
| 314 |
# Fallback: return raw rows
|
| 315 |
logger.warning("tabular agent failed after retries, returning raw rows")
|
| 316 |
+
return df.head(limit)[df.columns[:20]]
|
| 317 |
|
| 318 |
|
| 319 |
tabular_executor = TabularExecutor()
|