sofhiaazzhr Claude Sonnet 4.6 commited on
Commit
a49dc1b
·
1 Parent(s): 8daf9b5

fix: 5 bug fixes on tabular executor

Browse files

- top_n: remove fallback to sort_col (must use value_col only)
- coercion warning: log NaN introduced by numeric coercion in filters
- source_type filter: restrict tabular results to source_type=document
- raw fallback: cap columns to 20 to avoid oversized response
- add comment clarifying cross-file JOIN is out of scope for v1

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

Files changed (1) hide show
  1. src/query/executors/tabular.py +11 -3
src/query/executors/tabular.py CHANGED
@@ -82,6 +82,9 @@ class TabularOperation(BaseModel):
82
 
83
  def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
84
  numeric = pd.to_numeric(df[col], errors="coerce")
 
 
 
85
  if operator == "eq":
86
  return df[col].astype(str) == str(value)
87
  elif operator == "ne":
@@ -99,6 +102,9 @@ def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> p
99
 
100
  def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
101
  numeric = pd.to_numeric(df[col], errors="coerce")
 
 
 
102
  if operator == "eq":
103
  return df[df[col].astype(str) == str(value)]
104
  elif operator == "ne":
@@ -162,7 +168,7 @@ def _apply_operation(df: pd.DataFrame, op: TabularOperation, limit: int) -> pd.D
162
  raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
163
  return result.head(limit)
164
  elif op.operation == "top_n":
165
- col = op.value_col or op.sort_col
166
  if not col:
167
  raise ValueError(f"top_n requires value_col, got {op}")
168
  n = op.n or limit
@@ -206,7 +212,8 @@ class TabularExecutor(BaseExecutor):
206
  ) -> list[QueryResult]:
207
  tabular = [
208
  r for r in results
209
- if r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
 
210
  ]
211
 
212
  if not tabular:
@@ -270,6 +277,7 @@ class TabularExecutor(BaseExecutor):
270
  )
271
  return None
272
 
 
273
  gathered = await asyncio.gather(*[
274
  _process_group(doc_id, sheet_name, info)
275
  for (doc_id, sheet_name), info in groups.items()
@@ -305,7 +313,7 @@ class TabularExecutor(BaseExecutor):
305
 
306
  # Fallback: return raw rows
307
  logger.warning("tabular agent failed after retries, returning raw rows")
308
- return df.head(limit)
309
 
310
 
311
  tabular_executor = TabularExecutor()
 
82
 
83
  def _get_filter_mask(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.Series:
84
  numeric = pd.to_numeric(df[col], errors="coerce")
85
+ coerced_nulls = numeric.isnull() & df[col].notna()
86
+ if coerced_nulls.any():
87
+ logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
88
  if operator == "eq":
89
  return df[col].astype(str) == str(value)
90
  elif operator == "ne":
 
102
 
103
  def _apply_single_filter(df: pd.DataFrame, col: str, value: str, operator: str) -> pd.DataFrame:
104
  numeric = pd.to_numeric(df[col], errors="coerce")
105
+ coerced_nulls = numeric.isnull() & df[col].notna()
106
+ if coerced_nulls.any():
107
+ logger.warning("numeric coercion introduced NaN", col=col, count=int(coerced_nulls.sum()))
108
  if operator == "eq":
109
  return df[df[col].astype(str) == str(value)]
110
  elif operator == "ne":
 
168
  raise ValueError(f"filter requires filter_col/filter_value or filters or or_filters, got {op}")
169
  return result.head(limit)
170
  elif op.operation == "top_n":
171
+ col = op.value_col
172
  if not col:
173
  raise ValueError(f"top_n requires value_col, got {op}")
174
  n = op.n or limit
 
212
  ) -> list[QueryResult]:
213
  tabular = [
214
  r for r in results
215
+ if r.source_type == "document"
216
+ and r.metadata.get("data", {}).get("file_type") in _TABULAR_FILE_TYPES
217
  ]
218
 
219
  if not tabular:
 
277
  )
278
  return None
279
 
280
+ # Each group runs independently — cross-file JOIN is out of scope for v1.
281
  gathered = await asyncio.gather(*[
282
  _process_group(doc_id, sheet_name, info)
283
  for (doc_id, sheet_name), info in groups.items()
 
313
 
314
  # Fallback: return raw rows
315
  logger.warning("tabular agent failed after retries, returning raw rows")
316
+ return df.head(limit)[df.columns[:20]]
317
 
318
 
319
  tabular_executor = TabularExecutor()