jonathanagustin commited on
Commit
37d53b3
·
verified ·
1 Parent(s): fca3f00

Upload app.py with huggingface_hub

Browse files
Files changed (1) hide show
  1. app.py +20 -9
app.py CHANGED
@@ -41,6 +41,7 @@ def get_parquet_path(config: str, shard: int = 0) -> str:
41
  cache_key = f"{config}_{shard}"
42
  if cache_key not in _db_cache:
43
  filename = f"data/{config}/{config}-{shard:05d}.parquet"
 
44
  try:
45
  local_path = hf_hub_download(
46
  repo_id=DATASET_ID,
@@ -48,25 +49,35 @@ def get_parquet_path(config: str, shard: int = 0) -> str:
48
  repo_type="dataset",
49
  token=HF_TOKEN
50
  )
 
51
  _db_cache[cache_key] = local_path
52
  except Exception as e:
53
- raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}")
 
54
  return _db_cache[cache_key]
55
 
56
 
57
  def query_parquet(config: str, sql: str, params: dict = None) -> list:
58
  """Execute SQL query on parquet file."""
59
  path = get_parquet_path(config)
60
- conn = duckdb.connect(":memory:")
61
- conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
 
 
 
62
 
63
- if params:
64
- result = conn.execute(sql, params).fetchdf()
65
- else:
66
- result = conn.execute(sql).fetchdf()
67
 
68
- conn.close()
69
- return result.to_dict(orient="records")
 
 
 
 
 
70
 
71
 
72
  @app.get("/")
 
41
  cache_key = f"{config}_{shard}"
42
  if cache_key not in _db_cache:
43
  filename = f"data/{config}/{config}-{shard:05d}.parquet"
44
+ print(f"Downloading: {filename}")
45
  try:
46
  local_path = hf_hub_download(
47
  repo_id=DATASET_ID,
 
49
  repo_type="dataset",
50
  token=HF_TOKEN
51
  )
52
+ print(f"Downloaded to: {local_path}")
53
  _db_cache[cache_key] = local_path
54
  except Exception as e:
55
+ print(f"Error downloading {filename}: {e}")
56
+ raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}. Error: {str(e)}")
57
  return _db_cache[cache_key]
58
 
59
 
60
  def query_parquet(config: str, sql: str, params: dict = None) -> list:
61
  """Execute SQL query on parquet file."""
62
  path = get_parquet_path(config)
63
+ print(f"Querying parquet: {path}")
64
+ print(f"SQL: {sql}")
65
+ try:
66
+ conn = duckdb.connect(":memory:")
67
+ conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
68
 
69
+ if params:
70
+ result = conn.execute(sql, params).fetchdf()
71
+ else:
72
+ result = conn.execute(sql).fetchdf()
73
 
74
+ conn.close()
75
+ # Convert NaN to None for JSON serialization
76
+ result = result.where(pd.notnull(result), None)
77
+ return result.to_dict(orient="records")
78
+ except Exception as e:
79
+ print(f"Query error: {e}")
80
+ raise HTTPException(status_code=500, detail=f"Query error: {str(e)}")
81
 
82
 
83
  @app.get("/")