Spaces:
Sleeping
Sleeping
Upload app.py with huggingface_hub
Browse files
app.py
CHANGED
|
@@ -41,6 +41,7 @@ def get_parquet_path(config: str, shard: int = 0) -> str:
|
|
| 41 |
cache_key = f"{config}_{shard}"
|
| 42 |
if cache_key not in _db_cache:
|
| 43 |
filename = f"data/{config}/{config}-{shard:05d}.parquet"
|
|
|
|
| 44 |
try:
|
| 45 |
local_path = hf_hub_download(
|
| 46 |
repo_id=DATASET_ID,
|
|
@@ -48,25 +49,35 @@ def get_parquet_path(config: str, shard: int = 0) -> str:
|
|
| 48 |
repo_type="dataset",
|
| 49 |
token=HF_TOKEN
|
| 50 |
)
|
|
|
|
| 51 |
_db_cache[cache_key] = local_path
|
| 52 |
except Exception as e:
|
| 53 |
-
|
|
|
|
| 54 |
return _db_cache[cache_key]
|
| 55 |
|
| 56 |
|
| 57 |
def query_parquet(config: str, sql: str, params: dict = None) -> list:
|
| 58 |
"""Execute SQL query on parquet file."""
|
| 59 |
path = get_parquet_path(config)
|
| 60 |
-
|
| 61 |
-
|
|
|
|
|
|
|
|
|
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 70 |
|
| 71 |
|
| 72 |
@app.get("/")
|
|
|
|
| 41 |
cache_key = f"{config}_{shard}"
|
| 42 |
if cache_key not in _db_cache:
|
| 43 |
filename = f"data/{config}/{config}-{shard:05d}.parquet"
|
| 44 |
+
print(f"Downloading: {filename}")
|
| 45 |
try:
|
| 46 |
local_path = hf_hub_download(
|
| 47 |
repo_id=DATASET_ID,
|
|
|
|
| 49 |
repo_type="dataset",
|
| 50 |
token=HF_TOKEN
|
| 51 |
)
|
| 52 |
+
print(f"Downloaded to: {local_path}")
|
| 53 |
_db_cache[cache_key] = local_path
|
| 54 |
except Exception as e:
|
| 55 |
+
print(f"Error downloading {filename}: {e}")
|
| 56 |
+
raise HTTPException(status_code=404, detail=f"Parquet file not found: {filename}. Error: {str(e)}")
|
| 57 |
return _db_cache[cache_key]
|
| 58 |
|
| 59 |
|
| 60 |
def query_parquet(config: str, sql: str, params: dict = None) -> list:
|
| 61 |
"""Execute SQL query on parquet file."""
|
| 62 |
path = get_parquet_path(config)
|
| 63 |
+
print(f"Querying parquet: {path}")
|
| 64 |
+
print(f"SQL: {sql}")
|
| 65 |
+
try:
|
| 66 |
+
conn = duckdb.connect(":memory:")
|
| 67 |
+
conn.execute(f"CREATE VIEW data AS SELECT * FROM read_parquet('{path}')")
|
| 68 |
|
| 69 |
+
if params:
|
| 70 |
+
result = conn.execute(sql, params).fetchdf()
|
| 71 |
+
else:
|
| 72 |
+
result = conn.execute(sql).fetchdf()
|
| 73 |
|
| 74 |
+
conn.close()
|
| 75 |
+
# Convert NaN to None for JSON serialization
|
| 76 |
+
result = result.where(pd.notnull(result), None)
|
| 77 |
+
return result.to_dict(orient="records")
|
| 78 |
+
except Exception as e:
|
| 79 |
+
print(f"Query error: {e}")
|
| 80 |
+
raise HTTPException(status_code=500, detail=f"Query error: {str(e)}")
|
| 81 |
|
| 82 |
|
| 83 |
@app.get("/")
|