Spaces:
Sleeping
Sleeping
Upload folder using huggingface_hub
Browse files- app.py +57 -69
- requirements.txt +0 -1
app.py
CHANGED
|
@@ -9,13 +9,6 @@ import io
|
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
from typing import Optional, Tuple
|
| 11 |
import traceback
|
| 12 |
-
from fastapi import FastAPI, HTTPException
|
| 13 |
-
from fastapi.responses import StreamingResponse
|
| 14 |
-
import uuid
|
| 15 |
-
|
| 16 |
-
# Global dictionary to store DataFrames for serving as CSV
|
| 17 |
-
# Key: unique ID, Value: DataFrame
|
| 18 |
-
_dataframe_cache = {}
|
| 19 |
|
| 20 |
# Initialize Hugging Face Inference Client
|
| 21 |
def get_inference_client(token: Optional[str] = None) -> InferenceClient:
|
|
@@ -110,18 +103,29 @@ Error: {previous_error}
|
|
| 110 |
Make sure to address this error in your new specification.
|
| 111 |
"""
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
|
| 114 |
|
| 115 |
User Query: {query}
|
| 116 |
|
| 117 |
Data Schema:
|
| 118 |
{schema}
|
| 119 |
-
|
| 120 |
-
Data URL: {data_url}
|
| 121 |
{error_feedback}
|
| 122 |
-
Requirements:
|
| 123 |
-
1. Generate ONLY valid Vega-Lite JSON specification
|
| 124 |
-
2. Use the data URL provided in the "data" field with "url" property
|
| 125 |
3. Choose appropriate mark types and encodings based on the query
|
| 126 |
4. Include appropriate titles and labels
|
| 127 |
5. Make sure the field names match exactly with the column names from the schema
|
|
@@ -148,10 +152,11 @@ Generate the Vega-Lite specification now:"""
|
|
| 148 |
# Parse JSON
|
| 149 |
spec = json.loads(spec_text)
|
| 150 |
|
| 151 |
-
# Ensure the data URL is set correctly
|
| 152 |
-
if
|
| 153 |
-
|
| 154 |
-
|
|
|
|
| 155 |
|
| 156 |
return spec, None
|
| 157 |
|
|
@@ -431,8 +436,7 @@ def create_visualization(
|
|
| 431 |
data_url: str,
|
| 432 |
query: str,
|
| 433 |
token: Optional[str] = None,
|
| 434 |
-
max_retries: int = 5
|
| 435 |
-
app_base_url: str = ""
|
| 436 |
) -> Tuple[Optional[dict], Optional[str], str]:
|
| 437 |
"""
|
| 438 |
Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
|
|
@@ -442,7 +446,6 @@ def create_visualization(
|
|
| 442 |
query: User's visualization query
|
| 443 |
token: Optional HuggingFace token
|
| 444 |
max_retries: Maximum number of retry attempts
|
| 445 |
-
app_base_url: Base URL of the Gradio app for serving CSV data
|
| 446 |
|
| 447 |
Returns:
|
| 448 |
Tuple of (vega_lite_spec, error_message, log_message)
|
|
@@ -460,18 +463,8 @@ def create_visualization(
|
|
| 460 |
# Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
|
| 461 |
is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
|
| 462 |
|
| 463 |
-
# For parquet files, generate a unique ID and cache the DataFrame
|
| 464 |
-
csv_data_id = None
|
| 465 |
-
effective_data_url = data_url
|
| 466 |
if is_parquet:
|
| 467 |
-
|
| 468 |
-
_dataframe_cache[csv_data_id] = df
|
| 469 |
-
# Use app_base_url if provided, otherwise use relative path
|
| 470 |
-
if app_base_url:
|
| 471 |
-
effective_data_url = f"{app_base_url}/data/{csv_data_id}.csv"
|
| 472 |
-
else:
|
| 473 |
-
effective_data_url = f"/data/{csv_data_id}.csv"
|
| 474 |
-
log_messages.append(f" Note: Parquet file - serving as CSV at {effective_data_url}")
|
| 475 |
|
| 476 |
# Get schema
|
| 477 |
schema = get_data_schema(df)
|
|
@@ -482,7 +475,8 @@ def create_visualization(
|
|
| 482 |
for attempt in range(max_retries):
|
| 483 |
log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
|
| 484 |
|
| 485 |
-
|
|
|
|
| 486 |
|
| 487 |
if error:
|
| 488 |
log_messages.append(f"✗ Generation failed: {error}")
|
|
@@ -539,6 +533,33 @@ def create_visualization(
|
|
| 539 |
# Fix up schema in case the LLM hallucinated
|
| 540 |
spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
|
| 541 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 542 |
# Validate with Altair to catch rendering errors
|
| 543 |
log_messages.append(" Validating specification with Altair...")
|
| 544 |
try:
|
|
@@ -565,7 +586,7 @@ def create_visualization(
|
|
| 565 |
log_messages.append(f"\n✗ {error_msg}")
|
| 566 |
return None, error_msg, "\n".join(log_messages)
|
| 567 |
|
| 568 |
-
def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None
|
| 569 |
"""
|
| 570 |
Main function to create visualization for Gradio interface.
|
| 571 |
|
|
@@ -573,7 +594,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
|
|
| 573 |
data_url: URL to the data file
|
| 574 |
query: User's visualization query
|
| 575 |
oauth_token: OAuth token from Gradio (None if not logged in)
|
| 576 |
-
request: Gradio request object to get the base URL
|
| 577 |
|
| 578 |
Returns:
|
| 579 |
Tuple of (vega_lite_spec_dict, log_message, error_message)
|
|
@@ -591,11 +611,7 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
|
|
| 591 |
# Extract token from OAuth if user is logged in
|
| 592 |
token = oauth_token.token
|
| 593 |
|
| 594 |
-
|
| 595 |
-
# For Hugging Face Spaces, use the space URL; for local, use localhost
|
| 596 |
-
app_base_url = f"{request.url.scheme}://{request.url.netloc}"
|
| 597 |
-
|
| 598 |
-
spec, error, log = create_visualization(data_url.strip(), query.strip(), token, app_base_url=app_base_url)
|
| 599 |
|
| 600 |
if error:
|
| 601 |
return None, log, error
|
|
@@ -613,29 +629,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
|
|
| 613 |
print(error_msg, file=sys.stderr)
|
| 614 |
return None, log, error_msg
|
| 615 |
|
| 616 |
-
# FastAPI app for custom routes
|
| 617 |
-
custom_fastapi = FastAPI()
|
| 618 |
-
|
| 619 |
-
@custom_fastapi.get("/data/{data_id}.csv")
|
| 620 |
-
async def serve_csv_data(data_id: str):
|
| 621 |
-
"""Serve cached DataFrame as CSV for Vega-Lite visualization."""
|
| 622 |
-
if data_id not in _dataframe_cache:
|
| 623 |
-
raise HTTPException(status_code=404, detail="Data not found")
|
| 624 |
-
|
| 625 |
-
df = _dataframe_cache[data_id]
|
| 626 |
-
|
| 627 |
-
# Convert DataFrame to CSV
|
| 628 |
-
csv_buffer = io.StringIO()
|
| 629 |
-
df.to_csv(csv_buffer, index=False)
|
| 630 |
-
csv_buffer.seek(0)
|
| 631 |
-
|
| 632 |
-
# Return as streaming response
|
| 633 |
-
return StreamingResponse(
|
| 634 |
-
io.BytesIO(csv_buffer.getvalue().encode('utf-8')),
|
| 635 |
-
media_type="text/csv",
|
| 636 |
-
headers={"Content-Disposition": f"attachment; filename={data_id}.csv"}
|
| 637 |
-
)
|
| 638 |
-
|
| 639 |
# Create Gradio interface
|
| 640 |
def create_app():
|
| 641 |
with gr.Blocks(title="Visualizator") as app:
|
|
@@ -750,13 +743,8 @@ def create_app():
|
|
| 750 |
|
| 751 |
return app
|
| 752 |
|
| 753 |
-
# Create
|
| 754 |
-
|
| 755 |
-
|
| 756 |
-
# Mount Gradio on the custom FastAPI app for both local and deployment
|
| 757 |
-
demo = gr.mount_gradio_app(custom_fastapi, gradio_app, path="/")
|
| 758 |
|
| 759 |
if __name__ == "__main__":
|
| 760 |
-
|
| 761 |
-
import uvicorn
|
| 762 |
-
uvicorn.run(demo, host="0.0.0.0", port=7860)
|
|
|
|
| 9 |
from huggingface_hub import InferenceClient
|
| 10 |
from typing import Optional, Tuple
|
| 11 |
import traceback
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
# Initialize Hugging Face Inference Client
|
| 14 |
def get_inference_client(token: Optional[str] = None) -> InferenceClient:
|
|
|
|
| 103 |
Make sure to address this error in your new specification.
|
| 104 |
"""
|
| 105 |
|
| 106 |
+
# Build data instruction based on whether we have a data URL
|
| 107 |
+
data_instruction = ""
|
| 108 |
+
if data_url:
|
| 109 |
+
data_instruction = f"""
|
| 110 |
+
Data URL: {data_url}
|
| 111 |
+
|
| 112 |
+
Requirements:
|
| 113 |
+
1. Generate ONLY valid Vega-Lite JSON specification
|
| 114 |
+
2. Use the data URL provided in the "data" field with "url" property"""
|
| 115 |
+
else:
|
| 116 |
+
data_instruction = """
|
| 117 |
+
Requirements:
|
| 118 |
+
1. Generate ONLY valid Vega-Lite JSON specification
|
| 119 |
+
2. DO NOT include a "data" field - the data will be injected automatically"""
|
| 120 |
+
|
| 121 |
prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
|
| 122 |
|
| 123 |
User Query: {query}
|
| 124 |
|
| 125 |
Data Schema:
|
| 126 |
{schema}
|
| 127 |
+
{data_instruction}
|
|
|
|
| 128 |
{error_feedback}
|
|
|
|
|
|
|
|
|
|
| 129 |
3. Choose appropriate mark types and encodings based on the query
|
| 130 |
4. Include appropriate titles and labels
|
| 131 |
5. Make sure the field names match exactly with the column names from the schema
|
|
|
|
| 152 |
# Parse JSON
|
| 153 |
spec = json.loads(spec_text)
|
| 154 |
|
| 155 |
+
# Ensure the data URL is set correctly (only if we have a URL)
|
| 156 |
+
if data_url:
|
| 157 |
+
if 'data' not in spec:
|
| 158 |
+
spec['data'] = {}
|
| 159 |
+
spec['data']['url'] = data_url
|
| 160 |
|
| 161 |
return spec, None
|
| 162 |
|
|
|
|
| 436 |
data_url: str,
|
| 437 |
query: str,
|
| 438 |
token: Optional[str] = None,
|
| 439 |
+
max_retries: int = 5
|
|
|
|
| 440 |
) -> Tuple[Optional[dict], Optional[str], str]:
|
| 441 |
"""
|
| 442 |
Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
|
|
|
|
| 446 |
query: User's visualization query
|
| 447 |
token: Optional HuggingFace token
|
| 448 |
max_retries: Maximum number of retry attempts
|
|
|
|
| 449 |
|
| 450 |
Returns:
|
| 451 |
Tuple of (vega_lite_spec, error_message, log_message)
|
|
|
|
| 463 |
# Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
|
| 464 |
is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
|
| 465 |
|
|
|
|
|
|
|
|
|
|
| 466 |
if is_parquet:
|
| 467 |
+
log_messages.append(" Note: Parquet file - data will be embedded inline (max 5000 rows)")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 468 |
|
| 469 |
# Get schema
|
| 470 |
schema = get_data_schema(df)
|
|
|
|
| 475 |
for attempt in range(max_retries):
|
| 476 |
log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
|
| 477 |
|
| 478 |
+
# For parquet files, tell LLM not to include data URL
|
| 479 |
+
spec, error = generate_vega_lite_spec(query, schema, data_url if not is_parquet else "", token, previous_error)
|
| 480 |
|
| 481 |
if error:
|
| 482 |
log_messages.append(f"✗ Generation failed: {error}")
|
|
|
|
| 533 |
# Fix up schema in case the LLM hallucinated
|
| 534 |
spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
|
| 535 |
|
| 536 |
+
# For parquet files, inject the data as inline values
|
| 537 |
+
if is_parquet:
|
| 538 |
+
log_messages.append(" Embedding data inline for parquet file...")
|
| 539 |
+
# Sample data if it's too large (Vega-Lite can struggle with large datasets)
|
| 540 |
+
MAX_ROWS = 5000
|
| 541 |
+
data_to_inject = df
|
| 542 |
+
if len(df) > MAX_ROWS:
|
| 543 |
+
log_messages.append(f" Sampling {MAX_ROWS} rows from {len(df)} total rows")
|
| 544 |
+
data_to_inject = df.sample(n=MAX_ROWS, random_state=42)
|
| 545 |
+
|
| 546 |
+
# Prepare data for JSON serialization
|
| 547 |
+
data_to_inject = data_to_inject.copy()
|
| 548 |
+
|
| 549 |
+
# Convert datetime columns to ISO format strings
|
| 550 |
+
for col in data_to_inject.columns:
|
| 551 |
+
if pd.api.types.is_datetime64_any_dtype(data_to_inject[col]):
|
| 552 |
+
# Convert to string, handling NaT values
|
| 553 |
+
data_to_inject[col] = data_to_inject[col].astype(str).replace('NaT', None)
|
| 554 |
+
|
| 555 |
+
# Replace NaN and infinity values with None for JSON compatibility
|
| 556 |
+
data_to_inject = data_to_inject.replace([float('inf'), float('-inf')], None)
|
| 557 |
+
data_to_inject = data_to_inject.where(pd.notna(data_to_inject), None)
|
| 558 |
+
|
| 559 |
+
# Convert to records format (list of dicts)
|
| 560 |
+
spec['data'] = {'values': data_to_inject.to_dict('records')}
|
| 561 |
+
log_messages.append(f"✓ Embedded {len(data_to_inject)} rows of data")
|
| 562 |
+
|
| 563 |
# Validate with Altair to catch rendering errors
|
| 564 |
log_messages.append(" Validating specification with Altair...")
|
| 565 |
try:
|
|
|
|
| 586 |
log_messages.append(f"\n✗ {error_msg}")
|
| 587 |
return None, error_msg, "\n".join(log_messages)
|
| 588 |
|
| 589 |
+
def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None):
|
| 590 |
"""
|
| 591 |
Main function to create visualization for Gradio interface.
|
| 592 |
|
|
|
|
| 594 |
data_url: URL to the data file
|
| 595 |
query: User's visualization query
|
| 596 |
oauth_token: OAuth token from Gradio (None if not logged in)
|
|
|
|
| 597 |
|
| 598 |
Returns:
|
| 599 |
Tuple of (vega_lite_spec_dict, log_message, error_message)
|
|
|
|
| 611 |
# Extract token from OAuth if user is logged in
|
| 612 |
token = oauth_token.token
|
| 613 |
|
| 614 |
+
spec, error, log = create_visualization(data_url.strip(), query.strip(), token)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 615 |
|
| 616 |
if error:
|
| 617 |
return None, log, error
|
|
|
|
| 629 |
print(error_msg, file=sys.stderr)
|
| 630 |
return None, log, error_msg
|
| 631 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 632 |
# Create Gradio interface
|
| 633 |
def create_app():
|
| 634 |
with gr.Blocks(title="Visualizator") as app:
|
|
|
|
| 743 |
|
| 744 |
return app
|
| 745 |
|
| 746 |
+
# Create demo for Hugging Face Spaces
|
| 747 |
+
demo = create_app()
|
|
|
|
|
|
|
|
|
|
| 748 |
|
| 749 |
if __name__ == "__main__":
|
| 750 |
+
demo.launch()
|
|
|
|
|
|
requirements.txt
CHANGED
|
@@ -6,4 +6,3 @@ altair>=5.0.0
|
|
| 6 |
vega-datasets>=0.9.0
|
| 7 |
python-dotenv>=1.0.0
|
| 8 |
pyarrow>=10.0.0
|
| 9 |
-
uvicorn>=0.20.0
|
|
|
|
| 6 |
vega-datasets>=0.9.0
|
| 7 |
python-dotenv>=1.0.0
|
| 8 |
pyarrow>=10.0.0
|
|
|