znation HF Staff commited on
Commit
f06b780
·
verified ·
1 Parent(s): bb2a28c

Upload folder using huggingface_hub

Browse files
Files changed (2) hide show
  1. app.py +57 -69
  2. requirements.txt +0 -1
app.py CHANGED
@@ -9,13 +9,6 @@ import io
9
  from huggingface_hub import InferenceClient
10
  from typing import Optional, Tuple
11
  import traceback
12
- from fastapi import FastAPI, HTTPException
13
- from fastapi.responses import StreamingResponse
14
- import uuid
15
-
16
- # Global dictionary to store DataFrames for serving as CSV
17
- # Key: unique ID, Value: DataFrame
18
- _dataframe_cache = {}
19
 
20
  # Initialize Hugging Face Inference Client
21
  def get_inference_client(token: Optional[str] = None) -> InferenceClient:
@@ -110,18 +103,29 @@ Error: {previous_error}
110
  Make sure to address this error in your new specification.
111
  """
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
114
 
115
  User Query: {query}
116
 
117
  Data Schema:
118
  {schema}
119
-
120
- Data URL: {data_url}
121
  {error_feedback}
122
- Requirements:
123
- 1. Generate ONLY valid Vega-Lite JSON specification
124
- 2. Use the data URL provided in the "data" field with "url" property
125
  3. Choose appropriate mark types and encodings based on the query
126
  4. Include appropriate titles and labels
127
  5. Make sure the field names match exactly with the column names from the schema
@@ -148,10 +152,11 @@ Generate the Vega-Lite specification now:"""
148
  # Parse JSON
149
  spec = json.loads(spec_text)
150
 
151
- # Ensure the data URL is set correctly
152
- if 'data' not in spec:
153
- spec['data'] = {}
154
- spec['data']['url'] = data_url
 
155
 
156
  return spec, None
157
 
@@ -431,8 +436,7 @@ def create_visualization(
431
  data_url: str,
432
  query: str,
433
  token: Optional[str] = None,
434
- max_retries: int = 5,
435
- app_base_url: str = ""
436
  ) -> Tuple[Optional[dict], Optional[str], str]:
437
  """
438
  Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
@@ -442,7 +446,6 @@ def create_visualization(
442
  query: User's visualization query
443
  token: Optional HuggingFace token
444
  max_retries: Maximum number of retry attempts
445
- app_base_url: Base URL of the Gradio app for serving CSV data
446
 
447
  Returns:
448
  Tuple of (vega_lite_spec, error_message, log_message)
@@ -460,18 +463,8 @@ def create_visualization(
460
  # Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
461
  is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
462
 
463
- # For parquet files, generate a unique ID and cache the DataFrame
464
- csv_data_id = None
465
- effective_data_url = data_url
466
  if is_parquet:
467
- csv_data_id = str(uuid.uuid4())
468
- _dataframe_cache[csv_data_id] = df
469
- # Use app_base_url if provided, otherwise use relative path
470
- if app_base_url:
471
- effective_data_url = f"{app_base_url}/data/{csv_data_id}.csv"
472
- else:
473
- effective_data_url = f"/data/{csv_data_id}.csv"
474
- log_messages.append(f" Note: Parquet file - serving as CSV at {effective_data_url}")
475
 
476
  # Get schema
477
  schema = get_data_schema(df)
@@ -482,7 +475,8 @@ def create_visualization(
482
  for attempt in range(max_retries):
483
  log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
484
 
485
- spec, error = generate_vega_lite_spec(query, schema, effective_data_url, token, previous_error)
 
486
 
487
  if error:
488
  log_messages.append(f"✗ Generation failed: {error}")
@@ -539,6 +533,33 @@ def create_visualization(
539
  # Fix up schema in case the LLM hallucinated
540
  spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
541
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
542
  # Validate with Altair to catch rendering errors
543
  log_messages.append(" Validating specification with Altair...")
544
  try:
@@ -565,7 +586,7 @@ def create_visualization(
565
  log_messages.append(f"\n✗ {error_msg}")
566
  return None, error_msg, "\n".join(log_messages)
567
 
568
- def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, request: gr.Request):
569
  """
570
  Main function to create visualization for Gradio interface.
571
 
@@ -573,7 +594,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
573
  data_url: URL to the data file
574
  query: User's visualization query
575
  oauth_token: OAuth token from Gradio (None if not logged in)
576
- request: Gradio request object to get the base URL
577
 
578
  Returns:
579
  Tuple of (vega_lite_spec_dict, log_message, error_message)
@@ -591,11 +611,7 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
591
  # Extract token from OAuth if user is logged in
592
  token = oauth_token.token
593
 
594
- # Get the base URL from the request
595
- # For Hugging Face Spaces, use the space URL; for local, use localhost
596
- app_base_url = f"{request.url.scheme}://{request.url.netloc}"
597
-
598
- spec, error, log = create_visualization(data_url.strip(), query.strip(), token, app_base_url=app_base_url)
599
 
600
  if error:
601
  return None, log, error
@@ -613,29 +629,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
613
  print(error_msg, file=sys.stderr)
614
  return None, log, error_msg
615
 
616
- # FastAPI app for custom routes
617
- custom_fastapi = FastAPI()
618
-
619
- @custom_fastapi.get("/data/{data_id}.csv")
620
- async def serve_csv_data(data_id: str):
621
- """Serve cached DataFrame as CSV for Vega-Lite visualization."""
622
- if data_id not in _dataframe_cache:
623
- raise HTTPException(status_code=404, detail="Data not found")
624
-
625
- df = _dataframe_cache[data_id]
626
-
627
- # Convert DataFrame to CSV
628
- csv_buffer = io.StringIO()
629
- df.to_csv(csv_buffer, index=False)
630
- csv_buffer.seek(0)
631
-
632
- # Return as streaming response
633
- return StreamingResponse(
634
- io.BytesIO(csv_buffer.getvalue().encode('utf-8')),
635
- media_type="text/csv",
636
- headers={"Content-Disposition": f"attachment; filename={data_id}.csv"}
637
- )
638
-
639
  # Create Gradio interface
640
  def create_app():
641
  with gr.Blocks(title="Visualizator") as app:
@@ -750,13 +743,8 @@ def create_app():
750
 
751
  return app
752
 
753
- # Create the Gradio app
754
- gradio_app = create_app()
755
-
756
- # Mount Gradio on the custom FastAPI app for both local and deployment
757
- demo = gr.mount_gradio_app(custom_fastapi, gradio_app, path="/")
758
 
759
  if __name__ == "__main__":
760
- # For local development with uvicorn
761
- import uvicorn
762
- uvicorn.run(demo, host="0.0.0.0", port=7860)
 
9
  from huggingface_hub import InferenceClient
10
  from typing import Optional, Tuple
11
  import traceback
 
 
 
 
 
 
 
12
 
13
  # Initialize Hugging Face Inference Client
14
  def get_inference_client(token: Optional[str] = None) -> InferenceClient:
 
103
  Make sure to address this error in your new specification.
104
  """
105
 
106
+ # Build data instruction based on whether we have a data URL
107
+ data_instruction = ""
108
+ if data_url:
109
+ data_instruction = f"""
110
+ Data URL: {data_url}
111
+
112
+ Requirements:
113
+ 1. Generate ONLY valid Vega-Lite JSON specification
114
+ 2. Use the data URL provided in the "data" field with "url" property"""
115
+ else:
116
+ data_instruction = """
117
+ Requirements:
118
+ 1. Generate ONLY valid Vega-Lite JSON specification
119
+ 2. DO NOT include a "data" field - the data will be injected automatically"""
120
+
121
  prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
122
 
123
  User Query: {query}
124
 
125
  Data Schema:
126
  {schema}
127
+ {data_instruction}
 
128
  {error_feedback}
 
 
 
129
  3. Choose appropriate mark types and encodings based on the query
130
  4. Include appropriate titles and labels
131
  5. Make sure the field names match exactly with the column names from the schema
 
152
  # Parse JSON
153
  spec = json.loads(spec_text)
154
 
155
+ # Ensure the data URL is set correctly (only if we have a URL)
156
+ if data_url:
157
+ if 'data' not in spec:
158
+ spec['data'] = {}
159
+ spec['data']['url'] = data_url
160
 
161
  return spec, None
162
 
 
436
  data_url: str,
437
  query: str,
438
  token: Optional[str] = None,
439
+ max_retries: int = 5
 
440
  ) -> Tuple[Optional[dict], Optional[str], str]:
441
  """
442
  Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
 
446
  query: User's visualization query
447
  token: Optional HuggingFace token
448
  max_retries: Maximum number of retry attempts
 
449
 
450
  Returns:
451
  Tuple of (vega_lite_spec, error_message, log_message)
 
463
  # Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
464
  is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
465
 
 
 
 
466
  if is_parquet:
467
+ log_messages.append(" Note: Parquet file - data will be embedded inline (max 5000 rows)")
 
 
 
 
 
 
 
468
 
469
  # Get schema
470
  schema = get_data_schema(df)
 
475
  for attempt in range(max_retries):
476
  log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
477
 
478
+ # For parquet files, tell LLM not to include data URL
479
+ spec, error = generate_vega_lite_spec(query, schema, data_url if not is_parquet else "", token, previous_error)
480
 
481
  if error:
482
  log_messages.append(f"✗ Generation failed: {error}")
 
533
  # Fix up schema in case the LLM hallucinated
534
  spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
535
 
536
+ # For parquet files, inject the data as inline values
537
+ if is_parquet:
538
+ log_messages.append(" Embedding data inline for parquet file...")
539
+ # Sample data if it's too large (Vega-Lite can struggle with large datasets)
540
+ MAX_ROWS = 5000
541
+ data_to_inject = df
542
+ if len(df) > MAX_ROWS:
543
+ log_messages.append(f" Sampling {MAX_ROWS} rows from {len(df)} total rows")
544
+ data_to_inject = df.sample(n=MAX_ROWS, random_state=42)
545
+
546
+ # Prepare data for JSON serialization
547
+ data_to_inject = data_to_inject.copy()
548
+
549
+ # Convert datetime columns to ISO format strings
550
+ for col in data_to_inject.columns:
551
+ if pd.api.types.is_datetime64_any_dtype(data_to_inject[col]):
552
+ # Convert to string, handling NaT values
553
+ data_to_inject[col] = data_to_inject[col].astype(str).replace('NaT', None)
554
+
555
+ # Replace NaN and infinity values with None for JSON compatibility
556
+ data_to_inject = data_to_inject.replace([float('inf'), float('-inf')], None)
557
+ data_to_inject = data_to_inject.where(pd.notna(data_to_inject), None)
558
+
559
+ # Convert to records format (list of dicts)
560
+ spec['data'] = {'values': data_to_inject.to_dict('records')}
561
+ log_messages.append(f"✓ Embedded {len(data_to_inject)} rows of data")
562
+
563
  # Validate with Altair to catch rendering errors
564
  log_messages.append(" Validating specification with Altair...")
565
  try:
 
586
  log_messages.append(f"\n✗ {error_msg}")
587
  return None, error_msg, "\n".join(log_messages)
588
 
589
+ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None):
590
  """
591
  Main function to create visualization for Gradio interface.
592
 
 
594
  data_url: URL to the data file
595
  query: User's visualization query
596
  oauth_token: OAuth token from Gradio (None if not logged in)
 
597
 
598
  Returns:
599
  Tuple of (vega_lite_spec_dict, log_message, error_message)
 
611
  # Extract token from OAuth if user is logged in
612
  token = oauth_token.token
613
 
614
+ spec, error, log = create_visualization(data_url.strip(), query.strip(), token)
 
 
 
 
615
 
616
  if error:
617
  return None, log, error
 
629
  print(error_msg, file=sys.stderr)
630
  return None, log, error_msg
631
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
632
  # Create Gradio interface
633
  def create_app():
634
  with gr.Blocks(title="Visualizator") as app:
 
743
 
744
  return app
745
 
746
+ # Create demo for Hugging Face Spaces
747
+ demo = create_app()
 
 
 
748
 
749
  if __name__ == "__main__":
750
+ demo.launch()
 
 
requirements.txt CHANGED
@@ -6,4 +6,3 @@ altair>=5.0.0
6
  vega-datasets>=0.9.0
7
  python-dotenv>=1.0.0
8
  pyarrow>=10.0.0
9
- uvicorn>=0.20.0
 
6
  vega-datasets>=0.9.0
7
  python-dotenv>=1.0.0
8
  pyarrow>=10.0.0