Spaces:

znation
/

visualizator

Sleeping

App Files Files Community

znation HF Staff commited on Dec 23, 2025

Commit

f06b780

verified ·

1 Parent(s): bb2a28c

Upload folder using huggingface_hub

Browse files

Files changed (2) hide show

app.py +57 -69
requirements.txt +0 -1

app.py CHANGED Viewed

@@ -9,13 +9,6 @@ import io
 from huggingface_hub import InferenceClient
 from typing import Optional, Tuple
 import traceback
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import StreamingResponse
-import uuid
-# Global dictionary to store DataFrames for serving as CSV
-# Key: unique ID, Value: DataFrame
-_dataframe_cache = {}
 # Initialize Hugging Face Inference Client
 def get_inference_client(token: Optional[str] = None) -> InferenceClient:
@@ -110,18 +103,29 @@ Error: {previous_error}
 Make sure to address this error in your new specification.
 """
         prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
 User Query: {query}
 Data Schema:
 {schema}
-Data URL: {data_url}
 {error_feedback}
-Requirements:
-1. Generate ONLY valid Vega-Lite JSON specification
-2. Use the data URL provided in the "data" field with "url" property
 3. Choose appropriate mark types and encodings based on the query
 4. Include appropriate titles and labels
 5. Make sure the field names match exactly with the column names from the schema
@@ -148,10 +152,11 @@ Generate the Vega-Lite specification now:"""
         # Parse JSON
         spec = json.loads(spec_text)
-        # Ensure the data URL is set correctly
-        if 'data' not in spec:
-            spec['data'] = {}
-        spec['data']['url'] = data_url
         return spec, None
@@ -431,8 +436,7 @@ def create_visualization(
     data_url: str,
     query: str,
     token: Optional[str] = None,
-    max_retries: int = 5,
-    app_base_url: str = ""
 ) -> Tuple[Optional[dict], Optional[str], str]:
     """
     Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
@@ -442,7 +446,6 @@ def create_visualization(
         query: User's visualization query
         token: Optional HuggingFace token
         max_retries: Maximum number of retry attempts
-        app_base_url: Base URL of the Gradio app for serving CSV data
     Returns:
         Tuple of (vega_lite_spec, error_message, log_message)
@@ -460,18 +463,8 @@ def create_visualization(
     # Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
     is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
-    # For parquet files, generate a unique ID and cache the DataFrame
-    csv_data_id = None
-    effective_data_url = data_url
     if is_parquet:
-        csv_data_id = str(uuid.uuid4())
-        _dataframe_cache[csv_data_id] = df
-        # Use app_base_url if provided, otherwise use relative path
-        if app_base_url:
-            effective_data_url = f"{app_base_url}/data/{csv_data_id}.csv"
-        else:
-            effective_data_url = f"/data/{csv_data_id}.csv"
-        log_messages.append(f"  Note: Parquet file - serving as CSV at {effective_data_url}")
     # Get schema
     schema = get_data_schema(df)
@@ -482,7 +475,8 @@ def create_visualization(
     for attempt in range(max_retries):
         log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
-        spec, error = generate_vega_lite_spec(query, schema, effective_data_url, token, previous_error)
         if error:
             log_messages.append(f"✗ Generation failed: {error}")
@@ -539,6 +533,33 @@ def create_visualization(
             # Fix up schema in case the LLM hallucinated
             spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
             # Validate with Altair to catch rendering errors
             log_messages.append("  Validating specification with Altair...")
             try:
@@ -565,7 +586,7 @@ def create_visualization(
     log_messages.append(f"\n✗ {error_msg}")
     return None, error_msg, "\n".join(log_messages)
-def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, request: gr.Request):
     """
     Main function to create visualization for Gradio interface.
@@ -573,7 +594,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
         data_url: URL to the data file
         query: User's visualization query
         oauth_token: OAuth token from Gradio (None if not logged in)
-        request: Gradio request object to get the base URL
     Returns:
         Tuple of (vega_lite_spec_dict, log_message, error_message)
@@ -591,11 +611,7 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
     # Extract token from OAuth if user is logged in
     token = oauth_token.token
-    # Get the base URL from the request
-    # For Hugging Face Spaces, use the space URL; for local, use localhost
-    app_base_url = f"{request.url.scheme}://{request.url.netloc}"
-    spec, error, log = create_visualization(data_url.strip(), query.strip(), token, app_base_url=app_base_url)
     if error:
         return None, log, error
@@ -613,29 +629,6 @@ def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None, requ
         print(error_msg, file=sys.stderr)
         return None, log, error_msg
-# FastAPI app for custom routes
-custom_fastapi = FastAPI()
-@custom_fastapi.get("/data/{data_id}.csv")
-async def serve_csv_data(data_id: str):
-    """Serve cached DataFrame as CSV for Vega-Lite visualization."""
-    if data_id not in _dataframe_cache:
-        raise HTTPException(status_code=404, detail="Data not found")
-    df = _dataframe_cache[data_id]
-    # Convert DataFrame to CSV
-    csv_buffer = io.StringIO()
-    df.to_csv(csv_buffer, index=False)
-    csv_buffer.seek(0)
-    # Return as streaming response
-    return StreamingResponse(
-        io.BytesIO(csv_buffer.getvalue().encode('utf-8')),
-        media_type="text/csv",
-        headers={"Content-Disposition": f"attachment; filename={data_id}.csv"}
-    )
 # Create Gradio interface
 def create_app():
     with gr.Blocks(title="Visualizator") as app:
@@ -750,13 +743,8 @@ def create_app():
     return app
-# Create the Gradio app
-gradio_app = create_app()
-# Mount Gradio on the custom FastAPI app for both local and deployment
-demo = gr.mount_gradio_app(custom_fastapi, gradio_app, path="/")
 if __name__ == "__main__":
-    # For local development with uvicorn
-    import uvicorn
-    uvicorn.run(demo, host="0.0.0.0", port=7860)

 from huggingface_hub import InferenceClient
 from typing import Optional, Tuple
 import traceback
 # Initialize Hugging Face Inference Client
 def get_inference_client(token: Optional[str] = None) -> InferenceClient:
 Make sure to address this error in your new specification.
 """
+        # Build data instruction based on whether we have a data URL
+        data_instruction = ""
+        if data_url:
+            data_instruction = f"""
+Data URL: {data_url}
+Requirements:
+1. Generate ONLY valid Vega-Lite JSON specification
+2. Use the data URL provided in the "data" field with "url" property"""
+        else:
+            data_instruction = """
+Requirements:
+1. Generate ONLY valid Vega-Lite JSON specification
+2. DO NOT include a "data" field - the data will be injected automatically"""
         prompt = f"""You are a data visualization expert. Generate a valid Vega-Lite specification (JSON) based on the user's query and data schema.
 User Query: {query}
 Data Schema:
 {schema}
+{data_instruction}
 {error_feedback}
 3. Choose appropriate mark types and encodings based on the query
 4. Include appropriate titles and labels
 5. Make sure the field names match exactly with the column names from the schema
         # Parse JSON
         spec = json.loads(spec_text)
+        # Ensure the data URL is set correctly (only if we have a URL)
+        if data_url:
+            if 'data' not in spec:
+                spec['data'] = {}
+            spec['data']['url'] = data_url
         return spec, None
     data_url: str,
     query: str,
     token: Optional[str] = None,
+    max_retries: int = 5
 ) -> Tuple[Optional[dict], Optional[str], str]:
     """
     Create a visualization by loading data and generating Vega-Lite spec with auto-retry.
         query: User's visualization query
         token: Optional HuggingFace token
         max_retries: Maximum number of retry attempts
     Returns:
         Tuple of (vega_lite_spec, error_message, log_message)
     # Check if this is a parquet file (Vega-Lite doesn't support parquet URLs)
     is_parquet = data_url.endswith('.parquet') or data_url.endswith('.parq')
     if is_parquet:
+        log_messages.append("  Note: Parquet file - data will be embedded inline (max 5000 rows)")
     # Get schema
     schema = get_data_schema(df)
     for attempt in range(max_retries):
         log_messages.append(f"\nAttempt {attempt + 1}/{max_retries}: Generating Vega-Lite specification...")
+        # For parquet files, tell LLM not to include data URL
+        spec, error = generate_vega_lite_spec(query, schema, data_url if not is_parquet else "", token, previous_error)
         if error:
             log_messages.append(f"✗ Generation failed: {error}")
             # Fix up schema in case the LLM hallucinated
             spec['$schema'] = 'https://vega.github.io/schema/vega-lite/v5.json'
+            # For parquet files, inject the data as inline values
+            if is_parquet:
+                log_messages.append("  Embedding data inline for parquet file...")
+                # Sample data if it's too large (Vega-Lite can struggle with large datasets)
+                MAX_ROWS = 5000
+                data_to_inject = df
+                if len(df) > MAX_ROWS:
+                    log_messages.append(f"  Sampling {MAX_ROWS} rows from {len(df)} total rows")
+                    data_to_inject = df.sample(n=MAX_ROWS, random_state=42)
+                # Prepare data for JSON serialization
+                data_to_inject = data_to_inject.copy()
+                # Convert datetime columns to ISO format strings
+                for col in data_to_inject.columns:
+                    if pd.api.types.is_datetime64_any_dtype(data_to_inject[col]):
+                        # Convert to string, handling NaT values
+                        data_to_inject[col] = data_to_inject[col].astype(str).replace('NaT', None)
+                # Replace NaN and infinity values with None for JSON compatibility
+                data_to_inject = data_to_inject.replace([float('inf'), float('-inf')], None)
+                data_to_inject = data_to_inject.where(pd.notna(data_to_inject), None)
+                # Convert to records format (list of dicts)
+                spec['data'] = {'values': data_to_inject.to_dict('records')}
+                log_messages.append(f"✓ Embedded {len(data_to_inject)} rows of data")
             # Validate with Altair to catch rendering errors
             log_messages.append("  Validating specification with Altair...")
             try:
     log_messages.append(f"\n✗ {error_msg}")
     return None, error_msg, "\n".join(log_messages)
+def visualize(data_url: str, query: str, oauth_token: gr.OAuthToken | None):
     """
     Main function to create visualization for Gradio interface.
         data_url: URL to the data file
         query: User's visualization query
         oauth_token: OAuth token from Gradio (None if not logged in)
     Returns:
         Tuple of (vega_lite_spec_dict, log_message, error_message)
     # Extract token from OAuth if user is logged in
     token = oauth_token.token
+    spec, error, log = create_visualization(data_url.strip(), query.strip(), token)
     if error:
         return None, log, error
         print(error_msg, file=sys.stderr)
         return None, log, error_msg
 # Create Gradio interface
 def create_app():
     with gr.Blocks(title="Visualizator") as app:
     return app
+# Create demo for Hugging Face Spaces
+demo = create_app()
 if __name__ == "__main__":
+    demo.launch()

requirements.txt CHANGED Viewed

@@ -6,4 +6,3 @@ altair>=5.0.0
 vega-datasets>=0.9.0
 python-dotenv>=1.0.0
 pyarrow>=10.0.0
-uvicorn>=0.20.0

 vega-datasets>=0.9.0
 python-dotenv>=1.0.0
 pyarrow>=10.0.0