Spaces:

Labelbox
/

databricks_upload

Sleeping

App Files Files Community

ksaramout commited on Oct 16, 2023

Commit

a234a1a

1 Parent(s): 70a2533

Delete preview_mode_server.py

Browse files

Files changed (1) hide show

preview_mode_server.py +0 -270

preview_mode_server.py DELETED Viewed

@@ -1,270 +0,0 @@
-from flask import Flask, request, jsonify
-import threading
-import requests
-import json
-app = Flask(__name__)
-def create_databricks_job(data):
-    mode = data.get('mode')
-    databricks_instance = data.get('databricks_instance')
-    databricks_api_key = data.get('databricks_api_key')
-    new_dataset = data.get('new_dataset')
-    dataset_id = data.get('dataset_id')
-    table_path = data.get('table_path')
-    labelbox_api_key = data.get('labelbox_api_key')
-    frequency = data.get('frequency')
-    new_cluster = data.get('new_cluster')
-    cluster_id = data.get('cluster_id')
-    schema_map = data.get('schema_map')
-    # Define the authentication headers
-    headers = {
-        "Authorization": f"Bearer {databricks_api_key}",
-        "Content-Type": "application/json",
-    }
-    # ----- CLUSTER CREATION LOGIC -----
-    def create_all_purpose_cluster(databricks_instance):
-        url = f"https://{databricks_instance}/api/2.0/clusters/create"
-        cluster_payload = {
-            "autoscale": {
-                "min_workers": 1,
-                "max_workers": 10
-            },
-            "cluster_name": "Labelbox Worker",
-            "spark_version": "11.3.x-scala2.12",
-            "gcp_attributes": {
-                "use_preemptible_executors": False,
-                "availability": "PREEMPTIBLE_WITH_FALLBACK_GCP",
-                "zone_id": "HA"
-            },
-            "node_type_id": "n2-highmem-4",
-            "driver_node_type_id": "n2-highmem-4",
-            "ssh_public_keys": [],
-            "custom_tags": {},
-            "cluster_log_conf": {
-                "dbfs": {
-                    "destination": "dbfs:/cluster-logs"
-                }
-            },
-            "spark_env_vars": {},
-            "autotermination_minutes": 60,
-            "enable_elastic_disk": False,
-            "init_scripts": [],
-            "enable_local_disk_encryption": False,
-            "runtime_engine": "STANDARD"
-        }
-        response = requests.post(url, data=json.dumps(cluster_payload), headers=headers)
-        if response.status_code == 200:
-            return response.json()['cluster_id']
-        else:
-            raise Exception(f"Failed to create all-purpose cluster. Error: {response.text}")
-    # ----- PREVIEW MODE LOGIC -----
-    def create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id):
-            # ----- JOB SCHEDULING LOGIC -----
-            if frequency == "continuous":
-                schedule_block = {
-                    "continuous": {
-                        "pause_status": "UNPAUSED"
-                    }
-                }
-            else:
-                schedule_block = {
-                    "schedule": {
-                        "quartz_cron_expression": frequency,
-                        "timezone_id": "UTC",
-                        "pause_status": "UNPAUSED"
-                    }
-                }
-            # ----- JOB DEFINITION -----
-        # Define the parameters and structure of the job to be created in Databricks
-            payload = {
-                "name": "PREVIEW_upload_to_labelbox",
-                "email_notifications": {"no_alert_for_skipped_runs": False},
-                "webhook_notifications": {},
-                "timeout_seconds": 0,
-                "max_concurrent_runs": 1,
-                "tasks": [
-                    {
-                        "existing_cluster_id": cluster_id,  # Move this inside the task
-                        "task_key": "PREVIEW_upload_to_labelbox",
-                        "run_if": "ALL_SUCCESS",
-                        "notebook_task": {
-                            "notebook_path": "notebooks/databricks_pipeline_creator/preview_upload_to_labelbox",
-                            "base_parameters": {
-                                "dataset_id": dataset_id,
-                                "table_path": table_path,
-                                "labelbox_api_key": labelbox_api_key,
-                                "schema_map": schema_map
-                            },
-                            "source": "GIT"
-                        },
-                        "libraries": [
-                            {"pypi": {"package": "labelspark"}},
-                            {"pypi": {"package": "labelbox==3.49.1"}},
-                            {"pypi": {"package": "numpy==1.25"}},
-                            {"pypi": {"package": "opencv-python==4.8.0.74"}}
-                        ],
-                        "timeout_seconds": 0,
-                        "email_notifications": {},
-                        "notification_settings": {
-                            "no_alert_for_skipped_runs": False,
-                            "no_alert_for_canceled_runs": False,
-                            "alert_on_last_attempt": False
-                        }
-                    }
-                ],
-                "git_source": {
-                    "git_url": "https://github.com/Labelbox/labelspark.git",
-                    "git_provider": "gitHub",
-                    "git_branch": "master"
-                },
-                "format": "MULTI_TASK"
-            }
-            # Merge the scheduling configuration into the main job payload
-            payload.update(schedule_block)
-            return payload
-    # ----- PRODUCTION MODE LOGIC -----
-    def create_production(dataset_id, table_path, labelbox_api_key, frequency):
-            # ----- JOB SCHEDULING LOGIC -----
-            # If the job needs to run continuously, use the "continuous" block
-            # Else, use the "schedule" block with the specified cron frequency
-            if frequency == "continuous":
-                schedule_block = {
-                    "continuous": {
-                        "pause_status": "UNPAUSED"
-                    }
-                }
-            else:
-                schedule_block = {
-                    "schedule": {
-                        "quartz_cron_expression": frequency,
-                        "timezone_id": "UTC",
-                        "pause_status": "UNPAUSED"
-                    }
-                }
-            # ----- JOB DEFINITION -----
-            # Define the parameters and structure of the job to be created in Databricks
-            payload = {
-                "name": "upload_to_labelbox",
-                "email_notifications": {"no_alert_for_skipped_runs": False},
-                "webhook_notifications": {},
-                "timeout_seconds": 0,
-                "max_concurrent_runs": 1,
-                "tasks": [
-                    {
-                        "task_key": "upload_to_labelbox",
-                        "run_if": "ALL_SUCCESS",
-                        "notebook_task": {
-                            "notebook_path": "notebooks/databricks_pipeline_creator/upload_to_labelbox",
-                            "base_parameters": {
-                                "dataset_id": dataset_id,
-                                "table_path": table_path,
-                                "labelbox_api_key": labelbox_api_key,
-                                "schema_map": schema_map
-                            },
-                            "source": "GIT"
-                        },
-                        "job_cluster_key": "Job_cluster",
-                        "libraries": [
-                            {"pypi": {"package": "labelspark"}},
-                            {"pypi": {"package": "labelbox==3.49.1"}},
-                            {"pypi": {"package": "numpy==1.25"}},
-                            {"pypi": {"package": "opencv-python==4.8.0.74"}}
-                        ],
-                        "timeout_seconds": 0,
-                        "email_notifications": {},
-                        "notification_settings": {
-                            "no_alert_for_skipped_runs": False,
-                            "no_alert_for_canceled_runs": False,
-                            "alert_on_last_attempt": False
-                        }
-                    }
-                ],
-                "job_clusters": [
-                    {
-                        "job_cluster_key": "Job_cluster",
-                        "new_cluster": {
-                            "cluster_name": "",
-                            "spark_version": "13.3.x-scala2.12",
-                            "gcp_attributes": {
-                                "use_preemptible_executors": False,
-                                "availability": "ON_DEMAND_GCP",
-                                "zone_id": "HA"
-                            },
-                            "node_type_id": "n2-highmem-4",
-                            "enable_elastic_disk": True,
-                            "data_security_mode": "SINGLE_USER",
-                            "runtime_engine": "STANDARD",
-                            "autoscale": {
-                                "min_workers": 1,
-                                "max_workers": 10
-                            }
-                        }
-                    }
-                ],
-                "git_source": {
-                    "git_url": "https://github.com/Labelbox/labelspark.git",
-                    "git_provider": "gitHub",
-                    "git_branch": "master"
-                },
-                "format": "MULTI_TASK"
-            }
-            # Merge the scheduling configuration into the main job payload
-            payload.update(schedule_block)
-            return payload
-    # if cluster_id blank or null:
-    if new_cluster == True:
-        cluster_id = create_all_purpose_cluster(databricks_instance)
-        print(f"Created all-purpose cluster with ID: {cluster_id}")
-    else:
-        print(f"Using existing cluster with ID: {cluster_id}")
-    if mode == "preview":
-        payload = create_preview(dataset_id, table_path, labelbox_api_key, frequency, cluster_id)
-    elif mode == "production":
-        payload = create_production(dataset_id, table_path, labelbox_api_key, frequency)
-    else:
-        return f"Invalid mode: {mode}"
-    # ----- JOB CREATION -----
-    # Formulate the endpoint URL for the Databricks REST API job creation
-    url = f"https://{databricks_instance}/api/2.0/jobs/create"
-    # Send the POST request to Databricks to create the job
-    response = requests.post(url, data=json.dumps(payload), headers=headers)
-    # ----- RESPONSE HANDLING -----
-    if response.status_code == 200:
-        return f"Job created successfully. {response.text}"
-    else:
-        return f"Failed to create job. Error: {response.text}"
-@app.route('/create-databricks-job', methods=['POST'])
-def api_create_databricks_job():
-    data = request.get_json()
-    result = create_databricks_job(data)
-    return jsonify({"message": result})
-def run():
-    app.run(port=5000)
-threading.Thread(target=run).start()