Unknown-Geek commited on
Commit
5838d8d
·
1 Parent(s): c8766e0

Deployment 1

Browse files
Files changed (5) hide show
  1. Dockerfile +17 -20
  2. app.py +137 -96
  3. main.py +141 -0
  4. requirements.txt +8 -5
  5. synthetic_data_pipeline.py +4 -0
Dockerfile CHANGED
@@ -1,26 +1,23 @@
1
- # Use an official Python runtime as a parent image
2
- FROM python:3.8-slim
3
 
4
- # Set the working directory in the container
5
- WORKDIR /app
6
-
7
- # Copy the current directory contents into the container at /app
8
- COPY . /app
9
 
10
- # Create necessary directories with the correct permissions
11
- RUN mkdir -p /app/temp_uploads /app/output && chmod -R 777 /app/temp_uploads /app/output
 
12
 
13
- # Create a virtual environment
14
- RUN python -m venv venv
15
-
16
- # Activate the virtual environment and install any needed packages specified in requirements.txt
17
- RUN . venv/bin/activate && pip install --no-cache-dir -r requirements.txt
18
 
19
- # Make port 8080 available to the world outside this container
20
- EXPOSE 8080
 
 
 
 
21
 
22
- # Define environment variable
23
- ENV PORT=8080
24
 
25
- # Run app.py with Gunicorn when the container launches
26
- CMD ["venv/bin/gunicorn", "--bind", "0.0.0.0:8080", "--timeout", "300", "app:app"]
 
1
+ # Read the doc: https://huggingface.co/docs/hub/spaces-sdks-docker
2
+ # you will also find guides on how best to write your Dockerfile
3
 
4
+ FROM python:3.9
 
 
 
 
5
 
6
+ RUN useradd -m -u 1000 user
7
+ USER user
8
+ ENV PATH="/home/user/.local/bin:$PATH"
9
 
10
+ WORKDIR /app
 
 
 
 
11
 
12
+ COPY --chown=user ./requirements.txt requirements.txt
13
+ RUN pip install --no-cache-dir --upgrade pip && \
14
+ pip install --no-cache-dir numpy && \
15
+ pip install --no-cache-dir pandas && \
16
+ pip install --no-cache-dir torch && \
17
+ pip install --no-cache-dir -r requirements.txt
18
 
19
+ # Make sure directories exist for file operations
20
+ RUN mkdir -p temp_uploads output
21
 
22
+ COPY --chown=user . /app
23
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,31 +1,61 @@
1
  import os
2
- from flask import Flask, request, send_file, jsonify
3
- from flask_cors import CORS
4
- from werkzeug.utils import secure_filename
5
- import traceback
6
  import logging
7
- import shutil
8
- from synthetic_data_pipeline import SyntheticDataPipeline
9
- import time
10
 
11
- # Setup logging
12
- logging.basicConfig(level=logging.DEBUG)
13
  logger = logging.getLogger(__name__)
14
 
15
- app = Flask(__name__)
16
- # Setup CORS with Render domains
17
- CORS(app, origins=["*"],
18
- supports_credentials=True,
19
- methods=["GET", "POST"],
20
- allow_headers=["Content-Type"])
21
-
22
- app.config['UPLOAD_FOLDER'] = 'temp_uploads'
23
- app.config['OUTPUT_FOLDER'] = 'output'
24
- app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
 
26
  # Create necessary directories
27
- os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
28
- os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
29
 
30
  def cleanup_output_directory(directory):
31
  """Remove all files in the output directory except .gitkeep"""
@@ -47,95 +77,106 @@ def cleanup_output_directory(directory):
47
  except Exception as e:
48
  logger.error(f'Error deleting {file_path}: {e}')
49
 
50
- @app.route('/health', methods=['GET'])
 
 
 
 
51
  def health_check():
52
  try:
53
- return jsonify({"status": "healthy"}), 200
54
  except Exception as e:
55
  logger.error(f"Health check failed: {str(e)}")
56
- return jsonify({"status": "unhealthy", "error": str(e)}), 500
 
 
 
57
 
58
- @app.route('/generate', methods=['POST'])
59
- def generate_synthetic_data():
 
 
 
 
60
  try:
61
  # Clean up output directory before starting
62
- cleanup_output_directory(app.config['OUTPUT_FOLDER'])
63
  logger.info("Cleaned up output directory")
64
 
65
- if 'file' not in request.files:
66
- return jsonify({'error': 'No file provided'}), 400
 
 
 
 
 
67
 
68
- file = request.files['file']
69
- if file.filename == '':
70
- return jsonify({'error': 'No file selected'}), 400
71
-
72
- # Save uploaded file
73
- filename = secure_filename(file.filename)
74
- filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
75
- file.save(filepath)
76
-
77
- logger.info(f"File saved to {filepath}")
78
-
79
- # Get configuration from request
80
- config = request.form.to_dict()
81
- categorical_columns = [
82
- col.strip()
83
- for col in config.get('categorical_columns', '').replace('"', '').replace("'", '').split(',')
84
- if col.strip()
85
- ]
86
-
87
- if not categorical_columns:
88
- return jsonify({'error': 'No valid categorical columns provided'}), 400
89
 
90
- try:
91
- num_samples = int(config.get('num_samples', 1000))
92
- except ValueError:
93
- return jsonify({'error': 'Invalid number of samples'}), 400
94
-
95
- logger.info(f"Processing with categorical columns: {categorical_columns}")
96
- logger.info(f"Number of samples requested: {num_samples}")
97
-
98
- # Run pipeline with kwargs
99
- pipeline = SyntheticDataPipeline(
100
- input_file=filepath,
101
- categorical_columns=categorical_columns,
102
- output_dir=os.path.abspath(app.config['OUTPUT_FOLDER'])
103
- )
104
-
105
- pipeline.run_pipeline(
106
- num_samples=num_samples,
107
- epochs=100,
108
- chunk_size=10000
109
- )
110
-
111
- # Get latest generated file
112
- output_dir = pipeline.output_dir
113
- files = [f for f in os.listdir(output_dir) if f.startswith("synthetic_data_")]
114
-
115
- if not files:
116
- raise Exception("No output file generated")
117
 
118
- latest_file = sorted(files)[-1]
119
- output_path = os.path.join(output_dir, latest_file)
120
-
121
- logger.info(f"Sending file: {output_path}")
122
-
123
- # Clean up
124
- os.remove(filepath)
125
-
126
- return send_file(
127
- output_path,
128
- as_attachment=True,
129
- download_name='synthetic_data.csv'
130
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  except Exception as e:
133
  logger.error(f"Error in generate_synthetic_data: {str(e)}")
134
  logger.error(traceback.format_exc())
135
- return jsonify({'error': str(e)}), 500
 
 
 
136
 
137
- if __name__ == '__main__':
138
- # Use Render's PORT environment variable
139
- port = int(os.environ.get('PORT', 8080))
140
- logger.info(f"Starting Flask app on port {port}")
141
- app.run(host='0.0.0.0', port=port)
 
 
 
 
1
  import os
2
+ import sys
 
 
 
3
  import logging
 
 
 
4
 
5
+ # Set up logging first
6
+ logging.basicConfig(level=logging.INFO)
7
  logger = logging.getLogger(__name__)
8
 
9
+ try:
10
+ # Check versions before importing other libraries
11
+ import numpy as np
12
+ import pandas as pd
13
+ logger.info(f"NumPy version: {np.__version__}")
14
+ logger.info(f"Pandas version: {pd.__version__}")
15
+
16
+ try:
17
+ import torch
18
+ logger.info(f"PyTorch version: {torch.__version__}")
19
+ except ImportError as e:
20
+ logger.warning(f"PyTorch import error: {str(e)}")
21
+
22
+ from fastapi import FastAPI, UploadFile, File, Form
23
+ from fastapi.responses import FileResponse, JSONResponse
24
+ from fastapi.middleware.cors import CORSMiddleware
25
+ import traceback
26
+ import shutil
27
+ import tempfile
28
+ import uvicorn # Import uvicorn for running the server
29
+
30
+ try:
31
+ from synthetic_data_pipeline import SyntheticDataPipeline
32
+ import time
33
+ except ImportError as e:
34
+ logger.error(f"Failed to import SyntheticDataPipeline: {str(e)}")
35
+ raise
36
+ except ImportError as e:
37
+ logger.error(f"Import error: {str(e)}")
38
+ logger.error("Try checking package compatibility or downgrading packages")
39
+ sys.exit(1)
40
+
41
+ app = FastAPI(title="Synthetic Data Generator")
42
+
43
+ # Setup CORS
44
+ app.add_middleware(
45
+ CORSMiddleware,
46
+ allow_origins=["*"],
47
+ allow_credentials=True,
48
+ allow_methods=["*"],
49
+ allow_headers=["*"],
50
+ )
51
+
52
+ # Configuration
53
+ UPLOAD_FOLDER = 'temp_uploads'
54
+ OUTPUT_FOLDER = 'output'
55
 
56
  # Create necessary directories
57
+ os.makedirs(UPLOAD_FOLDER, exist_ok=True)
58
+ os.makedirs(OUTPUT_FOLDER, exist_ok=True)
59
 
60
  def cleanup_output_directory(directory):
61
  """Remove all files in the output directory except .gitkeep"""
 
77
  except Exception as e:
78
  logger.error(f'Error deleting {file_path}: {e}')
79
 
80
+ @app.get("/")
81
+ def root():
82
+ return {"message": "Synthetic Data Generator API", "numpy_version": np.__version__}
83
+
84
+ @app.get("/health")
85
  def health_check():
86
  try:
87
+ return {"status": "healthy", "numpy_version": np.__version__}
88
  except Exception as e:
89
  logger.error(f"Health check failed: {str(e)}")
90
+ return JSONResponse(
91
+ status_code=500,
92
+ content={"status": "unhealthy", "error": str(e)}
93
+ )
94
 
95
+ @app.post("/generate")
96
+ async def generate_synthetic_data(
97
+ file: UploadFile = File(...),
98
+ categorical_columns: str = Form(...),
99
+ num_samples: int = Form(1000)
100
+ ):
101
  try:
102
  # Clean up output directory before starting
103
+ cleanup_output_directory(OUTPUT_FOLDER)
104
  logger.info("Cleaned up output directory")
105
 
106
+ # Save uploaded file to temp location
107
+ temp_file = tempfile.NamedTemporaryFile(delete=False, suffix='.csv')
108
+ try:
109
+ contents = await file.read()
110
+ temp_file.write(contents)
111
+ temp_file.close()
112
+ filepath = temp_file.name
113
 
114
+ logger.info(f"File saved to {filepath}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ # Parse categorical columns
117
+ categorical_columns_list = [
118
+ col.strip()
119
+ for col in categorical_columns.replace('"', '').replace("'", '').split(',')
120
+ if col.strip()
121
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
122
 
123
+ if not categorical_columns_list:
124
+ return JSONResponse(
125
+ status_code=400,
126
+ content={"error": "No valid categorical columns provided"}
127
+ )
128
+
129
+ logger.info(f"Processing with categorical columns: {categorical_columns_list}")
130
+ logger.info(f"Number of samples requested: {num_samples}")
131
+
132
+ # Run pipeline with kwargs
133
+ pipeline = SyntheticDataPipeline(
134
+ input_file=filepath,
135
+ categorical_columns=categorical_columns_list,
136
+ output_dir=os.path.abspath(OUTPUT_FOLDER)
137
+ )
138
+
139
+ pipeline.run_pipeline(
140
+ num_samples=num_samples,
141
+ epochs=100,
142
+ chunk_size=10000
143
+ )
144
+
145
+ # Get latest generated file
146
+ output_dir = pipeline.output_dir
147
+ files = [f for f in os.listdir(output_dir) if f.startswith("synthetic_data_")]
148
+
149
+ if not files:
150
+ raise Exception("No output file generated")
151
+
152
+ latest_file = sorted(files)[-1]
153
+ output_path = os.path.join(output_dir, latest_file)
154
+
155
+ logger.info(f"Sending file: {output_path}")
156
+
157
+ return FileResponse(
158
+ path=output_path,
159
+ filename="synthetic_data.csv",
160
+ media_type="text/csv"
161
+ )
162
+ finally:
163
+ # Clean up temp file
164
+ if os.path.exists(filepath):
165
+ os.unlink(filepath)
166
 
167
  except Exception as e:
168
  logger.error(f"Error in generate_synthetic_data: {str(e)}")
169
  logger.error(traceback.format_exc())
170
+ return JSONResponse(
171
+ status_code=500,
172
+ content={"error": str(e)}
173
+ )
174
 
175
+ # Add this section to run the server when the script is executed directly
176
+ if __name__ == "__main__":
177
+ # Get port from environment variable or use default
178
+ port = int(os.environ.get("PORT", 7860))
179
+ logger.info(f"Starting FastAPI server on port {port}")
180
+
181
+ # Start the server
182
+ uvicorn.run("app:app", host="0.0.0.0", port=port, reload=True)
main.py ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from flask import Flask, request, send_file, jsonify
3
+ from flask_cors import CORS
4
+ from werkzeug.utils import secure_filename
5
+ import traceback
6
+ import logging
7
+ import shutil
8
+ from synthetic_data_pipeline import SyntheticDataPipeline
9
+ import time
10
+
11
+ # Setup logging
12
+ logging.basicConfig(level=logging.DEBUG)
13
+ logger = logging.getLogger(__name__)
14
+
15
+ app = Flask(__name__)
16
+ # Setup CORS with Render domains
17
+ CORS(app, origins=["*"],
18
+ supports_credentials=True,
19
+ methods=["GET", "POST"],
20
+ allow_headers=["Content-Type"])
21
+
22
+ app.config['UPLOAD_FOLDER'] = 'temp_uploads'
23
+ app.config['OUTPUT_FOLDER'] = 'output'
24
+ app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 # 16MB max file size
25
+
26
+ # Create necessary directories
27
+ os.makedirs(app.config['UPLOAD_FOLDER'], exist_ok=True)
28
+ os.makedirs(app.config['OUTPUT_FOLDER'], exist_ok=True)
29
+
30
+ def cleanup_output_directory(directory):
31
+ """Remove all files in the output directory except .gitkeep"""
32
+ for filename in os.listdir(directory):
33
+ if filename != '.gitkeep':
34
+ file_path = os.path.join(directory, filename)
35
+ try:
36
+ if os.path.isfile(file_path):
37
+ # Add retry logic for locked files
38
+ max_retries = 3
39
+ for _ in range(max_retries):
40
+ try:
41
+ os.unlink(file_path)
42
+ break
43
+ except PermissionError:
44
+ time.sleep(1) # Wait before retry
45
+ elif os.path.isdir(file_path):
46
+ shutil.rmtree(file_path, ignore_errors=True)
47
+ except Exception as e:
48
+ logger.error(f'Error deleting {file_path}: {e}')
49
+
50
+ @app.route('/health', methods=['GET'])
51
+ def health_check():
52
+ try:
53
+ return jsonify({"status": "healthy"}), 200
54
+ except Exception as e:
55
+ logger.error(f"Health check failed: {str(e)}")
56
+ return jsonify({"status": "unhealthy", "error": str(e)}), 500
57
+
58
+ @app.route('/generate', methods=['POST'])
59
+ def generate_synthetic_data():
60
+ try:
61
+ # Clean up output directory before starting
62
+ cleanup_output_directory(app.config['OUTPUT_FOLDER'])
63
+ logger.info("Cleaned up output directory")
64
+
65
+ if 'file' not in request.files:
66
+ return jsonify({'error': 'No file provided'}), 400
67
+
68
+ file = request.files['file']
69
+ if file.filename == '':
70
+ return jsonify({'error': 'No file selected'}), 400
71
+
72
+ # Save uploaded file
73
+ filename = secure_filename(file.filename)
74
+ filepath = os.path.join(app.config['UPLOAD_FOLDER'], filename)
75
+ file.save(filepath)
76
+
77
+ logger.info(f"File saved to {filepath}")
78
+
79
+ # Get configuration from request
80
+ config = request.form.to_dict()
81
+ categorical_columns = [
82
+ col.strip()
83
+ for col in config.get('categorical_columns', '').replace('"', '').replace("'", '').split(',')
84
+ if col.strip()
85
+ ]
86
+
87
+ if not categorical_columns:
88
+ return jsonify({'error': 'No valid categorical columns provided'}), 400
89
+
90
+ try:
91
+ num_samples = int(config.get('num_samples', 1000))
92
+ except ValueError:
93
+ return jsonify({'error': 'Invalid number of samples'}), 400
94
+
95
+ logger.info(f"Processing with categorical columns: {categorical_columns}")
96
+ logger.info(f"Number of samples requested: {num_samples}")
97
+
98
+ # Run pipeline with kwargs
99
+ pipeline = SyntheticDataPipeline(
100
+ input_file=filepath,
101
+ categorical_columns=categorical_columns,
102
+ output_dir=os.path.abspath(app.config['OUTPUT_FOLDER'])
103
+ )
104
+
105
+ pipeline.run_pipeline(
106
+ num_samples=num_samples,
107
+ epochs=100,
108
+ chunk_size=10000
109
+ )
110
+
111
+ # Get latest generated file
112
+ output_dir = pipeline.output_dir
113
+ files = [f for f in os.listdir(output_dir) if f.startswith("synthetic_data_")]
114
+
115
+ if not files:
116
+ raise Exception("No output file generated")
117
+
118
+ latest_file = sorted(files)[-1]
119
+ output_path = os.path.join(output_dir, latest_file)
120
+
121
+ logger.info(f"Sending file: {output_path}")
122
+
123
+ # Clean up
124
+ os.remove(filepath)
125
+
126
+ return send_file(
127
+ output_path,
128
+ as_attachment=True,
129
+ download_name='synthetic_data.csv'
130
+ )
131
+
132
+ except Exception as e:
133
+ logger.error(f"Error in generate_synthetic_data: {str(e)}")
134
+ logger.error(traceback.format_exc())
135
+ return jsonify({'error': str(e)}), 500
136
+
137
+ if __name__ == '__main__':
138
+ # Use Render's PORT environment variable
139
+ port = int(os.environ.get('PORT', 8080))
140
+ logger.info(f"Starting Flask app on port {port}")
141
+ app.run(host='0.0.0.0', port=port)
requirements.txt CHANGED
@@ -1,12 +1,15 @@
1
  flask==2.0.1
2
  flask-cors==3.0.10
3
- pandas
4
- numpy
5
  --only-binary=:all: scikit-learn
6
- scikit-learn
7
  ctgan
8
  rdt==1.13.2
9
- torch==2.0.1
10
  werkzeug==2.0.3
11
  gunicorn==21.2.0
12
- psutil==5.9.4
 
 
 
 
1
  flask==2.0.1
2
  flask-cors==3.0.10
3
+ pandas==2.1.1
4
+ numpy<2.0.0
5
  --only-binary=:all: scikit-learn
6
+ scikit-learn<1.4.0
7
  ctgan
8
  rdt==1.13.2
9
+ torch>=2.2.0
10
  werkzeug==2.0.3
11
  gunicorn==21.2.0
12
+ psutil==5.9.4
13
+ fastapi
14
+ uvicorn[standard]
15
+ python-multipart
synthetic_data_pipeline.py CHANGED
@@ -10,6 +10,10 @@ from typing import List, Dict, Optional
10
  import shutil
11
  import tempfile
12
 
 
 
 
 
13
  class SyntheticDataPipeline:
14
  def __init__(
15
  self,
 
10
  import shutil
11
  import tempfile
12
 
13
+ # Check pandas version
14
+ logger = logging.getLogger(__name__)
15
+ logger.info(f"Using pandas version: {pd.__version__}")
16
+
17
  class SyntheticDataPipeline:
18
  def __init__(
19
  self,