dhruv575 commited on
Commit
f6249c8
·
1 Parent(s): aa62fa3

Proper migration to GridFS

Browse files
app.py CHANGED
@@ -5,8 +5,6 @@ from dotenv import load_dotenv
5
  import logging
6
  from logging.handlers import RotatingFileHandler
7
  import pymongo
8
- import cloudinary
9
- import cloudinary.uploader
10
  import datetime
11
  import secrets
12
 
@@ -67,7 +65,7 @@ app.logger.info('App secrets configured.')
67
 
68
  # Verify essential environment variables
69
  missing_vars = []
70
- required_env_vars = ['MONGO_URI', 'CLOUDINARY_CLOUD_NAME', 'CLOUDINARY_API_KEY', 'CLOUDINARY_API_SECRET', 'OPENAI_API_KEY']
71
  for var in required_env_vars:
72
  if not os.environ.get(var):
73
  missing_vars.append(var)
@@ -88,17 +86,6 @@ except Exception as e:
88
  app.logger.error(f"Failed to connect to MongoDB: {str(e)}")
89
  # Depending on severity, you might want to exit or prevent app run
90
 
91
- # Configure Cloudinary
92
- try:
93
- cloudinary.config(
94
- cloud_name=os.environ.get('CLOUDINARY_CLOUD_NAME'),
95
- api_key=os.environ.get('CLOUDINARY_API_KEY'),
96
- api_secret=os.environ.get('CLOUDINARY_API_SECRET')
97
- )
98
- app.logger.info("Cloudinary configured.")
99
- except Exception as e:
100
- app.logger.error(f"Failed to configure Cloudinary: {str(e)}")
101
-
102
  # Configure OpenAI API key
103
  if 'OPENAI_API_KEY' in os.environ:
104
  try:
 
5
  import logging
6
  from logging.handlers import RotatingFileHandler
7
  import pymongo
 
 
8
  import datetime
9
  import secrets
10
 
 
65
 
66
  # Verify essential environment variables
67
  missing_vars = []
68
+ required_env_vars = ['MONGO_URI', 'OPENAI_API_KEY']
69
  for var in required_env_vars:
70
  if not os.environ.get(var):
71
  missing_vars.append(var)
 
86
  app.logger.error(f"Failed to connect to MongoDB: {str(e)}")
87
  # Depending on severity, you might want to exit or prevent app run
88
 
 
 
 
 
 
 
 
 
 
 
 
89
  # Configure OpenAI API key
90
  if 'OPENAI_API_KEY' in os.environ:
91
  try:
controllers/log_controller.py CHANGED
@@ -1,9 +1,8 @@
1
  from flask import jsonify, request
2
  import logging
3
  import os
4
- import cloudinary.uploader
5
- import uuid
6
  from datetime import datetime
 
7
  import pytesseract
8
  from PIL import Image
9
  import pdf2image
@@ -17,6 +16,8 @@ from models.workflow import Workflow
17
  from models.incident import Incident
18
  from utils.celery_tasks import process_log_document
19
  from utils.pdf_utils import pdf_to_text
 
 
20
 
21
  # Configure logging
22
  logger = logging.getLogger(__name__)
@@ -43,17 +44,22 @@ def upload_log(current_user):
43
  # Parse the date string
44
  log_date = datetime.strptime(log_date_str, '%Y-%m-%d').date()
45
 
46
- # Upload file to Cloudinary
47
- result = cloudinary.uploader.upload(
48
- file,
49
- resource_type="raw",
50
- folder=f"enflow/logs/{current_user.department_id}/{current_user._id}",
51
- public_id=f"{uuid.uuid4()}",
52
- format="pdf"
 
 
 
 
 
53
  )
54
 
55
- # Get the public URL
56
- log_file_url = result['secure_url']
57
 
58
  # Create new log entry
59
  log = Log(
@@ -72,6 +78,13 @@ def upload_log(current_user):
72
  'log': log.to_dict()
73
  }), 201
74
  else:
 
 
 
 
 
 
 
75
  return jsonify({'message': 'Failed to save log entry'}), 500
76
 
77
  except ValueError:
@@ -106,15 +119,16 @@ def delete_log(current_user, log_id):
106
  if str(log.user_id) != str(current_user._id) and current_user.permissions != 'Admin':
107
  return jsonify({'message': 'Only the log owner or department admin can delete logs'}), 403
108
 
109
- # Delete the log file from Cloudinary if it exists
110
- if log.log_file and log.log_file.startswith('http'):
111
  try:
112
- # Extract public_id from URL
113
- parts = log.log_file.split('/')
114
- public_id = parts[-1].split('.')[0]
115
- cloudinary.uploader.destroy(public_id)
 
116
  except Exception as e:
117
- logger.error(f"Error deleting log file from Cloudinary: {str(e)}")
118
 
119
  # Delete associated incidents if they exist
120
  for incident_id in log.incidents:
 
1
  from flask import jsonify, request
2
  import logging
3
  import os
 
 
4
  from datetime import datetime
5
+ import uuid
6
  import pytesseract
7
  from PIL import Image
8
  import pdf2image
 
16
  from models.incident import Incident
17
  from utils.celery_tasks import process_log_document
18
  from utils.pdf_utils import pdf_to_text
19
+ from db import get_gridfs
20
+ from bson.objectid import ObjectId
21
 
22
  # Configure logging
23
  logger = logging.getLogger(__name__)
 
44
  # Parse the date string
45
  log_date = datetime.strptime(log_date_str, '%Y-%m-%d').date()
46
 
47
+ # Upload file to GridFS
48
+ fs = get_gridfs()
49
+ file_id = fs.put(
50
+ file.read(),
51
+ filename=file.filename,
52
+ content_type='application/pdf',
53
+ metadata={
54
+ 'user_id': str(current_user._id),
55
+ 'department_id': str(current_user.department_id),
56
+ 'log_date': log_date_str,
57
+ 'upload_date': datetime.now()
58
+ }
59
  )
60
 
61
+ # Create the file URL for retrieval
62
+ log_file_url = f"/api/logs/files/{file_id}"
63
 
64
  # Create new log entry
65
  log = Log(
 
78
  'log': log.to_dict()
79
  }), 201
80
  else:
81
+ # Clean up GridFS file if log save fails
82
+ try:
83
+ fs.delete(file_id)
84
+ logger.info(f"Deleted file {file_id} from GridFS after failed log save")
85
+ except Exception as del_e:
86
+ logger.error(f"Failed to delete GridFS file {file_id} after DB error: {del_e}")
87
+
88
  return jsonify({'message': 'Failed to save log entry'}), 500
89
 
90
  except ValueError:
 
119
  if str(log.user_id) != str(current_user._id) and current_user.permissions != 'Admin':
120
  return jsonify({'message': 'Only the log owner or department admin can delete logs'}), 403
121
 
122
+ # Delete the log file from GridFS if it exists
123
+ if log.log_file and '/files/' in log.log_file:
124
  try:
125
+ # Extract file_id from URL
126
+ file_id = log.log_file.split('/')[-1]
127
+ fs = get_gridfs()
128
+ fs.delete(ObjectId(file_id))
129
+ logger.info(f"Deleted file {file_id} from GridFS")
130
  except Exception as e:
131
+ logger.error(f"Error deleting log file from GridFS: {str(e)}")
132
 
133
  # Delete associated incidents if they exist
134
  for incident_id in log.incidents:
controllers/workflow_controller.py CHANGED
@@ -3,8 +3,6 @@ from models.workflow import Workflow
3
  from models.department import Department
4
  import logging
5
  import os
6
- import cloudinary.uploader
7
- import uuid
8
  from db import get_gridfs
9
  from bson.objectid import ObjectId
10
  from datetime import datetime
@@ -138,15 +136,15 @@ def delete_workflow(current_user, workflow_id):
138
  # Delete all forms associated with the workflow
139
  for form_path in workflow.raw_forms:
140
  try:
141
- # If using Cloudinary
142
- if form_path.startswith('http'):
143
- # Extract public_id from URL - assuming Cloudinary URL format
144
- parts = form_path.split('/')
145
- public_id = parts[-1].split('.')[0]
146
- cloudinary.uploader.destroy(public_id)
147
- # If using local storage
148
- elif os.path.exists(form_path):
149
- os.remove(form_path)
150
  except Exception as e:
151
  logger.error(f"Error deleting form {form_path}: {str(e)}")
152
 
@@ -333,17 +331,9 @@ def remove_form(current_user, workflow_id):
333
  file_id = form_url.split('/')[-1]
334
 
335
  # Delete from GridFS
336
- from db import get_gridfs
337
  fs = get_gridfs()
338
  fs.delete(ObjectId(file_id))
339
  logger.info(f"Deleted file {file_id} from GridFS")
340
- # If using Cloudinary (legacy support)
341
- elif form_url.startswith('http'):
342
- # Extract public_id from URL - assuming Cloudinary URL format
343
- parts = form_url.split('/')
344
- public_id = parts[-1].split('.')[0]
345
- cloudinary.uploader.destroy(public_id)
346
- logger.info(f"Deleted file {public_id} from Cloudinary")
347
 
348
  # Remove form from workflow
349
  if workflow.remove_form(form_url):
 
3
  from models.department import Department
4
  import logging
5
  import os
 
 
6
  from db import get_gridfs
7
  from bson.objectid import ObjectId
8
  from datetime import datetime
 
136
  # Delete all forms associated with the workflow
137
  for form_path in workflow.raw_forms:
138
  try:
139
+ # If it's a GridFS file URL (starts with /api/workflows/{id}/files/{file_id})
140
+ if '/files/' in form_path:
141
+ # Extract file_id from URL
142
+ file_id = form_path.split('/')[-1]
143
+
144
+ # Delete from GridFS
145
+ fs = get_gridfs()
146
+ fs.delete(ObjectId(file_id))
147
+ logger.info(f"Deleted file {file_id} from GridFS")
148
  except Exception as e:
149
  logger.error(f"Error deleting form {form_path}: {str(e)}")
150
 
 
331
  file_id = form_url.split('/')[-1]
332
 
333
  # Delete from GridFS
 
334
  fs = get_gridfs()
335
  fs.delete(ObjectId(file_id))
336
  logger.info(f"Deleted file {file_id} from GridFS")
 
 
 
 
 
 
 
337
 
338
  # Remove form from workflow
339
  if workflow.remove_form(form_url):
migrate_cloudinary_to_gridfs.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import logging
3
+ import requests
4
+ from urllib.parse import urlparse
5
+ import cloudinary
6
+ import cloudinary.api
7
+ from dotenv import load_dotenv
8
+ from db import get_gridfs, Database, get_workflows_collection
9
+ from models.workflow import Workflow
10
+ from bson.objectid import ObjectId
11
+
12
+ # Load environment variables
13
+ load_dotenv()
14
+
15
+ # Configure logging
16
+ logging.basicConfig(level=logging.INFO)
17
+ logger = logging.getLogger(__name__)
18
+
19
+ # Configure Cloudinary
20
+ try:
21
+ cloudinary.config(
22
+ cloud_name=os.environ.get('CLOUDINARY_CLOUD_NAME'),
23
+ api_key=os.environ.get('CLOUDINARY_API_KEY'),
24
+ api_secret=os.environ.get('CLOUDINARY_API_SECRET')
25
+ )
26
+ logger.info("Cloudinary configured.")
27
+ except Exception as e:
28
+ logger.error(f"Failed to configure Cloudinary: {str(e)}")
29
+ exit(1)
30
+
31
+ def is_cloudinary_url(url):
32
+ """Check if a URL is a Cloudinary URL"""
33
+ return url.startswith('http') and 'cloudinary.com' in url
34
+
35
+ def extract_cloudinary_public_id(url):
36
+ """Extract the public ID from a Cloudinary URL"""
37
+ parsed = urlparse(url)
38
+ path = parsed.path
39
+ # Remove the file extension
40
+ filename = os.path.basename(path)
41
+ public_id = os.path.splitext(filename)[0]
42
+ return public_id
43
+
44
+ def migrate_cloudinary_to_gridfs():
45
+ """Migrate all Cloudinary files referenced in workflows to GridFS"""
46
+ logger.info("Starting migration of Cloudinary files to GridFS")
47
+
48
+ # Initialize database connection
49
+ db_instance = Database.get_instance()
50
+ fs = get_gridfs()
51
+
52
+ # Get all workflows
53
+ workflows = Workflow.get_all()
54
+ logger.info(f"Found {len(workflows)} workflows to check")
55
+
56
+ migration_count = 0
57
+ error_count = 0
58
+
59
+ for workflow in workflows:
60
+ # Check if workflow has Cloudinary forms
61
+ cloudinary_forms = [form for form in workflow.raw_forms if is_cloudinary_url(form)]
62
+
63
+ if not cloudinary_forms:
64
+ continue
65
+
66
+ logger.info(f"Workflow {workflow._id} has {len(cloudinary_forms)} Cloudinary forms to migrate")
67
+
68
+ for form_url in cloudinary_forms:
69
+ try:
70
+ # Extract public_id from URL
71
+ public_id = extract_cloudinary_public_id(form_url)
72
+ logger.info(f"Migrating Cloudinary file {public_id} to GridFS")
73
+
74
+ # Download file from Cloudinary
75
+ resource = cloudinary.api.resource(public_id)
76
+ file_url = resource['secure_url']
77
+ response = requests.get(file_url)
78
+ if response.status_code != 200:
79
+ logger.error(f"Failed to download file from Cloudinary: {response.status_code}")
80
+ error_count += 1
81
+ continue
82
+
83
+ file_content = response.content
84
+
85
+ # Store in GridFS
86
+ file_id = fs.put(
87
+ file_content,
88
+ filename=f"{public_id}.pdf",
89
+ content_type='application/pdf',
90
+ metadata={
91
+ 'workflow_id': str(workflow._id),
92
+ 'migrated_from_cloudinary': True,
93
+ 'original_url': form_url
94
+ }
95
+ )
96
+
97
+ # Create a URL for retrieving the file
98
+ gridfs_url = f"/api/workflows/{workflow._id}/files/{file_id}"
99
+
100
+ # Replace Cloudinary URL with GridFS URL in workflow
101
+ workflow.raw_forms.remove(form_url)
102
+ workflow.raw_forms.append(gridfs_url)
103
+
104
+ if workflow.save():
105
+ logger.info(f"Successfully migrated {form_url} to {gridfs_url}")
106
+ migration_count += 1
107
+
108
+ # Optionally delete from Cloudinary
109
+ try:
110
+ cloudinary.uploader.destroy(public_id)
111
+ logger.info(f"Deleted file {public_id} from Cloudinary")
112
+ except Exception as del_e:
113
+ logger.warning(f"Could not delete {public_id} from Cloudinary: {str(del_e)}")
114
+ else:
115
+ logger.error(f"Failed to update workflow {workflow._id} with new GridFS URL")
116
+ # Clean up the added GridFS file
117
+ fs.delete(file_id)
118
+ error_count += 1
119
+
120
+ except Exception as e:
121
+ logger.error(f"Error migrating {form_url}: {str(e)}")
122
+ error_count += 1
123
+
124
+ logger.info(f"Migration complete: {migration_count} files migrated, {error_count} errors")
125
+
126
+ if __name__ == "__main__":
127
+ migrate_cloudinary_to_gridfs()
requirements.txt CHANGED
@@ -1,18 +1,18 @@
1
- Flask==2.2.3
2
- python-dotenv==1.0.0
3
- pymongo==4.5.0
4
- flask-cors==4.0.0
5
  bcrypt==4.0.1
6
  PyJWT==2.8.0
7
  gunicorn==21.2.0
8
- cloudinary==1.35.0
9
- openai==1.6.1
10
  pytesseract==0.3.10
11
- Pillow==10.1.0
12
  python-magic==0.4.27
13
  Flask-RESTful==0.3.10
14
  Werkzeug==2.2.3
15
- celery==5.3.4
16
  redis==5.0.1
17
- pdf2image==1.16.3
18
- requests==2.31.0
 
 
1
+ Flask==3.0.2
2
+ python-dotenv==1.0.1
3
+ pymongo==4.7.1
4
+ flask-cors==4.0.1
5
  bcrypt==4.0.1
6
  PyJWT==2.8.0
7
  gunicorn==21.2.0
8
+ openai==1.12.0
 
9
  pytesseract==0.3.10
10
+ Pillow==10.2.0
11
  python-magic==0.4.27
12
  Flask-RESTful==0.3.10
13
  Werkzeug==2.2.3
14
+ celery==5.3.6
15
  redis==5.0.1
16
+ pdf2image==1.17.0
17
+ requests==2.31.0
18
+ pytest==8.0.2
routes/log_routes.py CHANGED
@@ -1,12 +1,15 @@
1
- from flask import Blueprint
2
  from controllers.log_controller import (
3
  upload_log, get_log, delete_log, get_user_logs,
4
  get_department_logs, get_logs_by_date_range
5
  )
6
  from utils.auth import token_required, admin_required
 
 
 
7
 
8
  # Create blueprint
9
- log_bp = Blueprint('logs', __name__)
10
 
11
  # Routes that require authentication
12
  log_bp.route('/', methods=['POST'])(token_required(upload_log))
@@ -16,4 +19,37 @@ log_bp.route('/<log_id>', methods=['GET'])(token_required(get_log))
16
  log_bp.route('/<log_id>', methods=['DELETE'])(token_required(delete_log))
17
 
18
  # Routes that require admin permissions
19
- log_bp.route('/department', methods=['GET'])(admin_required(get_department_logs))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Blueprint, send_file, jsonify
2
  from controllers.log_controller import (
3
  upload_log, get_log, delete_log, get_user_logs,
4
  get_department_logs, get_logs_by_date_range
5
  )
6
  from utils.auth import token_required, admin_required
7
+ from db import get_gridfs
8
+ from bson.objectid import ObjectId
9
+ from io import BytesIO
10
 
11
  # Create blueprint
12
+ log_bp = Blueprint('log', __name__)
13
 
14
  # Routes that require authentication
15
  log_bp.route('/', methods=['POST'])(token_required(upload_log))
 
19
  log_bp.route('/<log_id>', methods=['DELETE'])(token_required(delete_log))
20
 
21
  # Routes that require admin permissions
22
+ log_bp.route('/department', methods=['GET'])(admin_required(get_department_logs))
23
+
24
+ @log_bp.route('/files/<file_id>', methods=['GET'])
25
+ @token_required
26
+ def get_log_file(current_user, file_id):
27
+ """Serve a file from GridFS"""
28
+ try:
29
+ # Get the file from GridFS
30
+ fs = get_gridfs()
31
+ file_obj = fs.get(ObjectId(file_id))
32
+
33
+ if not file_obj:
34
+ return jsonify({'message': 'File not found'}), 404
35
+
36
+ # Check if user has access to this file (belongs to their department)
37
+ metadata = file_obj.metadata or {}
38
+ if 'department_id' in metadata:
39
+ file_department_id = metadata['department_id']
40
+ if str(file_department_id) != str(current_user.department_id):
41
+ return jsonify({'message': 'Access denied to files from other departments'}), 403
42
+
43
+ # Create a response with the file data
44
+ data = BytesIO(file_obj.read())
45
+ data.seek(0)
46
+
47
+ return send_file(
48
+ data,
49
+ mimetype='application/pdf',
50
+ as_attachment=False,
51
+ download_name=file_obj.filename
52
+ )
53
+
54
+ except Exception as e:
55
+ return jsonify({'message': f'Error retrieving file: {str(e)}'}), 500