suvradeepp commited on
Commit
49e8d95
Β·
verified Β·
1 Parent(s): cea68af

Upload 34 files

Browse files
.gitattributes CHANGED
@@ -1,35 +1,11 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
1
+ # Git LFS Configuration for Hugging Face Hub
2
+ # Track large model files
3
+
4
+ *.pkl filter=lfs diff=lfs merge=lfs -text
5
+ *.pth filter=lfs diff=lfs merge=lfs -text
6
+ *.json filter=lfs diff=lfs merge=lfs -text
7
+ *.csv filter=lfs diff=lfs merge=lfs -text
8
+ *.png filter=lfs diff=lfs merge=lfs -text
9
+ *.jpg filter=lfs diff=lfs merge=lfs -text
10
+ *.jpeg filter=lfs diff=lfs merge=lfs -text
11
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Python
2
+ __pycache__/
3
+ *.py[cod]
4
+ *$py.class
5
+ *.so
6
+ .Python
7
+ env/
8
+ venv/
9
+ ENV/
10
+ build/
11
+ develop-eggs/
12
+ dist/
13
+ downloads/
14
+ eggs/
15
+ .eggs/
16
+ lib/
17
+ lib64/
18
+ parts/
19
+ sdist/
20
+ var/
21
+ wheels/
22
+ *.egg-info/
23
+ .installed.cfg
24
+ *.egg
25
+
26
+ # Virtual Environment
27
+ venv/
28
+ ENV/
29
+
30
+ # IDE
31
+ .vscode/
32
+ .idea/
33
+ *.swp
34
+ *.swo
35
+ *~
36
+
37
+ # OS
38
+ .DS_Store
39
+ Thumbs.db
40
+
41
+ # Streamlit
42
+ .streamlit/
43
+
44
+ # Logs
45
+ *.log
46
+
DEPLOYMENT_GUIDE.md ADDED
@@ -0,0 +1,233 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # πŸš€ Deployment Guide for Hugging Face Spaces
2
+
3
+ This guide will help you deploy the Crystallization Component Predictor to Hugging Face Spaces.
4
+
5
+ ## πŸ“‹ Prerequisites
6
+
7
+ 1. A Hugging Face account (sign up at https://huggingface.co/)
8
+ 2. Git installed on your computer
9
+ 3. Git LFS installed (`git lfs install`)
10
+
11
+ ## πŸ”§ Step-by-Step Deployment
12
+
13
+ ### Option 1: Web UI Upload (Easiest)
14
+
15
+ 1. **Create a new Space:**
16
+ - Go to https://huggingface.co/spaces
17
+ - Click "Create new Space"
18
+ - Choose a name (e.g., "crystallization-predictor")
19
+ - Select **Streamlit** as the SDK
20
+ - Choose visibility (Public or Private)
21
+ - Click "Create Space"
22
+
23
+ 2. **Upload files:**
24
+ - Click "Files" tab in your Space
25
+ - Click "Add file" β†’ "Upload files"
26
+ - Drag and drop ALL files from this `huggingface_app` folder:
27
+ - `app.py`
28
+ - `requirements.txt`
29
+ - `README.md`
30
+ - `.gitattributes`
31
+ - `.gitignore`
32
+ - `models/` folder (with all subfolders)
33
+ - `visualizations/` folder (with all images)
34
+ - Click "Commit changes to main"
35
+
36
+ 3. **Wait for build:**
37
+ - Hugging Face will automatically build your Space
38
+ - Check the "Logs" tab to monitor progress
39
+ - Usually takes 2-5 minutes
40
+
41
+ 4. **Test your app:**
42
+ - Once built, click on the "App" tab
43
+ - Your Streamlit app should be running!
44
+
45
+ ### Option 2: Git Command Line (Advanced)
46
+
47
+ 1. **Initialize Git LFS:**
48
+ ```bash
49
+ cd huggingface_app
50
+ git lfs install
51
+ ```
52
+
53
+ 2. **Clone your Space repository:**
54
+ ```bash
55
+ git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
56
+ cd YOUR_SPACE_NAME
57
+ ```
58
+
59
+ 3. **Copy files:**
60
+ ```bash
61
+ # Copy all files from huggingface_app to your cloned repo
62
+ # On Windows:
63
+ xcopy ..\huggingface_app\* . /E /H /Y
64
+
65
+ # On Linux/Mac:
66
+ cp -r ../huggingface_app/* .
67
+ ```
68
+
69
+ 4. **Commit and push:**
70
+ ```bash
71
+ git add .
72
+ git commit -m "Initial deployment of crystallization predictor"
73
+ git push
74
+ ```
75
+
76
+ 5. **Check deployment:**
77
+ - Visit your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
78
+ - Monitor build logs in the "Logs" tab
79
+
80
+ ## πŸ“ Files Included
81
+
82
+ ```
83
+ huggingface_app/
84
+ β”œβ”€β”€ app.py # Main Streamlit application
85
+ β”œβ”€β”€ requirements.txt # Python dependencies
86
+ β”œβ”€β”€ README.md # Documentation (shown on Space page)
87
+ β”œβ”€β”€ .gitattributes # Git LFS configuration
88
+ β”œβ”€β”€ .gitignore # Files to ignore
89
+ β”œβ”€β”€ DEPLOYMENT_GUIDE.md # This file
90
+ β”œβ”€β”€ models/
91
+ β”‚ β”œβ”€β”€ simple_baseline/
92
+ β”‚ β”‚ β”œβ”€β”€ model_component_name.pkl
93
+ β”‚ β”‚ β”œβ”€β”€ model_component_ph.pkl
94
+ β”‚ β”‚ β”œβ”€β”€ label_encoder_name.pkl
95
+ β”‚ β”‚ β”œβ”€β”€ scaler.pkl
96
+ β”‚ β”‚ β”œβ”€β”€ tfidf.pkl
97
+ β”‚ β”‚ └── training_results.json
98
+ β”‚ └── advanced_baseline/
99
+ β”‚ β”œβ”€β”€ model_component_name.pkl
100
+ β”‚ β”œβ”€β”€ model_component_conc.pkl
101
+ β”‚ β”œβ”€β”€ model_component_ph.pkl
102
+ β”‚ β”œβ”€β”€ label_encoder_name.pkl
103
+ β”‚ β”œβ”€β”€ scaler.pkl
104
+ β”‚ β”œβ”€β”€ tfidf.pkl
105
+ β”‚ └── training_results.json
106
+ └── visualizations/
107
+ β”œβ”€β”€ 01_component_name_comparison.png
108
+ β”œβ”€β”€ 02_component_conc_comparison.png
109
+ β”œβ”€β”€ 03_component_ph_comparison.png
110
+ β”œβ”€β”€ 04_all_approaches_heatmap.png
111
+ └── 05_complete_comparison.png
112
+ ```
113
+
114
+ ## πŸ” Troubleshooting
115
+
116
+ ### Build Fails
117
+
118
+ **Problem:** "Could not install packages due to an OSError"
119
+ - **Solution:** Check that all dependencies in `requirements.txt` are compatible
120
+ - Try pinning versions or using newer versions
121
+
122
+ **Problem:** "ModuleNotFoundError"
123
+ - **Solution:** Ensure the missing module is in `requirements.txt`
124
+
125
+ ### Model Loading Errors
126
+
127
+ **Problem:** "FileNotFoundError: [Errno 2] No such file or directory: 'models/...'"
128
+ - **Solution:** Verify all model files were uploaded correctly
129
+ - Check that folder structure is preserved
130
+
131
+ **Problem:** Large file upload fails
132
+ - **Solution:** Ensure Git LFS is properly configured
133
+ - Files over 10MB should use LFS (already configured in `.gitattributes`)
134
+
135
+ ### App Crashes
136
+
137
+ **Problem:** "Memory limit exceeded"
138
+ - **Solution:** Hugging Face Spaces have memory limits
139
+ - Consider using smaller models or optimizing loading
140
+
141
+ **Problem:** Slow loading
142
+ - **Solution:** Models are loaded on first prediction (not at startup)
143
+ - This is intentional for faster app startup
144
+
145
+ ## 🎨 Customization
146
+
147
+ ### Change App Title/Icon
148
+ Edit the `README.md` header:
149
+ ```yaml
150
+ ---
151
+ title: Your Custom Title
152
+ emoji: 🧬 # Change emoji
153
+ colorFrom: blue # Change colors
154
+ colorTo: purple
155
+ ---
156
+ ```
157
+
158
+ ### Modify the App
159
+ Edit `app.py` and commit changes. The Space will rebuild automatically.
160
+
161
+ ### Add More Models
162
+ 1. Add model files to `models/` folder
163
+ 2. Update `app.py` to load and use new models
164
+ 3. Update `README.md` to document changes
165
+
166
+ ## πŸ“Š Monitoring
167
+
168
+ - **Logs**: Check the "Logs" tab in your Space
169
+ - **Analytics**: View usage statistics in Space settings
170
+ - **Updates**: Any push to the main branch triggers a rebuild
171
+
172
+ ## πŸ”’ Security & Privacy
173
+
174
+ - **Public Spaces**: Anyone can use your app and see the code
175
+ - **Private Spaces**: Only you and collaborators can access
176
+ - **No User Data**: The app doesn't collect or store user inputs
177
+ - **Model Files**: Ensure you have rights to distribute the models
178
+
179
+ ## πŸ’° Costs
180
+
181
+ - **Free Tier**:
182
+ - CPU: 2 vCPU, 16GB RAM
183
+ - Perfect for this app
184
+ - No credit card required
185
+
186
+ - **Paid Tiers**:
187
+ - Available for GPU or more resources
188
+ - Not needed for this application
189
+
190
+ ## πŸ”— Useful Links
191
+
192
+ - Hugging Face Spaces Docs: https://huggingface.co/docs/hub/spaces
193
+ - Streamlit Docs: https://docs.streamlit.io/
194
+ - Git LFS: https://git-lfs.github.com/
195
+
196
+ ## πŸ“ž Support
197
+
198
+ If you encounter issues:
199
+ 1. Check the "Logs" tab in your Space
200
+ 2. Review Hugging Face Spaces documentation
201
+ 3. Search Hugging Face forums
202
+ 4. Open an issue on the repository
203
+
204
+ ## βœ… Pre-Deployment Checklist
205
+
206
+ - [ ] All model files copied to `models/` folders
207
+ - [ ] Visualizations copied to `visualizations/` folder
208
+ - [ ] `requirements.txt` has all dependencies
209
+ - [ ] `README.md` header configured with title/emoji
210
+ - [ ] Tested app locally (`streamlit run app.py`)
211
+ - [ ] Git LFS installed and configured
212
+ - [ ] Hugging Face account created
213
+ - [ ] Space created on Hugging Face
214
+
215
+ ## πŸŽ‰ Post-Deployment
216
+
217
+ After successful deployment:
218
+ 1. Test all features in the live app
219
+ 2. Share your Space URL with others
220
+ 3. Monitor logs for any errors
221
+ 4. Consider adding:
222
+ - Example inputs/outputs
223
+ - Tutorial video
224
+ - Publication link
225
+ - Citation information
226
+
227
+ ---
228
+
229
+ **Good luck with your deployment! πŸš€**
230
+
231
+ Your app will be accessible at:
232
+ `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
233
+
FILE_STRUCTURE.md ADDED
@@ -0,0 +1,333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # πŸ“ Hugging Face Deployment - Complete File Structure
2
+
3
+ ## Overview
4
+ This folder contains everything needed to deploy the Crystallization Component Predictor to Hugging Face Spaces.
5
+
6
+ **Total Size:** ~46 MB
7
+ **Status:** βœ… Ready for deployment
8
+
9
+ ---
10
+
11
+ ## πŸ“‚ Directory Structure
12
+
13
+ ```
14
+ huggingface_app/
15
+ β”‚
16
+ β”œβ”€β”€ πŸ“„ Core Application Files
17
+ β”‚ β”œβ”€β”€ app.py # Main Streamlit application (standalone)
18
+ β”‚ β”œβ”€β”€ requirements.txt # Python dependencies for Hugging Face
19
+ β”‚ └── README.md # Hugging Face Space documentation
20
+ β”‚
21
+ β”œβ”€β”€ βš™οΈ Configuration Files
22
+ β”‚ β”œβ”€β”€ .gitattributes # Git LFS configuration for large files
23
+ β”‚ └── .gitignore # Files to exclude from Git
24
+ β”‚
25
+ β”œβ”€β”€ πŸ“š Documentation
26
+ β”‚ β”œβ”€β”€ DEPLOYMENT_GUIDE.md # Step-by-step deployment instructions
27
+ β”‚ β”œβ”€β”€ QUICKSTART.txt # Quick reference guide
28
+ β”‚ └── FILE_STRUCTURE.md # This file
29
+ β”‚
30
+ β”œβ”€β”€ πŸ”§ Utility Scripts
31
+ β”‚ β”œβ”€β”€ verify_files.py # Verification script (check all files present)
32
+ β”‚ β”œβ”€β”€ RUN_LOCAL.bat # Windows: Run app locally
33
+ β”‚ └── run_local.sh # Linux/Mac: Run app locally
34
+ β”‚
35
+ β”œβ”€β”€ πŸ€– models/
36
+ β”‚ β”‚
37
+ β”‚ β”œβ”€β”€ simple_baseline/ # Simple Baseline models
38
+ β”‚ β”‚ β”œβ”€β”€ model_component_name.pkl # Random Forest classifier (name)
39
+ β”‚ β”‚ β”œβ”€β”€ model_component_ph.pkl # XGBoost regressor (pH)
40
+ β”‚ β”‚ β”œβ”€β”€ label_encoder_name.pkl # Label encoder for component names
41
+ β”‚ β”‚ β”œβ”€β”€ scaler.pkl # StandardScaler for features
42
+ β”‚ β”‚ β”œβ”€β”€ tfidf.pkl # TF-IDF vectorizer for methods
43
+ β”‚ β”‚ └── training_results.json # Training metrics
44
+ β”‚ β”‚
45
+ β”‚ └── advanced_baseline/ # Advanced Baseline models
46
+ β”‚ β”œβ”€β”€ model_component_name.pkl # Ensemble classifier (name)
47
+ β”‚ β”œβ”€β”€ model_component_conc.pkl # Ensemble regressor (concentration)
48
+ β”‚ β”œβ”€β”€ model_component_ph.pkl # Ensemble regressor (pH)
49
+ β”‚ β”œβ”€β”€ label_encoder_name.pkl # Label encoder for component names
50
+ β”‚ β”œβ”€β”€ scaler.pkl # StandardScaler for features
51
+ β”‚ β”œβ”€β”€ tfidf.pkl # TF-IDF vectorizer for methods
52
+ β”‚ └── training_results.json # Training metrics
53
+ β”‚
54
+ └── πŸ“Š visualizations/ # Performance comparison charts
55
+ β”œβ”€β”€ 01_component_name_comparison.png
56
+ β”œβ”€β”€ 02_component_conc_comparison.png
57
+ β”œβ”€β”€ 03_component_ph_comparison.png
58
+ β”œβ”€β”€ 04_all_approaches_heatmap.png
59
+ β”œβ”€β”€ 05_complete_comparison.png
60
+ β”œβ”€β”€ eda_01_missing_values_matrix.png
61
+ β”œβ”€β”€ eda_02_missing_values_heatmap.png
62
+ β”œβ”€β”€ eda_03_target_distributions.png
63
+ β”œβ”€β”€ eda_04_feature_distributions.png
64
+ └── eda_05_correlation_matrix.png
65
+ ```
66
+
67
+ ---
68
+
69
+ ## πŸ“‹ File Descriptions
70
+
71
+ ### Core Application Files
72
+
73
+ #### `app.py` (Main Application)
74
+ - **Purpose:** Streamlit web application
75
+ - **Key Features:**
76
+ - Model selection (Simple vs Advanced Baseline)
77
+ - Interactive parameter input
78
+ - Real-time predictions
79
+ - Top-5 component predictions with probabilities
80
+ - Visual pH scale
81
+ - Downloadable results (CSV)
82
+ - Performance visualizations
83
+ - Model comparison charts
84
+ - **Dependencies:** All specified in `requirements.txt`
85
+ - **Entry Point:** Yes - Hugging Face will run this automatically
86
+
87
+ #### `requirements.txt`
88
+ - **Purpose:** Python package dependencies
89
+ - **Key Packages:**
90
+ - streamlit==1.29.0
91
+ - pandas==2.1.4
92
+ - numpy==1.26.2
93
+ - scikit-learn==1.3.2
94
+ - xgboost==2.0.3
95
+ - lightgbm==4.1.0
96
+ - catboost==1.2.2
97
+ - joblib==1.3.2
98
+ - **Note:** Versions pinned for reproducibility
99
+
100
+ #### `README.md`
101
+ - **Purpose:** Documentation displayed on Hugging Face Space page
102
+ - **Contains:**
103
+ - App description and features
104
+ - Model performance metrics
105
+ - Usage instructions
106
+ - Technical details
107
+ - Background information
108
+ - Acknowledgments
109
+ - **Special:** YAML header configures Space appearance
110
+
111
+ ---
112
+
113
+ ### Configuration Files
114
+
115
+ #### `.gitattributes`
116
+ - **Purpose:** Git LFS (Large File Storage) configuration
117
+ - **Tracks:**
118
+ - *.pkl (model files)
119
+ - *.pth (PyTorch models)
120
+ - *.json (results)
121
+ - *.png (images)
122
+ - **Why:** Files >10MB need LFS on Hugging Face
123
+
124
+ #### `.gitignore`
125
+ - **Purpose:** Exclude unnecessary files from Git
126
+ - **Excludes:**
127
+ - Python cache (`__pycache__/`)
128
+ - Virtual environments
129
+ - IDE files
130
+ - OS files
131
+ - Logs
132
+
133
+ ---
134
+
135
+ ### Documentation Files
136
+
137
+ #### `DEPLOYMENT_GUIDE.md`
138
+ - **Purpose:** Complete deployment instructions
139
+ - **Sections:**
140
+ - Prerequisites
141
+ - Step-by-step deployment (Web UI & Git CLI)
142
+ - Troubleshooting
143
+ - Customization
144
+ - Monitoring
145
+ - Security & privacy
146
+
147
+ #### `QUICKSTART.txt`
148
+ - **Purpose:** Quick reference for common tasks
149
+ - **Format:** Plain text for easy viewing
150
+ - **Content:** Essential info at a glance
151
+
152
+ #### `FILE_STRUCTURE.md`
153
+ - **Purpose:** This document - complete file inventory
154
+
155
+ ---
156
+
157
+ ### Utility Scripts
158
+
159
+ #### `verify_files.py`
160
+ - **Purpose:** Pre-deployment verification
161
+ - **Checks:**
162
+ - All required files present
163
+ - Model files exist
164
+ - Folder structure correct
165
+ - Total size calculation
166
+ - **Usage:** `python verify_files.py`
167
+
168
+ #### `RUN_LOCAL.bat` (Windows)
169
+ - **Purpose:** Launch app locally for testing
170
+ - **Usage:** Double-click or run `RUN_LOCAL.bat`
171
+ - **Opens:** http://localhost:8501
172
+
173
+ #### `run_local.sh` (Linux/Mac)
174
+ - **Purpose:** Launch app locally for testing
175
+ - **Usage:** `bash run_local.sh`
176
+ - **Opens:** http://localhost:8501
177
+
178
+ ---
179
+
180
+ ### Model Files
181
+
182
+ #### Simple Baseline Models (6 files)
183
+ **Performance:**
184
+ - Name Accuracy: 61.12%
185
+ - pH RΒ²: 95.58%
186
+ - Concentration: N/A
187
+
188
+ **Files:**
189
+ 1. `model_component_name.pkl` - Random Forest classifier
190
+ 2. `model_component_ph.pkl` - XGBoost regressor
191
+ 3. `label_encoder_name.pkl` - Encode component names
192
+ 4. `scaler.pkl` - Feature normalization
193
+ 5. `tfidf.pkl` - Text vectorization
194
+ 6. `training_results.json` - Performance metrics
195
+
196
+ #### Advanced Baseline Models (7 files)
197
+ **Performance:**
198
+ - Name Accuracy: 64.18% ⭐
199
+ - Concentration RΒ²: 47.33%
200
+ - pH R²: 99.34% ⭐
201
+
202
+ **Files:**
203
+ 1. `model_component_name.pkl` - Ensemble (RF + XGB + LGB + Cat)
204
+ 2. `model_component_conc.pkl` - Ensemble concentration regressor
205
+ 3. `model_component_ph.pkl` - Ensemble pH regressor
206
+ 4. `label_encoder_name.pkl` - Encode component names
207
+ 5. `scaler.pkl` - Feature normalization
208
+ 6. `tfidf.pkl` - Text vectorization
209
+ 7. `training_results.json` - Performance metrics
210
+
211
+ ---
212
+
213
+ ### Visualization Files (10 images)
214
+
215
+ #### Model Comparison Charts
216
+ - `01_component_name_comparison.png` - Name accuracy comparison
217
+ - `02_component_conc_comparison.png` - Concentration RΒ² comparison
218
+ - `03_component_ph_comparison.png` - pH RΒ² comparison
219
+ - `04_all_approaches_heatmap.png` - Performance heatmap
220
+ - `05_complete_comparison.png` - Comprehensive comparison
221
+
222
+ #### EDA Visualizations
223
+ - `eda_01_missing_values_matrix.png` - Missing data patterns
224
+ - `eda_02_missing_values_heatmap.png` - Missing data heatmap
225
+ - `eda_03_target_distributions.png` - Target variable distributions
226
+ - `eda_04_feature_distributions.png` - Feature distributions
227
+ - `eda_05_correlation_matrix.png` - Feature correlations
228
+
229
+ ---
230
+
231
+ ## πŸš€ Deployment Checklist
232
+
233
+ Before deploying to Hugging Face:
234
+
235
+ - [x] βœ… All core files present (app.py, requirements.txt, README.md)
236
+ - [x] βœ… Configuration files (.gitattributes, .gitignore)
237
+ - [x] βœ… Simple Baseline models (6 files)
238
+ - [x] βœ… Advanced Baseline models (7 files)
239
+ - [x] βœ… Visualizations (10 images)
240
+ - [x] βœ… Documentation complete
241
+ - [x] βœ… Verification script passes
242
+ - [x] βœ… Total size: 46.47 MB (within limits)
243
+ - [ ] ⏳ Test locally (run `streamlit run app.py`)
244
+ - [ ] ⏳ Deploy to Hugging Face
245
+ - [ ] ⏳ Test live deployment
246
+
247
+ ---
248
+
249
+ ## πŸ’‘ Key Features
250
+
251
+ ### What Makes This Deployment Special
252
+
253
+ 1. **Self-Contained**: No external dependencies or file paths
254
+ 2. **Production-Ready**: All error handling included
255
+ 3. **User-Friendly**: Beautiful UI with helpful tooltips
256
+ 4. **Well-Documented**: Comprehensive README and guides
257
+ 5. **Verified**: Includes verification script
258
+ 6. **Git LFS Ready**: Configured for large model files
259
+ 7. **Cross-Platform**: Works on Windows, Linux, Mac
260
+
261
+ ### App Capabilities
262
+
263
+ - βœ… Two model options (Simple & Advanced)
264
+ - βœ… Interactive parameter input
265
+ - βœ… Real-time predictions
266
+ - βœ… Top-5 component suggestions
267
+ - βœ… Confidence scores
268
+ - βœ… Visual pH scale
269
+ - βœ… Downloadable CSV results
270
+ - βœ… Performance visualizations
271
+ - βœ… Model comparison tables
272
+ - βœ… Responsive design
273
+
274
+ ---
275
+
276
+ ## πŸ“Š Statistics
277
+
278
+ | Metric | Value |
279
+ |--------|-------|
280
+ | Total Files | 30 |
281
+ | Python Scripts | 2 |
282
+ | Model Files | 13 |
283
+ | Images | 10 |
284
+ | Documentation | 5 |
285
+ | Total Size | 46.47 MB |
286
+ | Largest File | model_component_name.pkl (~8 MB each) |
287
+
288
+ ---
289
+
290
+ ## πŸ”— Next Steps
291
+
292
+ 1. **Test Locally:**
293
+ ```bash
294
+ streamlit run app.py
295
+ ```
296
+
297
+ 2. **Verify Files:**
298
+ ```bash
299
+ python verify_files.py
300
+ ```
301
+
302
+ 3. **Deploy to Hugging Face:**
303
+ - Follow `DEPLOYMENT_GUIDE.md`
304
+ - Or see `QUICKSTART.txt` for quick steps
305
+
306
+ 4. **Share Your Space:**
307
+ - URL: `https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME`
308
+
309
+ ---
310
+
311
+ ## ⚠️ Important Notes
312
+
313
+ - All paths in `app.py` are relative to the script location
314
+ - Models load on first prediction (not at startup)
315
+ - Git LFS is required for files >10MB
316
+ - Free tier on Hugging Face is sufficient
317
+ - No API keys or secrets required
318
+
319
+ ---
320
+
321
+ ## πŸ“ž Support
322
+
323
+ - **Deployment Issues:** See `DEPLOYMENT_GUIDE.md`
324
+ - **File Issues:** Run `verify_files.py`
325
+ - **App Issues:** Check `app.py` comments
326
+ - **Hugging Face Help:** https://huggingface.co/docs/hub/spaces
327
+
328
+ ---
329
+
330
+ **Status:** βœ… **READY FOR DEPLOYMENT**
331
+
332
+ This folder is complete and ready to be uploaded to Hugging Face Spaces!
333
+
QUICKSTART.txt ADDED
@@ -0,0 +1,103 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ╔══════════════════════════════════════════════════════════════════╗
2
+ β•‘ CRYSTALLIZATION COMPONENT PREDICTOR - QUICK START GUIDE β•‘
3
+ β•šβ•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•β•
4
+
5
+ πŸ“ FOLDER CONTENTS:
6
+ ==================
7
+ βœ… All files ready for Hugging Face deployment!
8
+
9
+ πŸ“¦ WHAT'S INCLUDED:
10
+ ===================
11
+ β€’ app.py - Main Streamlit application
12
+ β€’ requirements.txt - Python dependencies
13
+ β€’ README.md - Documentation for Hugging Face
14
+ β€’ models/ - All trained ML models (Simple & Advanced Baseline)
15
+ β€’ visualizations/ - Performance comparison charts
16
+ β€’ .gitattributes - Git LFS configuration
17
+ β€’ DEPLOYMENT_GUIDE.md - Detailed deployment instructions
18
+
19
+ πŸ“Š FOLDER SIZE: ~46 MB
20
+
21
+ πŸš€ DEPLOY TO HUGGING FACE:
22
+ ===========================
23
+
24
+ OPTION 1: Web Upload (Easiest)
25
+ -------------------------------
26
+ 1. Go to https://huggingface.co/spaces
27
+ 2. Click "Create new Space"
28
+ 3. Choose name, select "Streamlit" as SDK
29
+ 4. Upload ALL files from this folder
30
+ 5. Wait 2-5 minutes for build
31
+ 6. Done! Your app is live
32
+
33
+ OPTION 2: Git Command Line
34
+ ---------------------------
35
+ 1. git clone https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME
36
+ 2. Copy all files to cloned folder
37
+ 3. git add .
38
+ 4. git commit -m "Deploy crystallization predictor"
39
+ 5. git push
40
+
41
+ πŸ“– See DEPLOYMENT_GUIDE.md for detailed instructions!
42
+
43
+ πŸ§ͺ TEST LOCALLY FIRST:
44
+ =======================
45
+ Windows: Double-click RUN_LOCAL.bat
46
+ Linux/Mac: bash run_local.sh
47
+ OR: streamlit run app.py
48
+
49
+ Then open: http://localhost:8501
50
+
51
+ βœ… VERIFICATION:
52
+ ================
53
+ Run: python verify_files.py
54
+ All files present: βœ…
55
+
56
+ 🎯 WHAT THE APP DOES:
57
+ ======================
58
+ Predicts optimal crystallization components:
59
+ β€’ Component Name (chemical compound)
60
+ β€’ Concentration (molarity)
61
+ β€’ pH (acidity level)
62
+
63
+ Based on your input parameters:
64
+ β€’ Crystallization method
65
+ β€’ Temperature
66
+ β€’ pH
67
+ β€’ Matthews coefficient
68
+ β€’ Solvent content
69
+
70
+ πŸ“ˆ MODEL PERFORMANCE:
71
+ ======================
72
+ Advanced Baseline (Recommended):
73
+ β€’ Name Accuracy: 64.18%
74
+ β€’ Concentration RΒ²: 47.33%
75
+ β€’ pH RΒ²: 99.34%
76
+
77
+ Simple Baseline:
78
+ β€’ Name Accuracy: 61.12%
79
+ β€’ pH RΒ²: 95.58%
80
+ β€’ No concentration prediction
81
+
82
+ ⚑ QUICK TIPS:
83
+ ==============
84
+ βœ“ Use Advanced Baseline for complete predictions
85
+ βœ“ Test locally before deploying
86
+ βœ“ All dependencies are in requirements.txt
87
+ βœ“ Git LFS is configured for large files
88
+ βœ“ Models load on first prediction (intentional)
89
+ βœ“ Free tier on Hugging Face is sufficient
90
+
91
+ πŸ“ž NEED HELP?
92
+ =============
93
+ β€’ Check DEPLOYMENT_GUIDE.md
94
+ β€’ Visit https://huggingface.co/docs/hub/spaces
95
+ β€’ Review app.py comments
96
+
97
+ πŸŽ‰ READY TO DEPLOY!
98
+ ===================
99
+ Your Space URL will be:
100
+ https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
101
+
102
+ Good luck! πŸš€πŸ”¬
103
+
README.md CHANGED
@@ -1,19 +1,143 @@
1
- ---
2
- title: BTP 2026
3
- emoji: πŸš€
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- ---
13
-
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Crystallization Component Predictor
3
+ emoji: πŸ”¬
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: streamlit
7
+ sdk_version: 1.29.0
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ ---
12
+
13
+ # πŸ”¬ Crystallization Component Predictor
14
+
15
+ An interactive machine learning application for predicting optimal protein crystallization components based on experimental parameters.
16
+
17
+ ## 🎯 What Does This App Do?
18
+
19
+ This tool predicts three critical crystallization parameters:
20
+ 1. **Component Name**: The chemical compound most likely to produce crystals
21
+ 2. **Concentration**: The optimal molarity for the component
22
+ 3. **pH**: The ideal acidity/basicity level for crystallization
23
+
24
+ ## πŸš€ Quick Start
25
+
26
+ 1. Select a model (Advanced Baseline recommended)
27
+ 2. Input your crystallization parameters:
28
+ - Crystallization method
29
+ - Temperature
30
+ - pH
31
+ - Matthews coefficient
32
+ - Solvent content
33
+ 3. Click "Predict Components"
34
+ 4. Review predictions and download results
35
+
36
+ ## πŸ“Š Model Performance
37
+
38
+ | Model | Name Accuracy | Conc RΒ² | pH RΒ² |
39
+ |-------|--------------|---------|-------|
40
+ | Simple Baseline | 61.12% | N/A | 95.58% |
41
+ | **Advanced Baseline** ⭐ | **64.18%** | **47.33%** | **99.34%** |
42
+ | Transformer | 53.85% | 18.72% | 99.27% |
43
+
44
+ **Recommended:** Advanced Baseline for best overall performance
45
+
46
+ ## πŸ”¬ Features
47
+
48
+ - **Two Model Approaches**: Choose between Simple and Advanced Baseline
49
+ - **Interactive UI**: Easy-to-use sliders and dropdowns
50
+ - **Top-5 Predictions**: View confidence scores for multiple candidates
51
+ - **Visual pH Scale**: Intuitive pH visualization
52
+ - **Downloadable Results**: Export predictions as CSV
53
+ - **Performance Charts**: Compare model accuracies
54
+
55
+ ## πŸ› οΈ Technical Details
56
+
57
+ ### Simple Baseline
58
+ - Random Forest for component classification
59
+ - XGBoost for pH regression
60
+ - 4 numerical features + TF-IDF of crystallization method
61
+
62
+ ### Advanced Baseline (Recommended)
63
+ - Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
64
+ - 8 engineered features including interaction terms
65
+ - Separate models for name, concentration, and pH
66
+ - Log-transformed concentration predictions
67
+
68
+ ### Models Included
69
+ - `simple_baseline/`: Simple baseline models
70
+ - `model_component_name.pkl`: Component classifier
71
+ - `model_component_ph.pkl`: pH regressor
72
+ - `label_encoder_name.pkl`: Label encoder
73
+ - `scaler.pkl`: Feature scaler
74
+ - `tfidf.pkl`: TF-IDF vectorizer
75
+
76
+ - `advanced_baseline/`: Advanced baseline models
77
+ - `model_component_name.pkl`: Enhanced component classifier
78
+ - `model_component_conc.pkl`: Concentration regressor
79
+ - `model_component_ph.pkl`: Enhanced pH regressor
80
+ - `label_encoder_name.pkl`: Label encoder
81
+ - `scaler.pkl`: Feature scaler
82
+ - `tfidf.pkl`: TF-IDF vectorizer
83
+
84
+ ## πŸ“¦ Dependencies
85
+
86
+ - Python 3.9+
87
+ - Streamlit
88
+ - Scikit-learn
89
+ - XGBoost
90
+ - LightGBM
91
+ - CatBoost
92
+ - Pandas
93
+ - NumPy
94
+ - Joblib
95
+
96
+ ## πŸŽ“ Use Cases
97
+
98
+ - **Structural Biology**: Plan crystallization experiments
99
+ - **Drug Discovery**: Optimize protein crystal conditions
100
+ - **Research**: Explore crystallization parameter space
101
+ - **Education**: Learn about protein crystallization
102
+
103
+ ## πŸ“– Background
104
+
105
+ Protein crystallization is essential for determining 3D protein structures via X-ray crystallography. This tool uses machine learning trained on historical crystallization data from the Protein Data Bank (PDB) to predict optimal conditions.
106
+
107
+ ### Input Parameters Explained
108
+
109
+ - **Crystallization Method**: Technique used (vapor diffusion, batch, etc.)
110
+ - **Temperature**: Affects protein stability and crystal growth (typically 277-298K)
111
+ - **pH**: Critical for protein solubility and crystal formation (0-14 scale)
112
+ - **Matthews Coefficient**: Unit cell volume to protein molecular weight ratio (Ε²/Da)
113
+ - **Solvent Content**: Percentage of solvent in crystal lattice (typically 30-70%)
114
+
115
+ ## ⚠️ Important Notes
116
+
117
+ - **Validation Required**: Always validate predictions experimentally
118
+ - **Research Tool**: For research and educational purposes
119
+ - **Starting Point**: Use predictions as a guide, not absolute truth
120
+ - **Protein-Specific**: Results may vary based on your specific protein
121
+
122
+ ## 🀝 Contributing
123
+
124
+ This is a research project. Feedback and suggestions are welcome!
125
+
126
+ ## πŸ“„ License
127
+
128
+ MIT License - Free to use for research and educational purposes
129
+
130
+ ## πŸ™ Acknowledgments
131
+
132
+ - Training data derived from Protein Data Bank (PDB)
133
+ - Built with Streamlit and ensemble ML models
134
+ - Inspired by advances in computational structural biology
135
+
136
+ ## πŸ“ž Contact & Support
137
+
138
+ For questions or issues, please open an issue on the repository.
139
+
140
+ ---
141
+
142
+ **Note**: This tool provides predictions based on historical data. Always conduct proper experimental validation. Crystallization is a complex process influenced by many factors not captured by these models alone.
143
+
RUN_LOCAL.bat ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ @echo off
2
+ echo ========================================
3
+ echo Crystallization Predictor - Local Run
4
+ echo ========================================
5
+ echo.
6
+ echo Starting Streamlit app...
7
+ echo Press Ctrl+C to stop
8
+ echo.
9
+ echo App will open in your browser at:
10
+ echo http://localhost:8501
11
+ echo.
12
+ echo ----------------------------------------
13
+
14
+ streamlit run app.py
15
+
16
+ pause
17
+
app.py ADDED
@@ -0,0 +1,479 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Interactive Crystallization Component Predictor
3
+ ===============================================
4
+ Streamlit app for Hugging Face Hub deployment
5
+ Predicts crystallization components using Simple Baseline and Advanced Baseline models
6
+ """
7
+
8
+ import streamlit as st
9
+ import pandas as pd
10
+ import numpy as np
11
+ import joblib
12
+ import json
13
+ import os
14
+ import warnings
15
+
16
+ warnings.filterwarnings('ignore')
17
+
18
+ # Page config
19
+ st.set_page_config(
20
+ page_title="Crystallization Predictor",
21
+ page_icon="πŸ”¬",
22
+ layout="wide",
23
+ initial_sidebar_state="expanded"
24
+ )
25
+
26
+ # Get the directory of this script
27
+ BASE_DIR = os.path.dirname(os.path.abspath(__file__))
28
+
29
+ # Title and Introduction
30
+ st.title("πŸ”¬ Crystallization Component Predictor")
31
+ st.markdown("""
32
+ ### Predict crystallization components using Machine Learning
33
+ This app uses trained machine learning models to predict the optimal components for protein crystallization
34
+ based on your experimental parameters.
35
+ """)
36
+ st.markdown("---")
37
+
38
+ # Sidebar
39
+ st.sidebar.header("βš™οΈ Model Selection")
40
+ approach = st.sidebar.radio(
41
+ "Choose Approach:",
42
+ ["Advanced Baseline (Recommended)", "Simple Baseline"],
43
+ help="Advanced has concentration parsing and better accuracy"
44
+ )
45
+
46
+ st.sidebar.markdown("---")
47
+ st.sidebar.markdown("### πŸ“Š Model Performance")
48
+
49
+ # Display performance metrics
50
+ try:
51
+ simple_results_path = os.path.join(BASE_DIR, 'models', 'simple_baseline', 'training_results.json')
52
+ advanced_results_path = os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'training_results.json')
53
+
54
+ if os.path.exists(simple_results_path):
55
+ with open(simple_results_path, 'r') as f:
56
+ simple_results = json.load(f)
57
+ if os.path.exists(advanced_results_path):
58
+ with open(advanced_results_path, 'r') as f:
59
+ advanced_results = json.load(f)
60
+
61
+ if "Simple" in approach:
62
+ st.sidebar.metric("Name Accuracy", "61.12%")
63
+ st.sidebar.metric("pH RΒ²", "95.58%")
64
+ st.sidebar.warning("⚠️ Conc: N/A")
65
+ else:
66
+ st.sidebar.metric("Name Accuracy", "64.18%")
67
+ st.sidebar.metric("Conc RΒ²", "47.33%")
68
+ st.sidebar.metric("pH RΒ²", "99.34%")
69
+ st.sidebar.success("βœ… All metrics working!")
70
+
71
+ except Exception as e:
72
+ st.sidebar.info(f"Using default metrics")
73
+
74
+ st.sidebar.markdown("---")
75
+ st.sidebar.markdown("""
76
+ ### ℹ️ About
77
+ This tool predicts three key crystallization parameters:
78
+ - **Component Name**: The chemical compound
79
+ - **Concentration**: Amount in solution (M)
80
+ - **pH**: Acidity/basicity level
81
+
82
+ **Recommended:** Advanced Baseline for complete predictions
83
+ """)
84
+
85
+ # Input Form
86
+ st.header("🎯 Input Crystallization Parameters")
87
+
88
+ col1, col2 = st.columns(2)
89
+
90
+ with col1:
91
+ st.markdown("#### Crystallization Setup")
92
+ cryst_method = st.selectbox(
93
+ "Crystallization Method",
94
+ [
95
+ "VAPOR DIFFUSION, SITTING DROP",
96
+ "VAPOR DIFFUSION, HANGING DROP",
97
+ "VAPOR DIFFUSION",
98
+ "BATCH MODE",
99
+ "MICROBATCH"
100
+ ],
101
+ help="Select the crystallization technique you're using"
102
+ )
103
+
104
+ temp = st.slider(
105
+ "Temperature (K)",
106
+ 250.0, 320.0, 293.0, 1.0,
107
+ help="Typical room temperature is ~293K (20Β°C)"
108
+ )
109
+
110
+ ph = st.slider(
111
+ "pH",
112
+ 0.0, 14.0, 7.0, 0.1,
113
+ help="Initial pH of your crystallization solution"
114
+ )
115
+
116
+ with col2:
117
+ st.markdown("#### Crystal Properties")
118
+ matthews = st.slider(
119
+ "Matthews Coefficient",
120
+ 1.0, 4.5, 2.2, 0.1,
121
+ help="Ratio of unit cell volume to protein molecular weight (Γ…Β³/Da)"
122
+ )
123
+
124
+ solvent = st.slider(
125
+ "Percent Solvent Content (%)",
126
+ 0.0, 100.0, 45.0, 1.0,
127
+ help="Percentage of solvent in the crystal"
128
+ )
129
+
130
+ st.markdown("---")
131
+
132
+ # Predict button
133
+ if st.button("πŸš€ Predict Components", type="primary", use_container_width=True):
134
+
135
+ try:
136
+ with st.spinner("πŸ”„ Loading models and making predictions..."):
137
+
138
+ if "Advanced" in approach:
139
+ # Load advanced models
140
+ model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_name.pkl'))
141
+ model_conc = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_conc.pkl'))
142
+ model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_ph.pkl'))
143
+ le = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'label_encoder_name.pkl'))
144
+ scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'scaler.pkl'))
145
+ tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'tfidf.pkl'))
146
+
147
+ # Feature engineering (Advanced Baseline needs 8 features)
148
+ temp_ph_int = temp * ph
149
+ matthews_solvent_int = matthews * solvent
150
+ ph_diff = 0 # Unknown for new prediction
151
+ solvent_ratio = solvent / (matthews + 1e-6)
152
+
153
+ numerical = np.array([[temp, ph, matthews, solvent,
154
+ temp_ph_int, matthews_solvent_int,
155
+ ph_diff, solvent_ratio]])
156
+
157
+ else:
158
+ # Load simple models
159
+ model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_name.pkl'))
160
+ model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_ph.pkl'))
161
+ le = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'label_encoder_name.pkl'))
162
+ scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'scaler.pkl'))
163
+ tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'tfidf.pkl'))
164
+
165
+ # Simple baseline: only 4 features
166
+ numerical = np.array([[temp, ph, matthews, solvent]])
167
+
168
+ # Scale numerical features
169
+ numerical_scaled = scaler.transform(numerical)
170
+
171
+ # TF-IDF for crystallization method
172
+ method_tfidf = tfidf.transform([cryst_method.upper()]).toarray()
173
+
174
+ # Combine features
175
+ X_pred = np.concatenate([numerical_scaled, method_tfidf], axis=1)
176
+
177
+ # Make predictions
178
+ pred_name_idx = model_name.predict(X_pred)[0]
179
+ pred_name = le.inverse_transform([pred_name_idx])[0]
180
+ pred_name_proba = model_name.predict_proba(X_pred)[0]
181
+ top_5_idx = np.argsort(pred_name_proba)[-5:][::-1]
182
+ top_5_names = le.inverse_transform(top_5_idx)
183
+ top_5_proba = pred_name_proba[top_5_idx]
184
+
185
+ pred_ph = model_ph.predict(X_pred)[0]
186
+
187
+ if "Advanced" in approach:
188
+ pred_conc = model_conc.predict(X_pred)[0]
189
+
190
+ # Display Results
191
+ st.success("βœ… Predictions Complete!")
192
+ st.markdown("---")
193
+
194
+ st.header("πŸ“Š Prediction Results")
195
+
196
+ # Component Name
197
+ st.subheader("1️⃣ Component_1_Name")
198
+ st.markdown("**Most likely chemical component for crystallization:**")
199
+
200
+ col1, col2 = st.columns([1, 2])
201
+
202
+ with col1:
203
+ st.metric("Predicted Component", pred_name)
204
+ st.caption("Top prediction from the model")
205
+
206
+ with col2:
207
+ st.markdown("**Top 5 Predictions (with confidence):**")
208
+ top5_df = pd.DataFrame({
209
+ 'Rank': range(1, 6),
210
+ 'Component': top_5_names,
211
+ 'Probability': [f"{p:.2%}" for p in top_5_proba]
212
+ })
213
+ st.dataframe(top5_df, hide_index=True, use_container_width=True)
214
+
215
+ st.markdown("---")
216
+
217
+ # Concentration
218
+ st.subheader("2️⃣ Component_1_Conc")
219
+ if "Advanced" in approach:
220
+ col1, col2 = st.columns(2)
221
+ with col1:
222
+ st.metric("Predicted Concentration (log-scale)", f"{pred_conc:.4f}")
223
+ with col2:
224
+ actual_molarity = 10**pred_conc
225
+ st.metric("Actual Molarity", f"{actual_molarity:.6f} M")
226
+
227
+ st.info(f"πŸ’‘ Use approximately **{actual_molarity:.6f} M** of {pred_name} in your crystallization trials")
228
+ else:
229
+ st.warning("⚠️ Not available in Simple Baseline - use Advanced Baseline for concentration predictions")
230
+
231
+ st.markdown("---")
232
+
233
+ # pH
234
+ st.subheader("3️⃣ Component_1_pH")
235
+ col1, col2 = st.columns([1, 2])
236
+
237
+ with col1:
238
+ st.metric("Predicted pH", f"{pred_ph:.2f}")
239
+
240
+ # pH classification
241
+ if pred_ph < 6:
242
+ ph_class = "Acidic"
243
+ ph_emoji = "πŸ”΄"
244
+ elif pred_ph < 8:
245
+ ph_class = "Neutral"
246
+ ph_emoji = "🟒"
247
+ else:
248
+ ph_class = "Basic"
249
+ ph_emoji = "πŸ”΅"
250
+
251
+ st.caption(f"{ph_emoji} {ph_class} solution")
252
+
253
+ with col2:
254
+ # pH visualization
255
+ ph_percent = (pred_ph / 14) * 100
256
+ ph_color = "red" if pred_ph < 6 else ("green" if pred_ph < 8 else "blue")
257
+ st.markdown(f"""
258
+ <div style='background: linear-gradient(to right, red, yellow, green, cyan, blue);
259
+ height: 40px; border-radius: 10px; margin: 10px 0; border: 2px solid #333;'></div>
260
+ <div style='display: flex; justify-content: space-between; font-size: 14px;'>
261
+ <span><b>0</b> (Acidic)</span>
262
+ <span><b>7</b> (Neutral)</span>
263
+ <span><b>14</b> (Basic)</span>
264
+ </div>
265
+ <div style='text-align: center; margin-top: 15px;'>
266
+ <b style='font-size: 24px; color: {ph_color};'>pH = {pred_ph:.2f}</b>
267
+ </div>
268
+ """, unsafe_allow_html=True)
269
+
270
+ st.info(f"πŸ’‘ Adjust your buffer to maintain pH β‰ˆ **{pred_ph:.2f}** for optimal crystallization")
271
+
272
+ # Input Summary
273
+ st.markdown("---")
274
+ st.subheader("πŸ“₯ Input Summary")
275
+ input_df = pd.DataFrame({
276
+ 'Parameter': [
277
+ 'Crystallization Method',
278
+ 'Temperature',
279
+ 'Input pH',
280
+ 'Matthews Coefficient',
281
+ 'Solvent Content'
282
+ ],
283
+ 'Value': [
284
+ cryst_method,
285
+ f"{temp:.1f} K ({temp-273.15:.1f}Β°C)",
286
+ f"{ph:.1f}",
287
+ f"{matthews:.2f} Ε²/Da",
288
+ f"{solvent:.1f}%"
289
+ ]
290
+ })
291
+ st.table(input_df)
292
+
293
+ # Download Results
294
+ st.markdown("---")
295
+ st.subheader("πŸ’Ύ Download Results")
296
+
297
+ results_dict = {
298
+ 'Crystallization Method': cryst_method,
299
+ 'Temperature (K)': temp,
300
+ 'Temperature (Β°C)': temp - 273.15,
301
+ 'Input pH': ph,
302
+ 'Matthews Coefficient': matthews,
303
+ 'Solvent Content (%)': solvent,
304
+ 'Predicted Component': pred_name,
305
+ 'Component Probability': f"{top_5_proba[0]:.4f}",
306
+ 'Predicted pH': f"{pred_ph:.2f}",
307
+ }
308
+
309
+ if "Advanced" in approach:
310
+ results_dict['Predicted Concentration (log)'] = f"{pred_conc:.4f}"
311
+ results_dict['Predicted Concentration (M)'] = f"{10**pred_conc:.6f}"
312
+
313
+ results_df = pd.DataFrame([results_dict])
314
+ csv = results_df.to_csv(index=False)
315
+
316
+ st.download_button(
317
+ label="πŸ“₯ Download Predictions as CSV",
318
+ data=csv,
319
+ file_name="crystallization_predictions.csv",
320
+ mime="text/csv",
321
+ )
322
+
323
+ except FileNotFoundError as e:
324
+ st.error(f"""
325
+ ❌ **Model files not found!**
326
+
327
+ Error: {e}
328
+
329
+ Please ensure model files are in the correct directory:
330
+ - `models/simple_baseline/`
331
+ - `models/advanced_baseline/`
332
+ """)
333
+ except Exception as e:
334
+ st.error(f"❌ **Prediction Error:** {e}")
335
+ with st.expander("πŸ” Show full error details"):
336
+ import traceback
337
+ st.code(traceback.format_exc())
338
+
339
+ # Model Comparison Section
340
+ st.markdown("---")
341
+ st.header("πŸ“ˆ Model Comparison")
342
+
343
+ comparison_df = pd.DataFrame({
344
+ 'Model': ['Simple Baseline', 'Advanced Baseline', 'Transformer'],
345
+ 'Name Accuracy': ['61.12%', '64.18% ⭐', '53.85%'],
346
+ 'Conc RΒ²': ['N/A', '47.33%', '18.72%'],
347
+ 'pH R²': ['95.58%', '99.34% ⭐', '99.27%'],
348
+ 'Speed': ['⚑ Fast', '⚑ Fast', '🐌 Slow'],
349
+ 'Recommendation': ['Basic use', 'βœ… Best overall', 'Research only']
350
+ })
351
+
352
+ st.dataframe(
353
+ comparison_df,
354
+ hide_index=True,
355
+ use_container_width=True,
356
+ column_config={
357
+ "Model": st.column_config.TextColumn("Model", width="medium"),
358
+ "Name Accuracy": st.column_config.TextColumn("Name Accuracy", width="medium"),
359
+ "Conc RΒ²": st.column_config.TextColumn("Concentration RΒ²", width="medium"),
360
+ "pH RΒ²": st.column_config.TextColumn("pH RΒ²", width="medium"),
361
+ }
362
+ )
363
+
364
+ st.markdown("""
365
+ **Model Selection Guide:**
366
+ - **Simple Baseline**: Fast predictions, no concentration. Good for quick pH and component estimates.
367
+ - **Advanced Baseline**: ⭐ Recommended for most users. Includes all three predictions with high accuracy.
368
+ - **Transformer**: Deep learning approach, requires more data for better performance.
369
+ """)
370
+
371
+ # Visualizations Section
372
+ st.markdown("---")
373
+ st.header("πŸ“Š Performance Visualizations")
374
+
375
+ viz_path = os.path.join(BASE_DIR, 'visualizations')
376
+
377
+ if os.path.exists(viz_path):
378
+ try:
379
+ tab1, tab2, tab3, tab4 = st.tabs([
380
+ "πŸ“Š Name Accuracy",
381
+ "πŸ“ˆ Concentration RΒ²",
382
+ "πŸ§ͺ pH RΒ²",
383
+ "🎯 Complete Comparison"
384
+ ])
385
+
386
+ with tab1:
387
+ img_path = os.path.join(viz_path, '01_component_name_comparison.png')
388
+ if os.path.exists(img_path):
389
+ st.image(img_path, use_column_width=True)
390
+ st.caption("Comparison of component name prediction accuracy across all models")
391
+
392
+ with tab2:
393
+ img_path = os.path.join(viz_path, '02_component_conc_comparison.png')
394
+ if os.path.exists(img_path):
395
+ st.image(img_path, use_column_width=True)
396
+ st.caption("Concentration prediction performance (RΒ² scores)")
397
+
398
+ with tab3:
399
+ img_path = os.path.join(viz_path, '03_component_ph_comparison.png')
400
+ if os.path.exists(img_path):
401
+ st.image(img_path, use_column_width=True)
402
+ st.caption("pH prediction performance (RΒ² scores)")
403
+
404
+ with tab4:
405
+ img_path = os.path.join(viz_path, '05_complete_comparison.png')
406
+ if os.path.exists(img_path):
407
+ st.image(img_path, use_column_width=True)
408
+ st.caption("Comprehensive comparison of all approaches and metrics")
409
+ except Exception as e:
410
+ st.info(f"Visualizations are being loaded... {e}")
411
+ else:
412
+ st.info("πŸ“Š Visualization files not found in this deployment")
413
+
414
+ # Information Section
415
+ st.markdown("---")
416
+ st.header("ℹ️ How It Works")
417
+
418
+ with st.expander("πŸ”¬ About Protein Crystallization"):
419
+ st.markdown("""
420
+ **Protein crystallization** is a crucial step in structural biology for determining 3D protein structures using X-ray crystallography.
421
+
422
+ **Key Parameters:**
423
+ - **Crystallization Method**: The technique used (e.g., vapor diffusion, batch mode)
424
+ - **Temperature**: Affects protein stability and crystal growth
425
+ - **pH**: Critical for protein solubility and crystal formation
426
+ - **Matthews Coefficient**: Indicates crystal packing density
427
+ - **Solvent Content**: Amount of solvent in the crystal lattice
428
+
429
+ This tool helps predict optimal conditions based on historical crystallization data.
430
+ """)
431
+
432
+ with st.expander("πŸ€– About the Models"):
433
+ st.markdown("""
434
+ **Simple Baseline:**
435
+ - Random Forest classifier for component name
436
+ - XGBoost regressor for pH
437
+ - Uses 4 numerical features + TF-IDF of method
438
+
439
+ **Advanced Baseline:**
440
+ - Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
441
+ - Includes concentration prediction with log-transformation
442
+ - Uses 8 engineered features including interactions
443
+ - Best overall performance: 64% name accuracy, 99% pH RΒ²
444
+
445
+ **Training Data:**
446
+ - Based on protein crystallization experiments from PDB
447
+ - Includes various crystallization methods and conditions
448
+ - Models trained on structured crystallization data
449
+ """)
450
+
451
+ with st.expander("πŸ“– How to Use"):
452
+ st.markdown("""
453
+ 1. **Select a model** in the sidebar (Advanced Baseline recommended)
454
+ 2. **Input your parameters**:
455
+ - Choose crystallization method
456
+ - Set temperature, pH, Matthews coefficient, and solvent content
457
+ 3. **Click "Predict Components"** to get predictions
458
+ 4. **Review results**:
459
+ - Component name with confidence scores
460
+ - Concentration (if using Advanced Baseline)
461
+ - Optimal pH for crystallization
462
+ 5. **Download** results as CSV for your records
463
+
464
+ πŸ’‘ **Tip:** Start with the recommended default values and adjust based on your specific protein and experimental setup.
465
+ """)
466
+
467
+ # Footer
468
+ st.markdown("---")
469
+ st.markdown("""
470
+ <div style='text-align: center; color: gray; padding: 20px;'>
471
+ <p><b>πŸ”¬ Crystallization Component Prediction System</b></p>
472
+ <p><i>Advanced Baseline achieves: 64% Name Accuracy | 47% Conc RΒ² | 99% pH RΒ²</i></p>
473
+ <p>Built with Scikit-learn, XGBoost, LightGBM, CatBoost & Streamlit</p>
474
+ <p style='font-size: 12px; margin-top: 10px;'>
475
+ For research and educational purposes. Validate predictions experimentally.
476
+ </p>
477
+ </div>
478
+ """, unsafe_allow_html=True)
479
+
models/advanced_baseline/label_encoder_name.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
3
+ size 116738
models/advanced_baseline/model_component_conc.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:88824da3404b10b0665cd9f854e79373b48b20fa26434599179c8c25b4076d7c
3
+ size 14164817
models/advanced_baseline/model_component_name.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51b4f1641a13b3bb649ffbb636c08b1b319243758c7baeb7b8c2ea90abea5561
3
+ size 11727850
models/advanced_baseline/model_component_ph.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f76b749b7b827e382b78e5401321671f45d937973b42b289ced7a03a7b598ba0
3
+ size 1452962
models/advanced_baseline/scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:73fb3f7d2e6bddf8275ef0ed30515699685867b00f7fdca5c80076d50c972a56
3
+ size 1287
models/advanced_baseline/tfidf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3122e3afec9af0916a8571b366e9a49b284adc6f72a4a63e3a2e5dab2c3d7a93
3
+ size 2552
models/advanced_baseline/training_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4541496759fff9ca683a388156eecf97c44839363e5a9948df380b388e7f9284
3
+ size 1112
models/simple_baseline/label_encoder_name.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
3
+ size 116738
models/simple_baseline/model_component_name.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a5b7b93b194da3be0b9a72c06f34cbd0f6810e043ab88ea83a25108de686c73
3
+ size 16985140
models/simple_baseline/model_component_ph.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f906964a6ce338aca61bd7b5d1ace1f67daf1ece52eb44658ff539d0201dee3
3
+ size 1239942
models/simple_baseline/scaler.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3c82727c7b6cf52a834e529b5fc01c43d28328465323f2945c331568b9e1095f
3
+ size 1079
models/simple_baseline/tfidf.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7bd17a22531b2aa52a6677af44fadc676329069d40614172fc64def0549bb207
3
+ size 2552
models/simple_baseline/training_results.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba3ee92aff285a69afb5edb8d648d2bbf2d550b37af791248c66688747970297
3
+ size 701
requirements.txt CHANGED
@@ -1,3 +1,17 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Crystallization Component Predictor - Requirements for Hugging Face Spaces
2
+ # Python 3.9+
3
+
4
+ # Core Dependencies
5
+ streamlit==1.29.0
6
+ pandas==2.1.4
7
+ numpy==1.26.2
8
+
9
+ # Machine Learning Models
10
+ scikit-learn==1.3.2
11
+ xgboost==2.0.3
12
+ lightgbm==4.1.0
13
+ catboost==1.2.2
14
+
15
+ # Utilities
16
+ joblib==1.3.2
17
+
run_local.sh ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/bin/bash
2
+
3
+ echo "========================================"
4
+ echo " Crystallization Predictor - Local Run"
5
+ echo "========================================"
6
+ echo ""
7
+ echo "Starting Streamlit app..."
8
+ echo "Press Ctrl+C to stop"
9
+ echo ""
10
+ echo "App will open in your browser at:"
11
+ echo "http://localhost:8501"
12
+ echo ""
13
+ echo "----------------------------------------"
14
+
15
+ streamlit run app.py
16
+
verify_files.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Verification script to check if all required files are present
3
+ Run this before deploying to Hugging Face
4
+ """
5
+
6
+ import os
7
+ from pathlib import Path
8
+
9
+ def check_file(filepath, required=True):
10
+ """Check if a file exists and return status"""
11
+ exists = os.path.exists(filepath)
12
+ status = "βœ…" if exists else ("❌" if required else "⚠️")
13
+ req_text = "(required)" if required else "(optional)"
14
+ print(f"{status} {filepath} {req_text}")
15
+ return exists
16
+
17
+ def check_folder(folderpath, required=True):
18
+ """Check if a folder exists and return status"""
19
+ exists = os.path.exists(folderpath) and os.path.isdir(folderpath)
20
+ status = "βœ…" if exists else ("❌" if required else "⚠️")
21
+ req_text = "(required)" if required else "(optional)"
22
+ print(f"{status} {folderpath}/ {req_text}")
23
+ if exists:
24
+ files = list(Path(folderpath).rglob('*'))
25
+ file_count = len([f for f in files if f.is_file()])
26
+ print(f" β†’ Contains {file_count} file(s)")
27
+ return exists
28
+
29
+ def main():
30
+ print("=" * 60)
31
+ print(" Hugging Face Deployment - File Verification")
32
+ print("=" * 60)
33
+ print()
34
+
35
+ base_dir = os.path.dirname(os.path.abspath(__file__))
36
+ os.chdir(base_dir)
37
+
38
+ all_required_present = True
39
+
40
+ # Check essential files
41
+ print("πŸ“„ Essential Files:")
42
+ all_required_present &= check_file("app.py", required=True)
43
+ all_required_present &= check_file("requirements.txt", required=True)
44
+ all_required_present &= check_file("README.md", required=True)
45
+ print()
46
+
47
+ # Check configuration files
48
+ print("βš™οΈ Configuration Files:")
49
+ check_file(".gitattributes", required=True)
50
+ check_file(".gitignore", required=False)
51
+ print()
52
+
53
+ # Check documentation
54
+ print("πŸ“š Documentation:")
55
+ check_file("DEPLOYMENT_GUIDE.md", required=False)
56
+ print()
57
+
58
+ # Check model folders
59
+ print("πŸ€– Model Files:")
60
+ simple_exists = check_folder("models/simple_baseline", required=True)
61
+ all_required_present &= simple_exists
62
+
63
+ if simple_exists:
64
+ all_required_present &= check_file("models/simple_baseline/model_component_name.pkl", required=True)
65
+ all_required_present &= check_file("models/simple_baseline/model_component_ph.pkl", required=True)
66
+ all_required_present &= check_file("models/simple_baseline/label_encoder_name.pkl", required=True)
67
+ all_required_present &= check_file("models/simple_baseline/scaler.pkl", required=True)
68
+ all_required_present &= check_file("models/simple_baseline/tfidf.pkl", required=True)
69
+ check_file("models/simple_baseline/training_results.json", required=False)
70
+
71
+ print()
72
+
73
+ advanced_exists = check_folder("models/advanced_baseline", required=True)
74
+ all_required_present &= advanced_exists
75
+
76
+ if advanced_exists:
77
+ all_required_present &= check_file("models/advanced_baseline/model_component_name.pkl", required=True)
78
+ all_required_present &= check_file("models/advanced_baseline/model_component_conc.pkl", required=True)
79
+ all_required_present &= check_file("models/advanced_baseline/model_component_ph.pkl", required=True)
80
+ all_required_present &= check_file("models/advanced_baseline/label_encoder_name.pkl", required=True)
81
+ all_required_present &= check_file("models/advanced_baseline/scaler.pkl", required=True)
82
+ all_required_present &= check_file("models/advanced_baseline/tfidf.pkl", required=True)
83
+ check_file("models/advanced_baseline/training_results.json", required=False)
84
+
85
+ print()
86
+
87
+ # Check visualizations
88
+ print("πŸ“Š Visualization Files:")
89
+ viz_exists = check_folder("visualizations", required=False)
90
+ if viz_exists:
91
+ check_file("visualizations/01_component_name_comparison.png", required=False)
92
+ check_file("visualizations/02_component_conc_comparison.png", required=False)
93
+ check_file("visualizations/03_component_ph_comparison.png", required=False)
94
+ check_file("visualizations/05_complete_comparison.png", required=False)
95
+
96
+ print()
97
+ print("=" * 60)
98
+
99
+ if all_required_present:
100
+ print("βœ… SUCCESS! All required files are present.")
101
+ print(" You're ready to deploy to Hugging Face!")
102
+ print()
103
+ print("Next steps:")
104
+ print("1. Test locally: streamlit run app.py")
105
+ print("2. Follow DEPLOYMENT_GUIDE.md for deployment")
106
+ print("3. Upload entire folder to Hugging Face Spaces")
107
+ else:
108
+ print("❌ ERROR! Some required files are missing.")
109
+ print(" Please ensure all required files are present before deploying.")
110
+
111
+ print("=" * 60)
112
+
113
+ # Calculate total size
114
+ total_size = 0
115
+ for root, dirs, files in os.walk('.'):
116
+ for file in files:
117
+ filepath = os.path.join(root, file)
118
+ if os.path.exists(filepath):
119
+ total_size += os.path.getsize(filepath)
120
+
121
+ size_mb = total_size / (1024 * 1024)
122
+ print(f"\nπŸ“¦ Total folder size: {size_mb:.2f} MB")
123
+
124
+ if size_mb > 500:
125
+ print("⚠️ Warning: Folder is quite large. Consider Git LFS for files >10MB")
126
+
127
+ return all_required_present
128
+
129
+ if __name__ == "__main__":
130
+ success = main()
131
+ exit(0 if success else 1)
132
+
visualizations/01_component_name_comparison.png ADDED

Git LFS Details

  • SHA256: f6e07675fa75bc2943255a7079d48b9596dd5ca19a0e10e92c39ad43ab547e89
  • Pointer size: 130 Bytes
  • Size of remote file: 91.2 kB
visualizations/02_component_conc_comparison.png ADDED

Git LFS Details

  • SHA256: a7de140359c1579e6b39ae0bd5dd91a76263240801ff6df8303a14dd8f601816
  • Pointer size: 130 Bytes
  • Size of remote file: 84.8 kB
visualizations/03_component_ph_comparison.png ADDED

Git LFS Details

  • SHA256: 5a105563fa7059faf74d33518eedaab458105a4830acc31997957c3ab7dd8228
  • Pointer size: 130 Bytes
  • Size of remote file: 95.1 kB
visualizations/04_all_approaches_heatmap.png ADDED

Git LFS Details

  • SHA256: 4a44b87157bd85448ab1327526fb5aae403868ee7010b096f960330b147a0022
  • Pointer size: 131 Bytes
  • Size of remote file: 154 kB
visualizations/05_complete_comparison.png ADDED

Git LFS Details

  • SHA256: aef6f2c2473d926f96c515800b5329360df238821eb42cf2f52858dd1f24d139
  • Pointer size: 131 Bytes
  • Size of remote file: 407 kB
visualizations/eda_01_missing_values_matrix.png ADDED

Git LFS Details

  • SHA256: c4056a00091add9b441ad638dae069ede36d423729b0ad07c4291459a61afd43
  • Pointer size: 131 Bytes
  • Size of remote file: 210 kB
visualizations/eda_02_missing_values_heatmap.png ADDED

Git LFS Details

  • SHA256: 6b745b27c50999589f8538c6759cf056b7fa189967256b3f776e47d7864c3fed
  • Pointer size: 131 Bytes
  • Size of remote file: 499 kB
visualizations/eda_03_target_distributions.png ADDED

Git LFS Details

  • SHA256: 621f00ea0976d0c2a6b9c21e278137e33a811f79d2a359f7e7438ffa243cc0bc
  • Pointer size: 131 Bytes
  • Size of remote file: 412 kB
visualizations/eda_04_feature_distributions.png ADDED

Git LFS Details

  • SHA256: bc04db2154a4082a0423beee918fe8e91ece369c341ac4f74e810a27605d5fa4
  • Pointer size: 131 Bytes
  • Size of remote file: 495 kB
visualizations/eda_05_correlation_matrix.png ADDED

Git LFS Details

  • SHA256: a434b7f336aa39c284508baceea73d407c9c6378bb6f0e8f71955de846348443
  • Pointer size: 131 Bytes
  • Size of remote file: 432 kB