Spaces:
Build error
Build error
Upload 34 files
Browse files- .gitattributes +11 -35
- .gitignore +46 -0
- DEPLOYMENT_GUIDE.md +233 -0
- FILE_STRUCTURE.md +333 -0
- QUICKSTART.txt +103 -0
- README.md +143 -19
- RUN_LOCAL.bat +17 -0
- app.py +479 -0
- models/advanced_baseline/label_encoder_name.pkl +3 -0
- models/advanced_baseline/model_component_conc.pkl +3 -0
- models/advanced_baseline/model_component_name.pkl +3 -0
- models/advanced_baseline/model_component_ph.pkl +3 -0
- models/advanced_baseline/scaler.pkl +3 -0
- models/advanced_baseline/tfidf.pkl +3 -0
- models/advanced_baseline/training_results.json +3 -0
- models/simple_baseline/label_encoder_name.pkl +3 -0
- models/simple_baseline/model_component_name.pkl +3 -0
- models/simple_baseline/model_component_ph.pkl +3 -0
- models/simple_baseline/scaler.pkl +3 -0
- models/simple_baseline/tfidf.pkl +3 -0
- models/simple_baseline/training_results.json +3 -0
- requirements.txt +17 -3
- run_local.sh +16 -0
- verify_files.py +132 -0
- visualizations/01_component_name_comparison.png +3 -0
- visualizations/02_component_conc_comparison.png +3 -0
- visualizations/03_component_ph_comparison.png +3 -0
- visualizations/04_all_approaches_heatmap.png +3 -0
- visualizations/05_complete_comparison.png +3 -0
- visualizations/eda_01_missing_values_matrix.png +3 -0
- visualizations/eda_02_missing_values_heatmap.png +3 -0
- visualizations/eda_03_target_distributions.png +3 -0
- visualizations/eda_04_feature_distributions.png +3 -0
- visualizations/eda_05_correlation_matrix.png +3 -0
.gitattributes
CHANGED
|
@@ -1,35 +1,11 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
*.
|
| 5 |
-
*.
|
| 6 |
-
*.
|
| 7 |
-
*.
|
| 8 |
-
*.
|
| 9 |
-
*.
|
| 10 |
-
*.
|
| 11 |
-
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
# Git LFS Configuration for Hugging Face Hub
|
| 2 |
+
# Track large model files
|
| 3 |
+
|
| 4 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.json filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.jpeg filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Python
|
| 2 |
+
__pycache__/
|
| 3 |
+
*.py[cod]
|
| 4 |
+
*$py.class
|
| 5 |
+
*.so
|
| 6 |
+
.Python
|
| 7 |
+
env/
|
| 8 |
+
venv/
|
| 9 |
+
ENV/
|
| 10 |
+
build/
|
| 11 |
+
develop-eggs/
|
| 12 |
+
dist/
|
| 13 |
+
downloads/
|
| 14 |
+
eggs/
|
| 15 |
+
.eggs/
|
| 16 |
+
lib/
|
| 17 |
+
lib64/
|
| 18 |
+
parts/
|
| 19 |
+
sdist/
|
| 20 |
+
var/
|
| 21 |
+
wheels/
|
| 22 |
+
*.egg-info/
|
| 23 |
+
.installed.cfg
|
| 24 |
+
*.egg
|
| 25 |
+
|
| 26 |
+
# Virtual Environment
|
| 27 |
+
venv/
|
| 28 |
+
ENV/
|
| 29 |
+
|
| 30 |
+
# IDE
|
| 31 |
+
.vscode/
|
| 32 |
+
.idea/
|
| 33 |
+
*.swp
|
| 34 |
+
*.swo
|
| 35 |
+
*~
|
| 36 |
+
|
| 37 |
+
# OS
|
| 38 |
+
.DS_Store
|
| 39 |
+
Thumbs.db
|
| 40 |
+
|
| 41 |
+
# Streamlit
|
| 42 |
+
.streamlit/
|
| 43 |
+
|
| 44 |
+
# Logs
|
| 45 |
+
*.log
|
| 46 |
+
|
DEPLOYMENT_GUIDE.md
ADDED
|
@@ -0,0 +1,233 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π Deployment Guide for Hugging Face Spaces
|
| 2 |
+
|
| 3 |
+
This guide will help you deploy the Crystallization Component Predictor to Hugging Face Spaces.
|
| 4 |
+
|
| 5 |
+
## π Prerequisites
|
| 6 |
+
|
| 7 |
+
1. A Hugging Face account (sign up at https://huggingface.co/)
|
| 8 |
+
2. Git installed on your computer
|
| 9 |
+
3. Git LFS installed (`git lfs install`)
|
| 10 |
+
|
| 11 |
+
## π§ Step-by-Step Deployment
|
| 12 |
+
|
| 13 |
+
### Option 1: Web UI Upload (Easiest)
|
| 14 |
+
|
| 15 |
+
1. **Create a new Space:**
|
| 16 |
+
- Go to https://huggingface.co/spaces
|
| 17 |
+
- Click "Create new Space"
|
| 18 |
+
- Choose a name (e.g., "crystallization-predictor")
|
| 19 |
+
- Select **Streamlit** as the SDK
|
| 20 |
+
- Choose visibility (Public or Private)
|
| 21 |
+
- Click "Create Space"
|
| 22 |
+
|
| 23 |
+
2. **Upload files:**
|
| 24 |
+
- Click "Files" tab in your Space
|
| 25 |
+
- Click "Add file" β "Upload files"
|
| 26 |
+
- Drag and drop ALL files from this `huggingface_app` folder:
|
| 27 |
+
- `app.py`
|
| 28 |
+
- `requirements.txt`
|
| 29 |
+
- `README.md`
|
| 30 |
+
- `.gitattributes`
|
| 31 |
+
- `.gitignore`
|
| 32 |
+
- `models/` folder (with all subfolders)
|
| 33 |
+
- `visualizations/` folder (with all images)
|
| 34 |
+
- Click "Commit changes to main"
|
| 35 |
+
|
| 36 |
+
3. **Wait for build:**
|
| 37 |
+
- Hugging Face will automatically build your Space
|
| 38 |
+
- Check the "Logs" tab to monitor progress
|
| 39 |
+
- Usually takes 2-5 minutes
|
| 40 |
+
|
| 41 |
+
4. **Test your app:**
|
| 42 |
+
- Once built, click on the "App" tab
|
| 43 |
+
- Your Streamlit app should be running!
|
| 44 |
+
|
| 45 |
+
### Option 2: Git Command Line (Advanced)
|
| 46 |
+
|
| 47 |
+
1. **Initialize Git LFS:**
|
| 48 |
+
```bash
|
| 49 |
+
cd huggingface_app
|
| 50 |
+
git lfs install
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
2. **Clone your Space repository:**
|
| 54 |
+
```bash
|
| 55 |
+
git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 56 |
+
cd YOUR_SPACE_NAME
|
| 57 |
+
```
|
| 58 |
+
|
| 59 |
+
3. **Copy files:**
|
| 60 |
+
```bash
|
| 61 |
+
# Copy all files from huggingface_app to your cloned repo
|
| 62 |
+
# On Windows:
|
| 63 |
+
xcopy ..\huggingface_app\* . /E /H /Y
|
| 64 |
+
|
| 65 |
+
# On Linux/Mac:
|
| 66 |
+
cp -r ../huggingface_app/* .
|
| 67 |
+
```
|
| 68 |
+
|
| 69 |
+
4. **Commit and push:**
|
| 70 |
+
```bash
|
| 71 |
+
git add .
|
| 72 |
+
git commit -m "Initial deployment of crystallization predictor"
|
| 73 |
+
git push
|
| 74 |
+
```
|
| 75 |
+
|
| 76 |
+
5. **Check deployment:**
|
| 77 |
+
- Visit your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
|
| 78 |
+
- Monitor build logs in the "Logs" tab
|
| 79 |
+
|
| 80 |
+
## π Files Included
|
| 81 |
+
|
| 82 |
+
```
|
| 83 |
+
huggingface_app/
|
| 84 |
+
βββ app.py # Main Streamlit application
|
| 85 |
+
βββ requirements.txt # Python dependencies
|
| 86 |
+
βββ README.md # Documentation (shown on Space page)
|
| 87 |
+
βββ .gitattributes # Git LFS configuration
|
| 88 |
+
βββ .gitignore # Files to ignore
|
| 89 |
+
βββ DEPLOYMENT_GUIDE.md # This file
|
| 90 |
+
βββ models/
|
| 91 |
+
β βββ simple_baseline/
|
| 92 |
+
β β βββ model_component_name.pkl
|
| 93 |
+
β β βββ model_component_ph.pkl
|
| 94 |
+
β β βββ label_encoder_name.pkl
|
| 95 |
+
β β βββ scaler.pkl
|
| 96 |
+
β β βββ tfidf.pkl
|
| 97 |
+
β β βββ training_results.json
|
| 98 |
+
β βββ advanced_baseline/
|
| 99 |
+
β βββ model_component_name.pkl
|
| 100 |
+
β βββ model_component_conc.pkl
|
| 101 |
+
β βββ model_component_ph.pkl
|
| 102 |
+
β βββ label_encoder_name.pkl
|
| 103 |
+
β βββ scaler.pkl
|
| 104 |
+
β βββ tfidf.pkl
|
| 105 |
+
β βββ training_results.json
|
| 106 |
+
βββ visualizations/
|
| 107 |
+
βββ 01_component_name_comparison.png
|
| 108 |
+
βββ 02_component_conc_comparison.png
|
| 109 |
+
βββ 03_component_ph_comparison.png
|
| 110 |
+
βββ 04_all_approaches_heatmap.png
|
| 111 |
+
βββ 05_complete_comparison.png
|
| 112 |
+
```
|
| 113 |
+
|
| 114 |
+
## π Troubleshooting
|
| 115 |
+
|
| 116 |
+
### Build Fails
|
| 117 |
+
|
| 118 |
+
**Problem:** "Could not install packages due to an OSError"
|
| 119 |
+
- **Solution:** Check that all dependencies in `requirements.txt` are compatible
|
| 120 |
+
- Try pinning versions or using newer versions
|
| 121 |
+
|
| 122 |
+
**Problem:** "ModuleNotFoundError"
|
| 123 |
+
- **Solution:** Ensure the missing module is in `requirements.txt`
|
| 124 |
+
|
| 125 |
+
### Model Loading Errors
|
| 126 |
+
|
| 127 |
+
**Problem:** "FileNotFoundError: [Errno 2] No such file or directory: 'models/...'"
|
| 128 |
+
- **Solution:** Verify all model files were uploaded correctly
|
| 129 |
+
- Check that folder structure is preserved
|
| 130 |
+
|
| 131 |
+
**Problem:** Large file upload fails
|
| 132 |
+
- **Solution:** Ensure Git LFS is properly configured
|
| 133 |
+
- Files over 10MB should use LFS (already configured in `.gitattributes`)
|
| 134 |
+
|
| 135 |
+
### App Crashes
|
| 136 |
+
|
| 137 |
+
**Problem:** "Memory limit exceeded"
|
| 138 |
+
- **Solution:** Hugging Face Spaces have memory limits
|
| 139 |
+
- Consider using smaller models or optimizing loading
|
| 140 |
+
|
| 141 |
+
**Problem:** Slow loading
|
| 142 |
+
- **Solution:** Models are loaded on first prediction (not at startup)
|
| 143 |
+
- This is intentional for faster app startup
|
| 144 |
+
|
| 145 |
+
## π¨ Customization
|
| 146 |
+
|
| 147 |
+
### Change App Title/Icon
|
| 148 |
+
Edit the `README.md` header:
|
| 149 |
+
```yaml
|
| 150 |
+
---
|
| 151 |
+
title: Your Custom Title
|
| 152 |
+
emoji: 𧬠# Change emoji
|
| 153 |
+
colorFrom: blue # Change colors
|
| 154 |
+
colorTo: purple
|
| 155 |
+
---
|
| 156 |
+
```
|
| 157 |
+
|
| 158 |
+
### Modify the App
|
| 159 |
+
Edit `app.py` and commit changes. The Space will rebuild automatically.
|
| 160 |
+
|
| 161 |
+
### Add More Models
|
| 162 |
+
1. Add model files to `models/` folder
|
| 163 |
+
2. Update `app.py` to load and use new models
|
| 164 |
+
3. Update `README.md` to document changes
|
| 165 |
+
|
| 166 |
+
## π Monitoring
|
| 167 |
+
|
| 168 |
+
- **Logs**: Check the "Logs" tab in your Space
|
| 169 |
+
- **Analytics**: View usage statistics in Space settings
|
| 170 |
+
- **Updates**: Any push to the main branch triggers a rebuild
|
| 171 |
+
|
| 172 |
+
## π Security & Privacy
|
| 173 |
+
|
| 174 |
+
- **Public Spaces**: Anyone can use your app and see the code
|
| 175 |
+
- **Private Spaces**: Only you and collaborators can access
|
| 176 |
+
- **No User Data**: The app doesn't collect or store user inputs
|
| 177 |
+
- **Model Files**: Ensure you have rights to distribute the models
|
| 178 |
+
|
| 179 |
+
## π° Costs
|
| 180 |
+
|
| 181 |
+
- **Free Tier**:
|
| 182 |
+
- CPU: 2 vCPU, 16GB RAM
|
| 183 |
+
- Perfect for this app
|
| 184 |
+
- No credit card required
|
| 185 |
+
|
| 186 |
+
- **Paid Tiers**:
|
| 187 |
+
- Available for GPU or more resources
|
| 188 |
+
- Not needed for this application
|
| 189 |
+
|
| 190 |
+
## π Useful Links
|
| 191 |
+
|
| 192 |
+
- Hugging Face Spaces Docs: https://huggingface.co/docs/hub/spaces
|
| 193 |
+
- Streamlit Docs: https://docs.streamlit.io/
|
| 194 |
+
- Git LFS: https://git-lfs.github.com/
|
| 195 |
+
|
| 196 |
+
## π Support
|
| 197 |
+
|
| 198 |
+
If you encounter issues:
|
| 199 |
+
1. Check the "Logs" tab in your Space
|
| 200 |
+
2. Review Hugging Face Spaces documentation
|
| 201 |
+
3. Search Hugging Face forums
|
| 202 |
+
4. Open an issue on the repository
|
| 203 |
+
|
| 204 |
+
## β
Pre-Deployment Checklist
|
| 205 |
+
|
| 206 |
+
- [ ] All model files copied to `models/` folders
|
| 207 |
+
- [ ] Visualizations copied to `visualizations/` folder
|
| 208 |
+
- [ ] `requirements.txt` has all dependencies
|
| 209 |
+
- [ ] `README.md` header configured with title/emoji
|
| 210 |
+
- [ ] Tested app locally (`streamlit run app.py`)
|
| 211 |
+
- [ ] Git LFS installed and configured
|
| 212 |
+
- [ ] Hugging Face account created
|
| 213 |
+
- [ ] Space created on Hugging Face
|
| 214 |
+
|
| 215 |
+
## π Post-Deployment
|
| 216 |
+
|
| 217 |
+
After successful deployment:
|
| 218 |
+
1. Test all features in the live app
|
| 219 |
+
2. Share your Space URL with others
|
| 220 |
+
3. Monitor logs for any errors
|
| 221 |
+
4. Consider adding:
|
| 222 |
+
- Example inputs/outputs
|
| 223 |
+
- Tutorial video
|
| 224 |
+
- Publication link
|
| 225 |
+
- Citation information
|
| 226 |
+
|
| 227 |
+
---
|
| 228 |
+
|
| 229 |
+
**Good luck with your deployment! π**
|
| 230 |
+
|
| 231 |
+
Your app will be accessible at:
|
| 232 |
+
`https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
|
| 233 |
+
|
FILE_STRUCTURE.md
ADDED
|
@@ -0,0 +1,333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# π Hugging Face Deployment - Complete File Structure
|
| 2 |
+
|
| 3 |
+
## Overview
|
| 4 |
+
This folder contains everything needed to deploy the Crystallization Component Predictor to Hugging Face Spaces.
|
| 5 |
+
|
| 6 |
+
**Total Size:** ~46 MB
|
| 7 |
+
**Status:** β
Ready for deployment
|
| 8 |
+
|
| 9 |
+
---
|
| 10 |
+
|
| 11 |
+
## π Directory Structure
|
| 12 |
+
|
| 13 |
+
```
|
| 14 |
+
huggingface_app/
|
| 15 |
+
β
|
| 16 |
+
βββ π Core Application Files
|
| 17 |
+
β βββ app.py # Main Streamlit application (standalone)
|
| 18 |
+
β βββ requirements.txt # Python dependencies for Hugging Face
|
| 19 |
+
β βββ README.md # Hugging Face Space documentation
|
| 20 |
+
β
|
| 21 |
+
βββ βοΈ Configuration Files
|
| 22 |
+
β βββ .gitattributes # Git LFS configuration for large files
|
| 23 |
+
β βββ .gitignore # Files to exclude from Git
|
| 24 |
+
β
|
| 25 |
+
βββ π Documentation
|
| 26 |
+
β βββ DEPLOYMENT_GUIDE.md # Step-by-step deployment instructions
|
| 27 |
+
β βββ QUICKSTART.txt # Quick reference guide
|
| 28 |
+
β βββ FILE_STRUCTURE.md # This file
|
| 29 |
+
β
|
| 30 |
+
βββ π§ Utility Scripts
|
| 31 |
+
β βββ verify_files.py # Verification script (check all files present)
|
| 32 |
+
β βββ RUN_LOCAL.bat # Windows: Run app locally
|
| 33 |
+
β βββ run_local.sh # Linux/Mac: Run app locally
|
| 34 |
+
β
|
| 35 |
+
βββ π€ models/
|
| 36 |
+
β β
|
| 37 |
+
β βββ simple_baseline/ # Simple Baseline models
|
| 38 |
+
β β βββ model_component_name.pkl # Random Forest classifier (name)
|
| 39 |
+
β β βββ model_component_ph.pkl # XGBoost regressor (pH)
|
| 40 |
+
β β βββ label_encoder_name.pkl # Label encoder for component names
|
| 41 |
+
β β βββ scaler.pkl # StandardScaler for features
|
| 42 |
+
β β βββ tfidf.pkl # TF-IDF vectorizer for methods
|
| 43 |
+
β β βββ training_results.json # Training metrics
|
| 44 |
+
β β
|
| 45 |
+
β βββ advanced_baseline/ # Advanced Baseline models
|
| 46 |
+
β βββ model_component_name.pkl # Ensemble classifier (name)
|
| 47 |
+
β βββ model_component_conc.pkl # Ensemble regressor (concentration)
|
| 48 |
+
β βββ model_component_ph.pkl # Ensemble regressor (pH)
|
| 49 |
+
β βββ label_encoder_name.pkl # Label encoder for component names
|
| 50 |
+
β βββ scaler.pkl # StandardScaler for features
|
| 51 |
+
β βββ tfidf.pkl # TF-IDF vectorizer for methods
|
| 52 |
+
β βββ training_results.json # Training metrics
|
| 53 |
+
β
|
| 54 |
+
βββ π visualizations/ # Performance comparison charts
|
| 55 |
+
βββ 01_component_name_comparison.png
|
| 56 |
+
βββ 02_component_conc_comparison.png
|
| 57 |
+
βββ 03_component_ph_comparison.png
|
| 58 |
+
βββ 04_all_approaches_heatmap.png
|
| 59 |
+
βββ 05_complete_comparison.png
|
| 60 |
+
βββ eda_01_missing_values_matrix.png
|
| 61 |
+
βββ eda_02_missing_values_heatmap.png
|
| 62 |
+
βββ eda_03_target_distributions.png
|
| 63 |
+
βββ eda_04_feature_distributions.png
|
| 64 |
+
βββ eda_05_correlation_matrix.png
|
| 65 |
+
```
|
| 66 |
+
|
| 67 |
+
---
|
| 68 |
+
|
| 69 |
+
## π File Descriptions
|
| 70 |
+
|
| 71 |
+
### Core Application Files
|
| 72 |
+
|
| 73 |
+
#### `app.py` (Main Application)
|
| 74 |
+
- **Purpose:** Streamlit web application
|
| 75 |
+
- **Key Features:**
|
| 76 |
+
- Model selection (Simple vs Advanced Baseline)
|
| 77 |
+
- Interactive parameter input
|
| 78 |
+
- Real-time predictions
|
| 79 |
+
- Top-5 component predictions with probabilities
|
| 80 |
+
- Visual pH scale
|
| 81 |
+
- Downloadable results (CSV)
|
| 82 |
+
- Performance visualizations
|
| 83 |
+
- Model comparison charts
|
| 84 |
+
- **Dependencies:** All specified in `requirements.txt`
|
| 85 |
+
- **Entry Point:** Yes - Hugging Face will run this automatically
|
| 86 |
+
|
| 87 |
+
#### `requirements.txt`
|
| 88 |
+
- **Purpose:** Python package dependencies
|
| 89 |
+
- **Key Packages:**
|
| 90 |
+
- streamlit==1.29.0
|
| 91 |
+
- pandas==2.1.4
|
| 92 |
+
- numpy==1.26.2
|
| 93 |
+
- scikit-learn==1.3.2
|
| 94 |
+
- xgboost==2.0.3
|
| 95 |
+
- lightgbm==4.1.0
|
| 96 |
+
- catboost==1.2.2
|
| 97 |
+
- joblib==1.3.2
|
| 98 |
+
- **Note:** Versions pinned for reproducibility
|
| 99 |
+
|
| 100 |
+
#### `README.md`
|
| 101 |
+
- **Purpose:** Documentation displayed on Hugging Face Space page
|
| 102 |
+
- **Contains:**
|
| 103 |
+
- App description and features
|
| 104 |
+
- Model performance metrics
|
| 105 |
+
- Usage instructions
|
| 106 |
+
- Technical details
|
| 107 |
+
- Background information
|
| 108 |
+
- Acknowledgments
|
| 109 |
+
- **Special:** YAML header configures Space appearance
|
| 110 |
+
|
| 111 |
+
---
|
| 112 |
+
|
| 113 |
+
### Configuration Files
|
| 114 |
+
|
| 115 |
+
#### `.gitattributes`
|
| 116 |
+
- **Purpose:** Git LFS (Large File Storage) configuration
|
| 117 |
+
- **Tracks:**
|
| 118 |
+
- *.pkl (model files)
|
| 119 |
+
- *.pth (PyTorch models)
|
| 120 |
+
- *.json (results)
|
| 121 |
+
- *.png (images)
|
| 122 |
+
- **Why:** Files >10MB need LFS on Hugging Face
|
| 123 |
+
|
| 124 |
+
#### `.gitignore`
|
| 125 |
+
- **Purpose:** Exclude unnecessary files from Git
|
| 126 |
+
- **Excludes:**
|
| 127 |
+
- Python cache (`__pycache__/`)
|
| 128 |
+
- Virtual environments
|
| 129 |
+
- IDE files
|
| 130 |
+
- OS files
|
| 131 |
+
- Logs
|
| 132 |
+
|
| 133 |
+
---
|
| 134 |
+
|
| 135 |
+
### Documentation Files
|
| 136 |
+
|
| 137 |
+
#### `DEPLOYMENT_GUIDE.md`
|
| 138 |
+
- **Purpose:** Complete deployment instructions
|
| 139 |
+
- **Sections:**
|
| 140 |
+
- Prerequisites
|
| 141 |
+
- Step-by-step deployment (Web UI & Git CLI)
|
| 142 |
+
- Troubleshooting
|
| 143 |
+
- Customization
|
| 144 |
+
- Monitoring
|
| 145 |
+
- Security & privacy
|
| 146 |
+
|
| 147 |
+
#### `QUICKSTART.txt`
|
| 148 |
+
- **Purpose:** Quick reference for common tasks
|
| 149 |
+
- **Format:** Plain text for easy viewing
|
| 150 |
+
- **Content:** Essential info at a glance
|
| 151 |
+
|
| 152 |
+
#### `FILE_STRUCTURE.md`
|
| 153 |
+
- **Purpose:** This document - complete file inventory
|
| 154 |
+
|
| 155 |
+
---
|
| 156 |
+
|
| 157 |
+
### Utility Scripts
|
| 158 |
+
|
| 159 |
+
#### `verify_files.py`
|
| 160 |
+
- **Purpose:** Pre-deployment verification
|
| 161 |
+
- **Checks:**
|
| 162 |
+
- All required files present
|
| 163 |
+
- Model files exist
|
| 164 |
+
- Folder structure correct
|
| 165 |
+
- Total size calculation
|
| 166 |
+
- **Usage:** `python verify_files.py`
|
| 167 |
+
|
| 168 |
+
#### `RUN_LOCAL.bat` (Windows)
|
| 169 |
+
- **Purpose:** Launch app locally for testing
|
| 170 |
+
- **Usage:** Double-click or run `RUN_LOCAL.bat`
|
| 171 |
+
- **Opens:** http://localhost:8501
|
| 172 |
+
|
| 173 |
+
#### `run_local.sh` (Linux/Mac)
|
| 174 |
+
- **Purpose:** Launch app locally for testing
|
| 175 |
+
- **Usage:** `bash run_local.sh`
|
| 176 |
+
- **Opens:** http://localhost:8501
|
| 177 |
+
|
| 178 |
+
---
|
| 179 |
+
|
| 180 |
+
### Model Files
|
| 181 |
+
|
| 182 |
+
#### Simple Baseline Models (6 files)
|
| 183 |
+
**Performance:**
|
| 184 |
+
- Name Accuracy: 61.12%
|
| 185 |
+
- pH RΒ²: 95.58%
|
| 186 |
+
- Concentration: N/A
|
| 187 |
+
|
| 188 |
+
**Files:**
|
| 189 |
+
1. `model_component_name.pkl` - Random Forest classifier
|
| 190 |
+
2. `model_component_ph.pkl` - XGBoost regressor
|
| 191 |
+
3. `label_encoder_name.pkl` - Encode component names
|
| 192 |
+
4. `scaler.pkl` - Feature normalization
|
| 193 |
+
5. `tfidf.pkl` - Text vectorization
|
| 194 |
+
6. `training_results.json` - Performance metrics
|
| 195 |
+
|
| 196 |
+
#### Advanced Baseline Models (7 files)
|
| 197 |
+
**Performance:**
|
| 198 |
+
- Name Accuracy: 64.18% β
|
| 199 |
+
- Concentration RΒ²: 47.33%
|
| 200 |
+
- pH RΒ²: 99.34% β
|
| 201 |
+
|
| 202 |
+
**Files:**
|
| 203 |
+
1. `model_component_name.pkl` - Ensemble (RF + XGB + LGB + Cat)
|
| 204 |
+
2. `model_component_conc.pkl` - Ensemble concentration regressor
|
| 205 |
+
3. `model_component_ph.pkl` - Ensemble pH regressor
|
| 206 |
+
4. `label_encoder_name.pkl` - Encode component names
|
| 207 |
+
5. `scaler.pkl` - Feature normalization
|
| 208 |
+
6. `tfidf.pkl` - Text vectorization
|
| 209 |
+
7. `training_results.json` - Performance metrics
|
| 210 |
+
|
| 211 |
+
---
|
| 212 |
+
|
| 213 |
+
### Visualization Files (10 images)
|
| 214 |
+
|
| 215 |
+
#### Model Comparison Charts
|
| 216 |
+
- `01_component_name_comparison.png` - Name accuracy comparison
|
| 217 |
+
- `02_component_conc_comparison.png` - Concentration RΒ² comparison
|
| 218 |
+
- `03_component_ph_comparison.png` - pH RΒ² comparison
|
| 219 |
+
- `04_all_approaches_heatmap.png` - Performance heatmap
|
| 220 |
+
- `05_complete_comparison.png` - Comprehensive comparison
|
| 221 |
+
|
| 222 |
+
#### EDA Visualizations
|
| 223 |
+
- `eda_01_missing_values_matrix.png` - Missing data patterns
|
| 224 |
+
- `eda_02_missing_values_heatmap.png` - Missing data heatmap
|
| 225 |
+
- `eda_03_target_distributions.png` - Target variable distributions
|
| 226 |
+
- `eda_04_feature_distributions.png` - Feature distributions
|
| 227 |
+
- `eda_05_correlation_matrix.png` - Feature correlations
|
| 228 |
+
|
| 229 |
+
---
|
| 230 |
+
|
| 231 |
+
## π Deployment Checklist
|
| 232 |
+
|
| 233 |
+
Before deploying to Hugging Face:
|
| 234 |
+
|
| 235 |
+
- [x] β
All core files present (app.py, requirements.txt, README.md)
|
| 236 |
+
- [x] β
Configuration files (.gitattributes, .gitignore)
|
| 237 |
+
- [x] β
Simple Baseline models (6 files)
|
| 238 |
+
- [x] β
Advanced Baseline models (7 files)
|
| 239 |
+
- [x] β
Visualizations (10 images)
|
| 240 |
+
- [x] β
Documentation complete
|
| 241 |
+
- [x] β
Verification script passes
|
| 242 |
+
- [x] β
Total size: 46.47 MB (within limits)
|
| 243 |
+
- [ ] β³ Test locally (run `streamlit run app.py`)
|
| 244 |
+
- [ ] β³ Deploy to Hugging Face
|
| 245 |
+
- [ ] β³ Test live deployment
|
| 246 |
+
|
| 247 |
+
---
|
| 248 |
+
|
| 249 |
+
## π‘ Key Features
|
| 250 |
+
|
| 251 |
+
### What Makes This Deployment Special
|
| 252 |
+
|
| 253 |
+
1. **Self-Contained**: No external dependencies or file paths
|
| 254 |
+
2. **Production-Ready**: All error handling included
|
| 255 |
+
3. **User-Friendly**: Beautiful UI with helpful tooltips
|
| 256 |
+
4. **Well-Documented**: Comprehensive README and guides
|
| 257 |
+
5. **Verified**: Includes verification script
|
| 258 |
+
6. **Git LFS Ready**: Configured for large model files
|
| 259 |
+
7. **Cross-Platform**: Works on Windows, Linux, Mac
|
| 260 |
+
|
| 261 |
+
### App Capabilities
|
| 262 |
+
|
| 263 |
+
- β
Two model options (Simple & Advanced)
|
| 264 |
+
- β
Interactive parameter input
|
| 265 |
+
- β
Real-time predictions
|
| 266 |
+
- β
Top-5 component suggestions
|
| 267 |
+
- β
Confidence scores
|
| 268 |
+
- β
Visual pH scale
|
| 269 |
+
- β
Downloadable CSV results
|
| 270 |
+
- β
Performance visualizations
|
| 271 |
+
- β
Model comparison tables
|
| 272 |
+
- β
Responsive design
|
| 273 |
+
|
| 274 |
+
---
|
| 275 |
+
|
| 276 |
+
## π Statistics
|
| 277 |
+
|
| 278 |
+
| Metric | Value |
|
| 279 |
+
|--------|-------|
|
| 280 |
+
| Total Files | 30 |
|
| 281 |
+
| Python Scripts | 2 |
|
| 282 |
+
| Model Files | 13 |
|
| 283 |
+
| Images | 10 |
|
| 284 |
+
| Documentation | 5 |
|
| 285 |
+
| Total Size | 46.47 MB |
|
| 286 |
+
| Largest File | model_component_name.pkl (~8 MB each) |
|
| 287 |
+
|
| 288 |
+
---
|
| 289 |
+
|
| 290 |
+
## π Next Steps
|
| 291 |
+
|
| 292 |
+
1. **Test Locally:**
|
| 293 |
+
```bash
|
| 294 |
+
streamlit run app.py
|
| 295 |
+
```
|
| 296 |
+
|
| 297 |
+
2. **Verify Files:**
|
| 298 |
+
```bash
|
| 299 |
+
python verify_files.py
|
| 300 |
+
```
|
| 301 |
+
|
| 302 |
+
3. **Deploy to Hugging Face:**
|
| 303 |
+
- Follow `DEPLOYMENT_GUIDE.md`
|
| 304 |
+
- Or see `QUICKSTART.txt` for quick steps
|
| 305 |
+
|
| 306 |
+
4. **Share Your Space:**
|
| 307 |
+
- URL: `https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME`
|
| 308 |
+
|
| 309 |
+
---
|
| 310 |
+
|
| 311 |
+
## β οΈ Important Notes
|
| 312 |
+
|
| 313 |
+
- All paths in `app.py` are relative to the script location
|
| 314 |
+
- Models load on first prediction (not at startup)
|
| 315 |
+
- Git LFS is required for files >10MB
|
| 316 |
+
- Free tier on Hugging Face is sufficient
|
| 317 |
+
- No API keys or secrets required
|
| 318 |
+
|
| 319 |
+
---
|
| 320 |
+
|
| 321 |
+
## π Support
|
| 322 |
+
|
| 323 |
+
- **Deployment Issues:** See `DEPLOYMENT_GUIDE.md`
|
| 324 |
+
- **File Issues:** Run `verify_files.py`
|
| 325 |
+
- **App Issues:** Check `app.py` comments
|
| 326 |
+
- **Hugging Face Help:** https://huggingface.co/docs/hub/spaces
|
| 327 |
+
|
| 328 |
+
---
|
| 329 |
+
|
| 330 |
+
**Status:** β
**READY FOR DEPLOYMENT**
|
| 331 |
+
|
| 332 |
+
This folder is complete and ready to be uploaded to Hugging Face Spaces!
|
| 333 |
+
|
QUICKSTART.txt
ADDED
|
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 2 |
+
β CRYSTALLIZATION COMPONENT PREDICTOR - QUICK START GUIDE β
|
| 3 |
+
ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
|
| 4 |
+
|
| 5 |
+
π FOLDER CONTENTS:
|
| 6 |
+
==================
|
| 7 |
+
β
All files ready for Hugging Face deployment!
|
| 8 |
+
|
| 9 |
+
π¦ WHAT'S INCLUDED:
|
| 10 |
+
===================
|
| 11 |
+
β’ app.py - Main Streamlit application
|
| 12 |
+
β’ requirements.txt - Python dependencies
|
| 13 |
+
β’ README.md - Documentation for Hugging Face
|
| 14 |
+
β’ models/ - All trained ML models (Simple & Advanced Baseline)
|
| 15 |
+
β’ visualizations/ - Performance comparison charts
|
| 16 |
+
β’ .gitattributes - Git LFS configuration
|
| 17 |
+
β’ DEPLOYMENT_GUIDE.md - Detailed deployment instructions
|
| 18 |
+
|
| 19 |
+
π FOLDER SIZE: ~46 MB
|
| 20 |
+
|
| 21 |
+
π DEPLOY TO HUGGING FACE:
|
| 22 |
+
===========================
|
| 23 |
+
|
| 24 |
+
OPTION 1: Web Upload (Easiest)
|
| 25 |
+
-------------------------------
|
| 26 |
+
1. Go to https://huggingface.co/spaces
|
| 27 |
+
2. Click "Create new Space"
|
| 28 |
+
3. Choose name, select "Streamlit" as SDK
|
| 29 |
+
4. Upload ALL files from this folder
|
| 30 |
+
5. Wait 2-5 minutes for build
|
| 31 |
+
6. Done! Your app is live
|
| 32 |
+
|
| 33 |
+
OPTION 2: Git Command Line
|
| 34 |
+
---------------------------
|
| 35 |
+
1. git clone https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME
|
| 36 |
+
2. Copy all files to cloned folder
|
| 37 |
+
3. git add .
|
| 38 |
+
4. git commit -m "Deploy crystallization predictor"
|
| 39 |
+
5. git push
|
| 40 |
+
|
| 41 |
+
π See DEPLOYMENT_GUIDE.md for detailed instructions!
|
| 42 |
+
|
| 43 |
+
π§ͺ TEST LOCALLY FIRST:
|
| 44 |
+
=======================
|
| 45 |
+
Windows: Double-click RUN_LOCAL.bat
|
| 46 |
+
Linux/Mac: bash run_local.sh
|
| 47 |
+
OR: streamlit run app.py
|
| 48 |
+
|
| 49 |
+
Then open: http://localhost:8501
|
| 50 |
+
|
| 51 |
+
β
VERIFICATION:
|
| 52 |
+
================
|
| 53 |
+
Run: python verify_files.py
|
| 54 |
+
All files present: β
|
| 55 |
+
|
| 56 |
+
π― WHAT THE APP DOES:
|
| 57 |
+
======================
|
| 58 |
+
Predicts optimal crystallization components:
|
| 59 |
+
β’ Component Name (chemical compound)
|
| 60 |
+
β’ Concentration (molarity)
|
| 61 |
+
β’ pH (acidity level)
|
| 62 |
+
|
| 63 |
+
Based on your input parameters:
|
| 64 |
+
β’ Crystallization method
|
| 65 |
+
β’ Temperature
|
| 66 |
+
β’ pH
|
| 67 |
+
β’ Matthews coefficient
|
| 68 |
+
β’ Solvent content
|
| 69 |
+
|
| 70 |
+
π MODEL PERFORMANCE:
|
| 71 |
+
======================
|
| 72 |
+
Advanced Baseline (Recommended):
|
| 73 |
+
β’ Name Accuracy: 64.18%
|
| 74 |
+
β’ Concentration RΒ²: 47.33%
|
| 75 |
+
β’ pH RΒ²: 99.34%
|
| 76 |
+
|
| 77 |
+
Simple Baseline:
|
| 78 |
+
β’ Name Accuracy: 61.12%
|
| 79 |
+
β’ pH RΒ²: 95.58%
|
| 80 |
+
β’ No concentration prediction
|
| 81 |
+
|
| 82 |
+
β‘ QUICK TIPS:
|
| 83 |
+
==============
|
| 84 |
+
β Use Advanced Baseline for complete predictions
|
| 85 |
+
β Test locally before deploying
|
| 86 |
+
β All dependencies are in requirements.txt
|
| 87 |
+
β Git LFS is configured for large files
|
| 88 |
+
β Models load on first prediction (intentional)
|
| 89 |
+
β Free tier on Hugging Face is sufficient
|
| 90 |
+
|
| 91 |
+
π NEED HELP?
|
| 92 |
+
=============
|
| 93 |
+
β’ Check DEPLOYMENT_GUIDE.md
|
| 94 |
+
β’ Visit https://huggingface.co/docs/hub/spaces
|
| 95 |
+
β’ Review app.py comments
|
| 96 |
+
|
| 97 |
+
π READY TO DEPLOY!
|
| 98 |
+
===================
|
| 99 |
+
Your Space URL will be:
|
| 100 |
+
https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
|
| 101 |
+
|
| 102 |
+
Good luck! ππ¬
|
| 103 |
+
|
README.md
CHANGED
|
@@ -1,19 +1,143 @@
|
|
| 1 |
-
---
|
| 2 |
-
title:
|
| 3 |
-
emoji:
|
| 4 |
-
colorFrom:
|
| 5 |
-
colorTo:
|
| 6 |
-
sdk:
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
title: Crystallization Component Predictor
|
| 3 |
+
emoji: π¬
|
| 4 |
+
colorFrom: blue
|
| 5 |
+
colorTo: purple
|
| 6 |
+
sdk: streamlit
|
| 7 |
+
sdk_version: 1.29.0
|
| 8 |
+
app_file: app.py
|
| 9 |
+
pinned: false
|
| 10 |
+
license: mit
|
| 11 |
+
---
|
| 12 |
+
|
| 13 |
+
# π¬ Crystallization Component Predictor
|
| 14 |
+
|
| 15 |
+
An interactive machine learning application for predicting optimal protein crystallization components based on experimental parameters.
|
| 16 |
+
|
| 17 |
+
## π― What Does This App Do?
|
| 18 |
+
|
| 19 |
+
This tool predicts three critical crystallization parameters:
|
| 20 |
+
1. **Component Name**: The chemical compound most likely to produce crystals
|
| 21 |
+
2. **Concentration**: The optimal molarity for the component
|
| 22 |
+
3. **pH**: The ideal acidity/basicity level for crystallization
|
| 23 |
+
|
| 24 |
+
## π Quick Start
|
| 25 |
+
|
| 26 |
+
1. Select a model (Advanced Baseline recommended)
|
| 27 |
+
2. Input your crystallization parameters:
|
| 28 |
+
- Crystallization method
|
| 29 |
+
- Temperature
|
| 30 |
+
- pH
|
| 31 |
+
- Matthews coefficient
|
| 32 |
+
- Solvent content
|
| 33 |
+
3. Click "Predict Components"
|
| 34 |
+
4. Review predictions and download results
|
| 35 |
+
|
| 36 |
+
## π Model Performance
|
| 37 |
+
|
| 38 |
+
| Model | Name Accuracy | Conc RΒ² | pH RΒ² |
|
| 39 |
+
|-------|--------------|---------|-------|
|
| 40 |
+
| Simple Baseline | 61.12% | N/A | 95.58% |
|
| 41 |
+
| **Advanced Baseline** β | **64.18%** | **47.33%** | **99.34%** |
|
| 42 |
+
| Transformer | 53.85% | 18.72% | 99.27% |
|
| 43 |
+
|
| 44 |
+
**Recommended:** Advanced Baseline for best overall performance
|
| 45 |
+
|
| 46 |
+
## π¬ Features
|
| 47 |
+
|
| 48 |
+
- **Two Model Approaches**: Choose between Simple and Advanced Baseline
|
| 49 |
+
- **Interactive UI**: Easy-to-use sliders and dropdowns
|
| 50 |
+
- **Top-5 Predictions**: View confidence scores for multiple candidates
|
| 51 |
+
- **Visual pH Scale**: Intuitive pH visualization
|
| 52 |
+
- **Downloadable Results**: Export predictions as CSV
|
| 53 |
+
- **Performance Charts**: Compare model accuracies
|
| 54 |
+
|
| 55 |
+
## π οΈ Technical Details
|
| 56 |
+
|
| 57 |
+
### Simple Baseline
|
| 58 |
+
- Random Forest for component classification
|
| 59 |
+
- XGBoost for pH regression
|
| 60 |
+
- 4 numerical features + TF-IDF of crystallization method
|
| 61 |
+
|
| 62 |
+
### Advanced Baseline (Recommended)
|
| 63 |
+
- Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
|
| 64 |
+
- 8 engineered features including interaction terms
|
| 65 |
+
- Separate models for name, concentration, and pH
|
| 66 |
+
- Log-transformed concentration predictions
|
| 67 |
+
|
| 68 |
+
### Models Included
|
| 69 |
+
- `simple_baseline/`: Simple baseline models
|
| 70 |
+
- `model_component_name.pkl`: Component classifier
|
| 71 |
+
- `model_component_ph.pkl`: pH regressor
|
| 72 |
+
- `label_encoder_name.pkl`: Label encoder
|
| 73 |
+
- `scaler.pkl`: Feature scaler
|
| 74 |
+
- `tfidf.pkl`: TF-IDF vectorizer
|
| 75 |
+
|
| 76 |
+
- `advanced_baseline/`: Advanced baseline models
|
| 77 |
+
- `model_component_name.pkl`: Enhanced component classifier
|
| 78 |
+
- `model_component_conc.pkl`: Concentration regressor
|
| 79 |
+
- `model_component_ph.pkl`: Enhanced pH regressor
|
| 80 |
+
- `label_encoder_name.pkl`: Label encoder
|
| 81 |
+
- `scaler.pkl`: Feature scaler
|
| 82 |
+
- `tfidf.pkl`: TF-IDF vectorizer
|
| 83 |
+
|
| 84 |
+
## π¦ Dependencies
|
| 85 |
+
|
| 86 |
+
- Python 3.9+
|
| 87 |
+
- Streamlit
|
| 88 |
+
- Scikit-learn
|
| 89 |
+
- XGBoost
|
| 90 |
+
- LightGBM
|
| 91 |
+
- CatBoost
|
| 92 |
+
- Pandas
|
| 93 |
+
- NumPy
|
| 94 |
+
- Joblib
|
| 95 |
+
|
| 96 |
+
## π Use Cases
|
| 97 |
+
|
| 98 |
+
- **Structural Biology**: Plan crystallization experiments
|
| 99 |
+
- **Drug Discovery**: Optimize protein crystal conditions
|
| 100 |
+
- **Research**: Explore crystallization parameter space
|
| 101 |
+
- **Education**: Learn about protein crystallization
|
| 102 |
+
|
| 103 |
+
## π Background
|
| 104 |
+
|
| 105 |
+
Protein crystallization is essential for determining 3D protein structures via X-ray crystallography. This tool uses machine learning trained on historical crystallization data from the Protein Data Bank (PDB) to predict optimal conditions.
|
| 106 |
+
|
| 107 |
+
### Input Parameters Explained
|
| 108 |
+
|
| 109 |
+
- **Crystallization Method**: Technique used (vapor diffusion, batch, etc.)
|
| 110 |
+
- **Temperature**: Affects protein stability and crystal growth (typically 277-298K)
|
| 111 |
+
- **pH**: Critical for protein solubility and crystal formation (0-14 scale)
|
| 112 |
+
- **Matthews Coefficient**: Unit cell volume to protein molecular weight ratio (Ε²/Da)
|
| 113 |
+
- **Solvent Content**: Percentage of solvent in crystal lattice (typically 30-70%)
|
| 114 |
+
|
| 115 |
+
## β οΈ Important Notes
|
| 116 |
+
|
| 117 |
+
- **Validation Required**: Always validate predictions experimentally
|
| 118 |
+
- **Research Tool**: For research and educational purposes
|
| 119 |
+
- **Starting Point**: Use predictions as a guide, not absolute truth
|
| 120 |
+
- **Protein-Specific**: Results may vary based on your specific protein
|
| 121 |
+
|
| 122 |
+
## π€ Contributing
|
| 123 |
+
|
| 124 |
+
This is a research project. Feedback and suggestions are welcome!
|
| 125 |
+
|
| 126 |
+
## π License
|
| 127 |
+
|
| 128 |
+
MIT License - Free to use for research and educational purposes
|
| 129 |
+
|
| 130 |
+
## π Acknowledgments
|
| 131 |
+
|
| 132 |
+
- Training data derived from Protein Data Bank (PDB)
|
| 133 |
+
- Built with Streamlit and ensemble ML models
|
| 134 |
+
- Inspired by advances in computational structural biology
|
| 135 |
+
|
| 136 |
+
## π Contact & Support
|
| 137 |
+
|
| 138 |
+
For questions or issues, please open an issue on the repository.
|
| 139 |
+
|
| 140 |
+
---
|
| 141 |
+
|
| 142 |
+
**Note**: This tool provides predictions based on historical data. Always conduct proper experimental validation. Crystallization is a complex process influenced by many factors not captured by these models alone.
|
| 143 |
+
|
RUN_LOCAL.bat
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
@echo off
|
| 2 |
+
echo ========================================
|
| 3 |
+
echo Crystallization Predictor - Local Run
|
| 4 |
+
echo ========================================
|
| 5 |
+
echo.
|
| 6 |
+
echo Starting Streamlit app...
|
| 7 |
+
echo Press Ctrl+C to stop
|
| 8 |
+
echo.
|
| 9 |
+
echo App will open in your browser at:
|
| 10 |
+
echo http://localhost:8501
|
| 11 |
+
echo.
|
| 12 |
+
echo ----------------------------------------
|
| 13 |
+
|
| 14 |
+
streamlit run app.py
|
| 15 |
+
|
| 16 |
+
pause
|
| 17 |
+
|
app.py
ADDED
|
@@ -0,0 +1,479 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Interactive Crystallization Component Predictor
|
| 3 |
+
===============================================
|
| 4 |
+
Streamlit app for Hugging Face Hub deployment
|
| 5 |
+
Predicts crystallization components using Simple Baseline and Advanced Baseline models
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import streamlit as st
|
| 9 |
+
import pandas as pd
|
| 10 |
+
import numpy as np
|
| 11 |
+
import joblib
|
| 12 |
+
import json
|
| 13 |
+
import os
|
| 14 |
+
import warnings
|
| 15 |
+
|
| 16 |
+
warnings.filterwarnings('ignore')
|
| 17 |
+
|
| 18 |
+
# Page config
|
| 19 |
+
st.set_page_config(
|
| 20 |
+
page_title="Crystallization Predictor",
|
| 21 |
+
page_icon="π¬",
|
| 22 |
+
layout="wide",
|
| 23 |
+
initial_sidebar_state="expanded"
|
| 24 |
+
)
|
| 25 |
+
|
| 26 |
+
# Get the directory of this script
|
| 27 |
+
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 28 |
+
|
| 29 |
+
# Title and Introduction
|
| 30 |
+
st.title("π¬ Crystallization Component Predictor")
|
| 31 |
+
st.markdown("""
|
| 32 |
+
### Predict crystallization components using Machine Learning
|
| 33 |
+
This app uses trained machine learning models to predict the optimal components for protein crystallization
|
| 34 |
+
based on your experimental parameters.
|
| 35 |
+
""")
|
| 36 |
+
st.markdown("---")
|
| 37 |
+
|
| 38 |
+
# Sidebar
|
| 39 |
+
st.sidebar.header("βοΈ Model Selection")
|
| 40 |
+
approach = st.sidebar.radio(
|
| 41 |
+
"Choose Approach:",
|
| 42 |
+
["Advanced Baseline (Recommended)", "Simple Baseline"],
|
| 43 |
+
help="Advanced has concentration parsing and better accuracy"
|
| 44 |
+
)
|
| 45 |
+
|
| 46 |
+
st.sidebar.markdown("---")
|
| 47 |
+
st.sidebar.markdown("### π Model Performance")
|
| 48 |
+
|
| 49 |
+
# Display performance metrics
|
| 50 |
+
try:
|
| 51 |
+
simple_results_path = os.path.join(BASE_DIR, 'models', 'simple_baseline', 'training_results.json')
|
| 52 |
+
advanced_results_path = os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'training_results.json')
|
| 53 |
+
|
| 54 |
+
if os.path.exists(simple_results_path):
|
| 55 |
+
with open(simple_results_path, 'r') as f:
|
| 56 |
+
simple_results = json.load(f)
|
| 57 |
+
if os.path.exists(advanced_results_path):
|
| 58 |
+
with open(advanced_results_path, 'r') as f:
|
| 59 |
+
advanced_results = json.load(f)
|
| 60 |
+
|
| 61 |
+
if "Simple" in approach:
|
| 62 |
+
st.sidebar.metric("Name Accuracy", "61.12%")
|
| 63 |
+
st.sidebar.metric("pH RΒ²", "95.58%")
|
| 64 |
+
st.sidebar.warning("β οΈ Conc: N/A")
|
| 65 |
+
else:
|
| 66 |
+
st.sidebar.metric("Name Accuracy", "64.18%")
|
| 67 |
+
st.sidebar.metric("Conc RΒ²", "47.33%")
|
| 68 |
+
st.sidebar.metric("pH RΒ²", "99.34%")
|
| 69 |
+
st.sidebar.success("β
All metrics working!")
|
| 70 |
+
|
| 71 |
+
except Exception as e:
|
| 72 |
+
st.sidebar.info(f"Using default metrics")
|
| 73 |
+
|
| 74 |
+
st.sidebar.markdown("---")
|
| 75 |
+
st.sidebar.markdown("""
|
| 76 |
+
### βΉοΈ About
|
| 77 |
+
This tool predicts three key crystallization parameters:
|
| 78 |
+
- **Component Name**: The chemical compound
|
| 79 |
+
- **Concentration**: Amount in solution (M)
|
| 80 |
+
- **pH**: Acidity/basicity level
|
| 81 |
+
|
| 82 |
+
**Recommended:** Advanced Baseline for complete predictions
|
| 83 |
+
""")
|
| 84 |
+
|
| 85 |
+
# Input Form
|
| 86 |
+
st.header("π― Input Crystallization Parameters")
|
| 87 |
+
|
| 88 |
+
col1, col2 = st.columns(2)
|
| 89 |
+
|
| 90 |
+
with col1:
|
| 91 |
+
st.markdown("#### Crystallization Setup")
|
| 92 |
+
cryst_method = st.selectbox(
|
| 93 |
+
"Crystallization Method",
|
| 94 |
+
[
|
| 95 |
+
"VAPOR DIFFUSION, SITTING DROP",
|
| 96 |
+
"VAPOR DIFFUSION, HANGING DROP",
|
| 97 |
+
"VAPOR DIFFUSION",
|
| 98 |
+
"BATCH MODE",
|
| 99 |
+
"MICROBATCH"
|
| 100 |
+
],
|
| 101 |
+
help="Select the crystallization technique you're using"
|
| 102 |
+
)
|
| 103 |
+
|
| 104 |
+
temp = st.slider(
|
| 105 |
+
"Temperature (K)",
|
| 106 |
+
250.0, 320.0, 293.0, 1.0,
|
| 107 |
+
help="Typical room temperature is ~293K (20Β°C)"
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
ph = st.slider(
|
| 111 |
+
"pH",
|
| 112 |
+
0.0, 14.0, 7.0, 0.1,
|
| 113 |
+
help="Initial pH of your crystallization solution"
|
| 114 |
+
)
|
| 115 |
+
|
| 116 |
+
with col2:
|
| 117 |
+
st.markdown("#### Crystal Properties")
|
| 118 |
+
matthews = st.slider(
|
| 119 |
+
"Matthews Coefficient",
|
| 120 |
+
1.0, 4.5, 2.2, 0.1,
|
| 121 |
+
help="Ratio of unit cell volume to protein molecular weight (Γ
Β³/Da)"
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
solvent = st.slider(
|
| 125 |
+
"Percent Solvent Content (%)",
|
| 126 |
+
0.0, 100.0, 45.0, 1.0,
|
| 127 |
+
help="Percentage of solvent in the crystal"
|
| 128 |
+
)
|
| 129 |
+
|
| 130 |
+
st.markdown("---")
|
| 131 |
+
|
| 132 |
+
# Predict button
|
| 133 |
+
if st.button("π Predict Components", type="primary", use_container_width=True):
|
| 134 |
+
|
| 135 |
+
try:
|
| 136 |
+
with st.spinner("π Loading models and making predictions..."):
|
| 137 |
+
|
| 138 |
+
if "Advanced" in approach:
|
| 139 |
+
# Load advanced models
|
| 140 |
+
model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_name.pkl'))
|
| 141 |
+
model_conc = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_conc.pkl'))
|
| 142 |
+
model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_ph.pkl'))
|
| 143 |
+
le = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'label_encoder_name.pkl'))
|
| 144 |
+
scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'scaler.pkl'))
|
| 145 |
+
tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'tfidf.pkl'))
|
| 146 |
+
|
| 147 |
+
# Feature engineering (Advanced Baseline needs 8 features)
|
| 148 |
+
temp_ph_int = temp * ph
|
| 149 |
+
matthews_solvent_int = matthews * solvent
|
| 150 |
+
ph_diff = 0 # Unknown for new prediction
|
| 151 |
+
solvent_ratio = solvent / (matthews + 1e-6)
|
| 152 |
+
|
| 153 |
+
numerical = np.array([[temp, ph, matthews, solvent,
|
| 154 |
+
temp_ph_int, matthews_solvent_int,
|
| 155 |
+
ph_diff, solvent_ratio]])
|
| 156 |
+
|
| 157 |
+
else:
|
| 158 |
+
# Load simple models
|
| 159 |
+
model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_name.pkl'))
|
| 160 |
+
model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_ph.pkl'))
|
| 161 |
+
le = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'label_encoder_name.pkl'))
|
| 162 |
+
scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'scaler.pkl'))
|
| 163 |
+
tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'tfidf.pkl'))
|
| 164 |
+
|
| 165 |
+
# Simple baseline: only 4 features
|
| 166 |
+
numerical = np.array([[temp, ph, matthews, solvent]])
|
| 167 |
+
|
| 168 |
+
# Scale numerical features
|
| 169 |
+
numerical_scaled = scaler.transform(numerical)
|
| 170 |
+
|
| 171 |
+
# TF-IDF for crystallization method
|
| 172 |
+
method_tfidf = tfidf.transform([cryst_method.upper()]).toarray()
|
| 173 |
+
|
| 174 |
+
# Combine features
|
| 175 |
+
X_pred = np.concatenate([numerical_scaled, method_tfidf], axis=1)
|
| 176 |
+
|
| 177 |
+
# Make predictions
|
| 178 |
+
pred_name_idx = model_name.predict(X_pred)[0]
|
| 179 |
+
pred_name = le.inverse_transform([pred_name_idx])[0]
|
| 180 |
+
pred_name_proba = model_name.predict_proba(X_pred)[0]
|
| 181 |
+
top_5_idx = np.argsort(pred_name_proba)[-5:][::-1]
|
| 182 |
+
top_5_names = le.inverse_transform(top_5_idx)
|
| 183 |
+
top_5_proba = pred_name_proba[top_5_idx]
|
| 184 |
+
|
| 185 |
+
pred_ph = model_ph.predict(X_pred)[0]
|
| 186 |
+
|
| 187 |
+
if "Advanced" in approach:
|
| 188 |
+
pred_conc = model_conc.predict(X_pred)[0]
|
| 189 |
+
|
| 190 |
+
# Display Results
|
| 191 |
+
st.success("β
Predictions Complete!")
|
| 192 |
+
st.markdown("---")
|
| 193 |
+
|
| 194 |
+
st.header("π Prediction Results")
|
| 195 |
+
|
| 196 |
+
# Component Name
|
| 197 |
+
st.subheader("1οΈβ£ Component_1_Name")
|
| 198 |
+
st.markdown("**Most likely chemical component for crystallization:**")
|
| 199 |
+
|
| 200 |
+
col1, col2 = st.columns([1, 2])
|
| 201 |
+
|
| 202 |
+
with col1:
|
| 203 |
+
st.metric("Predicted Component", pred_name)
|
| 204 |
+
st.caption("Top prediction from the model")
|
| 205 |
+
|
| 206 |
+
with col2:
|
| 207 |
+
st.markdown("**Top 5 Predictions (with confidence):**")
|
| 208 |
+
top5_df = pd.DataFrame({
|
| 209 |
+
'Rank': range(1, 6),
|
| 210 |
+
'Component': top_5_names,
|
| 211 |
+
'Probability': [f"{p:.2%}" for p in top_5_proba]
|
| 212 |
+
})
|
| 213 |
+
st.dataframe(top5_df, hide_index=True, use_container_width=True)
|
| 214 |
+
|
| 215 |
+
st.markdown("---")
|
| 216 |
+
|
| 217 |
+
# Concentration
|
| 218 |
+
st.subheader("2οΈβ£ Component_1_Conc")
|
| 219 |
+
if "Advanced" in approach:
|
| 220 |
+
col1, col2 = st.columns(2)
|
| 221 |
+
with col1:
|
| 222 |
+
st.metric("Predicted Concentration (log-scale)", f"{pred_conc:.4f}")
|
| 223 |
+
with col2:
|
| 224 |
+
actual_molarity = 10**pred_conc
|
| 225 |
+
st.metric("Actual Molarity", f"{actual_molarity:.6f} M")
|
| 226 |
+
|
| 227 |
+
st.info(f"π‘ Use approximately **{actual_molarity:.6f} M** of {pred_name} in your crystallization trials")
|
| 228 |
+
else:
|
| 229 |
+
st.warning("β οΈ Not available in Simple Baseline - use Advanced Baseline for concentration predictions")
|
| 230 |
+
|
| 231 |
+
st.markdown("---")
|
| 232 |
+
|
| 233 |
+
# pH
|
| 234 |
+
st.subheader("3οΈβ£ Component_1_pH")
|
| 235 |
+
col1, col2 = st.columns([1, 2])
|
| 236 |
+
|
| 237 |
+
with col1:
|
| 238 |
+
st.metric("Predicted pH", f"{pred_ph:.2f}")
|
| 239 |
+
|
| 240 |
+
# pH classification
|
| 241 |
+
if pred_ph < 6:
|
| 242 |
+
ph_class = "Acidic"
|
| 243 |
+
ph_emoji = "π΄"
|
| 244 |
+
elif pred_ph < 8:
|
| 245 |
+
ph_class = "Neutral"
|
| 246 |
+
ph_emoji = "π’"
|
| 247 |
+
else:
|
| 248 |
+
ph_class = "Basic"
|
| 249 |
+
ph_emoji = "π΅"
|
| 250 |
+
|
| 251 |
+
st.caption(f"{ph_emoji} {ph_class} solution")
|
| 252 |
+
|
| 253 |
+
with col2:
|
| 254 |
+
# pH visualization
|
| 255 |
+
ph_percent = (pred_ph / 14) * 100
|
| 256 |
+
ph_color = "red" if pred_ph < 6 else ("green" if pred_ph < 8 else "blue")
|
| 257 |
+
st.markdown(f"""
|
| 258 |
+
<div style='background: linear-gradient(to right, red, yellow, green, cyan, blue);
|
| 259 |
+
height: 40px; border-radius: 10px; margin: 10px 0; border: 2px solid #333;'></div>
|
| 260 |
+
<div style='display: flex; justify-content: space-between; font-size: 14px;'>
|
| 261 |
+
<span><b>0</b> (Acidic)</span>
|
| 262 |
+
<span><b>7</b> (Neutral)</span>
|
| 263 |
+
<span><b>14</b> (Basic)</span>
|
| 264 |
+
</div>
|
| 265 |
+
<div style='text-align: center; margin-top: 15px;'>
|
| 266 |
+
<b style='font-size: 24px; color: {ph_color};'>pH = {pred_ph:.2f}</b>
|
| 267 |
+
</div>
|
| 268 |
+
""", unsafe_allow_html=True)
|
| 269 |
+
|
| 270 |
+
st.info(f"π‘ Adjust your buffer to maintain pH β **{pred_ph:.2f}** for optimal crystallization")
|
| 271 |
+
|
| 272 |
+
# Input Summary
|
| 273 |
+
st.markdown("---")
|
| 274 |
+
st.subheader("π₯ Input Summary")
|
| 275 |
+
input_df = pd.DataFrame({
|
| 276 |
+
'Parameter': [
|
| 277 |
+
'Crystallization Method',
|
| 278 |
+
'Temperature',
|
| 279 |
+
'Input pH',
|
| 280 |
+
'Matthews Coefficient',
|
| 281 |
+
'Solvent Content'
|
| 282 |
+
],
|
| 283 |
+
'Value': [
|
| 284 |
+
cryst_method,
|
| 285 |
+
f"{temp:.1f} K ({temp-273.15:.1f}Β°C)",
|
| 286 |
+
f"{ph:.1f}",
|
| 287 |
+
f"{matthews:.2f} Ε²/Da",
|
| 288 |
+
f"{solvent:.1f}%"
|
| 289 |
+
]
|
| 290 |
+
})
|
| 291 |
+
st.table(input_df)
|
| 292 |
+
|
| 293 |
+
# Download Results
|
| 294 |
+
st.markdown("---")
|
| 295 |
+
st.subheader("πΎ Download Results")
|
| 296 |
+
|
| 297 |
+
results_dict = {
|
| 298 |
+
'Crystallization Method': cryst_method,
|
| 299 |
+
'Temperature (K)': temp,
|
| 300 |
+
'Temperature (Β°C)': temp - 273.15,
|
| 301 |
+
'Input pH': ph,
|
| 302 |
+
'Matthews Coefficient': matthews,
|
| 303 |
+
'Solvent Content (%)': solvent,
|
| 304 |
+
'Predicted Component': pred_name,
|
| 305 |
+
'Component Probability': f"{top_5_proba[0]:.4f}",
|
| 306 |
+
'Predicted pH': f"{pred_ph:.2f}",
|
| 307 |
+
}
|
| 308 |
+
|
| 309 |
+
if "Advanced" in approach:
|
| 310 |
+
results_dict['Predicted Concentration (log)'] = f"{pred_conc:.4f}"
|
| 311 |
+
results_dict['Predicted Concentration (M)'] = f"{10**pred_conc:.6f}"
|
| 312 |
+
|
| 313 |
+
results_df = pd.DataFrame([results_dict])
|
| 314 |
+
csv = results_df.to_csv(index=False)
|
| 315 |
+
|
| 316 |
+
st.download_button(
|
| 317 |
+
label="π₯ Download Predictions as CSV",
|
| 318 |
+
data=csv,
|
| 319 |
+
file_name="crystallization_predictions.csv",
|
| 320 |
+
mime="text/csv",
|
| 321 |
+
)
|
| 322 |
+
|
| 323 |
+
except FileNotFoundError as e:
|
| 324 |
+
st.error(f"""
|
| 325 |
+
β **Model files not found!**
|
| 326 |
+
|
| 327 |
+
Error: {e}
|
| 328 |
+
|
| 329 |
+
Please ensure model files are in the correct directory:
|
| 330 |
+
- `models/simple_baseline/`
|
| 331 |
+
- `models/advanced_baseline/`
|
| 332 |
+
""")
|
| 333 |
+
except Exception as e:
|
| 334 |
+
st.error(f"β **Prediction Error:** {e}")
|
| 335 |
+
with st.expander("π Show full error details"):
|
| 336 |
+
import traceback
|
| 337 |
+
st.code(traceback.format_exc())
|
| 338 |
+
|
| 339 |
+
# Model Comparison Section
|
| 340 |
+
st.markdown("---")
|
| 341 |
+
st.header("π Model Comparison")
|
| 342 |
+
|
| 343 |
+
comparison_df = pd.DataFrame({
|
| 344 |
+
'Model': ['Simple Baseline', 'Advanced Baseline', 'Transformer'],
|
| 345 |
+
'Name Accuracy': ['61.12%', '64.18% β', '53.85%'],
|
| 346 |
+
'Conc RΒ²': ['N/A', '47.33%', '18.72%'],
|
| 347 |
+
'pH RΒ²': ['95.58%', '99.34% β', '99.27%'],
|
| 348 |
+
'Speed': ['β‘ Fast', 'β‘ Fast', 'π Slow'],
|
| 349 |
+
'Recommendation': ['Basic use', 'β
Best overall', 'Research only']
|
| 350 |
+
})
|
| 351 |
+
|
| 352 |
+
st.dataframe(
|
| 353 |
+
comparison_df,
|
| 354 |
+
hide_index=True,
|
| 355 |
+
use_container_width=True,
|
| 356 |
+
column_config={
|
| 357 |
+
"Model": st.column_config.TextColumn("Model", width="medium"),
|
| 358 |
+
"Name Accuracy": st.column_config.TextColumn("Name Accuracy", width="medium"),
|
| 359 |
+
"Conc RΒ²": st.column_config.TextColumn("Concentration RΒ²", width="medium"),
|
| 360 |
+
"pH RΒ²": st.column_config.TextColumn("pH RΒ²", width="medium"),
|
| 361 |
+
}
|
| 362 |
+
)
|
| 363 |
+
|
| 364 |
+
st.markdown("""
|
| 365 |
+
**Model Selection Guide:**
|
| 366 |
+
- **Simple Baseline**: Fast predictions, no concentration. Good for quick pH and component estimates.
|
| 367 |
+
- **Advanced Baseline**: β Recommended for most users. Includes all three predictions with high accuracy.
|
| 368 |
+
- **Transformer**: Deep learning approach, requires more data for better performance.
|
| 369 |
+
""")
|
| 370 |
+
|
| 371 |
+
# Visualizations Section
|
| 372 |
+
st.markdown("---")
|
| 373 |
+
st.header("π Performance Visualizations")
|
| 374 |
+
|
| 375 |
+
viz_path = os.path.join(BASE_DIR, 'visualizations')
|
| 376 |
+
|
| 377 |
+
if os.path.exists(viz_path):
|
| 378 |
+
try:
|
| 379 |
+
tab1, tab2, tab3, tab4 = st.tabs([
|
| 380 |
+
"π Name Accuracy",
|
| 381 |
+
"π Concentration RΒ²",
|
| 382 |
+
"π§ͺ pH RΒ²",
|
| 383 |
+
"π― Complete Comparison"
|
| 384 |
+
])
|
| 385 |
+
|
| 386 |
+
with tab1:
|
| 387 |
+
img_path = os.path.join(viz_path, '01_component_name_comparison.png')
|
| 388 |
+
if os.path.exists(img_path):
|
| 389 |
+
st.image(img_path, use_column_width=True)
|
| 390 |
+
st.caption("Comparison of component name prediction accuracy across all models")
|
| 391 |
+
|
| 392 |
+
with tab2:
|
| 393 |
+
img_path = os.path.join(viz_path, '02_component_conc_comparison.png')
|
| 394 |
+
if os.path.exists(img_path):
|
| 395 |
+
st.image(img_path, use_column_width=True)
|
| 396 |
+
st.caption("Concentration prediction performance (RΒ² scores)")
|
| 397 |
+
|
| 398 |
+
with tab3:
|
| 399 |
+
img_path = os.path.join(viz_path, '03_component_ph_comparison.png')
|
| 400 |
+
if os.path.exists(img_path):
|
| 401 |
+
st.image(img_path, use_column_width=True)
|
| 402 |
+
st.caption("pH prediction performance (RΒ² scores)")
|
| 403 |
+
|
| 404 |
+
with tab4:
|
| 405 |
+
img_path = os.path.join(viz_path, '05_complete_comparison.png')
|
| 406 |
+
if os.path.exists(img_path):
|
| 407 |
+
st.image(img_path, use_column_width=True)
|
| 408 |
+
st.caption("Comprehensive comparison of all approaches and metrics")
|
| 409 |
+
except Exception as e:
|
| 410 |
+
st.info(f"Visualizations are being loaded... {e}")
|
| 411 |
+
else:
|
| 412 |
+
st.info("π Visualization files not found in this deployment")
|
| 413 |
+
|
| 414 |
+
# Information Section
|
| 415 |
+
st.markdown("---")
|
| 416 |
+
st.header("βΉοΈ How It Works")
|
| 417 |
+
|
| 418 |
+
with st.expander("π¬ About Protein Crystallization"):
|
| 419 |
+
st.markdown("""
|
| 420 |
+
**Protein crystallization** is a crucial step in structural biology for determining 3D protein structures using X-ray crystallography.
|
| 421 |
+
|
| 422 |
+
**Key Parameters:**
|
| 423 |
+
- **Crystallization Method**: The technique used (e.g., vapor diffusion, batch mode)
|
| 424 |
+
- **Temperature**: Affects protein stability and crystal growth
|
| 425 |
+
- **pH**: Critical for protein solubility and crystal formation
|
| 426 |
+
- **Matthews Coefficient**: Indicates crystal packing density
|
| 427 |
+
- **Solvent Content**: Amount of solvent in the crystal lattice
|
| 428 |
+
|
| 429 |
+
This tool helps predict optimal conditions based on historical crystallization data.
|
| 430 |
+
""")
|
| 431 |
+
|
| 432 |
+
with st.expander("π€ About the Models"):
|
| 433 |
+
st.markdown("""
|
| 434 |
+
**Simple Baseline:**
|
| 435 |
+
- Random Forest classifier for component name
|
| 436 |
+
- XGBoost regressor for pH
|
| 437 |
+
- Uses 4 numerical features + TF-IDF of method
|
| 438 |
+
|
| 439 |
+
**Advanced Baseline:**
|
| 440 |
+
- Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
|
| 441 |
+
- Includes concentration prediction with log-transformation
|
| 442 |
+
- Uses 8 engineered features including interactions
|
| 443 |
+
- Best overall performance: 64% name accuracy, 99% pH RΒ²
|
| 444 |
+
|
| 445 |
+
**Training Data:**
|
| 446 |
+
- Based on protein crystallization experiments from PDB
|
| 447 |
+
- Includes various crystallization methods and conditions
|
| 448 |
+
- Models trained on structured crystallization data
|
| 449 |
+
""")
|
| 450 |
+
|
| 451 |
+
with st.expander("π How to Use"):
|
| 452 |
+
st.markdown("""
|
| 453 |
+
1. **Select a model** in the sidebar (Advanced Baseline recommended)
|
| 454 |
+
2. **Input your parameters**:
|
| 455 |
+
- Choose crystallization method
|
| 456 |
+
- Set temperature, pH, Matthews coefficient, and solvent content
|
| 457 |
+
3. **Click "Predict Components"** to get predictions
|
| 458 |
+
4. **Review results**:
|
| 459 |
+
- Component name with confidence scores
|
| 460 |
+
- Concentration (if using Advanced Baseline)
|
| 461 |
+
- Optimal pH for crystallization
|
| 462 |
+
5. **Download** results as CSV for your records
|
| 463 |
+
|
| 464 |
+
π‘ **Tip:** Start with the recommended default values and adjust based on your specific protein and experimental setup.
|
| 465 |
+
""")
|
| 466 |
+
|
| 467 |
+
# Footer
|
| 468 |
+
st.markdown("---")
|
| 469 |
+
st.markdown("""
|
| 470 |
+
<div style='text-align: center; color: gray; padding: 20px;'>
|
| 471 |
+
<p><b>π¬ Crystallization Component Prediction System</b></p>
|
| 472 |
+
<p><i>Advanced Baseline achieves: 64% Name Accuracy | 47% Conc RΒ² | 99% pH RΒ²</i></p>
|
| 473 |
+
<p>Built with Scikit-learn, XGBoost, LightGBM, CatBoost & Streamlit</p>
|
| 474 |
+
<p style='font-size: 12px; margin-top: 10px;'>
|
| 475 |
+
For research and educational purposes. Validate predictions experimentally.
|
| 476 |
+
</p>
|
| 477 |
+
</div>
|
| 478 |
+
""", unsafe_allow_html=True)
|
| 479 |
+
|
models/advanced_baseline/label_encoder_name.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
|
| 3 |
+
size 116738
|
models/advanced_baseline/model_component_conc.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:88824da3404b10b0665cd9f854e79373b48b20fa26434599179c8c25b4076d7c
|
| 3 |
+
size 14164817
|
models/advanced_baseline/model_component_name.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51b4f1641a13b3bb649ffbb636c08b1b319243758c7baeb7b8c2ea90abea5561
|
| 3 |
+
size 11727850
|
models/advanced_baseline/model_component_ph.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f76b749b7b827e382b78e5401321671f45d937973b42b289ced7a03a7b598ba0
|
| 3 |
+
size 1452962
|
models/advanced_baseline/scaler.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:73fb3f7d2e6bddf8275ef0ed30515699685867b00f7fdca5c80076d50c972a56
|
| 3 |
+
size 1287
|
models/advanced_baseline/tfidf.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3122e3afec9af0916a8571b366e9a49b284adc6f72a4a63e3a2e5dab2c3d7a93
|
| 3 |
+
size 2552
|
models/advanced_baseline/training_results.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4541496759fff9ca683a388156eecf97c44839363e5a9948df380b388e7f9284
|
| 3 |
+
size 1112
|
models/simple_baseline/label_encoder_name.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
|
| 3 |
+
size 116738
|
models/simple_baseline/model_component_name.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:9a5b7b93b194da3be0b9a72c06f34cbd0f6810e043ab88ea83a25108de686c73
|
| 3 |
+
size 16985140
|
models/simple_baseline/model_component_ph.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5f906964a6ce338aca61bd7b5d1ace1f67daf1ece52eb44658ff539d0201dee3
|
| 3 |
+
size 1239942
|
models/simple_baseline/scaler.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:3c82727c7b6cf52a834e529b5fc01c43d28328465323f2945c331568b9e1095f
|
| 3 |
+
size 1079
|
models/simple_baseline/tfidf.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7bd17a22531b2aa52a6677af44fadc676329069d40614172fc64def0549bb207
|
| 3 |
+
size 2552
|
models/simple_baseline/training_results.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba3ee92aff285a69afb5edb8d648d2bbf2d550b37af791248c66688747970297
|
| 3 |
+
size 701
|
requirements.txt
CHANGED
|
@@ -1,3 +1,17 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Crystallization Component Predictor - Requirements for Hugging Face Spaces
|
| 2 |
+
# Python 3.9+
|
| 3 |
+
|
| 4 |
+
# Core Dependencies
|
| 5 |
+
streamlit==1.29.0
|
| 6 |
+
pandas==2.1.4
|
| 7 |
+
numpy==1.26.2
|
| 8 |
+
|
| 9 |
+
# Machine Learning Models
|
| 10 |
+
scikit-learn==1.3.2
|
| 11 |
+
xgboost==2.0.3
|
| 12 |
+
lightgbm==4.1.0
|
| 13 |
+
catboost==1.2.2
|
| 14 |
+
|
| 15 |
+
# Utilities
|
| 16 |
+
joblib==1.3.2
|
| 17 |
+
|
run_local.sh
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
|
| 3 |
+
echo "========================================"
|
| 4 |
+
echo " Crystallization Predictor - Local Run"
|
| 5 |
+
echo "========================================"
|
| 6 |
+
echo ""
|
| 7 |
+
echo "Starting Streamlit app..."
|
| 8 |
+
echo "Press Ctrl+C to stop"
|
| 9 |
+
echo ""
|
| 10 |
+
echo "App will open in your browser at:"
|
| 11 |
+
echo "http://localhost:8501"
|
| 12 |
+
echo ""
|
| 13 |
+
echo "----------------------------------------"
|
| 14 |
+
|
| 15 |
+
streamlit run app.py
|
| 16 |
+
|
verify_files.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Verification script to check if all required files are present
|
| 3 |
+
Run this before deploying to Hugging Face
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
def check_file(filepath, required=True):
|
| 10 |
+
"""Check if a file exists and return status"""
|
| 11 |
+
exists = os.path.exists(filepath)
|
| 12 |
+
status = "β
" if exists else ("β" if required else "β οΈ")
|
| 13 |
+
req_text = "(required)" if required else "(optional)"
|
| 14 |
+
print(f"{status} {filepath} {req_text}")
|
| 15 |
+
return exists
|
| 16 |
+
|
| 17 |
+
def check_folder(folderpath, required=True):
|
| 18 |
+
"""Check if a folder exists and return status"""
|
| 19 |
+
exists = os.path.exists(folderpath) and os.path.isdir(folderpath)
|
| 20 |
+
status = "β
" if exists else ("β" if required else "β οΈ")
|
| 21 |
+
req_text = "(required)" if required else "(optional)"
|
| 22 |
+
print(f"{status} {folderpath}/ {req_text}")
|
| 23 |
+
if exists:
|
| 24 |
+
files = list(Path(folderpath).rglob('*'))
|
| 25 |
+
file_count = len([f for f in files if f.is_file()])
|
| 26 |
+
print(f" β Contains {file_count} file(s)")
|
| 27 |
+
return exists
|
| 28 |
+
|
| 29 |
+
def main():
|
| 30 |
+
print("=" * 60)
|
| 31 |
+
print(" Hugging Face Deployment - File Verification")
|
| 32 |
+
print("=" * 60)
|
| 33 |
+
print()
|
| 34 |
+
|
| 35 |
+
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 36 |
+
os.chdir(base_dir)
|
| 37 |
+
|
| 38 |
+
all_required_present = True
|
| 39 |
+
|
| 40 |
+
# Check essential files
|
| 41 |
+
print("π Essential Files:")
|
| 42 |
+
all_required_present &= check_file("app.py", required=True)
|
| 43 |
+
all_required_present &= check_file("requirements.txt", required=True)
|
| 44 |
+
all_required_present &= check_file("README.md", required=True)
|
| 45 |
+
print()
|
| 46 |
+
|
| 47 |
+
# Check configuration files
|
| 48 |
+
print("βοΈ Configuration Files:")
|
| 49 |
+
check_file(".gitattributes", required=True)
|
| 50 |
+
check_file(".gitignore", required=False)
|
| 51 |
+
print()
|
| 52 |
+
|
| 53 |
+
# Check documentation
|
| 54 |
+
print("π Documentation:")
|
| 55 |
+
check_file("DEPLOYMENT_GUIDE.md", required=False)
|
| 56 |
+
print()
|
| 57 |
+
|
| 58 |
+
# Check model folders
|
| 59 |
+
print("π€ Model Files:")
|
| 60 |
+
simple_exists = check_folder("models/simple_baseline", required=True)
|
| 61 |
+
all_required_present &= simple_exists
|
| 62 |
+
|
| 63 |
+
if simple_exists:
|
| 64 |
+
all_required_present &= check_file("models/simple_baseline/model_component_name.pkl", required=True)
|
| 65 |
+
all_required_present &= check_file("models/simple_baseline/model_component_ph.pkl", required=True)
|
| 66 |
+
all_required_present &= check_file("models/simple_baseline/label_encoder_name.pkl", required=True)
|
| 67 |
+
all_required_present &= check_file("models/simple_baseline/scaler.pkl", required=True)
|
| 68 |
+
all_required_present &= check_file("models/simple_baseline/tfidf.pkl", required=True)
|
| 69 |
+
check_file("models/simple_baseline/training_results.json", required=False)
|
| 70 |
+
|
| 71 |
+
print()
|
| 72 |
+
|
| 73 |
+
advanced_exists = check_folder("models/advanced_baseline", required=True)
|
| 74 |
+
all_required_present &= advanced_exists
|
| 75 |
+
|
| 76 |
+
if advanced_exists:
|
| 77 |
+
all_required_present &= check_file("models/advanced_baseline/model_component_name.pkl", required=True)
|
| 78 |
+
all_required_present &= check_file("models/advanced_baseline/model_component_conc.pkl", required=True)
|
| 79 |
+
all_required_present &= check_file("models/advanced_baseline/model_component_ph.pkl", required=True)
|
| 80 |
+
all_required_present &= check_file("models/advanced_baseline/label_encoder_name.pkl", required=True)
|
| 81 |
+
all_required_present &= check_file("models/advanced_baseline/scaler.pkl", required=True)
|
| 82 |
+
all_required_present &= check_file("models/advanced_baseline/tfidf.pkl", required=True)
|
| 83 |
+
check_file("models/advanced_baseline/training_results.json", required=False)
|
| 84 |
+
|
| 85 |
+
print()
|
| 86 |
+
|
| 87 |
+
# Check visualizations
|
| 88 |
+
print("π Visualization Files:")
|
| 89 |
+
viz_exists = check_folder("visualizations", required=False)
|
| 90 |
+
if viz_exists:
|
| 91 |
+
check_file("visualizations/01_component_name_comparison.png", required=False)
|
| 92 |
+
check_file("visualizations/02_component_conc_comparison.png", required=False)
|
| 93 |
+
check_file("visualizations/03_component_ph_comparison.png", required=False)
|
| 94 |
+
check_file("visualizations/05_complete_comparison.png", required=False)
|
| 95 |
+
|
| 96 |
+
print()
|
| 97 |
+
print("=" * 60)
|
| 98 |
+
|
| 99 |
+
if all_required_present:
|
| 100 |
+
print("β
SUCCESS! All required files are present.")
|
| 101 |
+
print(" You're ready to deploy to Hugging Face!")
|
| 102 |
+
print()
|
| 103 |
+
print("Next steps:")
|
| 104 |
+
print("1. Test locally: streamlit run app.py")
|
| 105 |
+
print("2. Follow DEPLOYMENT_GUIDE.md for deployment")
|
| 106 |
+
print("3. Upload entire folder to Hugging Face Spaces")
|
| 107 |
+
else:
|
| 108 |
+
print("β ERROR! Some required files are missing.")
|
| 109 |
+
print(" Please ensure all required files are present before deploying.")
|
| 110 |
+
|
| 111 |
+
print("=" * 60)
|
| 112 |
+
|
| 113 |
+
# Calculate total size
|
| 114 |
+
total_size = 0
|
| 115 |
+
for root, dirs, files in os.walk('.'):
|
| 116 |
+
for file in files:
|
| 117 |
+
filepath = os.path.join(root, file)
|
| 118 |
+
if os.path.exists(filepath):
|
| 119 |
+
total_size += os.path.getsize(filepath)
|
| 120 |
+
|
| 121 |
+
size_mb = total_size / (1024 * 1024)
|
| 122 |
+
print(f"\nπ¦ Total folder size: {size_mb:.2f} MB")
|
| 123 |
+
|
| 124 |
+
if size_mb > 500:
|
| 125 |
+
print("β οΈ Warning: Folder is quite large. Consider Git LFS for files >10MB")
|
| 126 |
+
|
| 127 |
+
return all_required_present
|
| 128 |
+
|
| 129 |
+
if __name__ == "__main__":
|
| 130 |
+
success = main()
|
| 131 |
+
exit(0 if success else 1)
|
| 132 |
+
|
visualizations/01_component_name_comparison.png
ADDED
|
Git LFS Details
|
visualizations/02_component_conc_comparison.png
ADDED
|
Git LFS Details
|
visualizations/03_component_ph_comparison.png
ADDED
|
Git LFS Details
|
visualizations/04_all_approaches_heatmap.png
ADDED
|
Git LFS Details
|
visualizations/05_complete_comparison.png
ADDED
|
Git LFS Details
|
visualizations/eda_01_missing_values_matrix.png
ADDED
|
Git LFS Details
|
visualizations/eda_02_missing_values_heatmap.png
ADDED
|
Git LFS Details
|
visualizations/eda_03_target_distributions.png
ADDED
|
Git LFS Details
|
visualizations/eda_04_feature_distributions.png
ADDED
|
Git LFS Details
|
visualizations/eda_05_correlation_matrix.png
ADDED
|
Git LFS Details
|