Spaces:

suvradeepp
/

BTP_2026

Build error

App Files Files Community

suvradeepp commited on Nov 14, 2025

Commit

49e8d95

verified ·

1 Parent(s): cea68af

Upload 34 files

Browse files

Files changed (34) hide show

.gitattributes +11 -35
.gitignore +46 -0
DEPLOYMENT_GUIDE.md +233 -0
FILE_STRUCTURE.md +333 -0
QUICKSTART.txt +103 -0
README.md +143 -19
RUN_LOCAL.bat +17 -0
app.py +479 -0
models/advanced_baseline/label_encoder_name.pkl +3 -0
models/advanced_baseline/model_component_conc.pkl +3 -0
models/advanced_baseline/model_component_name.pkl +3 -0
models/advanced_baseline/model_component_ph.pkl +3 -0
models/advanced_baseline/scaler.pkl +3 -0
models/advanced_baseline/tfidf.pkl +3 -0
models/advanced_baseline/training_results.json +3 -0
models/simple_baseline/label_encoder_name.pkl +3 -0
models/simple_baseline/model_component_name.pkl +3 -0
models/simple_baseline/model_component_ph.pkl +3 -0
models/simple_baseline/scaler.pkl +3 -0
models/simple_baseline/tfidf.pkl +3 -0
models/simple_baseline/training_results.json +3 -0
requirements.txt +17 -3
run_local.sh +16 -0
verify_files.py +132 -0
visualizations/01_component_name_comparison.png +3 -0
visualizations/02_component_conc_comparison.png +3 -0
visualizations/03_component_ph_comparison.png +3 -0
visualizations/04_all_approaches_heatmap.png +3 -0
visualizations/05_complete_comparison.png +3 -0
visualizations/eda_01_missing_values_matrix.png +3 -0
visualizations/eda_02_missing_values_heatmap.png +3 -0
visualizations/eda_03_target_distributions.png +3 -0
visualizations/eda_04_feature_distributions.png +3 -0
visualizations/eda_05_correlation_matrix.png +3 -0

.gitattributes CHANGED Viewed

@@ -1,35 +1,11 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

+# Git LFS Configuration for Hugging Face Hub
+# Track large model files
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.json filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text
+*.png filter=lfs diff=lfs merge=lfs -text
+*.jpg filter=lfs diff=lfs merge=lfs -text
+*.jpeg filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,46 @@

+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+*.so
+.Python
+env/
+venv/
+ENV/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+# Virtual Environment
+venv/
+ENV/
+# IDE
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS
+.DS_Store
+Thumbs.db
+# Streamlit
+.streamlit/
+# Logs
+*.log

DEPLOYMENT_GUIDE.md ADDED Viewed

	@@ -0,0 +1,233 @@

+# 🚀 Deployment Guide for Hugging Face Spaces
+This guide will help you deploy the Crystallization Component Predictor to Hugging Face Spaces.
+## 📋 Prerequisites
+1. A Hugging Face account (sign up at https://huggingface.co/)
+2. Git installed on your computer
+3. Git LFS installed (`git lfs install`)
+## 🔧 Step-by-Step Deployment
+### Option 1: Web UI Upload (Easiest)
+1. **Create a new Space:**
+   - Go to https://huggingface.co/spaces
+   - Click "Create new Space"
+   - Choose a name (e.g., "crystallization-predictor")
+   - Select **Streamlit** as the SDK
+   - Choose visibility (Public or Private)
+   - Click "Create Space"
+2. **Upload files:**
+   - Click "Files" tab in your Space
+   - Click "Add file" → "Upload files"
+   - Drag and drop ALL files from this `huggingface_app` folder:
+     - `app.py`
+     - `requirements.txt`
+     - `README.md`
+     - `.gitattributes`
+     - `.gitignore`
+     - `models/` folder (with all subfolders)
+     - `visualizations/` folder (with all images)
+   - Click "Commit changes to main"
+3. **Wait for build:**
+   - Hugging Face will automatically build your Space
+   - Check the "Logs" tab to monitor progress
+   - Usually takes 2-5 minutes
+4. **Test your app:**
+   - Once built, click on the "App" tab
+   - Your Streamlit app should be running!
+### Option 2: Git Command Line (Advanced)
+1. **Initialize Git LFS:**
+```bash
+cd huggingface_app
+git lfs install
+```
+2. **Clone your Space repository:**
+```bash
+git clone https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+cd YOUR_SPACE_NAME
+```
+3. **Copy files:**
+```bash
+# Copy all files from huggingface_app to your cloned repo
+# On Windows:
+xcopy ..\huggingface_app\* . /E /H /Y
+# On Linux/Mac:
+cp -r ../huggingface_app/* .
+```
+4. **Commit and push:**
+```bash
+git add .
+git commit -m "Initial deployment of crystallization predictor"
+git push
+```
+5. **Check deployment:**
+   - Visit your Space URL: `https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`
+   - Monitor build logs in the "Logs" tab
+## 📁 Files Included
+```
+huggingface_app/
+├── app.py                          # Main Streamlit application
+├── requirements.txt                # Python dependencies
+├── README.md                       # Documentation (shown on Space page)
+├── .gitattributes                  # Git LFS configuration
+├── .gitignore                      # Files to ignore
+├── DEPLOYMENT_GUIDE.md            # This file
+├── models/
+│   ├── simple_baseline/
+│   │   ├── model_component_name.pkl
+│   │   ├── model_component_ph.pkl
+│   │   ├── label_encoder_name.pkl
+│   │   ├── scaler.pkl
+│   │   ├── tfidf.pkl
+│   │   └── training_results.json
+│   └── advanced_baseline/
+│       ├── model_component_name.pkl
+│       ├── model_component_conc.pkl
+│       ├── model_component_ph.pkl
+│       ├── label_encoder_name.pkl
+│       ├── scaler.pkl
+│       ├── tfidf.pkl
+│       └── training_results.json
+└── visualizations/
+    ├── 01_component_name_comparison.png
+    ├── 02_component_conc_comparison.png
+    ├── 03_component_ph_comparison.png
+    ├── 04_all_approaches_heatmap.png
+    └── 05_complete_comparison.png
+```
+## 🔍 Troubleshooting
+### Build Fails
+**Problem:** "Could not install packages due to an OSError"
+- **Solution:** Check that all dependencies in `requirements.txt` are compatible
+- Try pinning versions or using newer versions
+**Problem:** "ModuleNotFoundError"
+- **Solution:** Ensure the missing module is in `requirements.txt`
+### Model Loading Errors
+**Problem:** "FileNotFoundError: [Errno 2] No such file or directory: 'models/...'"
+- **Solution:** Verify all model files were uploaded correctly
+- Check that folder structure is preserved
+**Problem:** Large file upload fails
+- **Solution:** Ensure Git LFS is properly configured
+- Files over 10MB should use LFS (already configured in `.gitattributes`)
+### App Crashes
+**Problem:** "Memory limit exceeded"
+- **Solution:** Hugging Face Spaces have memory limits
+- Consider using smaller models or optimizing loading
+**Problem:** Slow loading
+- **Solution:** Models are loaded on first prediction (not at startup)
+- This is intentional for faster app startup
+## 🎨 Customization
+### Change App Title/Icon
+Edit the `README.md` header:
+```yaml
+---
+title: Your Custom Title
+emoji: 🧬  # Change emoji
+colorFrom: blue  # Change colors
+colorTo: purple
+---
+```
+### Modify the App
+Edit `app.py` and commit changes. The Space will rebuild automatically.
+### Add More Models
+1. Add model files to `models/` folder
+2. Update `app.py` to load and use new models
+3. Update `README.md` to document changes
+## 📊 Monitoring
+- **Logs**: Check the "Logs" tab in your Space
+- **Analytics**: View usage statistics in Space settings
+- **Updates**: Any push to the main branch triggers a rebuild
+## 🔒 Security & Privacy
+- **Public Spaces**: Anyone can use your app and see the code
+- **Private Spaces**: Only you and collaborators can access
+- **No User Data**: The app doesn't collect or store user inputs
+- **Model Files**: Ensure you have rights to distribute the models
+## 💰 Costs
+- **Free Tier**:
+  - CPU: 2 vCPU, 16GB RAM
+  - Perfect for this app
+  - No credit card required
+- **Paid Tiers**:
+  - Available for GPU or more resources
+  - Not needed for this application
+## 🔗 Useful Links
+- Hugging Face Spaces Docs: https://huggingface.co/docs/hub/spaces
+- Streamlit Docs: https://docs.streamlit.io/
+- Git LFS: https://git-lfs.github.com/
+## 📞 Support
+If you encounter issues:
+1. Check the "Logs" tab in your Space
+2. Review Hugging Face Spaces documentation
+3. Search Hugging Face forums
+4. Open an issue on the repository
+## ✅ Pre-Deployment Checklist
+- [ ] All model files copied to `models/` folders
+- [ ] Visualizations copied to `visualizations/` folder
+- [ ] `requirements.txt` has all dependencies
+- [ ] `README.md` header configured with title/emoji
+- [ ] Tested app locally (`streamlit run app.py`)
+- [ ] Git LFS installed and configured
+- [ ] Hugging Face account created
+- [ ] Space created on Hugging Face
+## 🎉 Post-Deployment
+After successful deployment:
+1. Test all features in the live app
+2. Share your Space URL with others
+3. Monitor logs for any errors
+4. Consider adding:
+   - Example inputs/outputs
+   - Tutorial video
+   - Publication link
+   - Citation information
+---
+**Good luck with your deployment! 🚀**
+Your app will be accessible at:
+`https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME`

FILE_STRUCTURE.md ADDED Viewed

	@@ -0,0 +1,333 @@

+# 📁 Hugging Face Deployment - Complete File Structure
+## Overview
+This folder contains everything needed to deploy the Crystallization Component Predictor to Hugging Face Spaces.
+**Total Size:** ~46 MB
+**Status:** ✅ Ready for deployment
+---
+## 📂 Directory Structure
+```
+huggingface_app/
+│
+├── 📄 Core Application Files
+│   ├── app.py                          # Main Streamlit application (standalone)
+│   ├── requirements.txt                # Python dependencies for Hugging Face
+│   └── README.md                       # Hugging Face Space documentation
+│
+├── ⚙️ Configuration Files
+│   ├── .gitattributes                  # Git LFS configuration for large files
+│   └── .gitignore                      # Files to exclude from Git
+│
+├── 📚 Documentation
+│   ├── DEPLOYMENT_GUIDE.md             # Step-by-step deployment instructions
+│   ├── QUICKSTART.txt                  # Quick reference guide
+│   └── FILE_STRUCTURE.md               # This file
+│
+├── 🔧 Utility Scripts
+│   ├── verify_files.py                 # Verification script (check all files present)
+│   ├── RUN_LOCAL.bat                   # Windows: Run app locally
+│   └── run_local.sh                    # Linux/Mac: Run app locally
+│
+├── 🤖 models/
+│   │
+│   ├── simple_baseline/                # Simple Baseline models
+│   │   ├── model_component_name.pkl    # Random Forest classifier (name)
+│   │   ├── model_component_ph.pkl      # XGBoost regressor (pH)
+│   │   ├── label_encoder_name.pkl      # Label encoder for component names
+│   │   ├── scaler.pkl                  # StandardScaler for features
+│   │   ├── tfidf.pkl                   # TF-IDF vectorizer for methods
+│   │   └── training_results.json       # Training metrics
+│   │
+│   └── advanced_baseline/              # Advanced Baseline models
+│       ├── model_component_name.pkl    # Ensemble classifier (name)
+│       ├── model_component_conc.pkl    # Ensemble regressor (concentration)
+│       ├── model_component_ph.pkl      # Ensemble regressor (pH)
+│       ├── label_encoder_name.pkl      # Label encoder for component names
+│       ├── scaler.pkl                  # StandardScaler for features
+│       ├── tfidf.pkl                   # TF-IDF vectorizer for methods
+│       └── training_results.json       # Training metrics
+│
+└── 📊 visualizations/                  # Performance comparison charts
+    ├── 01_component_name_comparison.png
+    ├── 02_component_conc_comparison.png
+    ├── 03_component_ph_comparison.png
+    ├── 04_all_approaches_heatmap.png
+    ├── 05_complete_comparison.png
+    ├── eda_01_missing_values_matrix.png
+    ├── eda_02_missing_values_heatmap.png
+    ├── eda_03_target_distributions.png
+    ├── eda_04_feature_distributions.png
+    └── eda_05_correlation_matrix.png
+```
+---
+## 📋 File Descriptions
+### Core Application Files
+#### `app.py` (Main Application)
+- **Purpose:** Streamlit web application
+- **Key Features:**
+  - Model selection (Simple vs Advanced Baseline)
+  - Interactive parameter input
+  - Real-time predictions
+  - Top-5 component predictions with probabilities
+  - Visual pH scale
+  - Downloadable results (CSV)
+  - Performance visualizations
+  - Model comparison charts
+- **Dependencies:** All specified in `requirements.txt`
+- **Entry Point:** Yes - Hugging Face will run this automatically
+#### `requirements.txt`
+- **Purpose:** Python package dependencies
+- **Key Packages:**
+  - streamlit==1.29.0
+  - pandas==2.1.4
+  - numpy==1.26.2
+  - scikit-learn==1.3.2
+  - xgboost==2.0.3
+  - lightgbm==4.1.0
+  - catboost==1.2.2
+  - joblib==1.3.2
+- **Note:** Versions pinned for reproducibility
+#### `README.md`
+- **Purpose:** Documentation displayed on Hugging Face Space page
+- **Contains:**
+  - App description and features
+  - Model performance metrics
+  - Usage instructions
+  - Technical details
+  - Background information
+  - Acknowledgments
+- **Special:** YAML header configures Space appearance
+---
+### Configuration Files
+#### `.gitattributes`
+- **Purpose:** Git LFS (Large File Storage) configuration
+- **Tracks:**
+  - *.pkl (model files)
+  - *.pth (PyTorch models)
+  - *.json (results)
+  - *.png (images)
+- **Why:** Files >10MB need LFS on Hugging Face
+#### `.gitignore`
+- **Purpose:** Exclude unnecessary files from Git
+- **Excludes:**
+  - Python cache (`__pycache__/`)
+  - Virtual environments
+  - IDE files
+  - OS files
+  - Logs
+---
+### Documentation Files
+#### `DEPLOYMENT_GUIDE.md`
+- **Purpose:** Complete deployment instructions
+- **Sections:**
+  - Prerequisites
+  - Step-by-step deployment (Web UI & Git CLI)
+  - Troubleshooting
+  - Customization
+  - Monitoring
+  - Security & privacy
+#### `QUICKSTART.txt`
+- **Purpose:** Quick reference for common tasks
+- **Format:** Plain text for easy viewing
+- **Content:** Essential info at a glance
+#### `FILE_STRUCTURE.md`
+- **Purpose:** This document - complete file inventory
+---
+### Utility Scripts
+#### `verify_files.py`
+- **Purpose:** Pre-deployment verification
+- **Checks:**
+  - All required files present
+  - Model files exist
+  - Folder structure correct
+  - Total size calculation
+- **Usage:** `python verify_files.py`
+#### `RUN_LOCAL.bat` (Windows)
+- **Purpose:** Launch app locally for testing
+- **Usage:** Double-click or run `RUN_LOCAL.bat`
+- **Opens:** http://localhost:8501
+#### `run_local.sh` (Linux/Mac)
+- **Purpose:** Launch app locally for testing
+- **Usage:** `bash run_local.sh`
+- **Opens:** http://localhost:8501
+---
+### Model Files
+#### Simple Baseline Models (6 files)
+**Performance:**
+- Name Accuracy: 61.12%
+- pH R²: 95.58%
+- Concentration: N/A
+**Files:**
+1. `model_component_name.pkl` - Random Forest classifier
+2. `model_component_ph.pkl` - XGBoost regressor
+3. `label_encoder_name.pkl` - Encode component names
+4. `scaler.pkl` - Feature normalization
+5. `tfidf.pkl` - Text vectorization
+6. `training_results.json` - Performance metrics
+#### Advanced Baseline Models (7 files)
+**Performance:**
+- Name Accuracy: 64.18% ⭐
+- Concentration R²: 47.33%
+- pH R²: 99.34% ⭐
+**Files:**
+1. `model_component_name.pkl` - Ensemble (RF + XGB + LGB + Cat)
+2. `model_component_conc.pkl` - Ensemble concentration regressor
+3. `model_component_ph.pkl` - Ensemble pH regressor
+4. `label_encoder_name.pkl` - Encode component names
+5. `scaler.pkl` - Feature normalization
+6. `tfidf.pkl` - Text vectorization
+7. `training_results.json` - Performance metrics
+---
+### Visualization Files (10 images)
+#### Model Comparison Charts
+- `01_component_name_comparison.png` - Name accuracy comparison
+- `02_component_conc_comparison.png` - Concentration R² comparison
+- `03_component_ph_comparison.png` - pH R² comparison
+- `04_all_approaches_heatmap.png` - Performance heatmap
+- `05_complete_comparison.png` - Comprehensive comparison
+#### EDA Visualizations
+- `eda_01_missing_values_matrix.png` - Missing data patterns
+- `eda_02_missing_values_heatmap.png` - Missing data heatmap
+- `eda_03_target_distributions.png` - Target variable distributions
+- `eda_04_feature_distributions.png` - Feature distributions
+- `eda_05_correlation_matrix.png` - Feature correlations
+---
+## 🚀 Deployment Checklist
+Before deploying to Hugging Face:
+- [x] ✅ All core files present (app.py, requirements.txt, README.md)
+- [x] ✅ Configuration files (.gitattributes, .gitignore)
+- [x] ✅ Simple Baseline models (6 files)
+- [x] ✅ Advanced Baseline models (7 files)
+- [x] ✅ Visualizations (10 images)
+- [x] ✅ Documentation complete
+- [x] ✅ Verification script passes
+- [x] ✅ Total size: 46.47 MB (within limits)
+- [ ] ⏳ Test locally (run `streamlit run app.py`)
+- [ ] ⏳ Deploy to Hugging Face
+- [ ] ⏳ Test live deployment
+---
+## 💡 Key Features
+### What Makes This Deployment Special
+1. **Self-Contained**: No external dependencies or file paths
+2. **Production-Ready**: All error handling included
+3. **User-Friendly**: Beautiful UI with helpful tooltips
+4. **Well-Documented**: Comprehensive README and guides
+5. **Verified**: Includes verification script
+6. **Git LFS Ready**: Configured for large model files
+7. **Cross-Platform**: Works on Windows, Linux, Mac
+### App Capabilities
+- ✅ Two model options (Simple & Advanced)
+- ✅ Interactive parameter input
+- ✅ Real-time predictions
+- ✅ Top-5 component suggestions
+- ✅ Confidence scores
+- ✅ Visual pH scale
+- ✅ Downloadable CSV results
+- ✅ Performance visualizations
+- ✅ Model comparison tables
+- ✅ Responsive design
+---
+## 📊 Statistics
+| Metric | Value |
+|--------|-------|
+| Total Files | 30 |
+| Python Scripts | 2 |
+| Model Files | 13 |
+| Images | 10 |
+| Documentation | 5 |
+| Total Size | 46.47 MB |
+| Largest File | model_component_name.pkl (~8 MB each) |
+---
+## 🔗 Next Steps
+1. **Test Locally:**
+   ```bash
+   streamlit run app.py
+   ```
+2. **Verify Files:**
+   ```bash
+   python verify_files.py
+   ```
+3. **Deploy to Hugging Face:**
+   - Follow `DEPLOYMENT_GUIDE.md`
+   - Or see `QUICKSTART.txt` for quick steps
+4. **Share Your Space:**
+   - URL: `https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME`
+---
+## ⚠️ Important Notes
+- All paths in `app.py` are relative to the script location
+- Models load on first prediction (not at startup)
+- Git LFS is required for files >10MB
+- Free tier on Hugging Face is sufficient
+- No API keys or secrets required
+---
+## 📞 Support
+- **Deployment Issues:** See `DEPLOYMENT_GUIDE.md`
+- **File Issues:** Run `verify_files.py`
+- **App Issues:** Check `app.py` comments
+- **Hugging Face Help:** https://huggingface.co/docs/hub/spaces
+---
+**Status:** ✅ **READY FOR DEPLOYMENT**
+This folder is complete and ready to be uploaded to Hugging Face Spaces!

QUICKSTART.txt ADDED Viewed

	@@ -0,0 +1,103 @@

+╔══════════════════════════════════════════════════════════════════╗
+║    CRYSTALLIZATION COMPONENT PREDICTOR - QUICK START GUIDE      ║
+╚══════════════════════════════════════════════════════════════════╝
+📁 FOLDER CONTENTS:
+==================
+✅ All files ready for Hugging Face deployment!
+📦 WHAT'S INCLUDED:
+===================
+• app.py - Main Streamlit application
+• requirements.txt - Python dependencies
+• README.md - Documentation for Hugging Face
+• models/ - All trained ML models (Simple & Advanced Baseline)
+• visualizations/ - Performance comparison charts
+• .gitattributes - Git LFS configuration
+• DEPLOYMENT_GUIDE.md - Detailed deployment instructions
+📊 FOLDER SIZE: ~46 MB
+🚀 DEPLOY TO HUGGING FACE:
+===========================
+OPTION 1: Web Upload (Easiest)
+-------------------------------
+1. Go to https://huggingface.co/spaces
+2. Click "Create new Space"
+3. Choose name, select "Streamlit" as SDK
+4. Upload ALL files from this folder
+5. Wait 2-5 minutes for build
+6. Done! Your app is live
+OPTION 2: Git Command Line
+---------------------------
+1. git clone https://huggingface.co/spaces/YOUR_USERNAME/SPACE_NAME
+2. Copy all files to cloned folder
+3. git add .
+4. git commit -m "Deploy crystallization predictor"
+5. git push
+📖 See DEPLOYMENT_GUIDE.md for detailed instructions!
+🧪 TEST LOCALLY FIRST:
+=======================
+Windows:   Double-click RUN_LOCAL.bat
+Linux/Mac: bash run_local.sh
+OR:        streamlit run app.py
+Then open: http://localhost:8501
+✅ VERIFICATION:
+================
+Run: python verify_files.py
+All files present: ✅
+🎯 WHAT THE APP DOES:
+======================
+Predicts optimal crystallization components:
+• Component Name (chemical compound)
+• Concentration (molarity)
+• pH (acidity level)
+Based on your input parameters:
+• Crystallization method
+• Temperature
+• pH
+• Matthews coefficient
+• Solvent content
+📈 MODEL PERFORMANCE:
+======================
+Advanced Baseline (Recommended):
+• Name Accuracy: 64.18%
+• Concentration R²: 47.33%
+• pH R²: 99.34%
+Simple Baseline:
+• Name Accuracy: 61.12%
+• pH R²: 95.58%
+• No concentration prediction
+⚡ QUICK TIPS:
+==============
+✓ Use Advanced Baseline for complete predictions
+✓ Test locally before deploying
+✓ All dependencies are in requirements.txt
+✓ Git LFS is configured for large files
+✓ Models load on first prediction (intentional)
+✓ Free tier on Hugging Face is sufficient
+📞 NEED HELP?
+=============
+• Check DEPLOYMENT_GUIDE.md
+• Visit https://huggingface.co/docs/hub/spaces
+• Review app.py comments
+🎉 READY TO DEPLOY!
+===================
+Your Space URL will be:
+https://huggingface.co/spaces/YOUR_USERNAME/YOUR_SPACE_NAME
+Good luck! 🚀🔬

README.md CHANGED Viewed

@@ -1,19 +1,143 @@
----
-title: BTP 2026
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

+---
+title: Crystallization Component Predictor
+emoji: 🔬
+colorFrom: blue
+colorTo: purple
+sdk: streamlit
+sdk_version: 1.29.0
+app_file: app.py
+pinned: false
+license: mit
+---
+# 🔬 Crystallization Component Predictor
+An interactive machine learning application for predicting optimal protein crystallization components based on experimental parameters.
+## 🎯 What Does This App Do?
+This tool predicts three critical crystallization parameters:
+1. **Component Name**: The chemical compound most likely to produce crystals
+2. **Concentration**: The optimal molarity for the component
+3. **pH**: The ideal acidity/basicity level for crystallization
+## 🚀 Quick Start
+1. Select a model (Advanced Baseline recommended)
+2. Input your crystallization parameters:
+   - Crystallization method
+   - Temperature
+   - pH
+   - Matthews coefficient
+   - Solvent content
+3. Click "Predict Components"
+4. Review predictions and download results
+## 📊 Model Performance
+| Model | Name Accuracy | Conc R² | pH R² |
+|-------|--------------|---------|-------|
+| Simple Baseline | 61.12% | N/A | 95.58% |
+| **Advanced Baseline** ⭐ | **64.18%** | **47.33%** | **99.34%** |
+| Transformer | 53.85% | 18.72% | 99.27% |
+**Recommended:** Advanced Baseline for best overall performance
+## 🔬 Features
+- **Two Model Approaches**: Choose between Simple and Advanced Baseline
+- **Interactive UI**: Easy-to-use sliders and dropdowns
+- **Top-5 Predictions**: View confidence scores for multiple candidates
+- **Visual pH Scale**: Intuitive pH visualization
+- **Downloadable Results**: Export predictions as CSV
+- **Performance Charts**: Compare model accuracies
+## 🛠️ Technical Details
+### Simple Baseline
+- Random Forest for component classification
+- XGBoost for pH regression
+- 4 numerical features + TF-IDF of crystallization method
+### Advanced Baseline (Recommended)
+- Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
+- 8 engineered features including interaction terms
+- Separate models for name, concentration, and pH
+- Log-transformed concentration predictions
+### Models Included
+- `simple_baseline/`: Simple baseline models
+  - `model_component_name.pkl`: Component classifier
+  - `model_component_ph.pkl`: pH regressor
+  - `label_encoder_name.pkl`: Label encoder
+  - `scaler.pkl`: Feature scaler
+  - `tfidf.pkl`: TF-IDF vectorizer
+- `advanced_baseline/`: Advanced baseline models
+  - `model_component_name.pkl`: Enhanced component classifier
+  - `model_component_conc.pkl`: Concentration regressor
+  - `model_component_ph.pkl`: Enhanced pH regressor
+  - `label_encoder_name.pkl`: Label encoder
+  - `scaler.pkl`: Feature scaler
+  - `tfidf.pkl`: TF-IDF vectorizer
+## 📦 Dependencies
+- Python 3.9+
+- Streamlit
+- Scikit-learn
+- XGBoost
+- LightGBM
+- CatBoost
+- Pandas
+- NumPy
+- Joblib
+## 🎓 Use Cases
+- **Structural Biology**: Plan crystallization experiments
+- **Drug Discovery**: Optimize protein crystal conditions
+- **Research**: Explore crystallization parameter space
+- **Education**: Learn about protein crystallization
+## 📖 Background
+Protein crystallization is essential for determining 3D protein structures via X-ray crystallography. This tool uses machine learning trained on historical crystallization data from the Protein Data Bank (PDB) to predict optimal conditions.
+### Input Parameters Explained
+- **Crystallization Method**: Technique used (vapor diffusion, batch, etc.)
+- **Temperature**: Affects protein stability and crystal growth (typically 277-298K)
+- **pH**: Critical for protein solubility and crystal formation (0-14 scale)
+- **Matthews Coefficient**: Unit cell volume to protein molecular weight ratio (Ų/Da)
+- **Solvent Content**: Percentage of solvent in crystal lattice (typically 30-70%)
+## ⚠️ Important Notes
+- **Validation Required**: Always validate predictions experimentally
+- **Research Tool**: For research and educational purposes
+- **Starting Point**: Use predictions as a guide, not absolute truth
+- **Protein-Specific**: Results may vary based on your specific protein
+## 🤝 Contributing
+This is a research project. Feedback and suggestions are welcome!
+## 📄 License
+MIT License - Free to use for research and educational purposes
+## 🙏 Acknowledgments
+- Training data derived from Protein Data Bank (PDB)
+- Built with Streamlit and ensemble ML models
+- Inspired by advances in computational structural biology
+## 📞 Contact & Support
+For questions or issues, please open an issue on the repository.
+---
+**Note**: This tool provides predictions based on historical data. Always conduct proper experimental validation. Crystallization is a complex process influenced by many factors not captured by these models alone.

RUN_LOCAL.bat ADDED Viewed

	@@ -0,0 +1,17 @@

+@echo off
+echo ========================================
+echo  Crystallization Predictor - Local Run
+echo ========================================
+echo.
+echo Starting Streamlit app...
+echo Press Ctrl+C to stop
+echo.
+echo App will open in your browser at:
+echo http://localhost:8501
+echo.
+echo ----------------------------------------
+streamlit run app.py
+pause

app.py ADDED Viewed

	@@ -0,0 +1,479 @@

+"""
+Interactive Crystallization Component Predictor
+===============================================
+Streamlit app for Hugging Face Hub deployment
+Predicts crystallization components using Simple Baseline and Advanced Baseline models
+"""
+import streamlit as st
+import pandas as pd
+import numpy as np
+import joblib
+import json
+import os
+import warnings
+warnings.filterwarnings('ignore')
+# Page config
+st.set_page_config(
+    page_title="Crystallization Predictor",
+    page_icon="🔬",
+    layout="wide",
+    initial_sidebar_state="expanded"
+)
+# Get the directory of this script
+BASE_DIR = os.path.dirname(os.path.abspath(__file__))
+# Title and Introduction
+st.title("🔬 Crystallization Component Predictor")
+st.markdown("""
+### Predict crystallization components using Machine Learning
+This app uses trained machine learning models to predict the optimal components for protein crystallization
+based on your experimental parameters.
+""")
+st.markdown("---")
+# Sidebar
+st.sidebar.header("⚙️ Model Selection")
+approach = st.sidebar.radio(
+    "Choose Approach:",
+    ["Advanced Baseline (Recommended)", "Simple Baseline"],
+    help="Advanced has concentration parsing and better accuracy"
+)
+st.sidebar.markdown("---")
+st.sidebar.markdown("### 📊 Model Performance")
+# Display performance metrics
+try:
+    simple_results_path = os.path.join(BASE_DIR, 'models', 'simple_baseline', 'training_results.json')
+    advanced_results_path = os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'training_results.json')
+    if os.path.exists(simple_results_path):
+        with open(simple_results_path, 'r') as f:
+            simple_results = json.load(f)
+    if os.path.exists(advanced_results_path):
+        with open(advanced_results_path, 'r') as f:
+            advanced_results = json.load(f)
+    if "Simple" in approach:
+        st.sidebar.metric("Name Accuracy", "61.12%")
+        st.sidebar.metric("pH R²", "95.58%")
+        st.sidebar.warning("⚠️ Conc: N/A")
+    else:
+        st.sidebar.metric("Name Accuracy", "64.18%")
+        st.sidebar.metric("Conc R²", "47.33%")
+        st.sidebar.metric("pH R²", "99.34%")
+        st.sidebar.success("✅ All metrics working!")
+except Exception as e:
+    st.sidebar.info(f"Using default metrics")
+st.sidebar.markdown("---")
+st.sidebar.markdown("""
+### ℹ️ About
+This tool predicts three key crystallization parameters:
+- **Component Name**: The chemical compound
+- **Concentration**: Amount in solution (M)
+- **pH**: Acidity/basicity level
+**Recommended:** Advanced Baseline for complete predictions
+""")
+# Input Form
+st.header("🎯 Input Crystallization Parameters")
+col1, col2 = st.columns(2)
+with col1:
+    st.markdown("#### Crystallization Setup")
+    cryst_method = st.selectbox(
+        "Crystallization Method",
+        [
+            "VAPOR DIFFUSION, SITTING DROP",
+            "VAPOR DIFFUSION, HANGING DROP",
+            "VAPOR DIFFUSION",
+            "BATCH MODE",
+            "MICROBATCH"
+        ],
+        help="Select the crystallization technique you're using"
+    )
+    temp = st.slider(
+        "Temperature (K)",
+        250.0, 320.0, 293.0, 1.0,
+        help="Typical room temperature is ~293K (20°C)"
+    )
+    ph = st.slider(
+        "pH",
+        0.0, 14.0, 7.0, 0.1,
+        help="Initial pH of your crystallization solution"
+    )
+with col2:
+    st.markdown("#### Crystal Properties")
+    matthews = st.slider(
+        "Matthews Coefficient",
+        1.0, 4.5, 2.2, 0.1,
+        help="Ratio of unit cell volume to protein molecular weight (Å³/Da)"
+    )
+    solvent = st.slider(
+        "Percent Solvent Content (%)",
+        0.0, 100.0, 45.0, 1.0,
+        help="Percentage of solvent in the crystal"
+    )
+st.markdown("---")
+# Predict button
+if st.button("🚀 Predict Components", type="primary", use_container_width=True):
+    try:
+        with st.spinner("🔄 Loading models and making predictions..."):
+            if "Advanced" in approach:
+                # Load advanced models
+                model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_name.pkl'))
+                model_conc = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_conc.pkl'))
+                model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'model_component_ph.pkl'))
+                le = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'label_encoder_name.pkl'))
+                scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'scaler.pkl'))
+                tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'advanced_baseline', 'tfidf.pkl'))
+                # Feature engineering (Advanced Baseline needs 8 features)
+                temp_ph_int = temp * ph
+                matthews_solvent_int = matthews * solvent
+                ph_diff = 0  # Unknown for new prediction
+                solvent_ratio = solvent / (matthews + 1e-6)
+                numerical = np.array([[temp, ph, matthews, solvent,
+                                      temp_ph_int, matthews_solvent_int,
+                                      ph_diff, solvent_ratio]])
+            else:
+                # Load simple models
+                model_name = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_name.pkl'))
+                model_ph = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'model_component_ph.pkl'))
+                le = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'label_encoder_name.pkl'))
+                scaler = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'scaler.pkl'))
+                tfidf = joblib.load(os.path.join(BASE_DIR, 'models', 'simple_baseline', 'tfidf.pkl'))
+                # Simple baseline: only 4 features
+                numerical = np.array([[temp, ph, matthews, solvent]])
+            # Scale numerical features
+            numerical_scaled = scaler.transform(numerical)
+            # TF-IDF for crystallization method
+            method_tfidf = tfidf.transform([cryst_method.upper()]).toarray()
+            # Combine features
+            X_pred = np.concatenate([numerical_scaled, method_tfidf], axis=1)
+            # Make predictions
+            pred_name_idx = model_name.predict(X_pred)[0]
+            pred_name = le.inverse_transform([pred_name_idx])[0]
+            pred_name_proba = model_name.predict_proba(X_pred)[0]
+            top_5_idx = np.argsort(pred_name_proba)[-5:][::-1]
+            top_5_names = le.inverse_transform(top_5_idx)
+            top_5_proba = pred_name_proba[top_5_idx]
+            pred_ph = model_ph.predict(X_pred)[0]
+            if "Advanced" in approach:
+                pred_conc = model_conc.predict(X_pred)[0]
+        # Display Results
+        st.success("✅ Predictions Complete!")
+        st.markdown("---")
+        st.header("📊 Prediction Results")
+        # Component Name
+        st.subheader("1️⃣ Component_1_Name")
+        st.markdown("**Most likely chemical component for crystallization:**")
+        col1, col2 = st.columns([1, 2])
+        with col1:
+            st.metric("Predicted Component", pred_name)
+            st.caption("Top prediction from the model")
+        with col2:
+            st.markdown("**Top 5 Predictions (with confidence):**")
+            top5_df = pd.DataFrame({
+                'Rank': range(1, 6),
+                'Component': top_5_names,
+                'Probability': [f"{p:.2%}" for p in top_5_proba]
+            })
+            st.dataframe(top5_df, hide_index=True, use_container_width=True)
+        st.markdown("---")
+        # Concentration
+        st.subheader("2️⃣ Component_1_Conc")
+        if "Advanced" in approach:
+            col1, col2 = st.columns(2)
+            with col1:
+                st.metric("Predicted Concentration (log-scale)", f"{pred_conc:.4f}")
+            with col2:
+                actual_molarity = 10**pred_conc
+                st.metric("Actual Molarity", f"{actual_molarity:.6f} M")
+            st.info(f"💡 Use approximately **{actual_molarity:.6f} M** of {pred_name} in your crystallization trials")
+        else:
+            st.warning("⚠️ Not available in Simple Baseline - use Advanced Baseline for concentration predictions")
+        st.markdown("---")
+        # pH
+        st.subheader("3️⃣ Component_1_pH")
+        col1, col2 = st.columns([1, 2])
+        with col1:
+            st.metric("Predicted pH", f"{pred_ph:.2f}")
+            # pH classification
+            if pred_ph < 6:
+                ph_class = "Acidic"
+                ph_emoji = "🔴"
+            elif pred_ph < 8:
+                ph_class = "Neutral"
+                ph_emoji = "🟢"
+            else:
+                ph_class = "Basic"
+                ph_emoji = "🔵"
+            st.caption(f"{ph_emoji} {ph_class} solution")
+        with col2:
+            # pH visualization
+            ph_percent = (pred_ph / 14) * 100
+            ph_color = "red" if pred_ph < 6 else ("green" if pred_ph < 8 else "blue")
+            st.markdown(f"""
+            <div style='background: linear-gradient(to right, red, yellow, green, cyan, blue);
+                        height: 40px; border-radius: 10px; margin: 10px 0; border: 2px solid #333;'></div>
+            <div style='display: flex; justify-content: space-between; font-size: 14px;'>
+                <span><b>0</b> (Acidic)</span>
+                <span><b>7</b> (Neutral)</span>
+                <span><b>14</b> (Basic)</span>
+            </div>
+            <div style='text-align: center; margin-top: 15px;'>
+                <b style='font-size: 24px; color: {ph_color};'>pH = {pred_ph:.2f}</b>
+            </div>
+            """, unsafe_allow_html=True)
+        st.info(f"💡 Adjust your buffer to maintain pH ≈ **{pred_ph:.2f}** for optimal crystallization")
+        # Input Summary
+        st.markdown("---")
+        st.subheader("📥 Input Summary")
+        input_df = pd.DataFrame({
+            'Parameter': [
+                'Crystallization Method',
+                'Temperature',
+                'Input pH',
+                'Matthews Coefficient',
+                'Solvent Content'
+            ],
+            'Value': [
+                cryst_method,
+                f"{temp:.1f} K ({temp-273.15:.1f}°C)",
+                f"{ph:.1f}",
+                f"{matthews:.2f} Ų/Da",
+                f"{solvent:.1f}%"
+            ]
+        })
+        st.table(input_df)
+        # Download Results
+        st.markdown("---")
+        st.subheader("💾 Download Results")
+        results_dict = {
+            'Crystallization Method': cryst_method,
+            'Temperature (K)': temp,
+            'Temperature (°C)': temp - 273.15,
+            'Input pH': ph,
+            'Matthews Coefficient': matthews,
+            'Solvent Content (%)': solvent,
+            'Predicted Component': pred_name,
+            'Component Probability': f"{top_5_proba[0]:.4f}",
+            'Predicted pH': f"{pred_ph:.2f}",
+        }
+        if "Advanced" in approach:
+            results_dict['Predicted Concentration (log)'] = f"{pred_conc:.4f}"
+            results_dict['Predicted Concentration (M)'] = f"{10**pred_conc:.6f}"
+        results_df = pd.DataFrame([results_dict])
+        csv = results_df.to_csv(index=False)
+        st.download_button(
+            label="📥 Download Predictions as CSV",
+            data=csv,
+            file_name="crystallization_predictions.csv",
+            mime="text/csv",
+        )
+    except FileNotFoundError as e:
+        st.error(f"""
+        ❌ **Model files not found!**
+        Error: {e}
+        Please ensure model files are in the correct directory:
+        - `models/simple_baseline/`
+        - `models/advanced_baseline/`
+        """)
+    except Exception as e:
+        st.error(f"❌ **Prediction Error:** {e}")
+        with st.expander("🔍 Show full error details"):
+            import traceback
+            st.code(traceback.format_exc())
+# Model Comparison Section
+st.markdown("---")
+st.header("📈 Model Comparison")
+comparison_df = pd.DataFrame({
+    'Model': ['Simple Baseline', 'Advanced Baseline', 'Transformer'],
+    'Name Accuracy': ['61.12%', '64.18% ⭐', '53.85%'],
+    'Conc R²': ['N/A', '47.33%', '18.72%'],
+    'pH R²': ['95.58%', '99.34% ⭐', '99.27%'],
+    'Speed': ['⚡ Fast', '⚡ Fast', '🐌 Slow'],
+    'Recommendation': ['Basic use', '✅ Best overall', 'Research only']
+})
+st.dataframe(
+    comparison_df,
+    hide_index=True,
+    use_container_width=True,
+    column_config={
+        "Model": st.column_config.TextColumn("Model", width="medium"),
+        "Name Accuracy": st.column_config.TextColumn("Name Accuracy", width="medium"),
+        "Conc R²": st.column_config.TextColumn("Concentration R²", width="medium"),
+        "pH R²": st.column_config.TextColumn("pH R²", width="medium"),
+    }
+)
+st.markdown("""
+**Model Selection Guide:**
+- **Simple Baseline**: Fast predictions, no concentration. Good for quick pH and component estimates.
+- **Advanced Baseline**: ⭐ Recommended for most users. Includes all three predictions with high accuracy.
+- **Transformer**: Deep learning approach, requires more data for better performance.
+""")
+# Visualizations Section
+st.markdown("---")
+st.header("📊 Performance Visualizations")
+viz_path = os.path.join(BASE_DIR, 'visualizations')
+if os.path.exists(viz_path):
+    try:
+        tab1, tab2, tab3, tab4 = st.tabs([
+            "📊 Name Accuracy",
+            "📈 Concentration R²",
+            "🧪 pH R²",
+            "🎯 Complete Comparison"
+        ])
+        with tab1:
+            img_path = os.path.join(viz_path, '01_component_name_comparison.png')
+            if os.path.exists(img_path):
+                st.image(img_path, use_column_width=True)
+                st.caption("Comparison of component name prediction accuracy across all models")
+        with tab2:
+            img_path = os.path.join(viz_path, '02_component_conc_comparison.png')
+            if os.path.exists(img_path):
+                st.image(img_path, use_column_width=True)
+                st.caption("Concentration prediction performance (R² scores)")
+        with tab3:
+            img_path = os.path.join(viz_path, '03_component_ph_comparison.png')
+            if os.path.exists(img_path):
+                st.image(img_path, use_column_width=True)
+                st.caption("pH prediction performance (R² scores)")
+        with tab4:
+            img_path = os.path.join(viz_path, '05_complete_comparison.png')
+            if os.path.exists(img_path):
+                st.image(img_path, use_column_width=True)
+                st.caption("Comprehensive comparison of all approaches and metrics")
+    except Exception as e:
+        st.info(f"Visualizations are being loaded... {e}")
+else:
+    st.info("📊 Visualization files not found in this deployment")
+# Information Section
+st.markdown("---")
+st.header("ℹ️ How It Works")
+with st.expander("🔬 About Protein Crystallization"):
+    st.markdown("""
+    **Protein crystallization** is a crucial step in structural biology for determining 3D protein structures using X-ray crystallography.
+    **Key Parameters:**
+    - **Crystallization Method**: The technique used (e.g., vapor diffusion, batch mode)
+    - **Temperature**: Affects protein stability and crystal growth
+    - **pH**: Critical for protein solubility and crystal formation
+    - **Matthews Coefficient**: Indicates crystal packing density
+    - **Solvent Content**: Amount of solvent in the crystal lattice
+    This tool helps predict optimal conditions based on historical crystallization data.
+    """)
+with st.expander("🤖 About the Models"):
+    st.markdown("""
+    **Simple Baseline:**
+    - Random Forest classifier for component name
+    - XGBoost regressor for pH
+    - Uses 4 numerical features + TF-IDF of method
+    **Advanced Baseline:**
+    - Ensemble of Random Forest, XGBoost, LightGBM, and CatBoost
+    - Includes concentration prediction with log-transformation
+    - Uses 8 engineered features including interactions
+    - Best overall performance: 64% name accuracy, 99% pH R²
+    **Training Data:**
+    - Based on protein crystallization experiments from PDB
+    - Includes various crystallization methods and conditions
+    - Models trained on structured crystallization data
+    """)
+with st.expander("📖 How to Use"):
+    st.markdown("""
+    1. **Select a model** in the sidebar (Advanced Baseline recommended)
+    2. **Input your parameters**:
+       - Choose crystallization method
+       - Set temperature, pH, Matthews coefficient, and solvent content
+    3. **Click "Predict Components"** to get predictions
+    4. **Review results**:
+       - Component name with confidence scores
+       - Concentration (if using Advanced Baseline)
+       - Optimal pH for crystallization
+    5. **Download** results as CSV for your records
+    💡 **Tip:** Start with the recommended default values and adjust based on your specific protein and experimental setup.
+    """)
+# Footer
+st.markdown("---")
+st.markdown("""
+<div style='text-align: center; color: gray; padding: 20px;'>
+    <p><b>🔬 Crystallization Component Prediction System</b></p>
+    <p><i>Advanced Baseline achieves: 64% Name Accuracy | 47% Conc R² | 99% pH R²</i></p>
+    <p>Built with Scikit-learn, XGBoost, LightGBM, CatBoost & Streamlit</p>
+    <p style='font-size: 12px; margin-top: 10px;'>
+        For research and educational purposes. Validate predictions experimentally.
+    </p>
+</div>
+""", unsafe_allow_html=True)

models/advanced_baseline/label_encoder_name.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
+size 116738

models/advanced_baseline/model_component_conc.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:88824da3404b10b0665cd9f854e79373b48b20fa26434599179c8c25b4076d7c
+size 14164817

models/advanced_baseline/model_component_name.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:51b4f1641a13b3bb649ffbb636c08b1b319243758c7baeb7b8c2ea90abea5561
+size 11727850

models/advanced_baseline/model_component_ph.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f76b749b7b827e382b78e5401321671f45d937973b42b289ced7a03a7b598ba0
+size 1452962

models/advanced_baseline/scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:73fb3f7d2e6bddf8275ef0ed30515699685867b00f7fdca5c80076d50c972a56
+size 1287

models/advanced_baseline/tfidf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3122e3afec9af0916a8571b366e9a49b284adc6f72a4a63e3a2e5dab2c3d7a93
+size 2552

models/advanced_baseline/training_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:4541496759fff9ca683a388156eecf97c44839363e5a9948df380b388e7f9284
+size 1112

models/simple_baseline/label_encoder_name.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a3bf8ed7fff50ffa1d456f097575742f082e715975d8bd5c5e5e78c0c72f10a0
+size 116738

models/simple_baseline/model_component_name.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9a5b7b93b194da3be0b9a72c06f34cbd0f6810e043ab88ea83a25108de686c73
+size 16985140

models/simple_baseline/model_component_ph.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5f906964a6ce338aca61bd7b5d1ace1f67daf1ece52eb44658ff539d0201dee3
+size 1239942

models/simple_baseline/scaler.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3c82727c7b6cf52a834e529b5fc01c43d28328465323f2945c331568b9e1095f
+size 1079

models/simple_baseline/tfidf.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7bd17a22531b2aa52a6677af44fadc676329069d40614172fc64def0549bb207
+size 2552

models/simple_baseline/training_results.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:ba3ee92aff285a69afb5edb8d648d2bbf2d550b37af791248c66688747970297
+size 701

requirements.txt CHANGED Viewed

@@ -1,3 +1,17 @@
-altair
-pandas
-streamlit

+# Crystallization Component Predictor - Requirements for Hugging Face Spaces
+# Python 3.9+
+# Core Dependencies
+streamlit==1.29.0
+pandas==2.1.4
+numpy==1.26.2
+# Machine Learning Models
+scikit-learn==1.3.2
+xgboost==2.0.3
+lightgbm==4.1.0
+catboost==1.2.2
+# Utilities
+joblib==1.3.2

run_local.sh ADDED Viewed

	@@ -0,0 +1,16 @@

+#!/bin/bash
+echo "========================================"
+echo " Crystallization Predictor - Local Run"
+echo "========================================"
+echo ""
+echo "Starting Streamlit app..."
+echo "Press Ctrl+C to stop"
+echo ""
+echo "App will open in your browser at:"
+echo "http://localhost:8501"
+echo ""
+echo "----------------------------------------"
+streamlit run app.py

verify_files.py ADDED Viewed

	@@ -0,0 +1,132 @@

+"""
+Verification script to check if all required files are present
+Run this before deploying to Hugging Face
+"""
+import os
+from pathlib import Path
+def check_file(filepath, required=True):
+    """Check if a file exists and return status"""
+    exists = os.path.exists(filepath)
+    status = "✅" if exists else ("❌" if required else "⚠️")
+    req_text = "(required)" if required else "(optional)"
+    print(f"{status} {filepath} {req_text}")
+    return exists
+def check_folder(folderpath, required=True):
+    """Check if a folder exists and return status"""
+    exists = os.path.exists(folderpath) and os.path.isdir(folderpath)
+    status = "✅" if exists else ("❌" if required else "⚠️")
+    req_text = "(required)" if required else "(optional)"
+    print(f"{status} {folderpath}/ {req_text}")
+    if exists:
+        files = list(Path(folderpath).rglob('*'))
+        file_count = len([f for f in files if f.is_file()])
+        print(f"   → Contains {file_count} file(s)")
+    return exists
+def main():
+    print("=" * 60)
+    print("  Hugging Face Deployment - File Verification")
+    print("=" * 60)
+    print()
+    base_dir = os.path.dirname(os.path.abspath(__file__))
+    os.chdir(base_dir)
+    all_required_present = True
+    # Check essential files
+    print("📄 Essential Files:")
+    all_required_present &= check_file("app.py", required=True)
+    all_required_present &= check_file("requirements.txt", required=True)
+    all_required_present &= check_file("README.md", required=True)
+    print()
+    # Check configuration files
+    print("⚙️ Configuration Files:")
+    check_file(".gitattributes", required=True)
+    check_file(".gitignore", required=False)
+    print()
+    # Check documentation
+    print("📚 Documentation:")
+    check_file("DEPLOYMENT_GUIDE.md", required=False)
+    print()
+    # Check model folders
+    print("🤖 Model Files:")
+    simple_exists = check_folder("models/simple_baseline", required=True)
+    all_required_present &= simple_exists
+    if simple_exists:
+        all_required_present &= check_file("models/simple_baseline/model_component_name.pkl", required=True)
+        all_required_present &= check_file("models/simple_baseline/model_component_ph.pkl", required=True)
+        all_required_present &= check_file("models/simple_baseline/label_encoder_name.pkl", required=True)
+        all_required_present &= check_file("models/simple_baseline/scaler.pkl", required=True)
+        all_required_present &= check_file("models/simple_baseline/tfidf.pkl", required=True)
+        check_file("models/simple_baseline/training_results.json", required=False)
+    print()
+    advanced_exists = check_folder("models/advanced_baseline", required=True)
+    all_required_present &= advanced_exists
+    if advanced_exists:
+        all_required_present &= check_file("models/advanced_baseline/model_component_name.pkl", required=True)
+        all_required_present &= check_file("models/advanced_baseline/model_component_conc.pkl", required=True)
+        all_required_present &= check_file("models/advanced_baseline/model_component_ph.pkl", required=True)
+        all_required_present &= check_file("models/advanced_baseline/label_encoder_name.pkl", required=True)
+        all_required_present &= check_file("models/advanced_baseline/scaler.pkl", required=True)
+        all_required_present &= check_file("models/advanced_baseline/tfidf.pkl", required=True)
+        check_file("models/advanced_baseline/training_results.json", required=False)
+    print()
+    # Check visualizations
+    print("📊 Visualization Files:")
+    viz_exists = check_folder("visualizations", required=False)
+    if viz_exists:
+        check_file("visualizations/01_component_name_comparison.png", required=False)
+        check_file("visualizations/02_component_conc_comparison.png", required=False)
+        check_file("visualizations/03_component_ph_comparison.png", required=False)
+        check_file("visualizations/05_complete_comparison.png", required=False)
+    print()
+    print("=" * 60)
+    if all_required_present:
+        print("✅ SUCCESS! All required files are present.")
+        print("   You're ready to deploy to Hugging Face!")
+        print()
+        print("Next steps:")
+        print("1. Test locally: streamlit run app.py")
+        print("2. Follow DEPLOYMENT_GUIDE.md for deployment")
+        print("3. Upload entire folder to Hugging Face Spaces")
+    else:
+        print("❌ ERROR! Some required files are missing.")
+        print("   Please ensure all required files are present before deploying.")
+    print("=" * 60)
+    # Calculate total size
+    total_size = 0
+    for root, dirs, files in os.walk('.'):
+        for file in files:
+            filepath = os.path.join(root, file)
+            if os.path.exists(filepath):
+                total_size += os.path.getsize(filepath)
+    size_mb = total_size / (1024 * 1024)
+    print(f"\n📦 Total folder size: {size_mb:.2f} MB")
+    if size_mb > 500:
+        print("⚠️  Warning: Folder is quite large. Consider Git LFS for files >10MB")
+    return all_required_present
+if __name__ == "__main__":
+    success = main()
+    exit(0 if success else 1)