manhteky123 commited on
Commit
643395e
·
verified ·
1 Parent(s): b181820

Upload 23 files

Browse files
Files changed (2) hide show
  1. Dockerfile +67 -156
  2. README.md +1 -208
Dockerfile CHANGED
@@ -1,156 +1,67 @@
1
- # Start with NVIDIA CUDA base image
2
- FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
3
-
4
- # Set environment variables
5
- ENV PYTHONDONTWRITEBYTECODE=1
6
- ENV PYTHONUNBUFFERED=1
7
- ENV DEBIAN_FRONTEND=noninteractive
8
-
9
- # Set working directory
10
- WORKDIR /app
11
-
12
- # Install system dependencies
13
- RUN apt-get update && apt-get install -y \
14
- git \
15
- wget \
16
- python3-pip \
17
- python3-dev \
18
- && rm -rf /var/lib/apt/lists/*
19
-
20
- # Create symlink for python
21
- RUN ln -sf /usr/bin/python3 /usr/bin/python
22
-
23
- # Copy requirements files
24
- COPY requirements_lavis.txt /app/
25
- COPY requirements_emo.txt /app/
26
-
27
- # Install Python dependencies
28
- RUN pip3 install --no-cache-dir --upgrade pip
29
- RUN pip3 install --no-cache-dir -r requirements_lavis.txt
30
- RUN pip3 install --no-cache-dir gradio
31
-
32
- # Install PyTorch with CUDA 11.8
33
- RUN pip3 install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
34
-
35
- # Clone LAVIS repository
36
- RUN git clone https://github.com/salesforce/LAVIS.git
37
- WORKDIR /app/LAVIS
38
- RUN pip3 install -e .
39
- WORKDIR /app
40
-
41
- # Create directories for model weights
42
- RUN mkdir -p /app/LAVIS/lavis/weight/vicuna-7b-2/
43
- RUN mkdir -p /app/LAVIS/lavis/models/blip2_models/
44
-
45
- # Copy model files
46
- COPY blip2_vicuna_instruct.py /app/LAVIS/lavis/models/blip2_models/
47
- COPY FT.yaml /app/LAVIS/
48
-
49
- # Download trained weights
50
- RUN mkdir -p /app/weights
51
-
52
- # Create Gradio app
53
- COPY app.py /app/app.py
54
-
55
- # Create start script with model setup
56
- RUN echo '#!/bin/bash\n\
57
- # Download Vicuna model if not present\n\
58
- MODEL_PATH="/app/LAVIS/lavis/weight/vicuna-7b-2"\n\
59
- WEIGHTS_URL="https://drive.google.com/file/d/1zaYOSlt3mLVMdiNfAKdJcwvVc-4LHfdr/view?usp=drive_link"\n\
60
- \n\
61
- # Check if we need to download Vicuna model weights\n\
62
- if [ ! -f "$MODEL_PATH/config.json" ]; then\n\
63
- echo "Downloading Vicuna-7b model weights..."\n\
64
- apt-get update && apt-get install -y git-lfs && rm -rf /var/lib/apt/lists/*\n\
65
- git lfs install\n\
66
- git clone https://huggingface.co/lmsys/vicuna-7b-v1.1 $MODEL_PATH\n\
67
- echo "Vicuna model downloaded successfully!"\n\
68
- fi\n\
69
- \n\
70
- # Download EmoVIT trained weights if not present\n\
71
- if [ ! -f "/app/weights/model_weights1.pth" ]; then\n\
72
- echo "Downloading EmoVIT trained weights..."\n\
73
- apt-get update && apt-get install -y curl gdown && rm -rf /var/lib/apt/lists/*\n\
74
- gdown --id 1zaYOSlt3mLVMdiNfAKdJcwvVc-4LHfdr -O /app/weights/model_weights1.pth\n\
75
- echo "EmoVIT weights downloaded successfully!"\n\
76
- fi\n\
77
- \n\
78
- # Start the app\n\
79
- python /app/app.py\n'\
80
- > /app/start.sh
81
-
82
- RUN chmod +x /app/start.sh
83
-
84
- # Create a proper app.py with Gradio interface
85
- RUN echo 'import gradio as gr\n\
86
- import torch\n\
87
- import os\n\
88
- from PIL import Image\n\
89
- from lavis.models import load_model_and_preprocess\n\
90
- \n\
91
- # Set device\n\
92
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")\n\
93
- print(f"Using device: {device}")\n\
94
- \n\
95
- # Set model path\n\
96
- os.environ["TORCH_HOME"] = "/app/weights"\n\
97
- \n\
98
- # Load the model\n\
99
- print("Loading EmoVIT model...")\n\
100
- model, vis_processors, txt_processors = load_model_and_preprocess(\n\
101
- name="blip2_vicuna_instruct",\n\
102
- model_type="vicuna7b",\n\
103
- is_eval=True,\n\
104
- device=device\n\
105
- )\n\
106
- \n\
107
- # Load the fine-tuned weights\n\
108
- if os.path.exists("/app/weights/model_weights1.pth"):\n\
109
- print("Loading fine-tuned weights...")\n\
110
- model.load_state_dict(torch.load("/app/weights/model_weights1.pth", map_location=device))\n\
111
- print("Fine-tuned weights loaded successfully!")\n\
112
- else:\n\
113
- print("Warning: Fine-tuned weights not found!")\n\
114
- \n\
115
- print("Model initialization complete!")\n\
116
- \n\
117
- def predict(image, prompt):\n\
118
- if image is None:\n\
119
- return "Please upload an image."\n\
120
- \n\
121
- # Process the image\n\
122
- image_tensor = vis_processors["eval"](image).unsqueeze(0).to(device)\n\
123
- \n\
124
- # For emotion reasoning, format prompt if needed\n\
125
- if "reason" in prompt.lower() and not prompt.lower().startswith("predicted emotion"):\n\
126
- prompt = f"Predicted emotion: [emotion]. Reason: [explanation]. {prompt}"\n\
127
- \n\
128
- # Generate response\n\
129
- with torch.no_grad():\n\
130
- response = model.generate({{"image": image_tensor, "prompt": prompt}})\n\
131
- \n\
132
- return response[0]\n\
133
- \n\
134
- # Define Gradio interface with examples\n\
135
- examples = [\n\
136
- ["example_image.jpg", "What emotion is expressed in this image?"],\n\
137
- ["example_image.jpg", "Predicted emotion: [emotion]. Reason: [explanation]."],\n\
138
- ]\n\
139
- \n\
140
- demo = gr.Interface(\n\
141
- fn=predict,\n\
142
- inputs=[\n\
143
- gr.Image(type="pil", label="Upload Image"),\n\
144
- gr.Textbox(lines=2, placeholder="Enter your prompt here...", label="Prompt")\n\
145
- ],\n\
146
- outputs=gr.Textbox(label="Model Response"),\n\
147
- title="EmoVIT: Visual Emotion Analysis with Instruction Tuning",\n\
148
- description="Upload an image and enter a prompt to analyze emotions. For emotion reasoning, format your prompt as: \\"Predicted emotion: [emotion]. Reason: [explanation].\\"")\n\
149
- \n\
150
- # Launch the app\n\
151
- if __name__ == "__main__":\n\
152
- demo.launch(server_name="0.0.0.0", server_port=7860)\n'\
153
- > /app/app.py
154
-
155
- # Set the entry point
156
- ENTRYPOINT ["/app/start.sh"]
 
1
+ FROM nvidia/cuda:11.8.0-cudnn8-runtime-ubuntu20.04
2
+
3
+ WORKDIR /app
4
+
5
+ # Set environment variables
6
+ ENV PYTHONDONTWRITEBYTECODE=1 \
7
+ PYTHONUNBUFFERED=1 \
8
+ DEBIAN_FRONTEND=noninteractive \
9
+ TRANSFORMERS_CACHE=/app/.cache/transformers \
10
+ HF_HOME=/app/.cache/huggingface \
11
+ TORCH_HOME=/app/.cache/torch \
12
+ HF_DATASETS_CACHE=/app/.cache/datasets
13
+
14
+ # Install basic dependencies
15
+ RUN apt-get update && apt-get install -y --no-install-recommends \
16
+ python3.8 \
17
+ python3.8-dev \
18
+ python3-pip \
19
+ python3-setuptools \
20
+ git \
21
+ wget \
22
+ ca-certificates \
23
+ && apt-get clean \
24
+ && rm -rf /var/lib/apt/lists/*
25
+
26
+ # Create symbolic link for python
27
+ RUN ln -sf /usr/bin/python3.8 /usr/bin/python
28
+
29
+ # Upgrade pip
30
+ RUN pip install --no-cache-dir --upgrade pip
31
+
32
+ # Create cache directories
33
+ RUN mkdir -p /app/.cache/transformers \
34
+ /app/.cache/huggingface \
35
+ /app/.cache/torch \
36
+ /app/.cache/datasets
37
+
38
+ # Clone LAVIS repository to temp directory for installation
39
+ RUN git clone https://github.com/salesforce/LAVIS.git /tmp/LAVIS \
40
+ && cd /tmp/LAVIS \
41
+ && sed -i '/open3d/d' requirements.txt \
42
+ && pip install --no-cache-dir -e . \
43
+ && cd / \
44
+ && cp -r /tmp/LAVIS/lavis /app/LAVIS/ \
45
+ && rm -rf /tmp/LAVIS
46
+
47
+ # Install PyTorch with CUDA support
48
+ RUN pip install --no-cache-dir torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --extra-index-url https://download.pytorch.org/whl/cu118
49
+
50
+ # Copy requirements files and install dependencies
51
+ COPY requirements_lavis.txt requirements_emo.txt ./
52
+ RUN pip install --no-cache-dir -r requirements_lavis.txt -r requirements_emo.txt
53
+
54
+ # Copy model and application files
55
+ COPY app.py blip2_vicuna_instruct.py ./
56
+ COPY static/ ./static/
57
+ COPY templates/ ./templates/
58
+ COPY LAVIS/ ./LAVIS/
59
+
60
+ # Create directory for model weights (to be mounted or downloaded at runtime)
61
+ RUN mkdir -p ./LAVIS/lavis/weight/vicuna-7b-2/
62
+
63
+ # Set up a volume for persistent cache
64
+ VOLUME /app/.cache
65
+
66
+ # Set the default command to run the Flask app
67
+ CMD ["python", "app.py"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,211 +1,4 @@
1
- ---
2
- title: EmoVIT
3
- emoji: 😻
4
- colorFrom: gray
5
- colorTo: purple
6
- sdk: docker
7
- pinned: false
8
- ---
9
-
10
- # EmoVIT - Emotion Detection with BLIP2-Vicuna
11
-
12
- 🚀 **AI-Powered Emotion Detection Web Application**
13
-
14
- EmoVIT is a sophisticated emotion detection application that leverages the power of BLIP2-Vicuna model to analyze emotions in images through natural language understanding.
15
-
16
- ## 🌟 Features
17
-
18
- - **🖼️ Image Upload**: Easy drag-and-drop or click-to-upload interface
19
- - **🧠 AI Analysis**: Advanced emotion detection using BLIP2-Vicuna model
20
- - **💬 Custom Prompts**: Personalize your analysis with custom text prompts
21
- - **🎨 Beautiful UI**: Modern, responsive design with smooth animations
22
- - **⚡ Real-time Processing**: Fast inference with optimized model loading
23
- - **📱 Mobile Friendly**: Works seamlessly on all devices
24
-
25
- ## 🛠️ Technology Stack
26
-
27
- - **Backend**: Flask (Python web framework)
28
- - **AI Model**: BLIP2-Vicuna (Vision-Language model)
29
- - **Frontend**: HTML5, CSS3, JavaScript, Bootstrap 5
30
- - **Deployment**: Docker + Hugging Face Spaces
31
-
32
- ## 🚀 Quick Start
33
-
34
- ### Local Development
35
-
36
- 1. **Clone the repository**
37
- ```bash
38
- git clone <your-repo-url>
39
- cd EmoVIT
40
- ```
41
-
42
- 2. **Install dependencies**
43
- ```bash
44
- pip install -r requirements.txt
45
- ```
46
-
47
- 3. **Run the application**
48
- ```bash
49
- python app.py
50
- ```
51
-
52
- 4. **Open in browser**
53
- Navigate to `http://localhost:7860`
54
-
55
- ### Docker Deployment
56
-
57
- 1. **Build the Docker image**
58
- ```bash
59
- docker build -t emovit .
60
- ```
61
-
62
- 2. **Run the container**
63
- ```bash
64
- docker run -p 7860:7860 emovit
65
- ```
66
-
67
- ## 🌐 Hugging Face Spaces Deployment
68
-
69
- This application is configured for seamless deployment on Hugging Face Spaces:
70
-
71
- 1. **Create a new Space** on [Hugging Face Spaces](https://huggingface.co/spaces)
72
- 2. **Select Docker** as the SDK
73
- 3. **Upload your files** to the Space repository
74
- 4. **The app will automatically deploy** using the provided Dockerfile
75
-
76
- ### Required Files for HF Spaces:
77
- - `app.py` - Main Flask application
78
- - `Dockerfile` - Container configuration
79
- - `requirements.txt` - Python dependencies
80
- - `templates/` - HTML templates
81
- - `static/` - CSS and static assets
82
- - `blip2_vicuna_instruct.py` - Model implementation
83
-
84
- ## 📁 Project Structure
85
-
86
- ```
87
- EmoVIT/
88
- ├── app.py # Main Flask application
89
- ├── blip2_vicuna_instruct.py # BLIP2-Vicuna model implementation
90
- ├── requirements.txt # Python dependencies
91
- ├── Dockerfile # Docker configuration
92
- ├── README.md # This file
93
- ├── templates/
94
- │ └── index.html # Main HTML template
95
- ├── static/
96
- │ └── css/
97
- │ └── style.css # Custom CSS styles
98
- └── emo/ # Emotion datasets and utilities
99
- ├── train.json
100
- ├── val.json
101
- └── test.json
102
- ```
103
-
104
- ## 🎯 How It Works
105
-
106
- 1. **Upload Image**: Users upload an image through the web interface
107
- 2. **Enter Prompt**: Optionally customize the analysis prompt
108
- 3. **AI Processing**: The BLIP2-Vicuna model processes the image and prompt
109
- 4. **Results Display**: Emotion analysis results are displayed with the original image
110
-
111
- ## 🔧 Configuration
112
-
113
- ### Model Configuration
114
- The model can be configured in `app.py`:
115
-
116
- ```python
117
- model_config = {
118
- "vit_model": "eva_clip_g",
119
- "img_size": 224,
120
- "num_query_token": 32,
121
- "llm_model": "vicuna-7b-v1.1",
122
- "max_txt_len": 128,
123
- "max_output_txt_len": 256,
124
- # ... other configurations
125
- }
126
- ```
127
-
128
- ### Environment Variables
129
- - `PORT`: Application port (default: 7860)
130
- - `FLASK_ENV`: Flask environment (production/development)
131
-
132
- ## 🤖 Model Details
133
-
134
- **BLIP2-Vicuna** combines:
135
- - **Vision Encoder**: EVA-CLIP for image understanding
136
- - **Q-Former**: Querying transformer for cross-modal alignment
137
- - **Language Model**: Vicuna (LLaMA-based) for text generation
138
-
139
- This architecture enables sophisticated vision-language understanding for emotion detection tasks.
140
-
141
- ## 📊 Performance & Optimization
142
-
143
- - **GPU Support**: Automatic CUDA detection and utilization
144
- - **Memory Efficient**: Optimized model loading and inference
145
- - **Caching**: Smart caching for improved response times
146
- - **Error Handling**: Robust error handling and user feedback
147
-
148
- ## 🎨 UI/UX Features
149
-
150
- - **Responsive Design**: Works on desktop, tablet, and mobile
151
- - **Modern Aesthetics**: Clean, professional interface
152
- - **Smooth Animations**: Engaging user interactions
153
- - **Loading States**: Clear feedback during processing
154
- - **Error Handling**: User-friendly error messages
155
-
156
- ## 🔒 Security Features
157
-
158
- - **File Size Limits**: 16MB maximum upload size
159
- - **File Type Validation**: Only image files accepted
160
- - **Input Sanitization**: Secure handling of user inputs
161
- - **CORS Protection**: Appropriate cross-origin policies
162
-
163
- ## 🚀 Deployment Options
164
-
165
- ### 1. Hugging Face Spaces (Recommended)
166
- - Zero-configuration deployment
167
- - Automatic scaling
168
- - Free tier available
169
- - Built-in GPU support
170
-
171
- ### 2. Docker
172
- - Consistent environments
173
- - Easy scaling
174
- - Platform independent
175
-
176
- ### 3. Local Development
177
- - Quick testing
178
- - Development workflow
179
- - Custom configurations
180
-
181
- ## 🛠️ Development
182
-
183
- ### Adding New Features
184
- 1. Update `app.py` for backend changes
185
- 2. Modify `templates/index.html` for UI changes
186
- 3. Update `static/css/style.css` for styling
187
- 4. Test locally before deployment
188
-
189
- ### Model Updates
190
- 1. Update `blip2_vicuna_instruct.py`
191
- 2. Adjust configuration in `app.py`
192
- 3. Update requirements if needed
193
-
194
- ## 📄 License
195
-
196
- This project is open-source and available under the MIT License.
197
-
198
- ## 🤝 Contributing
199
-
200
- Contributions are welcome! Please feel free to submit a Pull Request.
201
-
202
- ## 📞 Support
203
-
204
- For questions or support, please open an issue in the repository.
205
-
206
- ---
207
-
208
- **Built with ❤️ using BLIP2-Vicuna and modern web technologies**
209
  Official code for the paper **"EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning"** | CVPR 2024
210
 
211
  ## 🔄 Update Log – 2025/04/07
 
1
+ # EmoVIT
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  Official code for the paper **"EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning"** | CVPR 2024
3
 
4
  ## 🔄 Update Log – 2025/04/07