Spaces:

manhteky123
/

EmoVIT

Configuration error

App Files Files Community

manhteky123 commited on Sep 23, 2025

Commit

3ed5c27

verified ·

1 Parent(s): a0ca14f

Upload 23 files

Browse files

Files changed (24) hide show

.gitattributes +1 -0
.gitignore +176 -0
Dockerfile +46 -0
FT.yaml +66 -0
README.md +343 -10
app.py +180 -0
blip2_vicuna_instruct.py +747 -0
emo/all.py +138 -0
emo/cap-anno.py +37 -0
emo/caption.py +26 -0
emo/desktop.ini +2 -0
emo/gpt4_conversation.py +87 -0
emo/gpt4_reasoning.py +89 -0
emo/prompt/conversation.txt +16 -0
emo/prompt/description.txt +8 -0
emo/prompt/reasoning.txt +49 -0
emo/test.json +0 -0
emo/train.json +3 -0
emo/val.json +0 -0
requirements.txt +39 -0
requirements_emo.txt +166 -0
requirements_lavis.txt +158 -0
static/css/style.css +278 -0
templates/index.html +253 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+emo/train.json filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,176 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+.python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# Model files and checkpoints
+*.pth
+*.pt
+*.bin
+*.safetensors
+checkpoints/
+models/
+# Data files
+*.csv
+*.json
+*.txt
+!requirements*.txt
+!README.txt
+# Uploaded files
+uploads/
+temp/
+# IDE files
+.vscode/
+.idea/
+*.swp
+*.swo
+*~
+# OS files
+.DS_Store
+.DS_Store?
+._*
+.Spotlight-V100
+.Trashes
+ehthumbs.db
+Thumbs.db
+# Logs
+logs/
+*.log
+# Temporary files
+tmp/
+temp/
+*.tmp
+# Hugging Face cache
+.cache/

Dockerfile ADDED Viewed

	@@ -0,0 +1,46 @@

+FROM python:3.9-slim
+# Set working directory
+WORKDIR /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y \
+    git \
+    wget \
+    curl \
+    build-essential \
+    libgl1-mesa-glx \
+    libglib2.0-0 \
+    libsm6 \
+    libxext6 \
+    libxrender-dev \
+    libgomp1 \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements first to leverage Docker cache
+COPY requirements.txt .
+# Install Python dependencies
+RUN pip install --no-cache-dir --upgrade pip && \
+    pip install --no-cache-dir -r requirements.txt
+# Copy application code
+COPY . .
+# Create necessary directories
+RUN mkdir -p static/css templates
+# Set environment variables for Hugging Face Spaces
+ENV PYTHONPATH=/app
+ENV FLASK_APP=app.py
+ENV FLASK_ENV=production
+# Expose port for Hugging Face Spaces
+EXPOSE 7860
+# Health check
+HEALTHCHECK --interval=30s --timeout=30s --start-period=5s --retries=3 \
+    CMD curl -f http://localhost:7860/health || exit 1
+# Command to run the application
+CMD ["python", "app.py"]

FT.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+ # Copyright (c) 2022, salesforce.com, inc.
+ # All rights reserved.
+ # SPDX-License-Identifier: BSD-3-Clause
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
+model:
+  arch: blip2_opt
+  model_type: caption_coco_opt2.7b
+  load_finetuned: False
+  use_grad_checkpoint: True
+  #freeze_vit: False
+  freeze_vit: True
+datasets:
+  coco_vqa: # name of the dataset builder
+    vis_processor:
+        train:
+          name: "blip2_image_train"
+          image_size: 224
+        eval:
+          name: "blip_image_eval"
+          image_size: 224
+    text_processor:
+        train:
+          name: "blip_caption"
+          prompt: " "
+        eval:
+          name: "blip_caption"
+    # build_info:
+    #     images:
+    #         storage: '/export/share/datasets/vision/coco/images/'
+run:
+  task: vqa
+  # optimizer
+  lr_sched: "linear_warmup_cosine_lr"
+  init_lr: 1e-5  #8e-6  #1e-5
+  min_lr: 1e-8
+  warmup_lr: 1e-8
+  warmup_steps: 30000
+  weight_decay: 0.005 #0.05  #0.00005
+  max_epoch: 4
+  batch_size_train: 1
+  batch_size_eval: 1
+  num_workers: 4
+  accum_grad_iters: 1
+  max_len: 1000
+  min_len: 1 #8
+  num_beams: 1 #5 #3
+  seed: 42
+  output_dir: "output"
+  amp: True
+  resume_ckpt_path: null
+  evaluate: False
+  train_splits: ["train"]
+  valid_splits: ["val"]
+  test_splits: ["test"]
+  device: "cuda"
+  world_size: 1
+  dist_url: "env://"
+  distributed: True

README.md CHANGED Viewed

@@ -1,10 +1,343 @@
----
-title: EmoVIT
-emoji: 😻
-colorFrom: gray
-colorTo: purple
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: EmoVIT
+emoji: 😻
+colorFrom: gray
+colorTo: purple
+sdk: docker
+pinned: false
+---
+# EmoVIT - Emotion Detection with BLIP2-Vicuna
+🚀 **AI-Powered Emotion Detection Web Application**
+EmoVIT is a sophisticated emotion detection application that leverages the power of BLIP2-Vicuna model to analyze emotions in images through natural language understanding.
+## 🌟 Features
+- **🖼️ Image Upload**: Easy drag-and-drop or click-to-upload interface
+- **🧠 AI Analysis**: Advanced emotion detection using BLIP2-Vicuna model
+- **💬 Custom Prompts**: Personalize your analysis with custom text prompts
+- **🎨 Beautiful UI**: Modern, responsive design with smooth animations
+- **⚡ Real-time Processing**: Fast inference with optimized model loading
+- **📱 Mobile Friendly**: Works seamlessly on all devices
+## 🛠️ Technology Stack
+- **Backend**: Flask (Python web framework)
+- **AI Model**: BLIP2-Vicuna (Vision-Language model)
+- **Frontend**: HTML5, CSS3, JavaScript, Bootstrap 5
+- **Deployment**: Docker + Hugging Face Spaces
+## 🚀 Quick Start
+### Local Development
+1. **Clone the repository**
+   ```bash
+   git clone <your-repo-url>
+   cd EmoVIT
+   ```
+2. **Install dependencies**
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. **Run the application**
+   ```bash
+   python app.py
+   ```
+4. **Open in browser**
+   Navigate to `http://localhost:7860`
+### Docker Deployment
+1. **Build the Docker image**
+   ```bash
+   docker build -t emovit .
+   ```
+2. **Run the container**
+   ```bash
+   docker run -p 7860:7860 emovit
+   ```
+## 🌐 Hugging Face Spaces Deployment
+This application is configured for seamless deployment on Hugging Face Spaces:
+1. **Create a new Space** on [Hugging Face Spaces](https://huggingface.co/spaces)
+2. **Select Docker** as the SDK
+3. **Upload your files** to the Space repository
+4. **The app will automatically deploy** using the provided Dockerfile
+### Required Files for HF Spaces:
+- `app.py` - Main Flask application
+- `Dockerfile` - Container configuration
+- `requirements.txt` - Python dependencies
+- `templates/` - HTML templates
+- `static/` - CSS and static assets
+- `blip2_vicuna_instruct.py` - Model implementation
+## 📁 Project Structure
+```
+EmoVIT/
+├── app.py                     # Main Flask application
+├── blip2_vicuna_instruct.py   # BLIP2-Vicuna model implementation
+├── requirements.txt           # Python dependencies
+├── Dockerfile                 # Docker configuration
+├── README.md                  # This file
+├── templates/
+│   └── index.html            # Main HTML template
+├── static/
+│   └── css/
+│       └── style.css         # Custom CSS styles
+└── emo/                      # Emotion datasets and utilities
+    ├── train.json
+    ├── val.json
+    └── test.json
+```
+## 🎯 How It Works
+1. **Upload Image**: Users upload an image through the web interface
+2. **Enter Prompt**: Optionally customize the analysis prompt
+3. **AI Processing**: The BLIP2-Vicuna model processes the image and prompt
+4. **Results Display**: Emotion analysis results are displayed with the original image
+## 🔧 Configuration
+### Model Configuration
+The model can be configured in `app.py`:
+```python
+model_config = {
+    "vit_model": "eva_clip_g",
+    "img_size": 224,
+    "num_query_token": 32,
+    "llm_model": "vicuna-7b-v1.1",
+    "max_txt_len": 128,
+    "max_output_txt_len": 256,
+    # ... other configurations
+}
+```
+### Environment Variables
+- `PORT`: Application port (default: 7860)
+- `FLASK_ENV`: Flask environment (production/development)
+## 🤖 Model Details
+**BLIP2-Vicuna** combines:
+- **Vision Encoder**: EVA-CLIP for image understanding
+- **Q-Former**: Querying transformer for cross-modal alignment
+- **Language Model**: Vicuna (LLaMA-based) for text generation
+This architecture enables sophisticated vision-language understanding for emotion detection tasks.
+## 📊 Performance & Optimization
+- **GPU Support**: Automatic CUDA detection and utilization
+- **Memory Efficient**: Optimized model loading and inference
+- **Caching**: Smart caching for improved response times
+- **Error Handling**: Robust error handling and user feedback
+## 🎨 UI/UX Features
+- **Responsive Design**: Works on desktop, tablet, and mobile
+- **Modern Aesthetics**: Clean, professional interface
+- **Smooth Animations**: Engaging user interactions
+- **Loading States**: Clear feedback during processing
+- **Error Handling**: User-friendly error messages
+## 🔒 Security Features
+- **File Size Limits**: 16MB maximum upload size
+- **File Type Validation**: Only image files accepted
+- **Input Sanitization**: Secure handling of user inputs
+- **CORS Protection**: Appropriate cross-origin policies
+## 🚀 Deployment Options
+### 1. Hugging Face Spaces (Recommended)
+- Zero-configuration deployment
+- Automatic scaling
+- Free tier available
+- Built-in GPU support
+### 2. Docker
+- Consistent environments
+- Easy scaling
+- Platform independent
+### 3. Local Development
+- Quick testing
+- Development workflow
+- Custom configurations
+## 🛠️ Development
+### Adding New Features
+1. Update `app.py` for backend changes
+2. Modify `templates/index.html` for UI changes
+3. Update `static/css/style.css` for styling
+4. Test locally before deployment
+### Model Updates
+1. Update `blip2_vicuna_instruct.py`
+2. Adjust configuration in `app.py`
+3. Update requirements if needed
+## 📄 License
+This project is open-source and available under the MIT License.
+## 🤝 Contributing
+Contributions are welcome! Please feel free to submit a Pull Request.
+## 📞 Support
+For questions or support, please open an issue in the repository.
+---
+**Built with ❤️ using BLIP2-Vicuna and modern web technologies**
+Official code for the paper **"EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning"** | CVPR 2024
+## 🔄 Update Log – 2025/04/07
+### 📄 Dataset Update
+- The originally provided `train.json` was incomplete.
+  ✅ The latest version now contains the full dataset and can be downloaded here:
+  [📎 Download `train.json`](https://drive.google.com/file/d/1OV3X7BJyEDYXTGaDbu7E8rGgGzIlnwVq/view?usp=drive_link)
+### ✅ Bug Fixes & Configuration Updates
+- Fixed incorrect version parameters previously used during inference.
+- Updated several related parameter files—please **replace** the original files with the latest versions found in the root directory.
+### 🛠️ Configuration Changes
+- `FT.yaml` and `blip2_vicuna_instruct` have been modified to incorporate the correct parameter settings.
+  - 📁 **Note:** `blip2_vicuna_instruct` should be placed in:
+    `LAVIS/lavis/models/blip2_models/`
+### 💾 Trained Weights
+- Weights trained using the **corrected parameters** are now available.
+  [📥 Download Trained Weights](https://drive.google.com/file/d/1zaYOSlt3mLVMdiNfAKdJcwvVc-4LHfdr/view?usp=drive_link)
+## 🧠 Emotion Reasoning Support
+To enable **emotion reasoning output** from the model, format the input prompt as:
+`Predicted emotion: [emotion]. Reason: [explanation].`
+🔍 This usage is described in **Section 4.4.1 (Affective Reasoning)** of our paper.
+---
+## Setting up the environment
+```bash
+git clone https://github.com/aimmemotion/EmoVIT.git
+conda create --name emovit python=3.8
+conda activate emovit
+cd Emovit
+pip install -r requirements_lavis.txt
+```
+## Install the corresponding version of PyTorch
+```bash
+#Using CUDA 11.8 as an example
+pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu118
+```
+## Install LAVIS
+```bash
+pip install salesforce-lavis
+# If not work, please proceed as follows.
+cd ..
+git clone https://github.com/salesforce/LAVIS.git
+cd LAVIS
+pip install -e . # Please remove 'open3d' from the 'requirements.txt' file to avoid version conflicts.
+cd ../
+# Cut the 'lavis' folder and paste it into the 'lib' folder.
+```
+## Dataset Preparation
+Download EmoSet from
+https://vcc.tech/EmoSet
+Extract the downloaded EmoSet files
+(annotation, image, info.json, test.json, train.json, val.json)
+and place them into the emo folder.
+## Model Preparation
+Download lavis_with_weight.zip https://drive.google.com/file/d/1vZa7C6rxxsq51VQ73ESGQ0S8zEI2dnq_/view?usp=drive_link
+(If you prefer to train it yourself, you can download lavis_without_weight.zip instead https://drive.google.com/file/d/1Re_lzyrQehuL1SjP4GmgPCMPf5jHg3hs/view?usp=drive_link)
+Extract the zip file and place it in the emovit folder.
+Download all files from this Hugging Face page
+https://huggingface.co/lmsys/vicuna-7b-v1.1/tree/main
+Place the downloaded files into ./Emovit/LAVIS/lavis/weight/vicuna-7b-2/
+## Emotion Instruction Data Generation
+1. Run `python ./emo/caption.py` to obtain image captions. Select the 'path' based on the class to be processed.
+2. Run `python ./emo/cap-anno.py` to write the attributes and captions of the image into a file. Select the 'path' based on the class to be processed.
+3. Run `python ./emo/gpt4_reasoning.py` or `python ./emo/gpt4_conversation.py` to instruct GPT-4 to generate questions using the above file as input data.
+    - Remember to change the key.
+    - If you wish to adjust the prompt, you can go to the 'prompt' folder.
+4. Run `python ./emo/all.py` to integrate the results of reasoning, conversation, and classification.
+Following these steps, you can create instructions. If you want to skip this step, you can use the instructions we created using EmoSet. (However, image data must still be downloaded from EmoSet's official website.)
+- Conversation: [Download](https://drive.google.com/file/d/1E8UEH09y0CiAT4Hg7rm975AR3JCjEHeM/view?usp=drive_link)
+- Reasoning: [Download](https://drive.google.com/file/d/1MTNHFzasCb0F921P0itaH-x8vN2OvxEu/view?usp=drive_link)
+The generation method of categorical data does not need to rely on GPT for creation; it can be directly produced (you can observe the prompt in `all.py`).
+#### Training
+```bash
+cd LAVIS
+python train.py --cfg-path FT.yaml
+```
+### Parameter Settings
+- `LAVIS/FT.yaml`: Setting of hyperparameters
+- `LAVIS/lavis/configs/models/blip2/blip2_instruct_vicuna7b.yaml`: Select the location of LLM weight
+- `LAVIS/lavis/configs/datasets/coco/defaults_vqa.yaml`: Select the location of your data
+  LAVIS/lavis/runners/runner_base.py (Change the name of the weight file to be saved.)
+## Inference EmoVIT
+If you haven't trained your own weights yet, you can use the `model_weights1.pth` provided in the `LAVIS` folder.
+```bash
+python ./LAVIS/test.py
+```
+## Citation
+If you found this paper is helpful, please consider cite our paper:
+```bibtex
+@inproceedings{Xie2024EmoVIT,
+  title={EmoVIT: Revolutionizing Emotion Insights with Visual Instruction Tuning},
+  author={Hongxia Xie and Chu-Jun Peng and Yu-Wen Tseng and Hung-Jen Chen and Chan-Feng Hsu and Hong-Han Shuai and Wen-Huang Cheng},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2024}
+}
+```

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import os
+import io
+import torch
+from flask import Flask, render_template, request, jsonify, url_for
+from PIL import Image
+import base64
+from transformers import AutoTokenizer
+import logging
+# Import model từ file hiện tại
+from blip2_vicuna_instruct import Blip2VicunaInstruct
+app = Flask(__name__)
+app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024  # 16MB max file size
+# Global variables cho model
+model = None
+device = None
+def load_model():
+    """Load BLIP2 Vicuna model"""
+    global model, device
+    try:
+        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        logging.info(f"Using device: {device}")
+        # Cấu hình model - có thể cần điều chỉnh theo config thực tế
+        model_config = {
+            "vit_model": "eva_clip_g",
+            "img_size": 224,
+            "drop_path_rate": 0,
+            "use_grad_checkpoint": False,
+            "vit_precision": "fp16",
+            "freeze_vit": True,
+            "num_query_token": 32,
+            "llm_model": "vicuna-7b-v1.1",  # Có thể cần thay đổi path
+            "prompt": "",
+            "max_txt_len": 128,
+            "max_output_txt_len": 256,
+            "apply_lemmatizer": False,
+            "qformer_text_input": True,
+        }
+        # Khởi tạo model
+        model = Blip2VicunaInstruct(**model_config)
+        model.to(device)
+        model.eval()
+        logging.info("Model loaded successfully!")
+    except Exception as e:
+        logging.error(f"Error loading model: {str(e)}")
+        model = None
+def preprocess_image(image):
+    """Preprocess image for model"""
+    try:
+        # Resize và normalize image
+        if image.mode != 'RGB':
+            image = image.convert('RGB')
+        # Resize to model input size
+        image = image.resize((224, 224))
+        # Convert to tensor
+        import torchvision.transforms as transforms
+        transform = transforms.Compose([
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406],
+                               std=[0.229, 0.224, 0.225])
+        ])
+        image_tensor = transform(image).unsqueeze(0)
+        return image_tensor
+    except Exception as e:
+        logging.error(f"Error preprocessing image: {str(e)}")
+        return None
+def predict_emotion(image_tensor, prompt="What emotion is shown in this image?"):
+    """Predict emotion từ image"""
+    global model, device
+    if model is None:
+        return "Model not loaded"
+    try:
+        with torch.no_grad():
+            # Prepare samples
+            samples = {
+                "image": image_tensor.to(device),
+                "text_input": [prompt]
+            }
+            # Generate prediction
+            result = model.generate(
+                samples,
+                use_nucleus_sampling=False,
+                num_beams=3,
+                max_length=50,
+                min_length=1,
+                temperature=0.1,
+                repetition_penalty=1.1
+            )
+            return result[0] if result else "Unable to predict emotion"
+    except Exception as e:
+        logging.error(f"Error predicting emotion: {str(e)}")
+        return f"Error: {str(e)}"
+@app.route('/')
+def index():
+    """Home page"""
+    return render_template('index.html')
+@app.route('/predict', methods=['POST'])
+def predict():
+    """Handle image upload and prediction"""
+    try:
+        if 'image' not in request.files:
+            return jsonify({'error': 'No image file provided'}), 400
+        file = request.files['image']
+        if file.filename == '':
+            return jsonify({'error': 'No image selected'}), 400
+        # Đọc và xử lý image
+        image = Image.open(io.BytesIO(file.read()))
+        # Preprocess image
+        image_tensor = preprocess_image(image)
+        if image_tensor is None:
+            return jsonify({'error': 'Failed to process image'}), 400
+        # Get custom prompt if provided
+        custom_prompt = request.form.get('prompt', 'What emotion is shown in this image?')
+        # Predict emotion
+        emotion_result = predict_emotion(image_tensor, custom_prompt)
+        # Convert image to base64 for display
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG")
+        img_str = base64.b64encode(buffered.getvalue()).decode()
+        return jsonify({
+            'success': True,
+            'emotion': emotion_result,
+            'image': img_str,
+            'prompt': custom_prompt
+        })
+    except Exception as e:
+        logging.error(f"Error in prediction: {str(e)}")
+        return jsonify({'error': f'Prediction failed: {str(e)}'}), 500
+@app.route('/health')
+def health():
+    """Health check endpoint"""
+    return jsonify({
+        'status': 'healthy',
+        'model_loaded': model is not None,
+        'device': str(device) if device else 'unknown'
+    })
+if __name__ == '__main__':
+    # Setup logging
+    logging.basicConfig(level=logging.INFO)
+    # Load model
+    logging.info("Loading model...")
+    load_model()
+    # Determine port for Hugging Face Spaces
+    port = int(os.environ.get("PORT", 7860))
+    # Run app
+    app.run(host="0.0.0.0", port=port, debug=False)

blip2_vicuna_instruct.py ADDED Viewed

	@@ -0,0 +1,747 @@

+"""
+Requires Transformer 4.28 and above, implementation may change according the Llama implementation
+"""
+import logging
+import string
+from packaging import version
+import torch
+from torch.cuda.amp import autocast as autocast
+import torch.nn as nn
+import transformers
+from lavis.common.registry import registry
+from lavis.models.blip2_models.blip2 import Blip2Base, disabled_train
+@registry.register_model("blip2_vicuna_instruct")
+class Blip2VicunaInstruct(Blip2Base):
+    """
+    BLIP2 Vicuna model.
+    Supported model types:
+        - vicuna7b
+        - vicuna13b
+    Usage:
+        >>> from lavis.models import load_model
+        >>> model = load_model("blip2_vicuna_instruct", "vicuna7b")
+    """
+    PRETRAINED_MODEL_CONFIG_DICT = {
+        "vicuna7b": "configs/models/blip2/blip2_instruct_vicuna7b.yaml",
+        "vicuna13b": "configs/models/blip2/blip2_instruct_vicuna13b.yaml",
+    }
+    def __init__(
+        self,
+        vit_model="eva_clip_g",
+        img_size=224,
+        drop_path_rate=0,
+        use_grad_checkpoint=False,
+        vit_precision="fp16",
+        freeze_vit=True,
+        num_query_token=32,
+        llm_model="",
+        prompt="",
+        max_txt_len=128,
+        max_output_txt_len=256,
+        apply_lemmatizer=False,
+        qformer_text_input=True,
+    ):
+        super().__init__()
+        transformers_version = version.parse(transformers.__version__)
+        assert transformers_version >= version.parse("4.28"), "BLIP-2 Vicuna requires transformers>=4.28"
+        from transformers import LlamaTokenizer
+        from lavis.models.blip2_models.modeling_llama import LlamaForCausalLM
+        self.tokenizer = self.init_tokenizer(truncation_side="left")
+        self.visual_encoder, self.ln_vision = self.init_vision_encoder(
+            vit_model, img_size, drop_path_rate, use_grad_checkpoint, vit_precision
+        )
+        if freeze_vit:
+            for name, param in self.visual_encoder.named_parameters():
+                param.requires_grad = False
+            self.visual_encoder = self.visual_encoder.eval()
+            self.visual_encoder.train = disabled_train
+            logging.info("freeze vision encoder")
+        self.Qformer, self.query_tokens = self.init_Qformer(
+            num_query_token, self.visual_encoder.num_features
+        )
+        if not qformer_text_input:
+            self.Qformer.bert.embeddings.word_embeddings = None
+            self.Qformer.bert.embeddings.position_embeddings = None
+            for layer in self.Qformer.bert.encoder.layer:
+                layer.output = None
+                layer.intermediate = None
+        else:
+            self.Qformer.resize_token_embeddings(len(self.tokenizer))
+        self.Qformer.cls = None
+        self.llm_tokenizer = LlamaTokenizer.from_pretrained(llm_model, use_fast=False, truncation_side="left")
+        self.llm_model = LlamaForCausalLM.from_pretrained(
+            llm_model, torch_dtype=torch.float16
+        )
+        self.llm_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+        self.llm_tokenizer.add_special_tokens({'bos_token': '</s>'})
+        self.llm_tokenizer.add_special_tokens({'eos_token': '</s>'})
+        self.llm_tokenizer.add_special_tokens({'unk_token': '</s>'})
+        # self.llm_tokenizer.pad_token = self.llm_tokenizer.unk_token
+        self.llm_model.resize_token_embeddings(len(self.llm_tokenizer))
+        # self.eos_token_id = self.llm_tokenizer(
+        #     self.llm_tokenizer.eos_token, add_special_tokens=False
+        # ).input_ids[0]
+        for name, param in self.llm_model.named_parameters():
+            param.requires_grad = False
+        self.llm_proj = nn.Linear(
+            self.Qformer.config.hidden_size, self.llm_model.config.hidden_size
+        )
+        self.max_txt_len = max_txt_len
+        self.max_output_txt_len = max_output_txt_len
+        self.prompt = prompt
+        prompt_tokens = self.llm_tokenizer(self.prompt, return_tensors="pt")
+        self.prompt_length = prompt_tokens.attention_mask.sum(1)
+        self._lemmatizer = None
+        self.qformer_text_input = qformer_text_input
+    def concat_text_input_output(self, input_ids, input_atts, output_ids, output_atts):
+        input_part_targets_len = []
+        llm_tokens = {"input_ids": [], "attention_mask": []}
+        for i in range(input_ids.size(0)):
+            this_input_ones = input_atts[i].sum()
+            input_part_targets_len.append(this_input_ones)
+            llm_tokens['input_ids'].append(
+                torch.cat([
+                    input_ids[i][:this_input_ones],
+                    output_ids[i][1:],
+                    input_ids[i][this_input_ones:]
+                ])
+            )
+            llm_tokens['attention_mask'].append(
+                torch.cat([
+                    input_atts[i][:this_input_ones],
+                    output_atts[i][1:],
+                    input_atts[i][this_input_ones:]
+                ])
+            )
+        llm_tokens['input_ids'] = torch.stack(llm_tokens['input_ids'])
+        llm_tokens['attention_mask'] = torch.stack(llm_tokens['attention_mask'])
+        return llm_tokens, input_part_targets_len
+    def forward(self, samples):
+        #print('-----------------')
+        #print(samples["text_input"])
+        #print(samples["text_output"])
+        #print('-----------------')
+        #print(samples)
+        #print(samples["text_input"])
+        #print(samples["answer"])
+        #sss
+        image = samples["image"]
+        with self.maybe_autocast():
+            image_embeds = self.ln_vision(self.visual_encoder(image))
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+        bs = image.size(0)
+        for i in range(len(samples["text_input"])):
+            samples["answer"][i] = samples["answer"][i].replace('Complex','')
+            #samples["text_input"][i] = samples["text_input"][i].replace(' predict emotion',':  Predict emotion:')
+            #samples["text_input"][i] = samples["text_input"][i].lstrip().capitalize()
+        #print(samples["text_input"])
+        #print(samples["answer"])
+        #print('----------------')
+        query_tokens = self.query_tokens.expand(image_embeds.shape[0], -1, -1)
+        if self.qformer_text_input:
+            text_Qformer = self.tokenizer(
+                samples["text_input"],
+                padding='longest',
+                truncation=True,
+                max_length=self.max_txt_len,
+                return_tensors="pt",
+            ).to(image.device)
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device)
+            Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask],dim=1)
+            query_output = self.Qformer.bert(
+                text_Qformer.input_ids,
+                attention_mask=Qformer_atts,
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+        else:
+            query_output = self.Qformer.bert(
+                query_embeds=query_tokens,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                return_dict=True,
+            )
+        inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:])
+        atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
+        self.llm_tokenizer.padding_side = "right"
+        self.llm_tokenizer.truncation_side = 'left'
+        text_input_tokens = self.llm_tokenizer(
+            samples['text_input'],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=self.max_txt_len,
+        ).to(image.device)
+        self.llm_tokenizer.truncation_side = 'right'
+        text_output_tokens = self.llm_tokenizer(
+            [t + self.llm_tokenizer.eos_token for t in samples['answer']],
+            return_tensors="pt",
+            padding="longest",
+            truncation=True,
+            max_length=self.max_output_txt_len,
+        ).to(image.device)
+        llm_tokens, input_part_targets_len = self.concat_text_input_output(
+            text_input_tokens.input_ids,
+            text_input_tokens.attention_mask,
+            text_output_tokens.input_ids,
+            text_output_tokens.attention_mask,
+        )
+        # do not apply loss to the padding
+        targets = llm_tokens['input_ids'].masked_fill(
+            llm_tokens['input_ids'] == self.llm_tokenizer.pad_token_id, -100
+        )
+        # do not apply loss to the text input (i.e., instruction)
+        for i, l in enumerate(input_part_targets_len):
+            targets[i][:l] = -100
+        # do not apply loss to the query tokens
+        empty_targets = (
+            torch.ones(atts_llm.size(), dtype=torch.long).to(image.device).fill_(-100)
+        )
+        targets = torch.cat([empty_targets, targets], dim=1)
+        inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens['input_ids'])
+        inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
+        attention_mask = torch.cat([atts_llm, llm_tokens['attention_mask']], dim=1)
+        with self.maybe_autocast():
+            outputs = self.llm_model(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                return_dict=True,
+                labels=targets,
+            )
+        loss = outputs.loss
+        return {"loss": loss}
+    @torch.no_grad()
+    def generate(
+        self,
+        samples,
+        use_nucleus_sampling=False,
+        num_beams=1, #5
+        max_length=256, #256
+        min_length=1,
+        top_p=0.9,
+        repetition_penalty=5.0, #1.5
+        length_penalty=1,
+        num_captions=1,
+        temperature=0, #1
+    ):
+        self.llm_tokenizer.padding_side = "left"
+        if "prompt" in samples.keys():
+            prompt = samples["prompt"]
+        else:
+            prompt = self.prompt
+        image = samples["image"]
+        bs = image.size(0)
+        if isinstance(prompt, str):
+            prompt = [prompt] * bs
+        else:
+            assert len(prompt) == bs, "The number of prompts must be equal to the batch size."
+        # For TextCaps
+        if "ocr_tokens" in samples.keys() and "{}" in prompt[0]:
+            prompt = [p.format(', '.join(samples['ocr_tokens'][i][:30])) for i, p in enumerate(prompt)]
+        query_tokens = self.query_tokens.expand(bs, -1, -1)
+        if self.qformer_text_input:
+            # remove ocr tokens in q_former (for eval textvqa)
+            # qformer_prompt = prompt
+            # qformer_prompt = ['Question: ' + qp.split(' Question: ')[1] for qp in qformer_prompt]
+            text_Qformer = self.tokenizer(
+                prompt,
+                padding='longest',
+                truncation=True,
+                max_length=self.max_txt_len,
+                return_tensors="pt",
+            ).to(image.device)
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device)
+            Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask], dim=1)
+        # For video data
+        if image.dim() == 5:
+            inputs_llm, atts_llm = [], []
+            for j in range(image.size(2)):
+                this_frame = image[:,:,j,:,:]
+                with self.maybe_autocast():
+                    frame_embeds = self.ln_vision(self.visual_encoder(this_frame))
+                frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device)
+                if self.qformer_text_input:
+                    frame_query_output = self.Qformer.bert(
+                        text_Qformer.input_ids,
+                        attention_mask=Qformer_atts,
+                        query_embeds=query_tokens,
+                        encoder_hidden_states=frame_embeds,
+                        encoder_attention_mask=frame_atts,
+                        return_dict=True,
+                    )
+                else:
+                    frame_query_output = self.Qformer.bert(
+                        query_embeds=query_tokens,
+                        encoder_hidden_states=frame_embeds,
+                        encoder_attention_mask=frame_atts,
+                        return_dict=True,
+                    )
+                frame_inputs_llm = self.llm_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:])
+                frame_atts_llm = torch.ones(frame_inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
+                inputs_llm.append(frame_inputs_llm)
+                atts_llm.append(frame_atts_llm)
+            inputs_llm = torch.cat(inputs_llm, dim=1)
+            atts_llm = torch.cat(atts_llm, dim=1)
+        else:
+            with self.maybe_autocast():
+                image_embeds = self.ln_vision(self.visual_encoder(image))
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+            if self.qformer_text_input:
+                query_output = self.Qformer.bert(
+                    text_Qformer.input_ids,
+                    attention_mask=Qformer_atts,
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                )
+            else:
+                query_output = self.Qformer.bert(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                )
+            inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:])
+            atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
+        llm_tokens = self.llm_tokenizer(
+            prompt,
+            padding="longest",
+            return_tensors="pt"
+        ).to(image.device)
+        with self.maybe_autocast():
+            inputs_embeds = self.llm_model.get_input_embeddings()(llm_tokens.input_ids)
+            inputs_embeds = torch.cat([inputs_llm, inputs_embeds], dim=1)
+            attention_mask = torch.cat([atts_llm, llm_tokens.attention_mask], dim=1)
+            outputs = self.llm_model.generate(
+                inputs_embeds=inputs_embeds,
+                attention_mask=attention_mask,
+                do_sample=use_nucleus_sampling,
+                top_p=top_p,
+                temperature=temperature,
+                num_beams=num_beams,
+                max_length=max_length,
+                min_length=min_length,
+                # eos_token_id=self.eos_token_id,
+                repetition_penalty=repetition_penalty,
+                length_penalty=length_penalty,
+                num_return_sequences=num_captions,
+            )
+        outputs[outputs == 0] = 2 # convert output id 0 to 2 (eos_token_id)
+        output_text = self.llm_tokenizer.batch_decode(outputs, skip_special_tokens=True)
+        output_text = [text.strip() for text in output_text]
+        return output_text
+    def predict_answers(
+        self,
+        samples,
+        num_beams=5,
+        inference_method="generate",
+        max_len=10,
+        min_len=1,
+        num_ans_candidates=128,
+        answer_list=None,
+        prompt="",
+        length_penalty=0,
+        **kwargs
+    ):
+        if isinstance(samples["text_input"], str):
+            samples["text_input"] = [samples["text_input"]]
+        if prompt:
+            if prompt.count("{}") == 2:
+                if 'ocr_tokens' in samples:
+                    text_input = [
+                        prompt.format(', '.join(samples['ocr_tokens'][i][:30]), samples["text_input"][i])
+                    for i in range(len(samples["text_input"]))]
+                elif 'choices' in samples:
+                    text_input = []
+                    for i in range(len(samples["text_input"])):
+                        this_choices = [f"({string.ascii_lowercase[j]}) {ch}" for j, ch in enumerate(samples["choices"][i])]
+                        this_choices = " ".join(this_choices)
+                        text_input.append(prompt.format(samples["text_input"][i], this_choices))
+            else:
+                text_input = [prompt.format(question) for question in samples["text_input"]]
+        else:
+            text_input = samples["text_input"]
+        samples["prompt"] = text_input
+        output_text = self.generate(
+            samples,
+            num_beams=num_beams,
+            max_length=max_len,
+            min_length=min_len,
+            length_penalty=length_penalty
+        )
+        if "apply_lemmatizer" in samples.keys() and samples["apply_lemmatizer"]:
+            output_text = self._lemmatize(output_text)
+        return output_text
+    def predict_class(
+        self,
+        samples,
+        candidates,
+        n_segments=1,
+    ):
+        self.llm_tokenizer.padding_side = "left"
+        # If candidates is a list of lists, each sample has its candidates, then we need to iterate one by one
+        if type(candidates[0]) == list:
+            results = []
+            for i in range(samples["image"].size(0)):
+                this_sample = {
+                    "image": samples["image"][i].unsqueeze(0),
+                    "prompt": samples["prompt"],
+                }
+                if "text_input" in samples.keys():
+                    this_sample["text_input"] = [samples["text_input"][i]]
+                if 'context' in samples.keys():
+                    this_sample['context'] = [samples["context"][i]]
+                if 'history' in samples.keys():
+                    this_sample['history'] = [samples["history"][i]]
+                if 'caption' in samples.keys():
+                    this_sample['caption'] = [samples["caption"][i]]
+                this_result = self._predict_class(this_sample, candidates[i], n_segments)
+                results.append(this_result)
+            try:
+                results = torch.cat(results, dim=0)
+            except:
+                results = [res.tolist()[0] for res in results]
+            return results
+        return self._predict_class(samples, candidates, n_segments)
+    def _predict_class(
+        self,
+        samples,
+        candidates,
+        n_segments=1,
+    ):
+        image = samples["image"]
+        prompt = samples["prompt"]
+        bs = image.size(0)
+        if isinstance(prompt, str):
+            prompt = [prompt] * bs
+        else:
+            assert len(prompt) == bs, "The number of prompts must be equal to the batch size."
+        if "text_input" in samples.keys():
+            if type(samples["text_input"][0]) == list:
+                prompt = [prompt[i].format(*samples["text_input"][i]) for i in range(len(prompt))]
+            else:
+                prompt = [prompt[i].format(samples["text_input"][i]) for i in range(len(prompt))]
+        # scienceqa
+        if 'context' in samples.keys() and samples['context'] != '':
+            prompt = [f'context: {samples["context"][i]}. {prompt[i]}' for i in range(len(prompt))]
+        # visual dialog
+        if 'history' in samples.keys() and samples['history'][0] != '':
+            prompt = [f'dialog history: {samples["history"][i]}\n{prompt[i]}' for i in range(len(prompt))]
+        if 'caption' in samples.keys() and samples['caption'][0] != '':
+            prompt = [f'This image has the caption "{samples["caption"][i]}". {prompt[i]}' for i in range(len(prompt))]
+        query_tokens = self.query_tokens.expand(bs, -1, -1)
+        if self.qformer_text_input:
+            text_Qformer = self.tokenizer(
+                prompt,
+                padding='longest',
+                truncation=True,
+                max_length=self.max_txt_len,
+                return_tensors="pt"
+            ).to(image.device)
+            query_atts = torch.ones(query_tokens.size()[:-1], dtype=torch.long).to(image.device)
+            Qformer_atts = torch.cat([query_atts, text_Qformer.attention_mask], dim=1)
+        if image.dim() == 5:
+            inputs_llm, atts_llm = [], []
+            for j in range(image.size(2)):
+                this_frame = image[:,:,j,:,:]
+                with self.maybe_autocast():
+                    frame_embeds = self.ln_vision(self.visual_encoder(this_frame))
+                    frame_atts = torch.ones(frame_embeds.size()[:-1], dtype=torch.long).to(image.device)
+                if self.qformer_text_input:
+                    frame_query_output = self.Qformer.bert(
+                        text_Qformer.input_ids,
+                        attention_mask=Qformer_atts,
+                        query_embeds=query_tokens,
+                        encoder_hidden_states=frame_embeds,
+                        encoder_attention_mask=frame_atts,
+                        return_dict=True,
+                    )
+                else:
+                    frame_query_output = self.Qformer.bert(
+                        query_embeds=query_tokens,
+                        encoder_hidden_states=frame_embeds,
+                        encoder_attention_mask=frame_atts,
+                        return_dict=True,
+                    )
+                frame_inputs_llm = self.llm_proj(frame_query_output.last_hidden_state[:,:query_tokens.size(1),:])
+                frame_atts_llm = torch.ones(frame_inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
+                inputs_llm.append(frame_inputs_llm)
+                atts_llm.append(frame_atts_llm)
+            inputs_llm = torch.cat(inputs_llm, dim=1)
+            atts_llm = torch.cat(atts_llm, dim=1)
+        else:
+            with self.maybe_autocast():
+                image_embeds = self.ln_vision(self.visual_encoder(image))
+            image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long).to(image.device)
+            if self.qformer_text_input:
+                query_output = self.Qformer.bert(
+                    text_Qformer.input_ids,
+                    attention_mask=Qformer_atts,
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                )
+            else:
+                query_output = self.Qformer.bert(
+                    query_embeds=query_tokens,
+                    encoder_hidden_states=image_embeds,
+                    encoder_attention_mask=image_atts,
+                    return_dict=True,
+                )
+            inputs_llm = self.llm_proj(query_output.last_hidden_state[:,:query_tokens.size(1),:])
+            atts_llm = torch.ones(inputs_llm.size()[:-1], dtype=torch.long).to(image.device)
+        self.llm_tokenizer.padding_side = "right"
+        self.llm_tokenizer.truncation_side = 'left'
+        text_input_tokens = self.llm_tokenizer(
+            prompt,
+            return_tensors="pt",
+            padding="longest",
+            # truncation=True,
+            # max_length=self.max_txt_len,
+        ).to(image.device)
+        empty_targets = torch.ones(atts_llm.size(), dtype=torch.long).to(image.device).fill_(-100)
+        # self.llm_tokenizer.padding_side = "right"
+        self.llm_tokenizer.truncation_side = 'right'
+        n_cands = len(candidates)
+        with self.maybe_autocast(dtype=torch.bfloat16):
+            all_losses = []
+            for n in range(n_segments):
+                seg_len = n_cands // n_segments
+                if n == (n_segments - 1):
+                    seg_len = n_cands - seg_len * (n_segments - 1)
+                start_i = n * (n_cands // n_segments)
+                end_i = start_i + seg_len
+                this_output_tokens = self.llm_tokenizer(
+                    candidates[start_i:end_i],
+                    return_tensors="pt",
+                    padding="longest",
+                    # truncation=True,
+                    # max_length=self.max_output_txt_len,
+                ).to(image.device)
+                this_input_tokens_ids = text_input_tokens.input_ids.repeat_interleave(seg_len, dim=0)
+                this_input_tokens_atts = text_input_tokens.attention_mask.repeat_interleave(seg_len, dim=0)
+                this_output_tokens_ids = this_output_tokens.input_ids.repeat(bs, 1)
+                this_output_tokens_atts = this_output_tokens.attention_mask.repeat(bs, 1)
+                this_llm_tokens, this_input_targets_len = self.concat_text_input_output(
+                    this_input_tokens_ids,
+                    this_input_tokens_atts,
+                    this_output_tokens_ids,
+                    this_output_tokens_atts
+                )
+                this_llm_input_ids = this_llm_tokens['input_ids']
+                this_llm_atts = this_llm_tokens['attention_mask']
+                # this_llm_input_ids = torch.cat([this_input_tokens_ids, this_output_tokens_ids], dim=1)
+                # this_llm_atts = torch.cat([this_input_tokens_atts, this_output_tokens_atts], dim=1)
+                inputs_embeds = self.llm_model.get_input_embeddings()(this_llm_input_ids)
+                inputs_embeds = torch.cat([inputs_llm.repeat_interleave(seg_len, dim=0), inputs_embeds], dim=1)
+                attention_mask = torch.cat([atts_llm.repeat_interleave(seg_len, dim=0), this_llm_atts], dim=1)
+                this_targets = this_llm_input_ids.masked_fill(this_llm_input_ids == self.llm_tokenizer.pad_token_id, -100)
+                # this_targets[:, :this_input_tokens_ids.size(1)] = -100
+                for i, l in enumerate(this_input_targets_len):
+                    this_targets[i][:l] = -100
+                this_targets = torch.cat([empty_targets.repeat_interleave(seg_len, dim=0), this_targets], dim=1)
+                outputs = self.llm_model(
+                    inputs_embeds=inputs_embeds,
+                    attention_mask=attention_mask,
+                    return_dict=True,
+                    labels=this_targets,
+                    reduction="none",
+                )
+                loss = outputs.loss
+                loss = loss.reshape(bs, seg_len)
+                # output_class_ranks = torch.argsort(loss, dim=-1)
+                all_losses.append(loss)
+            all_losses = torch.cat(all_losses, dim=-1)
+            output_class_ranks = torch.argsort(all_losses, dim=-1)
+        return output_class_ranks
+    def _lemmatize(self, answers):
+        def apply(answer):
+            doc = self.lemmatizer(answer)
+            words = []
+            for token in doc:
+                if token.pos_ in ["NOUN", "VERB"]:
+                    words.append(token.lemma_)
+                else:
+                    words.append(token.text)
+            answer = " ".join(words)
+            return answer
+        return [apply(answer) for answer in answers]
+    @property
+    def lemmatizer(self):
+        if self._lemmatizer is None:
+            try:
+                import spacy
+                self._lemmatizer = spacy.load("en_core_web_sm")
+            except ImportError:
+                logging.error(
+                    """
+                    Please install spacy and en_core_web_sm model to apply lemmatization.
+                    python -m spacy download en_core_web_sm
+                    OR
+                    import spacy.cli
+                    spacy.cli.download("en_core_web_sm")
+                    """
+                )
+                exit(1)
+        return self._lemmatizer
+    @classmethod
+    def from_config(cls, cfg):
+        vit_model = cfg.get("vit_model", "eva_clip_g")
+        img_size = cfg.get("image_size")
+        num_query_token = cfg.get("num_query_token")
+        llm_model = cfg.get("llm_model")
+        drop_path_rate = cfg.get("drop_path_rate", 0)
+        use_grad_checkpoint = cfg.get("use_grad_checkpoint", False)
+        vit_precision = cfg.get("vit_precision", "fp16")
+        freeze_vit = cfg.get("freeze_vit", True)
+        prompt = cfg.get("prompt", "")
+        max_txt_len = cfg.get("max_txt_len", 128)
+        max_output_txt_len = cfg.get("max_output_txt_len", 256)
+        apply_lemmatizer = cfg.get("apply_lemmatizer", False)
+        qformer_text_input = cfg.get("qformer_text_input", True)
+        model = cls(
+            vit_model=vit_model,
+            img_size=img_size,
+            drop_path_rate=drop_path_rate,
+            use_grad_checkpoint=use_grad_checkpoint,
+            vit_precision=vit_precision,
+            freeze_vit=freeze_vit,
+            num_query_token=num_query_token,
+            llm_model=llm_model,
+            prompt=prompt,
+            max_txt_len=max_txt_len,
+            max_output_txt_len=max_output_txt_len,
+            apply_lemmatizer=apply_lemmatizer,
+            qformer_text_input=qformer_text_input,
+        )
+        # if qformer_text_input:
+        #     # Hard-coded to load from BLIP-2 stage-1 pre-trained model (not ideal)
+        #     model.load_from_pretrained(
+        #         url_or_filename="https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/BLIP2/blip2_pretrained.pth"
+        #     )
+        model.load_checkpoint_from_config(cfg)
+        return model

emo/all.py ADDED Viewed

	@@ -0,0 +1,138 @@

+import json
+import os
+import re
+import shutil
+import random
+out = []
+# reasoning
+folder_path_reasoning = './emo/reasoning/'
+filelist_reasoning = os.listdir(folder_path_reasoning)
+for class_name in filelist_reasoning:
+    path = os.path.join(folder_path_reasoning, class_name)
+    item = os.listdir(path)
+    for name in item:
+        with open(folder_path_reasoning + class_name + '/' + name, 'r', encoding='utf-8') as file:
+            text = file.read()
+        pattern = r"(?i)Question\s*:(.*?)\s*Answer\s*:(.*?)(?=\s*(Question\s*:|Answer\s*:|$))"
+        matches = re.findall(pattern, text, re.DOTALL)
+        reasoning = []
+        for match in matches:
+            question = match[0].strip()
+            answer = match[1].strip()
+            reasoning.append({"from": "human", "value": question})
+            reasoning.append({"from": "gpt", "value": answer})
+        for i in range(int(len(reasoning)/2)):
+            out.append({"id": name.split('_')[1][:5], "image": name.split('.')[0] + '.jpg', 'conversations': reasoning[2*i:2*i+2]})
+# conversation
+folder_path = './emo/conversation/'
+filelist = os.listdir(folder_path)
+for class_name in filelist:
+    path = os.path.join(folder_path, class_name)
+    item = os.listdir(path)
+    for name in item:
+        with open(folder_path + class_name + '/' + name, 'r', encoding='utf-8') as file:
+            text = file.read()
+        pattern = r"(?i)Question\s*\d*:(.*?)\s*Answer\s*\d*:(.*?)\s*(?=(Question:\d*|Complex Question:\d*|Complex question:\d*|$))"
+        matches = re.findall(pattern, text, re.DOTALL)
+        conversations = []
+        for match in matches:
+            question = match[0].strip()
+            answer = match[1].strip()
+            conversations.append({"from": "human", "value": question})
+            conversations.append({"from": "gpt", "value": answer})
+        #conversations[0]['value'] = conversations[0]['value'] + '\n<image>'
+        conversations[0]['value'] = conversations[0]['value']
+        #out.append({"id": name.split('_')[1][:5], "image": name.split('.')[0] + '.jpg', 'conversations': conversations})
+        for i in range(int(len(conversations)/2)):
+            out.append({"id": name.split('_')[1][:5], "image": name.split('.')[0] + '.jpg', 'conversations': conversations[2*i:2*i+2]})
+        shutil.copy('./emo/image/' + class_name + '/' + name[:-3] + 'jpg', './emo/image/train_image')
+##### classification
+with open('./emo/train.json', 'r') as json_file:
+    json_data = json.load(json_file)
+amusement_data = []
+anger_data = []
+awe_data = []
+contentment_data = []
+disgust_data = []
+excitement_data = []
+fear_data = []
+sadness_data = []
+for item in json_data:
+    category = item[0]
+    if category == 'amusement':
+        amusement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'anger':
+        anger_data.append(item[1].split('/')[2][:-4])
+    elif category == 'awe':
+        awe_data.append(item[1].split('/')[2][:-4])
+    elif category == 'contentment':
+        contentment_data.append(item[1].split('/')[2][:-4])
+    elif category == 'disgust':
+        disgust_data.append(item[1].split('/')[2][:-4])
+    elif category == 'excitement':
+        excitement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'fear':
+        fear_data.append(item[1].split('/')[2][:-4])
+    elif category == 'sadness':
+        sadness_data.append(item[1].split('/')[2][:-4])
+all_data = [amusement_data, anger_data, awe_data, contentment_data, disgust_data, excitement_data, fear_data, sadness_data]
+emo = ['amusement', 'anger', 'awe', 'contentment', 'disgust', 'excitement', 'fear', 'sadness']
+for i in range(8):
+    for j in range(1000, 5600):
+        word = [
+          {
+            "from": "human",
+            "value": "Please select the emotion closest to the image from the following options:\
+amusement, \
+anger, \
+awe, \
+contentment, \
+disgust, \
+excitement, \
+fear and sadness \
+(Do not provide answers outside of the candidates options.) Please answer in the following format:  Predict emotion:"
+          },
+          {
+            "from": "gpt",
+            "value": 'Predict emotion: ' + emo[i]
+          }
+        ]
+        temp = {'id': all_data[i][j][-5:], 'image': all_data[i][j] + '.jpg', 'conversations': word}
+        out.append(temp)
+        shutil.copy('./emo/image/' + emo[i] + '/' + all_data[i][j] + '.jpg', './emo/image/train_image')
+#####
+random.shuffle(out)
+with open('./emo/train.json', 'w') as json_file:
+    json.dump(out, json_file, indent=2)

emo/cap-anno.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import json
+class_name = 'sadness'
+path = './emo/caption/' + class_name + '/'
+filelist = os.listdir(path)
+caption_path = './emo/caption/' + class_name + '/'
+annotation_path = './emo/annotation/' + class_name + '/'
+for name in filelist:
+    print(name)
+    with open(caption_path + name, 'r', encoding='utf-8') as file:
+        caption = file.read()
+    with open(annotation_path + name.split('txt')[0] + 'json', 'r') as json_file:
+        annotation = json.load(json_file)
+    out = caption
+    out = out + '\n\n'
+    out = out + 'emotion: ' + str(annotation['emotion'])
+    if 'brightness' in annotation:
+        out = out + '\n' + 'brightness: ' + str(annotation['brightness'])
+    if 'colorfulness' in annotation:
+        out = out + '\n' + 'colorfulness: ' + str(annotation['colorfulness'])
+    if 'object' in annotation:
+        out = out + '\n' + 'object: ' + str(annotation['object'])
+    if 'facial_expression' in annotation:
+        out = out + '\n' + 'facial_expression: ' + str(annotation['facial_expression'])
+    if 'human_action' in annotation:
+        out = out + '\n' + 'human_action: ' + str(annotation['human_action'])
+    out_path = "./emo/cap-ano/" + class_name + "/" + name
+    f = open(out_path, 'w')
+    f.write(out)
+    f.close()

emo/caption.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch
+from PIL import Image
+import requests
+from lavis.models import load_model_and_preprocess
+import os
+device = torch.device("cuda") if torch.cuda.is_available() else "cpu"
+model, vis_processors, _ = load_model_and_preprocess(
+     name="blip2_opt", model_type="pretrain_opt2.7b", is_eval=True, device=device
+)
+path = './emo/image/sadness/'
+filelist = os.listdir(path)
+for name in filelist:
+    print('-----------')
+    print(name)
+    out_path = './emo/caption/sadness/' + name.split('.')[0] + '.txt'
+    f = open(out_path, 'w')
+    raw_image = Image.open(path + name)
+    image = vis_processors["eval"](raw_image).unsqueeze(0).to(device)
+    caption = model.generate({"image": image})
+    print(caption[0])
+    f.write(caption[0])
+    f.close()

emo/desktop.ini ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ [LocalizedFileNames]
2	+ EmoSet-118K.zip=@EmoSet-118K,0

emo/gpt4_conversation.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import requests
+import json
+import openai
+import os
+#openai.api_key need to change to your own key
+#Search "Need change!!!" in this script
+#Change the number in range
+def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
+    openai.api_key ="" #key
+    response = openai.ChatCompletion.create(
+                   model="gpt-4",
+                   max_tokens=None,
+                   temperature=1,
+                   messages = messages)
+    return response["choices"][0]["message"]["content"]
+#####
+with open('./emo/train.json', 'r') as json_file:
+    json_data = json.load(json_file)
+amusement_data = []
+anger_data = []
+awe_data = []
+contentment_data = []
+disgust_data = []
+excitement_data = []
+fear_data = []
+sadness_data = []
+for item in json_data:
+    category = item[0]
+    if category == 'amusement':
+        amusement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'anger':
+        anger_data.append(item[1].split('/')[2][:-4])
+    elif category == 'awe':
+        awe_data.append(item[1].split('/')[2][:-4])
+    elif category == 'contentment':
+        contentment_data.append(item[1].split('/')[2][:-4])
+    elif category == 'disgust':
+        disgust_data.append(item[1].split('/')[2][:-4])
+    elif category == 'excitement':
+        excitement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'fear':
+        fear_data.append(item[1].split('/')[2][:-4])
+    elif category == 'sadness':
+        sadness_data.append(item[1].split('/')[2][:-4])
+#####
+prompt_path = "./emo/prompt/conversation.txt"
+with open(prompt_path, 'r', encoding='utf-8') as file:
+    content = file.read()
+#Need change!!!
+class_name = 'sadness'
+filelist = sadness_data
+path = './emo/cap-ano/' + class_name + '/'
+for i in range(0, 1000):
+    print(i)
+    name = filelist[i]
+    caption_path = "./emo/cap-ano/" + class_name + "/" + name + '.txt'
+    with open(caption_path, 'r', encoding='utf-8') as file:
+        caption = file.read()
+    messages = [
+        {"role": "system", "content": content},
+        {"role": "user", "content": caption}
+    ]
+    #print(caption)
+    response_text = generate_chat_completion(messages)
+    out_path = "./emo/conversation/" + class_name + "/" + name + '.txt'
+    f = open(out_path, 'w')
+    f.write(response_text)
+    f.close()

emo/gpt4_reasoning.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import requests
+import json
+import openai
+import os
+#openai.api_key need to change to your own key
+#Search "Need change!!!" in this script
+#Change the number in range
+def generate_chat_completion(messages, model="gpt-4", temperature=1, max_tokens=None):
+    openai.api_key ="" #key;
+    response = openai.ChatCompletion.create(
+                   model="gpt-4",
+                   max_tokens=None,
+                   temperature=1,
+                   messages = messages)
+    return response["choices"][0]["message"]["content"]
+#####
+with open('./emo/train.json', 'r') as json_file:
+    json_data = json.load(json_file)
+amusement_data = []
+anger_data = []
+awe_data = []
+contentment_data = []
+disgust_data = []
+excitement_data = []
+fear_data = []
+sadness_data = []
+for item in json_data:
+    category = item[0]
+    if category == 'amusement':
+        amusement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'anger':
+        anger_data.append(item[1].split('/')[2][:-4])
+    elif category == 'awe':
+        awe_data.append(item[1].split('/')[2][:-4])
+    elif category == 'contentment':
+        contentment_data.append(item[1].split('/')[2][:-4])
+    elif category == 'disgust':
+        disgust_data.append(item[1].split('/')[2][:-4])
+    elif category == 'excitement':
+        excitement_data.append(item[1].split('/')[2][:-4])
+    elif category == 'fear':
+        fear_data.append(item[1].split('/')[2][:-4])
+    elif category == 'sadness':
+        sadness_data.append(item[1].split('/')[2][:-4])
+#####
+prompt_path = "./emo/prompt/reasoning.txt"
+with open(prompt_path, 'r', encoding='utf-8') as file:
+    content = file.read()
+#Need change!!!
+class_name = 'fear'
+filelist = fear_data
+path = './emo/cap-ano/' + class_name + '/'
+for i in range(0,100):
+    print(i)
+    name = filelist[i]
+    caption_path = "./emo/cap-ano/" + class_name + "/" + name + '.txt'
+    with open(caption_path, 'r', encoding='utf-8') as file:
+        caption = file.read()
+    messages = [
+        {"role": "system", "content": content},
+        {"role": "user", "content": caption}
+    ]
+#    print(caption)
+#    assert 0
+    response_text = generate_chat_completion(messages)
+    out_path = "./emo/reasoning/" + class_name + "/" + name + '.txt'
+    f = open(out_path, 'w')
+    f.write(response_text)
+    f.close()

emo/prompt/conversation.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+You are an AI visual assistant, and you are seeing a single image. What you see are provided with one caption and some emotion related attributes, describing the same image you are looking at. Answer all questions as you are seeing the image.
+The range of brightness is from 0 (darkest) to 1 (brightest), and the range of colorfulness is from 0 (black-and-white) to 1 (the most colorful).
+Design two questions for a conversation between you and a person asking about this photo. The answers should be in a tone that a visual AI assistant is seeing the image and answering the question.
+Ask diverse questions and give corresponding answers.
+Include questions asking about the visual content of the image, including the object types, object actions, relationship among objects, etc. Only include questions that have definite answers:
+(1) one can see the content in the image that the question asks about and can answer confidently;
+(2) one can determine confidently from the image that it is not in the image.
+Do not ask any question that cannot be answered confidently.
+Please answer with the format
+Question:
+Answer:
+Also include one complex question that is relevant to the content in the image, for example, asking about background knowledge of the objects in the image, asking to discuss about events happening in the image, etc. Again, do not ask about uncertain details.
+Provide detailed answers when answering complex questions. For example, give detailed examples or reasoning steps to make the content more convincing and well-organized.  You can include multiple paragraphs if necessary.

emo/prompt/description.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+You are an AI visual assistant that can analyze a single image. What you see are provided with one caption and some emotion related attributes, describing the same image you are looking at.
+Using the provided caption and attributes, describe the scene in a detailed manner.
+When using the information from the caption and attributes, directly explain the scene, and do not mention that the information source is the caption or the attributes.  Always answer as if you are directly looking at the image.
+Please answer with the format
+Question:
+Answer:

emo/prompt/reasoning.txt ADDED Viewed

	@@ -0,0 +1,49 @@

+You are an AI visual assistant that can analyze a single image. You receive one caption and some emotion related attributes, describing the same image you are looking at.
+The task is to use the provided caption and attributes, create a plausible question about the image, and provide the answer in detail.
+Create one complex question beyond describing the scene.
+To answer such question, one should require first understanding the visual content, then based on the background knowledge or reasoning, either explain why the things are happening that way, or provide guides and help to user's request.  Make the question challenging by not including the visual content details in the question so that the user needs to reason about that first.
+When using the information from the caption and attributes, directly explain the scene, and do not mention that the information source is the caption or the attributes.  Always answer as if you are directly looking at the image.
+Following are examples for your information:
+caption & attributes:
+a young girl wearing a cheerleading outfit at a restaurant
+emotion: amusement
+brightness: 0.5
+colorfulness: 0.6
+facial_expression: happy
+Question: Considering the dining location and the girl's attire, what type of event might have led her to be in this situation, and why she could be feeling amused?
+Answer: It is probable that the girl is coming from a cheerleading competition or performance since she's dressed in a cheerleading outfit. This could be a school game, cheer clubs event or community celebration where cheerleading was part of the festivities. Often such events foster a sense of camaraderie and joy - elements that could contribute to her amusement. Perhaps her team was successful, or maybe she simply enjoyed performing. Moreover, it is common for teams to celebrate together after events, which would explain their choice to go to a restaurant. Eating together becomes part of the shared experience of the day. Her happy facial expression implies that she is enjoying the current moment, possibly sharing exciting recounts of the day or joking around with her teammates or family. Despite the moderate brightness and colorfulness of the restaurant, the overall mood is upbeat and cheerful.
+caption & attributes:
+a figurine of an asian woman riding on top of a yellow machine
+emotion: amusement
+brightness: 0.8
+colorfulness: 0.7
+Question: Given the peculiar nature of the figurine with an Asian woman on a yellow machine, what cultural or pop culture references could this object possibly be alluding to which may serve as a source of amusement?
+Answer: The figurine could be indicative of various cultural or pop culture elements. For instance, considering the bright and colorful nature of the figurine, it might be referencing the vibrant Asian pop culture. The yellow machine may be a nod towards the popularity of mecha (giant robots) in Asian cartoons and anime. The woman riding the machine might be a playful take on the trope of strong female characters seen in many of these works. As such, the amusement might arise from the recognition of these references and the whimsical portrayal of these themes in a form as innocent as a figurine. Sometimes, these figurines are designed in a playful and exaggerated manner to depict everyday or fictional scenarios with a touch of humor, which might also lead to the experienced amusement. The bright colors and the high brightness level add to the cheerfulness and comical nature of the scene.
+caption & attributes:
+a man standing in front of a display of powerpuff dolls
+emotion: amusement
+brightness: 0.6
+colorfulness: 0.8
+object: ['Toy']
+facial_expression: happy
+Question: Taking into account the scene with the man and the display of Powerpuff dolls, why might this scenario be amusing and evoke happiness in him, considering the age and gender stereotypes often associated with such collectibles?
+Answer: This image could be amusing due to the contrast between the man and the Powerpuff dolls, which are typically associated with a younger, female demographic. The man's amused expression and the vibrant colors and brightness of the scene suggest a playful and light-hearted atmosphere. Perhaps the man used to watch the Powerpuff Girls show when he was younger and the dolls took him down the memory lane, sparking a sense of nostalgia, a factor that could certainly contribute to his happiness. Maybe he is a collector who appreciates these toys for their design and the cultural significance they hold. Alternatively, he could be a parent or an uncle shopping for a young relative and the sight of the familiar characters from his past brought a smile to his face. Regardless, the image challenges the conventional stereotype regarding who should enjoy toys and cartoon characters, showing that amusement can be found in unexpected places and situations.

emo/test.json ADDED Viewed

The diff for this file is too large to render. See raw diff

emo/train.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b3043ba41320d9309513d9661e64c1f89d2c40370302596718b774f5917e772f
+size 10750390

emo/val.json ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,39 @@

+# Flask and Web Framework
+Flask==2.3.3
+gunicorn==21.2.0
+# Core ML Libraries
+torch>=1.10.0
+torchvision>=0.11.0
+transformers>=4.28.0
+pillow>=10.0.0
+# Image Processing
+opencv-python-headless>=4.5.0
+# LAVIS dependencies (for BLIP2)
+salesforce-lavis
+omegaconf>=2.3.0
+# Data handling
+numpy>=1.24.0
+pandas>=2.0.0
+# Other utilities
+requests>=2.31.0
+tqdm>=4.66.0
+safetensors>=0.4.0
+# For Hugging Face compatibility
+huggingface-hub>=0.20.0
+# Additional utilities that might be needed
+einops>=0.7.0
+timm>=0.4.12
+sentencepiece>=0.1.99
+# For better logging
+loguru
+# For environment variables
+python-dotenv

requirements_emo.txt ADDED Viewed

	@@ -0,0 +1,166 @@

+absl-py==2.1.0
+altair==5.1.2
+annotated-types==0.6.0
+antlr4-python3-runtime==4.9.3
+asttokens==2.4.1
+attrs==23.1.0
+backcall==0.2.0
+backports.zoneinfo==0.2.1
+bleach==6.1.0
+blinker==1.7.0
+blis==0.7.11
+braceexpand==0.1.7
+cachetools==5.3.3
+catalogue==2.0.10
+certifi==2023.7.22
+cfgv==3.4.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.16.0
+confection==0.1.3
+contexttimer==0.3.3
+contourpy==1.1.1
+cycler==0.12.1
+cymem==2.0.8
+Cython==3.0.10
+decorator==5.1.1
+decord==0.6.0
+diffusers==0.16.0
+distlib==0.3.7
+einops==0.7.0
+executing==2.0.1
+fairscale==0.4.4
+filelock==3.13.1
+fonttools==4.44.0
+fsspec==2023.10.0
+ftfy==6.1.1
+gitdb==4.0.11
+GitPython==3.1.40
+google-auth==2.29.0
+google-auth-oauthlib==1.0.0
+grpcio==1.62.1
+huggingface-hub==0.20.2
+identify==2.5.31
+idna==3.4
+imageio==2.32.0
+importlib-resources==6.1.1
+importlib_metadata==7.1.0
+iopath==0.1.10
+ipython==8.12.3
+jedi==0.19.1
+Jinja2==3.1.2
+jsonschema==4.19.2
+jsonschema-specifications==2023.7.1
+kaggle==1.5.16
+kiwisolver==1.4.5
+langcodes==3.3.0
+lazy_loader==0.3
+Markdown==3.6
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.3
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+murmurhash==1.0.10
+networkx==3.1
+nodeenv==1.8.0
+numpy==1.24.4
+nvidia-cublas-cu12==12.1.3.1
+nvidia-cuda-cupti-cu12==12.1.105
+nvidia-cuda-nvrtc-cu12==12.1.105
+nvidia-cuda-runtime-cu12==12.1.105
+nvidia-cudnn-cu12==8.9.2.26
+nvidia-cufft-cu12==11.0.2.54
+nvidia-curand-cu12==10.3.2.106
+nvidia-cusolver-cu12==11.4.5.107
+nvidia-cusparse-cu12==12.1.0.106
+nvidia-nccl-cu12==2.18.1
+nvidia-nvjitlink-cu12==12.3.52
+nvidia-nvtx-cu12==12.1.105
+omegaconf==2.3.0
+opencv-python-headless==4.5.5.64
+opendatasets==0.1.22
+packaging==24.0
+pandas==2.0.3
+parso==0.8.4
+pexpect==4.8.0
+pickleshare==0.7.5
+pillow==10.3.0
+pkgutil_resolve_name==1.3.10
+plotly==5.18.0
+portalocker==2.8.2
+pre-commit==3.5.0
+preshed==3.0.9
+prompt-toolkit==3.0.43
+protobuf==5.26.1
+ptyprocess==0.7.0
+pure-eval==0.2.2
+pyarrow==14.0.1
+pycocoevalcap==1.2
+pycocotools==2.0.7
+pydantic==2.5.0
+pydantic_core==2.14.1
+pydeck==0.8.1b0
+Pygments==2.17.2
+pyparsing==3.1.1
+python-dateutil==2.8.2
+python-magic==0.4.27
+python-slugify==8.0.1
+pytz==2024.1
+PyWavelets==1.4.1
+PyYAML==6.0.1
+referencing==0.30.2
+regex==2023.10.3
+requests==2.31.0
+requests-oauthlib==2.0.0
+rich==13.6.0
+rpds-py==0.12.0
+rsa==4.9
+safetensors==0.4.0
+scikit-image==0.21.0
+scipy==1.10.1
+sentencepiece==0.1.99
+six==1.16.0
+smart-open==6.4.0
+smmap==5.0.1
+spacy==3.7.2
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+stack-data==0.6.3
+streamlit==1.28.2
+sympy==1.12
+tenacity==8.2.3
+tensorboard==2.14.0
+tensorboard-data-server==0.7.2
+text-unidecode==1.3
+thinc==8.2.1
+tifffile==2023.7.10
+timm==0.4.12
+tokenizers==0.13.3
+toml==0.10.2
+toolz==0.12.0
+torch==1.10.2+cu111
+torch-tb-profiler==0.4.3
+torchaudio==0.10.2+cu111
+torchvision==0.11.3+cu111
+tqdm==4.66.1
+traitlets==5.14.3
+transformers==4.28.0
+triton==2.1.0
+typer==0.9.0
+typing_extensions==4.11.0
+tzdata==2024.1
+tzlocal==5.2
+urllib3==1.26.18
+validators==0.22.0
+virtualenv==20.24.6
+wasabi==1.1.2
+watchdog==3.0.0
+wcwidth==0.2.13
+weasel==0.3.4
+webdataset==0.2.75
+webencodings==0.5.1
+Werkzeug==3.0.1
+zipp==3.17.0

requirements_lavis.txt ADDED Viewed

	@@ -0,0 +1,158 @@

+altair==5.4.1
+annotated-types==0.7.0
+antlr4-python3-runtime==4.9.3
+asttokens==2.4.1
+attrs==24.2.0
+backcall==0.2.0
+bleach==6.1.0
+blinker==1.8.2
+blis==0.7.11
+braceexpand==0.1.7
+cachetools==5.5.0
+catalogue==2.0.10
+certifi==2024.8.30
+cfgv==3.4.0
+charset-normalizer==3.3.2
+click==8.1.7
+cloudpathlib==0.19.0
+cmake==3.30.3
+confection==0.1.5
+contexttimer==0.3.3
+contourpy==1.1.1
+cycler==0.12.1
+cymem==2.0.8
+decorator==5.1.1
+decord==0.6.0
+diffusers==0.16.0
+distlib==0.3.8
+einops==0.8.0
+executing==2.1.0
+fairscale==0.4.4
+filelock==3.16.1
+fonttools==4.53.1
+fsspec==2024.9.0
+ftfy==6.2.3
+gitdb==4.0.11
+GitPython==3.1.43
+huggingface-hub==0.25.0
+identify==2.6.1
+idna==3.10
+imageio==2.35.1
+importlib_metadata==8.5.0
+importlib_resources==6.4.5
+iopath==0.1.10
+ipython==8.12.3
+jedi==0.19.1
+Jinja2==3.1.4
+jsonschema==4.23.0
+jsonschema-specifications==2023.12.1
+kaggle==1.6.17
+kiwisolver==1.4.7
+langcodes==3.4.0
+language_data==1.2.0
+lazy_loader==0.4
+lit==18.1.8
+marisa-trie==1.2.0
+markdown-it-py==3.0.0
+MarkupSafe==2.1.5
+matplotlib==3.7.5
+matplotlib-inline==0.1.7
+mdurl==0.1.2
+mpmath==1.3.0
+murmurhash==1.0.10
+narwhals==1.8.2
+networkx==3.1
+nodeenv==1.9.1
+numpy==1.24.4
+nvidia-cublas-cu11==11.10.3.66
+nvidia-cuda-cupti-cu11==11.7.101
+nvidia-cuda-nvrtc-cu11==11.7.99
+nvidia-cuda-runtime-cu11==11.7.99
+nvidia-cudnn-cu11==8.5.0.96
+nvidia-cufft-cu11==10.9.0.58
+nvidia-curand-cu11==10.2.10.91
+nvidia-cusolver-cu11==11.4.0.1
+nvidia-cusparse-cu11==11.7.4.91
+nvidia-nccl-cu11==2.14.3
+nvidia-nvtx-cu11==11.7.91
+omegaconf==2.3.0
+opencv-python-headless==4.5.5.64
+opendatasets==0.1.22
+packaging==24.1
+pandas==2.0.3
+parso==0.8.4
+pexpect==4.9.0
+pickleshare==0.7.5
+pillow==10.4.0
+pkgutil_resolve_name==1.3.10
+platformdirs==4.3.6
+plotly==5.24.1
+portalocker==2.10.1
+pre-commit==3.5.0
+preshed==3.0.9
+prompt_toolkit==3.0.47
+protobuf==5.28.2
+ptyprocess==0.7.0
+pure_eval==0.2.3
+pyarrow==17.0.0
+pycocoevalcap==1.2
+pycocotools==2.0.7
+pydantic==2.9.2
+pydantic_core==2.23.4
+pydeck==0.9.1
+Pygments==2.18.0
+pyparsing==3.1.4
+python-dateutil==2.9.0.post0
+python-magic==0.4.27
+python-slugify==8.0.4
+pytz==2024.2
+PyWavelets==1.4.1
+PyYAML==6.0.2
+referencing==0.35.1
+regex==2024.9.11
+requests==2.32.3
+rich==13.8.1
+rpds-py==0.20.0
+safetensors==0.4.5
+scikit-image==0.21.0
+scipy==1.10.1
+sentencepiece==0.2.0
+shellingham==1.5.4
+six==1.16.0
+smart-open==7.0.4
+smmap==5.0.1
+spacy==3.7.6
+spacy-legacy==3.0.12
+spacy-loggers==1.0.5
+srsly==2.4.8
+stack-data==0.6.3
+streamlit==1.38.0
+sympy==1.13.3
+tenacity==8.5.0
+text-unidecode==1.3
+thinc==8.2.5
+tifffile==2023.7.10
+timm==0.4.12
+tokenizers==0.13.3
+toml==0.10.2
+torch==2.0.0
+torchaudio==2.0.1
+torchvision==0.15.1
+tornado==6.4.1
+tqdm==4.66.5
+traitlets==5.14.3
+transformers==4.31.0
+triton==2.0.0
+typer==0.12.5
+typing_extensions==4.12.2
+tzdata==2024.1
+urllib3==2.2.3
+virtualenv==20.26.5
+wasabi==1.1.3
+watchdog==4.0.2
+wcwidth==0.2.13
+weasel==0.4.1
+webdataset==0.2.100
+webencodings==0.5.1
+wrapt==1.16.0
+zipp==3.20.2

static/css/style.css ADDED Viewed

	@@ -0,0 +1,278 @@

+/* Custom CSS for EmoVIT Application */
+:root {
+    --primary-color: #4f46e5;
+    --primary-dark: #3730a3;
+    --secondary-color: #06b6d4;
+    --success-color: #10b981;
+    --warning-color: #f59e0b;
+    --danger-color: #ef4444;
+    --light-color: #f8fafc;
+    --dark-color: #1e293b;
+}
+body {
+    font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
+    line-height: 1.6;
+}
+/* Background Gradient */
+.bg-gradient-primary {
+    background: linear-gradient(135deg, var(--primary-color) 0%, var(--secondary-color) 100%);
+    min-height: 100vh;
+}
+/* Custom Card Styling */
+.card {
+    border: none;
+    box-shadow: 0 20px 25px -5px rgba(0, 0, 0, 0.1), 0 10px 10px -5px rgba(0, 0, 0, 0.04);
+    transition: all 0.3s ease;
+}
+.card:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 25px 30px -5px rgba(0, 0, 0, 0.15), 0 15px 15px -5px rgba(0, 0, 0, 0.06);
+}
+/* Button Styling */
+.btn {
+    border-radius: 12px;
+    font-weight: 600;
+    text-transform: uppercase;
+    letter-spacing: 0.5px;
+    transition: all 0.3s ease;
+    border: 2px solid transparent;
+}
+.btn-primary {
+    background: linear-gradient(135deg, var(--primary-color), var(--primary-dark));
+    border: none;
+    box-shadow: 0 4px 15px rgba(79, 70, 229, 0.3);
+}
+.btn-primary:hover {
+    transform: translateY(-2px);
+    box-shadow: 0 8px 25px rgba(79, 70, 229, 0.4);
+    background: linear-gradient(135deg, var(--primary-dark), var(--primary-color));
+}
+.btn-outline-primary {
+    border-color: var(--primary-color);
+    color: var(--primary-color);
+}
+.btn-outline-primary:hover {
+    background: var(--primary-color);
+    border-color: var(--primary-color);
+    transform: translateY(-2px);
+}
+/* Form Styling */
+.form-control {
+    border-radius: 10px;
+    border: 2px solid #e2e8f0;
+    padding: 12px 16px;
+    font-size: 16px;
+    transition: all 0.3s ease;
+}
+.form-control:focus {
+    border-color: var(--primary-color);
+    box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.1);
+    transform: translateY(-1px);
+}
+.form-control-lg {
+    padding: 16px 20px;
+    font-size: 18px;
+}
+/* Alert Styling */
+.alert {
+    border-radius: 12px;
+    border: none;
+    padding: 20px;
+    font-weight: 500;
+}
+.alert-info {
+    background: linear-gradient(135deg, #e0f2fe, #b3e5fc);
+    color: #0277bd;
+}
+.alert-success {
+    background: linear-gradient(135deg, #e8f5e8, #c8e6c9);
+    color: #2e7d32;
+}
+.alert-danger {
+    background: linear-gradient(135deg, #ffebee, #ffcdd2);
+    color: #c62828;
+}
+/* Loading Spinner */
+.spinner-border {
+    width: 3rem;
+    height: 3rem;
+    border-width: 0.3em;
+}
+/* Image Preview */
+#previewImage {
+    max-width: 100%;
+    border-radius: 10px;
+    transition: transform 0.3s ease;
+}
+#previewImage:hover {
+    transform: scale(1.02);
+}
+/* Upload Section */
+.upload-section {
+    position: relative;
+}
+.upload-section::before {
+    content: '';
+    position: absolute;
+    top: -20px;
+    left: 50%;
+    transform: translateX(-50%);
+    width: 60px;
+    height: 4px;
+    background: linear-gradient(90deg, var(--primary-color), var(--secondary-color));
+    border-radius: 2px;
+}
+/* Results Section */
+.emotion-result {
+    text-align: center;
+}
+.emotion-result h4 {
+    margin-bottom: 20px;
+}
+/* Icon Styling */
+.fas, .far {
+    transition: transform 0.3s ease;
+}
+.btn:hover .fas,
+.btn:hover .far {
+    transform: scale(1.1);
+}
+/* Card Headers */
+.card-header {
+    background: linear-gradient(135deg, #f8fafc, #e2e8f0) !important;
+    border-bottom: 2px solid #e2e8f0;
+    border-radius: 12px 12px 0 0 !important;
+}
+/* Responsive Design */
+@media (max-width: 768px) {
+    .container {
+        padding: 15px;
+    }
+    .card-body {
+        padding: 30px 20px;
+    }
+    .display-4 {
+        font-size: 2.5rem;
+    }
+    .btn-lg {
+        padding: 12px 30px;
+        font-size: 16px;
+    }
+}
+/* Animations */
+@keyframes fadeIn {
+    from {
+        opacity: 0;
+        transform: translateY(20px);
+    }
+    to {
+        opacity: 1;
+        transform: translateY(0);
+    }
+}
+.card {
+    animation: fadeIn 0.6s ease-out;
+}
+/* Custom Utilities */
+.rounded-4 {
+    border-radius: 1.5rem !important;
+}
+.text-white-50 {
+    color: rgba(255, 255, 255, 0.75) !important;
+}
+/* File Input Styling */
+input[type="file"] {
+    cursor: pointer;
+}
+input[type="file"]::-webkit-file-upload-button {
+    background: linear-gradient(135deg, var(--primary-color), var(--primary-dark));
+    color: white;
+    border: none;
+    padding: 8px 16px;
+    border-radius: 8px;
+    margin-right: 10px;
+    cursor: pointer;
+    font-weight: 600;
+    transition: all 0.3s ease;
+}
+input[type="file"]::-webkit-file-upload-button:hover {
+    transform: translateY(-1px);
+    box-shadow: 0 4px 12px rgba(79, 70, 229, 0.3);
+}
+/* Progress Bar (for future use) */
+.progress {
+    height: 8px;
+    border-radius: 4px;
+    background-color: #e2e8f0;
+}
+.progress-bar {
+    background: linear-gradient(90deg, var(--primary-color), var(--secondary-color));
+    border-radius: 4px;
+}
+/* Tooltip Styling */
+.tooltip {
+    font-size: 12px;
+}
+.tooltip-inner {
+    background-color: var(--dark-color);
+    border-radius: 6px;
+}
+/* Focus States for Accessibility */
+.btn:focus,
+.form-control:focus {
+    outline: none;
+    box-shadow: 0 0 0 3px rgba(79, 70, 229, 0.25);
+}
+/* Error Styling */
+.is-invalid {
+    border-color: var(--danger-color);
+}
+.invalid-feedback {
+    color: var(--danger-color);
+    font-weight: 500;
+}

templates/index.html ADDED Viewed

	@@ -0,0 +1,253 @@

+<!DOCTYPE html>
+<html lang="en">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>EmoVIT - Emotion Detection</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/css/bootstrap.min.css" rel="stylesheet">
+    <link href="https://cdnjs.cloudflare.com/ajax/libs/font-awesome/6.0.0/css/all.min.css" rel="stylesheet">
+    <link href="{{ url_for('static', filename='css/style.css') }}" rel="stylesheet">
+</head>
+<body>
+    <div class="container-fluid bg-gradient-primary min-vh-100">
+        <div class="container py-5">
+            <div class="row justify-content-center">
+                <div class="col-lg-8">
+                    <!-- Header -->
+                    <div class="text-center mb-5">
+                        <h1 class="display-4 text-white mb-3">
+                            <i class="fas fa-smile text-warning me-3"></i>
+                            EmoVIT
+                        </h1>
+                        <p class="lead text-white-50">
+                            AI-Powered Emotion Detection using BLIP2-Vicuna
+                        </p>
+                    </div>
+                    <!-- Main Card -->
+                    <div class="card shadow-lg border-0 rounded-4">
+                        <div class="card-body p-5">
+                            <!-- Upload Section -->
+                            <div class="upload-section mb-4">
+                                <h3 class="text-center mb-4">
+                                    <i class="fas fa-upload text-primary me-2"></i>
+                                    Upload Image for Emotion Analysis
+                                </h3>
+                                <form id="uploadForm" enctype="multipart/form-data">
+                                    <!-- Custom Prompt -->
+                                    <div class="mb-4">
+                                        <label for="promptInput" class="form-label fw-bold">
+                                            <i class="fas fa-comment-dots me-2"></i>
+                                            Custom Prompt (Optional)
+                                        </label>
+                                        <input type="text"
+                                               class="form-control form-control-lg"
+                                               id="promptInput"
+                                               name="prompt"
+                                               placeholder="What emotion is shown in this image?"
+                                               value="What emotion is shown in this image?">
+                                    </div>
+                                    <!-- File Upload -->
+                                    <div class="mb-4">
+                                        <label for="imageInput" class="form-label fw-bold">
+                                            <i class="fas fa-image me-2"></i>
+                                            Select Image
+                                        </label>
+                                        <input type="file"
+                                               class="form-control form-control-lg"
+                                               id="imageInput"
+                                               name="image"
+                                               accept="image/*"
+                                               required>
+                                    </div>
+                                    <!-- Submit Button -->
+                                    <div class="text-center">
+                                        <button type="submit"
+                                                class="btn btn-primary btn-lg px-5 py-3"
+                                                id="analyzeBtn">
+                                            <i class="fas fa-brain me-2"></i>
+                                            Analyze Emotion
+                                        </button>
+                                    </div>
+                                </form>
+                            </div>
+                            <!-- Loading Spinner -->
+                            <div id="loadingSpinner" class="text-center d-none">
+                                <div class="spinner-border text-primary" role="status">
+                                    <span class="visually-hidden">Loading...</span>
+                                </div>
+                                <p class="mt-3 text-muted">Analyzing emotion...</p>
+                            </div>
+                            <!-- Results Section -->
+                            <div id="resultsSection" class="d-none">
+                                <hr class="my-5">
+                                <h3 class="text-center mb-4">
+                                    <i class="fas fa-chart-line text-success me-2"></i>
+                                    Analysis Results
+                                </h3>
+                                <div class="row">
+                                    <!-- Image Preview -->
+                                    <div class="col-md-6 mb-4">
+                                        <div class="card h-100">
+                                            <div class="card-header bg-light">
+                                                <h5 class="mb-0">
+                                                    <i class="fas fa-image me-2"></i>
+                                                    Uploaded Image
+                                                </h5>
+                                            </div>
+                                            <div class="card-body text-center">
+                                                <img id="previewImage"
+                                                     src=""
+                                                     alt="Uploaded image"
+                                                     class="img-fluid rounded shadow-sm"
+                                                     style="max-height: 300px;">
+                                            </div>
+                                        </div>
+                                    </div>
+                                    <!-- Results -->
+                                    <div class="col-md-6 mb-4">
+                                        <div class="card h-100">
+                                            <div class="card-header bg-light">
+                                                <h5 class="mb-0">
+                                                    <i class="fas fa-brain me-2"></i>
+                                                    Detected Emotion
+                                                </h5>
+                                            </div>
+                                            <div class="card-body">
+                                                <div class="alert alert-info" role="alert">
+                                                    <strong>Prompt:</strong>
+                                                    <p id="usedPrompt" class="mb-0 mt-2"></p>
+                                                </div>
+                                                <div class="emotion-result">
+                                                    <h4 class="text-primary mb-3">
+                                                        <i class="fas fa-smile-beam me-2"></i>
+                                                        Result:
+                                                    </h4>
+                                                    <div class="alert alert-success" role="alert">
+                                                        <p id="emotionResult" class="mb-0 fs-5 fw-bold"></p>
+                                                    </div>
+                                                </div>
+                                            </div>
+                                        </div>
+                                    </div>
+                                </div>
+                                <!-- Try Again Button -->
+                                <div class="text-center mt-4">
+                                    <button type="button"
+                                            class="btn btn-outline-primary btn-lg"
+                                            id="tryAgainBtn">
+                                        <i class="fas fa-redo me-2"></i>
+                                        Try Another Image
+                                    </button>
+                                </div>
+                            </div>
+                            <!-- Error Section -->
+                            <div id="errorSection" class="d-none">
+                                <div class="alert alert-danger" role="alert">
+                                    <h4 class="alert-heading">
+                                        <i class="fas fa-exclamation-triangle me-2"></i>
+                                        Error
+                                    </h4>
+                                    <p id="errorMessage" class="mb-0"></p>
+                                </div>
+                            </div>
+                        </div>
+                    </div>
+                    <!-- Footer -->
+                    <div class="text-center mt-5">
+                        <p class="text-white-50">
+                            <i class="fas fa-robot me-2"></i>
+                            Powered by BLIP2-Vicuna AI Model
+                        </p>
+                    </div>
+                </div>
+            </div>
+        </div>
+    </div>
+    <!-- Scripts -->
+    <script src="https://cdn.jsdelivr.net/npm/bootstrap@5.1.3/dist/js/bootstrap.bundle.min.js"></script>
+    <script>
+        document.addEventListener('DOMContentLoaded', function() {
+            const uploadForm = document.getElementById('uploadForm');
+            const loadingSpinner = document.getElementById('loadingSpinner');
+            const resultsSection = document.getElementById('resultsSection');
+            const errorSection = document.getElementById('errorSection');
+            const tryAgainBtn = document.getElementById('tryAgainBtn');
+            const analyzeBtn = document.getElementById('analyzeBtn');
+            uploadForm.addEventListener('submit', async function(e) {
+                e.preventDefault();
+                // Hide previous results
+                resultsSection.classList.add('d-none');
+                errorSection.classList.add('d-none');
+                // Show loading
+                loadingSpinner.classList.remove('d-none');
+                analyzeBtn.disabled = true;
+                try {
+                    const formData = new FormData(uploadForm);
+                    const response = await fetch('/predict', {
+                        method: 'POST',
+                        body: formData
+                    });
+                    const result = await response.json();
+                    if (result.success) {
+                        // Display results
+                        document.getElementById('previewImage').src = 'data:image/png;base64,' + result.image;
+                        document.getElementById('emotionResult').textContent = result.emotion;
+                        document.getElementById('usedPrompt').textContent = result.prompt;
+                        resultsSection.classList.remove('d-none');
+                    } else {
+                        throw new Error(result.error || 'Unknown error occurred');
+                    }
+                } catch (error) {
+                    console.error('Error:', error);
+                    document.getElementById('errorMessage').textContent = error.message;
+                    errorSection.classList.remove('d-none');
+                } finally {
+                    // Hide loading
+                    loadingSpinner.classList.add('d-none');
+                    analyzeBtn.disabled = false;
+                }
+            });
+            tryAgainBtn.addEventListener('click', function() {
+                resultsSection.classList.add('d-none');
+                errorSection.classList.add('d-none');
+                uploadForm.reset();
+                document.getElementById('promptInput').value = 'What emotion is shown in this image?';
+            });
+            // Preview image on selection
+            document.getElementById('imageInput').addEventListener('change', function(e) {
+                const file = e.target.files[0];
+                if (file) {
+                    const reader = new FileReader();
+                    reader.onload = function(e) {
+                        // Could add image preview here if needed
+                    };
+                    reader.readAsDataURL(file);
+                }
+            });
+        });
+    </script>
+</body>
+</html>