Aryanaideveloper's picture
feat: initial project setup with audio/video detection modules and fusion
5cc48fc
#!/bin/bash
# Setup script for Audio Detection (Nes2Net) environment
# Run this from the project root: Multimodal Deepfake Detection/
echo "=== Setting up Audio Detection Environment ==="
# Create conda environment
# Using Python 3.9 for better compatibility (asvspoof5 branch approach)
conda create -n deepfake-audio python=3.9 -y
conda activate deepfake-audio
# Install PyTorch with CUDA support (adjust CUDA version for your GPU)
# RTX 5050 supports CUDA 12.x
pip install torch==1.13.1+cu117 torchaudio==0.13.1+cu117 torchvision==0.14.1+cu117 \
-f https://download.pytorch.org/whl/torch_stable.html
# Install core dependencies
pip install librosa==0.9.1 soundfile==0.12.1 numpy==1.23.5 scipy==1.9.3
pip install transformers==4.30.2 s3prl==0.4.15
pip install tqdm scikit-learn pandas
# Install fairseq (required for wav2vec2 frontend)
# Clone specific commit used by the authors
echo "=== Installing fairseq ==="
cd audio_detection
if [ ! -d "fairseq" ]; then
git clone https://github.com/facebookresearch/fairseq.git
cd fairseq
git checkout a54021305d6b3c4c5959ac9395135f63202db8f1
pip install --editable ./
cd ..
else
echo "fairseq already exists, skipping clone"
fi
echo ""
echo "=== Setup Complete ==="
echo "Next steps:"
echo "1. Download Nes2Net checkpoint from Google Drive (see README in Nes2Net_ASVspoof_ITW)"
echo "2. Download wav2vec 2.0 XLSR model (will auto-download on first run via fairseq)"
echo "3. Run inference: python Nes2Net_ASVspoof_ITW/easy_inference_demo.py --model_path <checkpoint> --file_to_test <audio_file> --model_name wav2vec2_Nes2Net_X"