Ishu8904 commited on
Commit
f5cf0fb
·
1 Parent(s): 40b0aa5

FIX: Handle NLTK download reliably with nltk.txt

Browse files
Files changed (3) hide show
  1. build.sh +2 -1
  2. build_vocab.py +0 -7
  3. nltk.txt +1 -0
build.sh CHANGED
@@ -4,6 +4,7 @@ set -e
4
  # Install all the python packages
5
  echo "--- Installing dependencies ---"
6
  pip install -r requirements.txt
 
7
 
8
  # Download ONLY the large model files from the GitHub Release
9
  echo "--- Downloading model files ---"
@@ -11,7 +12,7 @@ wget -O decoder-model.pth "https://github.com/Ishu-Kaur/Image-Caption-AI/release
11
  wget -O encoder-model.pth "https://github.com/Ishu-Kaur/Image-Caption-AI/releases/download/v1.0.1/encoder-model.pth"
12
  echo "--- Model files downloaded successfully ---"
13
 
14
- # CRITICAL STEP: Build the vocabulary file directly on the server
15
  echo "--- Building vocabulary file ---"
16
  python build_vocab.py
17
  echo "--- Vocabulary file built successfully ---"
 
4
  # Install all the python packages
5
  echo "--- Installing dependencies ---"
6
  pip install -r requirements.txt
7
+ pip install -r nltk.txt -d /opt/render/project/src/nltk_data # <-- ADD THIS LINE
8
 
9
  # Download ONLY the large model files from the GitHub Release
10
  echo "--- Downloading model files ---"
 
12
  wget -O encoder-model.pth "https://github.com/Ishu-Kaur/Image-Caption-AI/releases/download/v1.0.1/encoder-model.pth"
13
  echo "--- Model files downloaded successfully ---"
14
 
15
+ # Build the vocabulary file directly on the server
16
  echo "--- Building vocabulary file ---"
17
  python build_vocab.py
18
  echo "--- Vocabulary file built successfully ---"
build_vocab.py CHANGED
@@ -56,13 +56,6 @@ class Vocabulary:
56
  if __name__ == "__main__":
57
  print("Starting vocabulary creation process...")
58
 
59
- # Download the NLTK tokenizer model (only needs to be done once)
60
- try:
61
- nltk.data.find('tokenizers/punkt')
62
- except LookupError: # <-- This is a more robust way to check
63
- print("Downloading NLTK 'punkt' model...")
64
- nltk.download('punkt')
65
-
66
  # Load the Flickr8k training data from Hugging Face
67
  print("Loading Flickr8k dataset from Hugging Face...")
68
  train_dataset = load_dataset("jxie/flickr8k", split="train")
 
56
  if __name__ == "__main__":
57
  print("Starting vocabulary creation process...")
58
 
 
 
 
 
 
 
 
59
  # Load the Flickr8k training data from Hugging Face
60
  print("Loading Flickr8k dataset from Hugging Face...")
61
  train_dataset = load_dataset("jxie/flickr8k", split="train")
nltk.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ punkt