bluewhale2025 commited on
Commit
23e4091
·
1 Parent(s): a118576

Fix NLTK data installation and path handling

Browse files
Files changed (2) hide show
  1. Dockerfile +10 -1
  2. app.py +30 -4
Dockerfile CHANGED
@@ -48,16 +48,25 @@ RUN mkdir -p /usr/local/share/nltk_data \
48
  # Install Python dependencies
49
  RUN pip install --no-cache-dir -r requirements.txt
50
 
51
- # Download NLTK data as root
 
 
 
 
 
52
  RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
53
  && python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
54
  && python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
55
  && python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
 
56
  && chmod -R 755 /usr/local/share/nltk_data
57
 
58
  # Set NLTK_DATA environment variable
59
  ENV NLTK_DATA=/usr/local/share/nltk_data
60
 
 
 
 
61
  # Switch to non-root user
62
  USER user
63
 
 
48
  # Install Python dependencies
49
  RUN pip install --no-cache-dir -r requirements.txt
50
 
51
+ # Install system dependencies for NLTK
52
+ RUN apt-get update && apt-get install -y --no-install-recommends \
53
+ unzip \
54
+ && rm -rf /var/lib/apt/lists/*
55
+
56
+ # Download and install NLTK data as root
57
  RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
58
  && python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
59
  && python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
60
  && python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
61
+ && python -c "import nltk; nltk.download('punkt_tab', download_dir='/usr/local/share/nltk_data')" \
62
  && chmod -R 755 /usr/local/share/nltk_data
63
 
64
  # Set NLTK_DATA environment variable
65
  ENV NLTK_DATA=/usr/local/share/nltk_data
66
 
67
+ # Verify NLTK data is accessible
68
+ RUN python -c "import nltk; nltk.data.path.append('/usr/local/share/nltk_data'); nltk.data.find('tokenizers/punkt')"
69
+
70
  # Switch to non-root user
71
  USER user
72
 
app.py CHANGED
@@ -66,18 +66,35 @@ try:
66
  from summarizer import document_summarizer
67
  from vector_store import vector_store
68
 
69
- # Initialize NLTK data
70
  import nltk
71
 
72
  # Set NLTK data path - system path first, then user path
73
- nltk.data.path = [str(NLTK_DATA_DIR)] + nltk.data.path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  # Verify NLTK data is available
76
  required_nltk_data = [
77
  'tokenizers/punkt',
78
  'corpora/stopwords',
79
  'corpora/wordnet',
80
- 'taggers/averaged_perceptron_tagger'
 
81
  ]
82
 
83
  for resource in required_nltk_data:
@@ -86,7 +103,16 @@ try:
86
  logger.info(f"NLTK resource found: {resource}")
87
  except LookupError as e:
88
  logger.warning(f"NLTK resource not found: {resource}")
89
- # Don't try to download at runtime - should be handled in Dockerfile
 
 
 
 
 
 
 
 
 
90
 
91
  except ImportError as e:
92
  logger.error(f"Failed to import required modules: {e}")
 
66
  from summarizer import document_summarizer
67
  from vector_store import vector_store
68
 
69
+ # Initialize NLTK data
70
  import nltk
71
 
72
  # Set NLTK data path - system path first, then user path
73
+ nltk_data_paths = [
74
+ str(NLTK_DATA_DIR),
75
+ '/usr/local/share/nltk_data',
76
+ '/usr/share/nltk_data',
77
+ '/usr/local/nltk_data',
78
+ '/usr/local/share/nltk_data',
79
+ '/usr/local/lib/nltk_data',
80
+ '/usr/share/nltk_data',
81
+ '/usr/local/share/nltk_data',
82
+ '/usr/lib/nltk_data',
83
+ '/usr/local/lib/nltk_data',
84
+ '/root/nltk_data',
85
+ '/home/user/nltk_data'
86
+ ]
87
+
88
+ # Add all possible NLTK data paths
89
+ nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
90
 
91
  # Verify NLTK data is available
92
  required_nltk_data = [
93
  'tokenizers/punkt',
94
  'corpora/stopwords',
95
  'corpora/wordnet',
96
+ 'taggers/averaged_perceptron_tagger',
97
+ 'tokenizers/punkt_tab/english'
98
  ]
99
 
100
  for resource in required_nltk_data:
 
103
  logger.info(f"NLTK resource found: {resource}")
104
  except LookupError as e:
105
  logger.warning(f"NLTK resource not found: {resource}")
106
+ # Try to download the resource if not found
107
+ try:
108
+ resource_name = resource.split('/')[-1].split('.')[0]
109
+ logger.info(f"Attempting to download NLTK resource: {resource_name}")
110
+ nltk.download(resource_name, download_dir=str(NLTK_DATA_DIR))
111
+ nltk.data.path.append(str(NLTK_DATA_DIR))
112
+ nltk.data.find(resource) # Try to find it again after download
113
+ logger.info(f"Successfully downloaded NLTK resource: {resource}")
114
+ except Exception as download_error:
115
+ logger.error(f"Failed to download NLTK resource {resource}: {str(download_error)}")
116
 
117
  except ImportError as e:
118
  logger.error(f"Failed to import required modules: {e}")