Spaces:
Build error
Build error
Commit
·
23e4091
1
Parent(s):
a118576
Fix NLTK data installation and path handling
Browse files- Dockerfile +10 -1
- app.py +30 -4
Dockerfile
CHANGED
|
@@ -48,16 +48,25 @@ RUN mkdir -p /usr/local/share/nltk_data \
|
|
| 48 |
# Install Python dependencies
|
| 49 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 50 |
|
| 51 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
|
| 53 |
&& python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
|
| 54 |
&& python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
|
| 55 |
&& python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
|
|
|
|
| 56 |
&& chmod -R 755 /usr/local/share/nltk_data
|
| 57 |
|
| 58 |
# Set NLTK_DATA environment variable
|
| 59 |
ENV NLTK_DATA=/usr/local/share/nltk_data
|
| 60 |
|
|
|
|
|
|
|
|
|
|
| 61 |
# Switch to non-root user
|
| 62 |
USER user
|
| 63 |
|
|
|
|
| 48 |
# Install Python dependencies
|
| 49 |
RUN pip install --no-cache-dir -r requirements.txt
|
| 50 |
|
| 51 |
+
# Install system dependencies for NLTK
|
| 52 |
+
RUN apt-get update && apt-get install -y --no-install-recommends \
|
| 53 |
+
unzip \
|
| 54 |
+
&& rm -rf /var/lib/apt/lists/*
|
| 55 |
+
|
| 56 |
+
# Download and install NLTK data as root
|
| 57 |
RUN python -c "import nltk; nltk.download('punkt', download_dir='/usr/local/share/nltk_data')" \
|
| 58 |
&& python -c "import nltk; nltk.download('stopwords', download_dir='/usr/local/share/nltk_data')" \
|
| 59 |
&& python -c "import nltk; nltk.download('wordnet', download_dir='/usr/local/share/nltk_data')" \
|
| 60 |
&& python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/share/nltk_data')" \
|
| 61 |
+
&& python -c "import nltk; nltk.download('punkt_tab', download_dir='/usr/local/share/nltk_data')" \
|
| 62 |
&& chmod -R 755 /usr/local/share/nltk_data
|
| 63 |
|
| 64 |
# Set NLTK_DATA environment variable
|
| 65 |
ENV NLTK_DATA=/usr/local/share/nltk_data
|
| 66 |
|
| 67 |
+
# Verify NLTK data is accessible
|
| 68 |
+
RUN python -c "import nltk; nltk.data.path.append('/usr/local/share/nltk_data'); nltk.data.find('tokenizers/punkt')"
|
| 69 |
+
|
| 70 |
# Switch to non-root user
|
| 71 |
USER user
|
| 72 |
|
app.py
CHANGED
|
@@ -66,18 +66,35 @@ try:
|
|
| 66 |
from summarizer import document_summarizer
|
| 67 |
from vector_store import vector_store
|
| 68 |
|
| 69 |
-
|
| 70 |
import nltk
|
| 71 |
|
| 72 |
# Set NLTK data path - system path first, then user path
|
| 73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
|
| 75 |
# Verify NLTK data is available
|
| 76 |
required_nltk_data = [
|
| 77 |
'tokenizers/punkt',
|
| 78 |
'corpora/stopwords',
|
| 79 |
'corpora/wordnet',
|
| 80 |
-
'taggers/averaged_perceptron_tagger'
|
|
|
|
| 81 |
]
|
| 82 |
|
| 83 |
for resource in required_nltk_data:
|
|
@@ -86,7 +103,16 @@ try:
|
|
| 86 |
logger.info(f"NLTK resource found: {resource}")
|
| 87 |
except LookupError as e:
|
| 88 |
logger.warning(f"NLTK resource not found: {resource}")
|
| 89 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
|
| 91 |
except ImportError as e:
|
| 92 |
logger.error(f"Failed to import required modules: {e}")
|
|
|
|
| 66 |
from summarizer import document_summarizer
|
| 67 |
from vector_store import vector_store
|
| 68 |
|
| 69 |
+
# Initialize NLTK data
|
| 70 |
import nltk
|
| 71 |
|
| 72 |
# Set NLTK data path - system path first, then user path
|
| 73 |
+
nltk_data_paths = [
|
| 74 |
+
str(NLTK_DATA_DIR),
|
| 75 |
+
'/usr/local/share/nltk_data',
|
| 76 |
+
'/usr/share/nltk_data',
|
| 77 |
+
'/usr/local/nltk_data',
|
| 78 |
+
'/usr/local/share/nltk_data',
|
| 79 |
+
'/usr/local/lib/nltk_data',
|
| 80 |
+
'/usr/share/nltk_data',
|
| 81 |
+
'/usr/local/share/nltk_data',
|
| 82 |
+
'/usr/lib/nltk_data',
|
| 83 |
+
'/usr/local/lib/nltk_data',
|
| 84 |
+
'/root/nltk_data',
|
| 85 |
+
'/home/user/nltk_data'
|
| 86 |
+
]
|
| 87 |
+
|
| 88 |
+
# Add all possible NLTK data paths
|
| 89 |
+
nltk.data.path = list(dict.fromkeys(nltk_data_paths + nltk.data.path))
|
| 90 |
|
| 91 |
# Verify NLTK data is available
|
| 92 |
required_nltk_data = [
|
| 93 |
'tokenizers/punkt',
|
| 94 |
'corpora/stopwords',
|
| 95 |
'corpora/wordnet',
|
| 96 |
+
'taggers/averaged_perceptron_tagger',
|
| 97 |
+
'tokenizers/punkt_tab/english'
|
| 98 |
]
|
| 99 |
|
| 100 |
for resource in required_nltk_data:
|
|
|
|
| 103 |
logger.info(f"NLTK resource found: {resource}")
|
| 104 |
except LookupError as e:
|
| 105 |
logger.warning(f"NLTK resource not found: {resource}")
|
| 106 |
+
# Try to download the resource if not found
|
| 107 |
+
try:
|
| 108 |
+
resource_name = resource.split('/')[-1].split('.')[0]
|
| 109 |
+
logger.info(f"Attempting to download NLTK resource: {resource_name}")
|
| 110 |
+
nltk.download(resource_name, download_dir=str(NLTK_DATA_DIR))
|
| 111 |
+
nltk.data.path.append(str(NLTK_DATA_DIR))
|
| 112 |
+
nltk.data.find(resource) # Try to find it again after download
|
| 113 |
+
logger.info(f"Successfully downloaded NLTK resource: {resource}")
|
| 114 |
+
except Exception as download_error:
|
| 115 |
+
logger.error(f"Failed to download NLTK resource {resource}: {str(download_error)}")
|
| 116 |
|
| 117 |
except ImportError as e:
|
| 118 |
logger.error(f"Failed to import required modules: {e}")
|