Spaces:
Build error
Build error
Commit
·
544d677
1
Parent(s):
5daea2d
Fix NLTK data handling and Dockerfile configuration
Browse files- Dockerfile +17 -6
- app.py +17 -54
Dockerfile
CHANGED
|
@@ -38,15 +38,26 @@ RUN useradd -m -u 1000 user && \
|
|
| 38 |
# Copy requirements first to leverage Docker cache
|
| 39 |
COPY --chown=user:user requirements.txt .
|
| 40 |
|
| 41 |
-
# Install Python dependencies
|
| 42 |
-
|
| 43 |
|
| 44 |
-
#
|
| 45 |
-
RUN
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
|
| 48 |
# Set NLTK_DATA environment variable
|
| 49 |
-
ENV NLTK_DATA=/
|
|
|
|
|
|
|
|
|
|
| 50 |
|
| 51 |
# Copy application files
|
| 52 |
COPY --chown=user:user . .
|
|
|
|
| 38 |
# Copy requirements first to leverage Docker cache
|
| 39 |
COPY --chown=user:user requirements.txt .
|
| 40 |
|
| 41 |
+
# Install Python dependencies and NLTK data as root
|
| 42 |
+
USER root
|
| 43 |
|
| 44 |
+
# Create NLTK data directory with proper permissions
|
| 45 |
+
RUN mkdir -p /usr/share/nltk_data/tokenizers \
|
| 46 |
+
&& chmod -R 777 /usr/share/nltk_data
|
| 47 |
+
|
| 48 |
+
# Install NLTK and download data
|
| 49 |
+
RUN pip install --no-cache-dir -r requirements.txt \
|
| 50 |
+
&& python -c "import nltk; nltk.download('punkt', download_dir='/usr/share/nltk_data')" \
|
| 51 |
+
&& python -c "import nltk; nltk.download('stopwords', download_dir='/usr/share/nltk_data')" \
|
| 52 |
+
&& python -c "import nltk; nltk.download('wordnet', download_dir='/usr/share/nltk_data')" \
|
| 53 |
+
&& python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/share/nltk_data')" \
|
| 54 |
+
&& chmod -R 755 /usr/share/nltk_data
|
| 55 |
|
| 56 |
# Set NLTK_DATA environment variable
|
| 57 |
+
ENV NLTK_DATA=/usr/share/nltk_data
|
| 58 |
+
|
| 59 |
+
# Switch back to non-root user
|
| 60 |
+
USER user
|
| 61 |
|
| 62 |
# Copy application files
|
| 63 |
COPY --chown=user:user . .
|
app.py
CHANGED
|
@@ -42,7 +42,8 @@ app.add_middleware(
|
|
| 42 |
BASE_DIR = Path("/home/user/app/data")
|
| 43 |
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 44 |
PROCESSED_DIR = BASE_DIR / "processed"
|
| 45 |
-
|
|
|
|
| 46 |
|
| 47 |
# Ensure directories exist with proper permissions
|
| 48 |
for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
|
|
@@ -67,63 +68,25 @@ try:
|
|
| 67 |
|
| 68 |
# Initialize NLTK data
|
| 69 |
import nltk
|
| 70 |
-
|
|
|
|
|
|
|
| 71 |
|
| 72 |
# Verify NLTK data is available
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
except LookupError as e:
|
| 80 |
-
logger.warning(f"NLTK data missing: {e}")
|
| 81 |
-
try:
|
| 82 |
-
nltk.download('punkt', download_dir=str(NLTK_DATA_DIR))
|
| 83 |
-
nltk.data.path.append(str(NLTK_DATA_DIR))
|
| 84 |
-
logger.info("Successfully downloaded NLTK punkt data")
|
| 85 |
-
except Exception as download_error:
|
| 86 |
-
logger.error(f"Failed to download NLTK data: {download_error}")
|
| 87 |
-
# Continue without NLTK data if download fails
|
| 88 |
-
|
| 89 |
-
try:
|
| 90 |
-
nltk.data.find('corpora/stopwords')
|
| 91 |
-
logger.info("NLTK stopwords is available")
|
| 92 |
-
except LookupError as e:
|
| 93 |
-
logger.warning(f"NLTK data missing: {e}")
|
| 94 |
-
try:
|
| 95 |
-
nltk.download('stopwords', download_dir=str(NLTK_DATA_DIR))
|
| 96 |
-
nltk.data.path.append(str(NLTK_DATA_DIR))
|
| 97 |
-
logger.info("Successfully downloaded NLTK stopwords data")
|
| 98 |
-
except Exception as download_error:
|
| 99 |
-
logger.error(f"Failed to download NLTK data: {download_error}")
|
| 100 |
-
# Continue without NLTK data if download fails
|
| 101 |
-
|
| 102 |
-
try:
|
| 103 |
-
nltk.data.find('corpora/wordnet')
|
| 104 |
-
logger.info("NLTK wordnet is available")
|
| 105 |
-
except LookupError as e:
|
| 106 |
-
logger.warning(f"NLTK data missing: {e}")
|
| 107 |
-
try:
|
| 108 |
-
nltk.download('wordnet', download_dir=str(NLTK_DATA_DIR))
|
| 109 |
-
nltk.data.path.append(str(NLTK_DATA_DIR))
|
| 110 |
-
logger.info("Successfully downloaded NLTK wordnet data")
|
| 111 |
-
except Exception as download_error:
|
| 112 |
-
logger.error(f"Failed to download NLTK data: {download_error}")
|
| 113 |
-
# Continue without NLTK data if download fails
|
| 114 |
|
| 115 |
-
|
| 116 |
-
nltk.data.find('taggers/averaged_perceptron_tagger')
|
| 117 |
-
logger.info("NLTK averaged_perceptron_tagger is available")
|
| 118 |
-
except LookupError as e:
|
| 119 |
-
logger.warning(f"NLTK data missing: {e}")
|
| 120 |
try:
|
| 121 |
-
nltk.
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
# Continue without NLTK data if download fails
|
| 127 |
|
| 128 |
except ImportError as e:
|
| 129 |
logger.error(f"Failed to import required modules: {e}")
|
|
|
|
| 42 |
BASE_DIR = Path("/home/user/app/data")
|
| 43 |
UPLOAD_DIR = BASE_DIR / "uploads"
|
| 44 |
PROCESSED_DIR = BASE_DIR / "processed"
|
| 45 |
+
# Use system NLTK data directory that we'll populate in the Dockerfile
|
| 46 |
+
NLTK_DATA_DIR = Path("/usr/share/nltk_data")
|
| 47 |
|
| 48 |
# Ensure directories exist with proper permissions
|
| 49 |
for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
|
|
|
|
| 68 |
|
| 69 |
# Initialize NLTK data
|
| 70 |
import nltk
|
| 71 |
+
|
| 72 |
+
# Set NLTK data path - system path first, then user path
|
| 73 |
+
nltk.data.path = [str(NLTK_DATA_DIR)] + nltk.data.path
|
| 74 |
|
| 75 |
# Verify NLTK data is available
|
| 76 |
+
required_nltk_data = [
|
| 77 |
+
'tokenizers/punkt',
|
| 78 |
+
'corpora/stopwords',
|
| 79 |
+
'corpora/wordnet',
|
| 80 |
+
'taggers/averaged_perceptron_tagger'
|
| 81 |
+
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
+
for resource in required_nltk_data:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
try:
|
| 85 |
+
nltk.data.find(resource)
|
| 86 |
+
logger.info(f"NLTK resource found: {resource}")
|
| 87 |
+
except LookupError as e:
|
| 88 |
+
logger.warning(f"NLTK resource not found: {resource}")
|
| 89 |
+
# Don't try to download at runtime - should be handled in Dockerfile
|
|
|
|
| 90 |
|
| 91 |
except ImportError as e:
|
| 92 |
logger.error(f"Failed to import required modules: {e}")
|