bluewhale2025 commited on
Commit
544d677
·
1 Parent(s): 5daea2d

Fix NLTK data handling and Dockerfile configuration

Browse files
Files changed (2) hide show
  1. Dockerfile +17 -6
  2. app.py +17 -54
Dockerfile CHANGED
@@ -38,15 +38,26 @@ RUN useradd -m -u 1000 user && \
38
  # Copy requirements first to leverage Docker cache
39
  COPY --chown=user:user requirements.txt .
40
 
41
- # Install Python dependencies
42
- RUN pip install --no-cache-dir -r requirements.txt
43
 
44
- # Download NLTK data during build
45
- RUN python -c "import nltk; nltk.download('punkt', download_dir='/home/user/app/nltk_data')"
46
- RUN python -c "import nltk; nltk.download('stopwords', download_dir='/home/user/app/nltk_data')"
 
 
 
 
 
 
 
 
47
 
48
  # Set NLTK_DATA environment variable
49
- ENV NLTK_DATA=/home/user/app/nltk_data
 
 
 
50
 
51
  # Copy application files
52
  COPY --chown=user:user . .
 
38
  # Copy requirements first to leverage Docker cache
39
  COPY --chown=user:user requirements.txt .
40
 
41
+ # Install Python dependencies and NLTK data as root
42
+ USER root
43
 
44
+ # Create NLTK data directory with proper permissions
45
+ RUN mkdir -p /usr/share/nltk_data/tokenizers \
46
+ && chmod -R 777 /usr/share/nltk_data
47
+
48
+ # Install NLTK and download data
49
+ RUN pip install --no-cache-dir -r requirements.txt \
50
+ && python -c "import nltk; nltk.download('punkt', download_dir='/usr/share/nltk_data')" \
51
+ && python -c "import nltk; nltk.download('stopwords', download_dir='/usr/share/nltk_data')" \
52
+ && python -c "import nltk; nltk.download('wordnet', download_dir='/usr/share/nltk_data')" \
53
+ && python -c "import nltk; nltk.download('averaged_perceptron_tagger', download_dir='/usr/share/nltk_data')" \
54
+ && chmod -R 755 /usr/share/nltk_data
55
 
56
  # Set NLTK_DATA environment variable
57
+ ENV NLTK_DATA=/usr/share/nltk_data
58
+
59
+ # Switch back to non-root user
60
+ USER user
61
 
62
  # Copy application files
63
  COPY --chown=user:user . .
app.py CHANGED
@@ -42,7 +42,8 @@ app.add_middleware(
42
  BASE_DIR = Path("/home/user/app/data")
43
  UPLOAD_DIR = BASE_DIR / "uploads"
44
  PROCESSED_DIR = BASE_DIR / "processed"
45
- NLTK_DATA_DIR = Path(os.getenv("NLTK_DATA", "/app/nltk_data"))
 
46
 
47
  # Ensure directories exist with proper permissions
48
  for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
@@ -67,63 +68,25 @@ try:
67
 
68
  # Initialize NLTK data
69
  import nltk
70
- nltk.data.path.append(str(NLTK_DATA_DIR))
 
 
71
 
72
  # Verify NLTK data is available
73
- try:
74
- nltk.data.find('tokenizers/punkt')
75
- nltk.data.find('corpora/stopwords')
76
- nltk.data.find('corpora/wordnet')
77
- nltk.data.find('taggers/averaged_perceptron_tagger')
78
- logger.info("NLTK data verified successfully")
79
- except LookupError as e:
80
- logger.warning(f"NLTK data missing: {e}")
81
- try:
82
- nltk.download('punkt', download_dir=str(NLTK_DATA_DIR))
83
- nltk.data.path.append(str(NLTK_DATA_DIR))
84
- logger.info("Successfully downloaded NLTK punkt data")
85
- except Exception as download_error:
86
- logger.error(f"Failed to download NLTK data: {download_error}")
87
- # Continue without NLTK data if download fails
88
-
89
- try:
90
- nltk.data.find('corpora/stopwords')
91
- logger.info("NLTK stopwords is available")
92
- except LookupError as e:
93
- logger.warning(f"NLTK data missing: {e}")
94
- try:
95
- nltk.download('stopwords', download_dir=str(NLTK_DATA_DIR))
96
- nltk.data.path.append(str(NLTK_DATA_DIR))
97
- logger.info("Successfully downloaded NLTK stopwords data")
98
- except Exception as download_error:
99
- logger.error(f"Failed to download NLTK data: {download_error}")
100
- # Continue without NLTK data if download fails
101
-
102
- try:
103
- nltk.data.find('corpora/wordnet')
104
- logger.info("NLTK wordnet is available")
105
- except LookupError as e:
106
- logger.warning(f"NLTK data missing: {e}")
107
- try:
108
- nltk.download('wordnet', download_dir=str(NLTK_DATA_DIR))
109
- nltk.data.path.append(str(NLTK_DATA_DIR))
110
- logger.info("Successfully downloaded NLTK wordnet data")
111
- except Exception as download_error:
112
- logger.error(f"Failed to download NLTK data: {download_error}")
113
- # Continue without NLTK data if download fails
114
 
115
- try:
116
- nltk.data.find('taggers/averaged_perceptron_tagger')
117
- logger.info("NLTK averaged_perceptron_tagger is available")
118
- except LookupError as e:
119
- logger.warning(f"NLTK data missing: {e}")
120
  try:
121
- nltk.download('averaged_perceptron_tagger', download_dir=str(NLTK_DATA_DIR))
122
- nltk.data.path.append(str(NLTK_DATA_DIR))
123
- logger.info("Successfully downloaded NLTK averaged_perceptron_tagger data")
124
- except Exception as download_error:
125
- logger.error(f"Failed to download NLTK data: {download_error}")
126
- # Continue without NLTK data if download fails
127
 
128
  except ImportError as e:
129
  logger.error(f"Failed to import required modules: {e}")
 
42
  BASE_DIR = Path("/home/user/app/data")
43
  UPLOAD_DIR = BASE_DIR / "uploads"
44
  PROCESSED_DIR = BASE_DIR / "processed"
45
+ # Use system NLTK data directory that we'll populate in the Dockerfile
46
+ NLTK_DATA_DIR = Path("/usr/share/nltk_data")
47
 
48
  # Ensure directories exist with proper permissions
49
  for directory in [BASE_DIR, UPLOAD_DIR, PROCESSED_DIR]:
 
68
 
69
  # Initialize NLTK data
70
  import nltk
71
+
72
+ # Set NLTK data path - system path first, then user path
73
+ nltk.data.path = [str(NLTK_DATA_DIR)] + nltk.data.path
74
 
75
  # Verify NLTK data is available
76
+ required_nltk_data = [
77
+ 'tokenizers/punkt',
78
+ 'corpora/stopwords',
79
+ 'corpora/wordnet',
80
+ 'taggers/averaged_perceptron_tagger'
81
+ ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
+ for resource in required_nltk_data:
 
 
 
 
84
  try:
85
+ nltk.data.find(resource)
86
+ logger.info(f"NLTK resource found: {resource}")
87
+ except LookupError as e:
88
+ logger.warning(f"NLTK resource not found: {resource}")
89
+ # Don't try to download at runtime - should be handled in Dockerfile
 
90
 
91
  except ImportError as e:
92
  logger.error(f"Failed to import required modules: {e}")