HimanshuGoyal2004 commited on
Commit
fcbf7d5
·
1 Parent(s): e6dab59
Files changed (3) hide show
  1. Dockerfile +27 -2
  2. app.py +19 -0
  3. requirements.txt +4 -1
Dockerfile CHANGED
@@ -12,8 +12,13 @@ WORKDIR /app
12
  RUN apt-get update && apt-get install -y \
13
  gcc \
14
  g++ \
 
15
  && rm -rf /var/lib/apt/lists/*
16
 
 
 
 
 
17
  # Copy requirements first for better Docker layer caching
18
  COPY requirements.txt .
19
 
@@ -21,11 +26,31 @@ COPY requirements.txt .
21
  RUN pip install --no-cache-dir --upgrade pip && \
22
  pip install --no-cache-dir -r requirements.txt
23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
24
  # Copy application code
25
  COPY app.py .
26
 
27
- # Create a non-root user for security (optional but recommended)
28
- RUN useradd -m -u 1000 appuser && chown -R appuser:appuser /app
 
 
29
  USER appuser
30
 
31
  # Expose the port
 
12
  RUN apt-get update && apt-get install -y \
13
  gcc \
14
  g++ \
15
+ curl \
16
  && rm -rf /var/lib/apt/lists/*
17
 
18
+ # Create NLTK data directory with proper permissions
19
+ RUN mkdir -p /usr/local/nltk_data && chmod 755 /usr/local/nltk_data
20
+ ENV NLTK_DATA=/usr/local/nltk_data
21
+
22
  # Copy requirements first for better Docker layer caching
23
  COPY requirements.txt .
24
 
 
26
  RUN pip install --no-cache-dir --upgrade pip && \
27
  pip install --no-cache-dir -r requirements.txt
28
 
29
+ # Download all potentially needed NLTK data during build
30
+ # This ensures we have permissions and avoids runtime download issues
31
+ RUN python -c "import nltk; \
32
+ nltk.download('punkt', download_dir='/usr/local/nltk_data', quiet=True); \
33
+ nltk.download('punkt_tab', download_dir='/usr/local/nltk_data', quiet=True); \
34
+ nltk.download('stopwords', download_dir='/usr/local/nltk_data', quiet=True); \
35
+ nltk.download('averaged_perceptron_tagger', download_dir='/usr/local/nltk_data', quiet=True); \
36
+ print('NLTK data download completed successfully')"
37
+
38
+ # Verify NLTK data was downloaded correctly
39
+ RUN python -c "import nltk; \
40
+ try: \
41
+ nltk.data.find('tokenizers/punkt'); \
42
+ print('NLTK punkt tokenizer found successfully'); \
43
+ except LookupError: \
44
+ print('Warning: NLTK punkt tokenizer not found'); \
45
+ exit(1)"
46
+
47
  # Copy application code
48
  COPY app.py .
49
 
50
+ # Create a non-root user for security but ensure they can access NLTK data
51
+ RUN useradd -m -u 1000 appuser && \
52
+ chown -R appuser:appuser /app && \
53
+ chmod -R 755 /usr/local/nltk_data
54
  USER appuser
55
 
56
  # Expose the port
app.py CHANGED
@@ -1,5 +1,24 @@
1
  import os
2
  import gradio as gr
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
4
  from llama_index.embeddings.cohere import CohereEmbedding
5
  from llama_index.llms.groq import Groq
 
1
  import os
2
  import gradio as gr
3
+
4
+ # Handle NLTK setup early with proper error handling
5
+ try:
6
+ import nltk
7
+ # Ensure NLTK data is available, try to download if missing
8
+ try:
9
+ nltk.data.find('tokenizers/punkt')
10
+ except LookupError:
11
+ print("NLTK punkt tokenizer not found, attempting to download...")
12
+ try:
13
+ nltk.download('punkt', quiet=True)
14
+ nltk.download('punkt_tab', quiet=True)
15
+ except Exception as e:
16
+ print(f"Warning: Could not download NLTK data: {e}")
17
+ print("This may cause issues with text processing")
18
+ except ImportError:
19
+ print("NLTK not available, continuing without it")
20
+
21
+ # Now import LlamaIndex components
22
  from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
23
  from llama_index.embeddings.cohere import CohereEmbedding
24
  from llama_index.llms.groq import Groq
requirements.txt CHANGED
@@ -13,4 +13,7 @@ transformers>=4.30.0,<5.0.0
13
 
14
  # Add explicit dependencies that might be causing issues
15
  pydantic>=2.0.0,<3.0.0
16
- fastapi>=0.100.0,<1.0.0
 
 
 
 
13
 
14
  # Add explicit dependencies that might be causing issues
15
  pydantic>=2.0.0,<3.0.0
16
+ fastapi>=0.100.0,<1.0.0
17
+
18
+ # Explicitly include NLTK with a compatible version
19
+ nltk>=3.8,<4.0