ayushmodi001 commited on
Commit
caff036
·
1 Parent(s): 65509a0

final fixes

Browse files
Dockerfile CHANGED
@@ -4,6 +4,11 @@ FROM python:3.11-slim
4
  # Set the working directory in the container
5
  WORKDIR /app
6
 
 
 
 
 
 
7
  # Create a virtual environment
8
  RUN python -m venv /opt/venv
9
 
@@ -17,9 +22,9 @@ COPY requirements.txt .
17
  RUN pip install --upgrade -r requirements.txt
18
 
19
  # Create cache directories for models
20
- RUN mkdir -p ./cache/transformers/optimized_model
21
- RUN mkdir -p ./cache/transformers/optimized_tokenizer
22
  # Create a temp cache directory that will be writable by appuser
 
 
23
  RUN mkdir -p /tmp/opportunity_t5_model_cache/contexts
24
  RUN chmod 777 /tmp/opportunity_t5_model_cache -R
25
 
 
4
  # Set the working directory in the container
5
  WORKDIR /app
6
 
7
+ # Set environment variables for optimization
8
+ ENV CACHE_BASE_DIR="/tmp/opportunity_t5_model_cache"
9
+ ENV ENABLE_LAZY_LOADING="True"
10
+ ENV USE_ONNX_RUNTIME="True"
11
+
12
  # Create a virtual environment
13
  RUN python -m venv /opt/venv
14
 
 
22
  RUN pip install --upgrade -r requirements.txt
23
 
24
  # Create cache directories for models
 
 
25
  # Create a temp cache directory that will be writable by appuser
26
+ RUN mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_model
27
+ RUN mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_tokenizer
28
  RUN mkdir -p /tmp/opportunity_t5_model_cache/contexts
29
  RUN chmod 777 /tmp/opportunity_t5_model_cache -R
30
 
app/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (155 Bytes). View file
 
app/__pycache__/api.cpython-312.pyc ADDED
Binary file (7.54 kB). View file
 
app/__pycache__/config.cpython-312.pyc ADDED
Binary file (1.93 kB). View file
 
app/__pycache__/context_generator.cpython-312.pyc ADDED
Binary file (17.9 kB). View file
 
app/__pycache__/mcq_pipeline.cpython-312.pyc ADDED
Binary file (8.96 kB). View file
 
app/__pycache__/router.cpython-312.pyc ADDED
Binary file (7.71 kB). View file
 
app/__pycache__/utils.cpython-312.pyc ADDED
Binary file (3.46 kB). View file
 
app/api.py CHANGED
@@ -7,34 +7,42 @@ from .utils import get_logger
7
  from .question_generator import load_t5_pipeline
8
  import asyncio # Add asyncio import for request queue
9
  import time # Add time import for timing requests
10
- from .config import WARMUP_SAMPLE_SIZE
11
 
12
  logger = get_logger(__name__)
13
 
14
  @asynccontextmanager
15
- async def lifespan(app: FastAPI): # --- Startup ---
 
16
  with ProcessPoolExecutor(max_workers=2) as executor: # Use both cores
17
  app.state.process_executor = executor
18
  # Process one request at a time but fully utilize both cores
19
  app.state.request_semaphore = asyncio.Semaphore(1) # Process one request at a time for T5
20
  app.state.request_queue = asyncio.Queue() # Create a request queue
21
  logger.info("Application startup: ProcessPoolExecutor created with 2 workers.")
22
- # Load models in the main process during startup
23
- logger.info("Application startup: Loading models in main process...")
24
 
25
- load_t5_pipeline() # This is synchronous logger.info("Application startup: Models loaded in main process.")
26
- # Warm up the model with a simple inference
27
- logger.info("Warming up model with sample inference...")
28
- from .question_generator import generate_questions
29
- # Create a more substantial warm-up with varied examples
30
- warm_up_candidates = [
31
- {"answer": "compiler", "context": "A compiler translates high-level programming language code to machine code.", "context_id": "warm_up_1"},
32
- {"answer": "algorithm", "context": "An algorithm is a step-by-step procedure for solving a problem or accomplishing a task.", "context_id": "warm_up_2"}
33
- ]
34
- # Run warm-up in the executor to ensure it's done in the same way as actual requests
35
- loop = asyncio.get_event_loop()
36
- await loop.run_in_executor(executor, generate_questions, warm_up_candidates)
37
- logger.info("Model warm-up completed successfully.")
 
 
 
 
 
 
 
 
 
38
 
39
  # Start the request processor
40
  app.state.processor_task = asyncio.create_task(process_request_queue(app))
 
7
  from .question_generator import load_t5_pipeline
8
  import asyncio # Add asyncio import for request queue
9
  import time # Add time import for timing requests
10
+ from .config import WARMUP_SAMPLE_SIZE, ENABLE_LAZY_LOADING
11
 
12
  logger = get_logger(__name__)
13
 
14
  @asynccontextmanager
15
+ async def lifespan(app: FastAPI):
16
+ # --- Startup ---
17
  with ProcessPoolExecutor(max_workers=2) as executor: # Use both cores
18
  app.state.process_executor = executor
19
  # Process one request at a time but fully utilize both cores
20
  app.state.request_semaphore = asyncio.Semaphore(1) # Process one request at a time for T5
21
  app.state.request_queue = asyncio.Queue() # Create a request queue
22
  logger.info("Application startup: ProcessPoolExecutor created with 2 workers.")
 
 
23
 
24
+ # Load models in the main process during startup
25
+ logger.info("Application startup: Loading model with optimized startup...")
26
+
27
+ # With lazy loading, this will only load the tokenizer at startup
28
+ load_t5_pipeline()
29
+ logger.info("Application startup: Initial model loading complete.")
30
+
31
+ # For lazy loading, skip the warm-up at startup to reduce startup time
32
+ # The warm-up will happen naturally on the first request
33
+ if not ENABLE_LAZY_LOADING and WARMUP_SAMPLE_SIZE > 0:
34
+ logger.info("Warming up model with sample inference...")
35
+ from .question_generator import generate_questions
36
+ # Create a simplified warm-up with minimal examples
37
+ warm_up_candidates = [
38
+ {"answer": "compiler", "context": "A compiler translates code.", "context_id": "warm_up"}
39
+ ]
40
+ # Run warm-up in the executor to ensure it's done in the same way as actual requests
41
+ loop = asyncio.get_event_loop()
42
+ await loop.run_in_executor(executor, generate_questions, warm_up_candidates)
43
+ logger.info("Model warm-up completed successfully.")
44
+ else:
45
+ logger.info("Model warm-up skipped with lazy loading enabled.")
46
 
47
  # Start the request processor
48
  app.state.processor_task = asyncio.create_task(process_request_queue(app))
app/config.py CHANGED
@@ -23,10 +23,17 @@ T5_MODEL_NAME = "Ayush472/Technical_mcq_model"
23
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
24
  GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
25
 
26
- # --- Question Generator Configuration ---
27
- OPTIMIZED_TOKENIZER_PATH = "./cache/transformers/optimized_tokenizer"
28
- OPTIMIZED_MODEL_PATH = "./cache/transformers/optimized_model"
29
- TRACED_MODEL_PATH = "./cache/transformers/optimized_model/traced_model.pt"
 
 
 
 
 
 
 
30
 
31
  QUESTION_GENERATION_PARAMS = {
32
  "num_beams": 6, # Reduced for faster processing
@@ -59,17 +66,12 @@ CONTEXT_CACHE_EXPIRY = 60 * 60 * 24 * 7 # 7 days in seconds
59
  # In production, this will use a temp directory that the container user has access to
60
  CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", settings.cache_base_dir)
61
 
62
- # Model warm-up sample size (smaller is faster for startup)
63
- WARMUP_SAMPLE_SIZE = 1
64
-
65
- # Enable parallel processing within a request
66
- ENABLE_PARALLEL_PROCESSING = True
67
-
68
- # Enable lazy loading to reduce startup time - only loads tokenizer at startup, loads model on first use
69
- ENABLE_LAZY_LOADING = os.getenv("ENABLE_LAZY_LOADING", "True").lower() in ("true", "1", "t")
70
-
71
- # Use ONNX Runtime for faster inference when available
72
- USE_ONNX_RUNTIME = os.getenv("USE_ONNX_RUNTIME", "True").lower() in ("true", "1", "t")
73
 
74
  # --- Distractor Generator Configuration ---
75
  # Number of QA pairs per batch for distractor generation.
 
23
  GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
24
  GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
25
 
26
+ # Model warm-up sample size (smaller is faster for startup)
27
+ WARMUP_SAMPLE_SIZE = 1
28
+
29
+ # Enable parallel processing within a request
30
+ ENABLE_PARALLEL_PROCESSING = True
31
+
32
+ # Enable lazy loading to reduce startup time - only loads tokenizer at startup, loads model on first use
33
+ ENABLE_LAZY_LOADING = os.getenv("ENABLE_LAZY_LOADING", "True").lower() in ("true", "1", "t")
34
+
35
+ # Use ONNX Runtime for faster inference when available
36
+ USE_ONNX_RUNTIME = os.getenv("USE_ONNX_RUNTIME", "True").lower() in ("true", "1", "t")
37
 
38
  QUESTION_GENERATION_PARAMS = {
39
  "num_beams": 6, # Reduced for faster processing
 
66
  # In production, this will use a temp directory that the container user has access to
67
  CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", settings.cache_base_dir)
68
 
69
+ # Define paths based on the CACHE_BASE_DIR for consistency
70
+ CACHE_DIR = os.path.join(CACHE_BASE_DIR, "transformers")
71
+ OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
72
+ OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
73
+ TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
74
+ ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
 
 
 
 
 
75
 
76
  # --- Distractor Generator Configuration ---
77
  # Number of QA pairs per batch for distractor generation.
app/question_generator.py CHANGED
@@ -2,7 +2,7 @@ import torch
2
  from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
3
  from .config import (
4
  T5_MODEL_NAME, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH,
5
- QUESTION_GENERATION_PARAMS, ENABLE_LAZY_LOADING, USE_ONNX_RUNTIME
6
  )
7
  from .utils import get_logger
8
  import re
@@ -27,9 +27,6 @@ if USE_ONNX_RUNTIME:
27
  else:
28
  ONNX_AVAILABLE = False
29
 
30
- # Define path for ONNX model
31
- ONNX_MODEL_PATH = os.path.join(OPTIMIZED_MODEL_PATH, "model.onnx")
32
-
33
  logger = get_logger(__name__)
34
 
35
  # --- Global Model and Tokenizer ---
 
2
  from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
3
  from .config import (
4
  T5_MODEL_NAME, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH,
5
+ QUESTION_GENERATION_PARAMS, ENABLE_LAZY_LOADING, USE_ONNX_RUNTIME, ONNX_MODEL_PATH
6
  )
7
  from .utils import get_logger
8
  import re
 
27
  else:
28
  ONNX_AVAILABLE = False
29
 
 
 
 
30
  logger = get_logger(__name__)
31
 
32
  # --- Global Model and Tokenizer ---
scripts/OPTIMIZATION.md CHANGED
@@ -18,7 +18,8 @@ python -m scripts.setup_optimization
18
 
19
  This script will:
20
  1. Install required dependencies for ONNX runtime
21
- 2. Run model optimization to create:
 
22
  - An optimized standard model
23
  - A JIT traced model
24
  - An ONNX model (if ONNX runtime is available)
@@ -29,6 +30,7 @@ The following environment variables control optimization behavior:
29
 
30
  - `ENABLE_LAZY_LOADING`: Set to "True" (default) to enable lazy loading, "False" to disable
31
  - `USE_ONNX_RUNTIME`: Set to "True" (default) to use ONNX runtime when available, "False" to disable
 
32
 
33
  Example:
34
  ```bash
@@ -37,6 +39,9 @@ $env:ENABLE_LAZY_LOADING="False"
37
 
38
  # Disable ONNX runtime
39
  $env:USE_ONNX_RUNTIME="False"
 
 
 
40
  ```
41
 
42
  ## Performance Comparison
@@ -49,6 +54,22 @@ $env:USE_ONNX_RUNTIME="False"
49
  | ONNX | ~20s | ~1s | ~0.6s |
50
  | Lazy + ONNX | ~5s | ~15s | ~0.6s |
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  ## Troubleshooting
53
 
54
  If you encounter issues:
 
18
 
19
  This script will:
20
  1. Install required dependencies for ONNX runtime
21
+ 2. Create necessary directories with appropriate permissions
22
+ 3. Run model optimization to create:
23
  - An optimized standard model
24
  - A JIT traced model
25
  - An ONNX model (if ONNX runtime is available)
 
30
 
31
  - `ENABLE_LAZY_LOADING`: Set to "True" (default) to enable lazy loading, "False" to disable
32
  - `USE_ONNX_RUNTIME`: Set to "True" (default) to use ONNX runtime when available, "False" to disable
33
+ - `CACHE_BASE_DIR`: Set to override the default cache directory (defaults to "/tmp/opportunity_t5_model_cache")
34
 
35
  Example:
36
  ```bash
 
39
 
40
  # Disable ONNX runtime
41
  $env:USE_ONNX_RUNTIME="False"
42
+
43
+ # Use a custom cache directory
44
+ $env:CACHE_BASE_DIR="/path/to/writable/directory"
45
  ```
46
 
47
  ## Performance Comparison
 
54
  | ONNX | ~20s | ~1s | ~0.6s |
55
  | Lazy + ONNX | ~5s | ~15s | ~0.6s |
56
 
57
+ ## Hugging Face Spaces Compatibility
58
+
59
+ When running in Hugging Face Spaces, the optimized models are stored in the `/tmp/opportunity_t5_model_cache` directory, which is created with appropriate permissions during container startup. This directory is writable by the application user and persists across application restarts.
60
+
61
+ If you encounter permission errors when optimizing models, you can:
62
+
63
+ 1. Make sure the `CACHE_BASE_DIR` environment variable is set to a writable location
64
+ 2. Run the setup script with appropriate permissions
65
+ 3. Manually create the directories with the right permissions:
66
+
67
+ ```bash
68
+ mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_tokenizer
69
+ mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_model
70
+ chmod -R 777 /tmp/opportunity_t5_model_cache
71
+ ```
72
+
73
  ## Troubleshooting
74
 
75
  If you encounter issues:
scripts/optimize_model.py CHANGED
@@ -15,12 +15,22 @@ logging.basicConfig(
15
  )
16
  logger = logging.getLogger(__name__)
17
 
18
- # Paths for optimized models
19
- CACHE_DIR = "./cache/transformers"
20
- OPTIMIZED_TOKENIZER_PATH = f"{CACHE_DIR}/optimized_tokenizer"
21
- OPTIMIZED_MODEL_PATH = f"{CACHE_DIR}/optimized_model"
22
- TRACED_MODEL_PATH = f"{CACHE_DIR}/optimized_model/traced_model.pt"
23
- ONNX_MODEL_PATH = f"{CACHE_DIR}/optimized_model/model.onnx"
 
 
 
 
 
 
 
 
 
 
24
 
25
  def create_dummy_input(tokenizer, device):
26
  """Create a dummy input for tracing"""
@@ -37,10 +47,27 @@ def optimize_t5_model(model_name, output_dir=CACHE_DIR, use_jit=True, use_onnx=T
37
  3. Creating an ONNX version for even faster inference
38
  """
39
  try:
40
- # Ensure directory exists
41
- os.makedirs(output_dir, exist_ok=True)
 
 
 
 
 
 
 
 
 
42
  os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
43
  os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
 
 
 
 
 
 
 
 
44
 
45
  # Determine device
46
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
15
  )
16
  logger = logging.getLogger(__name__)
17
 
18
+ # Import CACHE_BASE_DIR from config if possible, or use a safe default
19
+ try:
20
+ # First try to import from the app config
21
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
22
+ from app.config import CACHE_BASE_DIR
23
+ except ImportError:
24
+ # Fallback to a safe location if config can't be imported
25
+ CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", "/tmp/opportunity_t5_model_cache")
26
+ logger.info(f"Could not import from config, using fallback cache directory: {CACHE_BASE_DIR}")
27
+
28
+ # Paths for optimized models - using the writable CACHE_BASE_DIR from config
29
+ CACHE_DIR = os.path.join(CACHE_BASE_DIR, "transformers")
30
+ OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
31
+ OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
32
+ TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
33
+ ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
34
 
35
  def create_dummy_input(tokenizer, device):
36
  """Create a dummy input for tracing"""
 
47
  3. Creating an ONNX version for even faster inference
48
  """
49
  try:
50
+ # Use output_dir if provided (for CLI compatibility)
51
+ global CACHE_DIR, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH, ONNX_MODEL_PATH
52
+ if output_dir != CACHE_DIR:
53
+ CACHE_DIR = output_dir
54
+ OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
55
+ OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
56
+ TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
57
+ ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
58
+
59
+ logger.info(f"Creating directories: {CACHE_DIR}, {OPTIMIZED_TOKENIZER_PATH}, {OPTIMIZED_MODEL_PATH}")
60
+ os.makedirs(CACHE_DIR, exist_ok=True)
61
  os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
62
  os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
63
+ try:
64
+ import stat
65
+ mode = stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO
66
+ os.chmod(CACHE_DIR, mode)
67
+ os.chmod(OPTIMIZED_TOKENIZER_PATH, mode)
68
+ os.chmod(OPTIMIZED_MODEL_PATH, mode)
69
+ except Exception as e:
70
+ logger.warning(f"Could not set directory permissions: {e}. This may be normal in some environments.")
71
 
72
  # Determine device
73
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
scripts/setup_optimization.py CHANGED
@@ -44,6 +44,32 @@ def run_optimization():
44
  script_dir = os.path.dirname(os.path.abspath(__file__))
45
  sys.path.insert(0, os.path.dirname(script_dir))
46
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47
  from scripts.optimize_model import optimize_t5_model
48
 
49
  # Run optimization with both JIT and ONNX
 
44
  script_dir = os.path.dirname(os.path.abspath(__file__))
45
  sys.path.insert(0, os.path.dirname(script_dir))
46
 
47
+ # First, import config to get the proper cache paths
48
+ try:
49
+ from app.config import CACHE_BASE_DIR, CACHE_DIR, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH
50
+
51
+ # Create all necessary directories with appropriate permissions
52
+ os.makedirs(CACHE_BASE_DIR, exist_ok=True)
53
+ os.makedirs(CACHE_DIR, exist_ok=True)
54
+ os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
55
+ os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
56
+
57
+ # Set permissions to ensure they're writable
58
+ try:
59
+ import stat
60
+ # Add write permissions to user, group and others
61
+ mode = stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO
62
+ os.chmod(CACHE_BASE_DIR, mode)
63
+ os.chmod(CACHE_DIR, mode)
64
+ os.chmod(OPTIMIZED_TOKENIZER_PATH, mode)
65
+ os.chmod(OPTIMIZED_MODEL_PATH, mode)
66
+ logger.info(f"Set permissions on directories: {CACHE_BASE_DIR}, {CACHE_DIR}, etc.")
67
+ except Exception as e:
68
+ logger.warning(f"Could not set directory permissions: {e}. This may be normal in some environments.")
69
+
70
+ except ImportError as e:
71
+ logger.warning(f"Could not import config: {e}. Will rely on optimize_model.py defaults.")
72
+
73
  from scripts.optimize_model import optimize_t5_model
74
 
75
  # Run optimization with both JIT and ONNX
space.json ADDED
File without changes