Spaces:

Ayush472
/

opportunity_t5_model

Sleeping

App Files Files Community

ayushmodi001 commited on Aug 17, 2025

Commit

caff036

1 Parent(s): 65509a0

final fixes

Browse files

Files changed (15) hide show

Dockerfile +7 -2
app/__pycache__/__init__.cpython-312.pyc +0 -0
app/__pycache__/api.cpython-312.pyc +0 -0
app/__pycache__/config.cpython-312.pyc +0 -0
app/__pycache__/context_generator.cpython-312.pyc +0 -0
app/__pycache__/mcq_pipeline.cpython-312.pyc +0 -0
app/__pycache__/router.cpython-312.pyc +0 -0
app/__pycache__/utils.cpython-312.pyc +0 -0
app/api.py +25 -17
app/config.py +17 -15
app/question_generator.py +1 -4
scripts/OPTIMIZATION.md +22 -1
scripts/optimize_model.py +35 -8
scripts/setup_optimization.py +26 -0
space.json +0 -0

Dockerfile CHANGED Viewed

@@ -4,6 +4,11 @@ FROM python:3.11-slim
 # Set the working directory in the container
 WORKDIR /app
 # Create a virtual environment
 RUN python -m venv /opt/venv
@@ -17,9 +22,9 @@ COPY requirements.txt .
 RUN pip install --upgrade -r requirements.txt
 # Create cache directories for models
-RUN mkdir -p ./cache/transformers/optimized_model
-RUN mkdir -p ./cache/transformers/optimized_tokenizer
 # Create a temp cache directory that will be writable by appuser
 RUN mkdir -p /tmp/opportunity_t5_model_cache/contexts
 RUN chmod 777 /tmp/opportunity_t5_model_cache -R

 # Set the working directory in the container
 WORKDIR /app
+# Set environment variables for optimization
+ENV CACHE_BASE_DIR="/tmp/opportunity_t5_model_cache"
+ENV ENABLE_LAZY_LOADING="True"
+ENV USE_ONNX_RUNTIME="True"
 # Create a virtual environment
 RUN python -m venv /opt/venv
 RUN pip install --upgrade -r requirements.txt
 # Create cache directories for models
 # Create a temp cache directory that will be writable by appuser
+RUN mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_model
+RUN mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_tokenizer
 RUN mkdir -p /tmp/opportunity_t5_model_cache/contexts
 RUN chmod 777 /tmp/opportunity_t5_model_cache -R

app/__pycache__/__init__.cpython-312.pyc ADDED Viewed

Binary file (155 Bytes). View file

app/__pycache__/api.cpython-312.pyc ADDED Viewed

Binary file (7.54 kB). View file

app/__pycache__/config.cpython-312.pyc ADDED Viewed

Binary file (1.93 kB). View file

app/__pycache__/context_generator.cpython-312.pyc ADDED Viewed

Binary file (17.9 kB). View file

app/__pycache__/mcq_pipeline.cpython-312.pyc ADDED Viewed

Binary file (8.96 kB). View file

app/__pycache__/router.cpython-312.pyc ADDED Viewed

Binary file (7.71 kB). View file

app/__pycache__/utils.cpython-312.pyc ADDED Viewed

Binary file (3.46 kB). View file

app/api.py CHANGED Viewed

@@ -7,34 +7,42 @@ from .utils import get_logger
 from .question_generator import load_t5_pipeline
 import asyncio  # Add asyncio import for request queue
 import time  # Add time import for timing requests
-from .config import WARMUP_SAMPLE_SIZE
 logger = get_logger(__name__)
 @asynccontextmanager
-async def lifespan(app: FastAPI):    # --- Startup ---
     with ProcessPoolExecutor(max_workers=2) as executor:  # Use both cores
         app.state.process_executor = executor
         # Process one request at a time but fully utilize both cores
         app.state.request_semaphore = asyncio.Semaphore(1)  # Process one request at a time for T5
         app.state.request_queue = asyncio.Queue()  # Create a request queue
         logger.info("Application startup: ProcessPoolExecutor created with 2 workers.")
-          # Load models in the main process during startup
-        logger.info("Application startup: Loading models in main process...")
-        load_t5_pipeline() # This is synchronous        logger.info("Application startup: Models loaded in main process.")
-          # Warm up the model with a simple inference
-        logger.info("Warming up model with sample inference...")
-        from .question_generator import generate_questions
-        # Create a more substantial warm-up with varied examples
-        warm_up_candidates = [
-            {"answer": "compiler", "context": "A compiler translates high-level programming language code to machine code.", "context_id": "warm_up_1"},
-            {"answer": "algorithm", "context": "An algorithm is a step-by-step procedure for solving a problem or accomplishing a task.", "context_id": "warm_up_2"}
-        ]
-        # Run warm-up in the executor to ensure it's done in the same way as actual requests
-        loop = asyncio.get_event_loop()
-        await loop.run_in_executor(executor, generate_questions, warm_up_candidates)
-        logger.info("Model warm-up completed successfully.")
         # Start the request processor
         app.state.processor_task = asyncio.create_task(process_request_queue(app))

 from .question_generator import load_t5_pipeline
 import asyncio  # Add asyncio import for request queue
 import time  # Add time import for timing requests
+from .config import WARMUP_SAMPLE_SIZE, ENABLE_LAZY_LOADING
 logger = get_logger(__name__)
 @asynccontextmanager
+async def lifespan(app: FastAPI):
+    # --- Startup ---
     with ProcessPoolExecutor(max_workers=2) as executor:  # Use both cores
         app.state.process_executor = executor
         # Process one request at a time but fully utilize both cores
         app.state.request_semaphore = asyncio.Semaphore(1)  # Process one request at a time for T5
         app.state.request_queue = asyncio.Queue()  # Create a request queue
         logger.info("Application startup: ProcessPoolExecutor created with 2 workers.")
+        # Load models in the main process during startup
+        logger.info("Application startup: Loading model with optimized startup...")
+        # With lazy loading, this will only load the tokenizer at startup
+        load_t5_pipeline()
+        logger.info("Application startup: Initial model loading complete.")
+        # For lazy loading, skip the warm-up at startup to reduce startup time
+        # The warm-up will happen naturally on the first request
+        if not ENABLE_LAZY_LOADING and WARMUP_SAMPLE_SIZE > 0:
+            logger.info("Warming up model with sample inference...")
+            from .question_generator import generate_questions
+            # Create a simplified warm-up with minimal examples
+            warm_up_candidates = [
+                {"answer": "compiler", "context": "A compiler translates code.", "context_id": "warm_up"}
+            ]
+            # Run warm-up in the executor to ensure it's done in the same way as actual requests
+            loop = asyncio.get_event_loop()
+            await loop.run_in_executor(executor, generate_questions, warm_up_candidates)
+            logger.info("Model warm-up completed successfully.")
+        else:
+            logger.info("Model warm-up skipped with lazy loading enabled.")
         # Start the request processor
         app.state.processor_task = asyncio.create_task(process_request_queue(app))

app/config.py CHANGED Viewed

@@ -23,10 +23,17 @@ T5_MODEL_NAME = "Ayush472/Technical_mcq_model"
 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
-# --- Question Generator Configuration ---
-OPTIMIZED_TOKENIZER_PATH = "./cache/transformers/optimized_tokenizer"
-OPTIMIZED_MODEL_PATH = "./cache/transformers/optimized_model"
-TRACED_MODEL_PATH = "./cache/transformers/optimized_model/traced_model.pt"
 QUESTION_GENERATION_PARAMS = {
     "num_beams": 6,  # Reduced for faster processing
@@ -59,17 +66,12 @@ CONTEXT_CACHE_EXPIRY = 60 * 60 * 24 * 7  # 7 days in seconds
 # In production, this will use a temp directory that the container user has access to
 CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", settings.cache_base_dir)
-# Model warm-up sample size (smaller is faster for startup)
-WARMUP_SAMPLE_SIZE = 1
-# Enable parallel processing within a request
-ENABLE_PARALLEL_PROCESSING = True
-# Enable lazy loading to reduce startup time - only loads tokenizer at startup, loads model on first use
-ENABLE_LAZY_LOADING = os.getenv("ENABLE_LAZY_LOADING", "True").lower() in ("true", "1", "t")
-# Use ONNX Runtime for faster inference when available
-USE_ONNX_RUNTIME = os.getenv("USE_ONNX_RUNTIME", "True").lower() in ("true", "1", "t")
 # --- Distractor Generator Configuration ---
 # Number of QA pairs per batch for distractor generation.

 GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 GEMINI_MODEL_NAME = os.getenv("GEMINI_MODEL_NAME", "gemini-2.5-flash")
+# Model warm-up sample size (smaller is faster for startup)
+WARMUP_SAMPLE_SIZE = 1
+# Enable parallel processing within a request
+ENABLE_PARALLEL_PROCESSING = True
+# Enable lazy loading to reduce startup time - only loads tokenizer at startup, loads model on first use
+ENABLE_LAZY_LOADING = os.getenv("ENABLE_LAZY_LOADING", "True").lower() in ("true", "1", "t")
+# Use ONNX Runtime for faster inference when available
+USE_ONNX_RUNTIME = os.getenv("USE_ONNX_RUNTIME", "True").lower() in ("true", "1", "t")
 QUESTION_GENERATION_PARAMS = {
     "num_beams": 6,  # Reduced for faster processing
 # In production, this will use a temp directory that the container user has access to
 CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", settings.cache_base_dir)
+# Define paths based on the CACHE_BASE_DIR for consistency
+CACHE_DIR = os.path.join(CACHE_BASE_DIR, "transformers")
+OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
+OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
+TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
+ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
 # --- Distractor Generator Configuration ---
 # Number of QA pairs per batch for distractor generation.

app/question_generator.py CHANGED Viewed

@@ -2,7 +2,7 @@ import torch
 from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
 from .config import (
     T5_MODEL_NAME, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH,
-    QUESTION_GENERATION_PARAMS, ENABLE_LAZY_LOADING, USE_ONNX_RUNTIME
 )
 from .utils import get_logger
 import re
@@ -27,9 +27,6 @@ if USE_ONNX_RUNTIME:
 else:
     ONNX_AVAILABLE = False
-# Define path for ONNX model
-ONNX_MODEL_PATH = os.path.join(OPTIMIZED_MODEL_PATH, "model.onnx")
 logger = get_logger(__name__)
 # --- Global Model and Tokenizer ---

 from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoModelForSeq2SeqLM
 from .config import (
     T5_MODEL_NAME, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH,
+    QUESTION_GENERATION_PARAMS, ENABLE_LAZY_LOADING, USE_ONNX_RUNTIME, ONNX_MODEL_PATH
 )
 from .utils import get_logger
 import re
 else:
     ONNX_AVAILABLE = False
 logger = get_logger(__name__)
 # --- Global Model and Tokenizer ---

scripts/OPTIMIZATION.md CHANGED Viewed

@@ -18,7 +18,8 @@ python -m scripts.setup_optimization
 This script will:
 1. Install required dependencies for ONNX runtime
-2. Run model optimization to create:
    - An optimized standard model
    - A JIT traced model
    - An ONNX model (if ONNX runtime is available)
@@ -29,6 +30,7 @@ The following environment variables control optimization behavior:
 - `ENABLE_LAZY_LOADING`: Set to "True" (default) to enable lazy loading, "False" to disable
 - `USE_ONNX_RUNTIME`: Set to "True" (default) to use ONNX runtime when available, "False" to disable
 Example:
 ```bash
@@ -37,6 +39,9 @@ $env:ENABLE_LAZY_LOADING="False"
 # Disable ONNX runtime
 $env:USE_ONNX_RUNTIME="False"
 ```
 ## Performance Comparison
@@ -49,6 +54,22 @@ $env:USE_ONNX_RUNTIME="False"
 | ONNX         | ~20s         | ~1s            | ~0.6s                |
 | Lazy + ONNX  | ~5s          | ~15s           | ~0.6s                |
 ## Troubleshooting
 If you encounter issues:

 This script will:
 1. Install required dependencies for ONNX runtime
+2. Create necessary directories with appropriate permissions
+3. Run model optimization to create:
    - An optimized standard model
    - A JIT traced model
    - An ONNX model (if ONNX runtime is available)
 - `ENABLE_LAZY_LOADING`: Set to "True" (default) to enable lazy loading, "False" to disable
 - `USE_ONNX_RUNTIME`: Set to "True" (default) to use ONNX runtime when available, "False" to disable
+- `CACHE_BASE_DIR`: Set to override the default cache directory (defaults to "/tmp/opportunity_t5_model_cache")
 Example:
 ```bash
 # Disable ONNX runtime
 $env:USE_ONNX_RUNTIME="False"
+# Use a custom cache directory
+$env:CACHE_BASE_DIR="/path/to/writable/directory"
 ```
 ## Performance Comparison
 | ONNX         | ~20s         | ~1s            | ~0.6s                |
 | Lazy + ONNX  | ~5s          | ~15s           | ~0.6s                |
+## Hugging Face Spaces Compatibility
+When running in Hugging Face Spaces, the optimized models are stored in the `/tmp/opportunity_t5_model_cache` directory, which is created with appropriate permissions during container startup. This directory is writable by the application user and persists across application restarts.
+If you encounter permission errors when optimizing models, you can:
+1. Make sure the `CACHE_BASE_DIR` environment variable is set to a writable location
+2. Run the setup script with appropriate permissions
+3. Manually create the directories with the right permissions:
+```bash
+mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_tokenizer
+mkdir -p /tmp/opportunity_t5_model_cache/transformers/optimized_model
+chmod -R 777 /tmp/opportunity_t5_model_cache
+```
 ## Troubleshooting
 If you encounter issues:

scripts/optimize_model.py CHANGED Viewed

@@ -15,12 +15,22 @@ logging.basicConfig(
 )
 logger = logging.getLogger(__name__)
-# Paths for optimized models
-CACHE_DIR = "./cache/transformers"
-OPTIMIZED_TOKENIZER_PATH = f"{CACHE_DIR}/optimized_tokenizer"
-OPTIMIZED_MODEL_PATH = f"{CACHE_DIR}/optimized_model"
-TRACED_MODEL_PATH = f"{CACHE_DIR}/optimized_model/traced_model.pt"
-ONNX_MODEL_PATH = f"{CACHE_DIR}/optimized_model/model.onnx"
 def create_dummy_input(tokenizer, device):
     """Create a dummy input for tracing"""
@@ -37,10 +47,27 @@ def optimize_t5_model(model_name, output_dir=CACHE_DIR, use_jit=True, use_onnx=T
     3. Creating an ONNX version for even faster inference
     """
     try:
-        # Ensure directory exists
-        os.makedirs(output_dir, exist_ok=True)
         os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
         os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
         # Determine device
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

 )
 logger = logging.getLogger(__name__)
+# Import CACHE_BASE_DIR from config if possible, or use a safe default
+try:
+    # First try to import from the app config
+    sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+    from app.config import CACHE_BASE_DIR
+except ImportError:
+    # Fallback to a safe location if config can't be imported
+    CACHE_BASE_DIR = os.getenv("CACHE_BASE_DIR", "/tmp/opportunity_t5_model_cache")
+    logger.info(f"Could not import from config, using fallback cache directory: {CACHE_BASE_DIR}")
+# Paths for optimized models - using the writable CACHE_BASE_DIR from config
+CACHE_DIR = os.path.join(CACHE_BASE_DIR, "transformers")
+OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
+OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
+TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
+ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
 def create_dummy_input(tokenizer, device):
     """Create a dummy input for tracing"""
     3. Creating an ONNX version for even faster inference
     """
     try:
+        # Use output_dir if provided (for CLI compatibility)
+        global CACHE_DIR, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH, TRACED_MODEL_PATH, ONNX_MODEL_PATH
+        if output_dir != CACHE_DIR:
+            CACHE_DIR = output_dir
+            OPTIMIZED_TOKENIZER_PATH = os.path.join(CACHE_DIR, "optimized_tokenizer")
+            OPTIMIZED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model")
+            TRACED_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "traced_model.pt")
+            ONNX_MODEL_PATH = os.path.join(CACHE_DIR, "optimized_model", "model.onnx")
+        logger.info(f"Creating directories: {CACHE_DIR}, {OPTIMIZED_TOKENIZER_PATH}, {OPTIMIZED_MODEL_PATH}")
+        os.makedirs(CACHE_DIR, exist_ok=True)
         os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
         os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
+        try:
+            import stat
+            mode = stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO
+            os.chmod(CACHE_DIR, mode)
+            os.chmod(OPTIMIZED_TOKENIZER_PATH, mode)
+            os.chmod(OPTIMIZED_MODEL_PATH, mode)
+        except Exception as e:
+            logger.warning(f"Could not set directory permissions: {e}. This may be normal in some environments.")
         # Determine device
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

scripts/setup_optimization.py CHANGED Viewed

@@ -44,6 +44,32 @@ def run_optimization():
         script_dir = os.path.dirname(os.path.abspath(__file__))
         sys.path.insert(0, os.path.dirname(script_dir))
         from scripts.optimize_model import optimize_t5_model
         # Run optimization with both JIT and ONNX

         script_dir = os.path.dirname(os.path.abspath(__file__))
         sys.path.insert(0, os.path.dirname(script_dir))
+        # First, import config to get the proper cache paths
+        try:
+            from app.config import CACHE_BASE_DIR, CACHE_DIR, OPTIMIZED_TOKENIZER_PATH, OPTIMIZED_MODEL_PATH
+            # Create all necessary directories with appropriate permissions
+            os.makedirs(CACHE_BASE_DIR, exist_ok=True)
+            os.makedirs(CACHE_DIR, exist_ok=True)
+            os.makedirs(OPTIMIZED_TOKENIZER_PATH, exist_ok=True)
+            os.makedirs(OPTIMIZED_MODEL_PATH, exist_ok=True)
+            # Set permissions to ensure they're writable
+            try:
+                import stat
+                # Add write permissions to user, group and others
+                mode = stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO
+                os.chmod(CACHE_BASE_DIR, mode)
+                os.chmod(CACHE_DIR, mode)
+                os.chmod(OPTIMIZED_TOKENIZER_PATH, mode)
+                os.chmod(OPTIMIZED_MODEL_PATH, mode)
+                logger.info(f"Set permissions on directories: {CACHE_BASE_DIR}, {CACHE_DIR}, etc.")
+            except Exception as e:
+                logger.warning(f"Could not set directory permissions: {e}. This may be normal in some environments.")
+        except ImportError as e:
+            logger.warning(f"Could not import config: {e}. Will rely on optimize_model.py defaults.")
         from scripts.optimize_model import optimize_t5_model
         # Run optimization with both JIT and ONNX

space.json ADDED Viewed

File without changes