Testsdft commited on
Commit
42fb1be
·
verified ·
1 Parent(s): 2eaea30

Upload 4 files

Browse files
Files changed (4) hide show
  1. Dockerfile +27 -0
  2. main.py +139 -0
  3. modeling_rx_codex_v3.py +104 -0
  4. requirements.txt +7 -0
Dockerfile ADDED
@@ -0,0 +1,27 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Use an official Python base image
2
+ FROM python:3.11-slim
3
+
4
+ # Set the working directory inside the container
5
+ WORKDIR /code
6
+
7
+ # Set environment variable for Hugging Face cache
8
+ ENV HF_HOME /code/.cache
9
+
10
+ # Create the cache directory and make it fully writable.
11
+ RUN mkdir -p /code/.cache && chmod -R 777 /code/.cache
12
+
13
+ # Copy and install requirements
14
+ COPY ./requirements.txt /code/requirements.txt
15
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
16
+
17
+ # --- CRITICAL: Copy our custom model's code ---
18
+ COPY ./modeling_rx_codex_v3.py /code/modeling_rx_codex_v3.py
19
+
20
+ # Copy application code
21
+ COPY ./main.py /code/app.py
22
+
23
+ # Expose the port Hugging Face Spaces expects
24
+ EXPOSE 7860
25
+
26
+ # Run the application
27
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # main.py (Corrected)
2
+
3
+ import logging
4
+ from contextlib import asynccontextmanager
5
+ import torch
6
+ from fastapi import FastAPI, HTTPException
7
+ from pydantic import BaseModel
8
+ from transformers import AutoTokenizer, GPT2Config
9
+ from huggingface_hub import hf_hub_download
10
+
11
+ # --- IMPORTANT: We must import our custom model class directly ---
12
+ # This assumes 'modeling_rx_codex_v3.py' is in the same directory
13
+ from modeling_rx_codex_v3 import Rx_Codex_V3_Custom_Model_Class
14
+
15
+ # --- Configuration ---
16
+ HF_REPO_ID = "rxmha125/Rx_Codex_V1_Tiny_V3"
17
+ MODEL_LOAD_DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
18
+
19
+ # --- Logging Setup ---
20
+ logging.basicConfig(level=logging.INFO)
21
+ logger = logging.getLogger(__name__)
22
+
23
+ # --- Global variables ---
24
+ model = None
25
+ tokenizer = None
26
+
27
+ # --- Application Lifespan (Model Loading) ---
28
+ @asynccontextmanager
29
+ async def lifespan(app: FastAPI):
30
+ global model, tokenizer
31
+ logger.info(f"API Startup: Explicitly loading model '{HF_REPO_ID}' to device '{MODEL_LOAD_DEVICE}'...")
32
+
33
+ try:
34
+ # Load tokenizer as before
35
+ tokenizer = AutoTokenizer.from_pretrained(HF_REPO_ID)
36
+ logger.info("✅ Tokenizer loaded successfully.")
37
+
38
+ # --- EXPLICIT MODEL LOADING ---
39
+ # 1. Load the configuration file
40
+ config = GPT2Config.from_pretrained(HF_REPO_ID)
41
+ logger.info("✅ Config loaded successfully.")
42
+
43
+ # 2. Instantiate our custom model with the config
44
+ model = Rx_Codex_V3_Custom_Model_Class(config)
45
+ logger.info("✅ Custom model architecture instantiated.")
46
+
47
+ # 3. Download the model weights file specifically
48
+ weights_path = hf_hub_download(repo_id=HF_REPO_ID, filename="pytorch_model.bin")
49
+ logger.info("✅ Model weights downloaded successfully.")
50
+
51
+ # 4. Load the state dictionary into our custom model
52
+ state_dict = torch.load(weights_path, map_location=MODEL_LOAD_DEVICE)
53
+ model.load_state_dict(state_dict)
54
+ logger.info("✅ Weights loaded into custom model successfully.")
55
+
56
+ # 5. Move to device and set to evaluation mode
57
+ model.to(MODEL_LOAD_DEVICE)
58
+ model.eval()
59
+ logger.info("✅ Model is fully loaded and ready on the target device.")
60
+
61
+ except Exception as e:
62
+ logger.error(f"❌ FATAL: An error occurred during model loading: {e}", exc_info=True)
63
+ # Set model to None to ensure API returns "not ready"
64
+ model = None
65
+ tokenizer = None
66
+
67
+ yield
68
+
69
+ # --- Code below this line runs on shutdown ---
70
+ logger.info("API Shutting down.")
71
+ model = None
72
+ tokenizer = None
73
+
74
+ # --- Initialize FastAPI ---
75
+ app = FastAPI(
76
+ title="Rx Codex V1-Tiny-V3 API",
77
+ description="An API for generating text with the Rx_Codex_V1_Tiny_V3 model.",
78
+ lifespan=lifespan
79
+ )
80
+
81
+ # --- Pydantic Models for API Data Validation ---
82
+ class GenerationRequest(BaseModel):
83
+ prompt: str
84
+ max_new_tokens: int = 150
85
+ temperature: float = 0.7
86
+ top_k: int = 50
87
+
88
+ class GenerationResponse(BaseModel):
89
+ generated_text: str
90
+
91
+ # --- API Endpoints ---
92
+ @app.get("/")
93
+ def root():
94
+ """A simple endpoint to check if the API is running."""
95
+ status = "loaded" if model and tokenizer else "not loaded"
96
+ return {"message": "Rx Codex V1-Tiny-V3 API is running", "model_status": status}
97
+
98
+ @app.post("/generate", response_model=GenerationResponse)
99
+ async def generate_text(request: GenerationRequest):
100
+ """The main endpoint to generate text from a prompt."""
101
+ if not model or not tokenizer:
102
+ raise HTTPException(status_code=503, detail="Model is not ready. Please try again later.")
103
+
104
+ logger.info(f"Received generation request for prompt: '{request.prompt}'")
105
+
106
+ formatted_prompt = f"### Human:\n{request.prompt}\n\n### Assistant:"
107
+ inputs = tokenizer(formatted_prompt, return_tensors="pt").to(MODEL_LOAD_DEVICE)
108
+
109
+ # --- NOTE: Our custom model does not have a .generate() method ---
110
+ # We must use our manual generation loop
111
+ output_ids = inputs["input_ids"]
112
+ with torch.no_grad():
113
+ for _ in range(request.max_new_tokens):
114
+ outputs = model(output_ids)
115
+ next_token_logits = outputs['logits'][:, -1, :]
116
+
117
+ # Apply temperature
118
+ if request.temperature > 0:
119
+ next_token_logits = next_token_logits / request.temperature
120
+
121
+ # Apply top-k
122
+ if request.top_k > 0:
123
+ v, _ = torch.topk(next_token_logits, min(request.top_k, next_token_logits.size(-1)))
124
+ next_token_logits[next_token_logits < v[:, [-1]]] = -float('Inf')
125
+
126
+ probs = torch.nn.functional.softmax(next_token_logits, dim=-1)
127
+ next_token_id = torch.multinomial(probs, num_samples=1)
128
+
129
+ # Stop if EOS token is generated
130
+ if next_token_id == tokenizer.eos_token_id:
131
+ break
132
+
133
+ output_ids = torch.cat((output_ids, next_token_id), dim=1)
134
+
135
+ full_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)
136
+ generated_text = full_text[len(formatted_prompt):].strip()
137
+
138
+ logger.info("Generation complete.")
139
+ return GenerationResponse(generated_text=generated_text)
modeling_rx_codex_v3.py ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # modeling_rx_codex_v3.py
2
+
3
+ import torch
4
+ import torch.nn as nn
5
+ from torch.nn import functional as F
6
+ from transformers import GPT2Config, PreTrainedModel
7
+ from transformers.modeling_outputs import CausalLMOutputWithPast
8
+
9
+ # --- Helper Functions for RoPE ---
10
+ def precompute_freqs_cis(dim: int, end: int, theta: float = 10000.0):
11
+ freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
12
+ t = torch.arange(end, device=freqs.device)
13
+ freqs = torch.outer(t, freqs).float()
14
+ freqs_cis = torch.polar(torch.ones_like(freqs), freqs)
15
+ return freqs_cis
16
+
17
+ def apply_rotary_emb(xq: torch.Tensor, xk: torch.Tensor, freqs_cis: torch.Tensor):
18
+ xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
19
+ xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
20
+ freqs_cis = freqs_cis.unsqueeze(0).unsqueeze(0)
21
+ xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
22
+ xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
23
+ return xq_out.type_as(xq), xk_out.type_as(xk)
24
+
25
+ # --- Model Modules ---
26
+ class CausalSelfAttention(nn.Module):
27
+ def __init__(self, config):
28
+ super().__init__()
29
+ assert config.n_embd % config.n_head == 0
30
+ self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
31
+ self.c_proj = nn.Linear(config.n_embd, config.n_embd)
32
+ self.n_head = config.n_head
33
+ self.n_embd = config.n_embd
34
+ def forward(self, x, freqs_cis):
35
+ B, T, C = x.size()
36
+ q, k, v = self.c_attn(x).split(self.n_embd, dim=2)
37
+ k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
38
+ q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
39
+ v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
40
+ q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)
41
+ y = F.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=0.0, is_causal=True)
42
+ y = y.transpose(1, 2).contiguous().view(B, T, C)
43
+ y = self.c_proj(y)
44
+ return y
45
+
46
+ class SwiGLU_MLP(nn.Module):
47
+ def __init__(self, config):
48
+ super().__init__()
49
+ hidden_dim = int(2/3 * 4 * config.n_embd)
50
+ hidden_dim = (hidden_dim + 127) // 128 * 128
51
+ self.w1 = nn.Linear(config.n_embd, hidden_dim, bias=False)
52
+ self.w3 = nn.Linear(config.n_embd, hidden_dim, bias=False)
53
+ self.w2 = nn.Linear(hidden_dim, config.n_embd, bias=False)
54
+ def forward(self, x):
55
+ return self.w2(F.silu(self.w1(x)) * self.w3(x))
56
+
57
+ class Block(nn.Module):
58
+ def __init__(self, config):
59
+ super().__init__()
60
+ self.ln_1 = nn.LayerNorm(config.n_embd)
61
+ self.attn = CausalSelfAttention(config)
62
+ self.ln_2 = nn.LayerNorm(config.n_embd)
63
+ self.mlp = SwiGLU_MLP(config)
64
+ def forward(self, x, freqs_cis):
65
+ x = x + self.attn(self.ln_1(x), freqs_cis=freqs_cis)
66
+ x = x + self.mlp(self.ln_2(x))
67
+ return x
68
+
69
+ # --- Main Model Class ---
70
+ class Rx_Codex_V3_Custom_Model_Class(PreTrainedModel):
71
+ config_class = GPT2Config
72
+
73
+ def __init__(self, config):
74
+ super().__init__(config)
75
+ self.config = config
76
+ self.transformer = nn.ModuleDict(dict(
77
+ wte = nn.Embedding(config.vocab_size, config.n_embd),
78
+ h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
79
+ ln_f = nn.LayerNorm(config.n_embd),
80
+ ))
81
+ self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
82
+ self.transformer.wte.weight = self.lm_head.weight
83
+ head_dim = config.n_embd // config.n_head
84
+ freqs_cis = precompute_freqs_cis(head_dim, self.config.n_positions * 2)
85
+ self.register_buffer("freqs_cis", freqs_cis)
86
+
87
+ def forward(self, input_ids, labels=None, **kwargs):
88
+ b, t = input_ids.size()
89
+ tok_emb = self.transformer.wte(input_ids)
90
+ x = tok_emb
91
+ freqs_cis = self.freqs_cis[:t]
92
+ for block in self.transformer.h:
93
+ x = block(x, freqs_cis=freqs_cis)
94
+ x = self.transformer.ln_f(x)
95
+ logits = self.lm_head(x)
96
+
97
+ loss = None
98
+ if labels is not None:
99
+ shift_logits = logits[..., :-1, :].contiguous()
100
+ shift_labels = labels[..., 1:].contiguous()
101
+ loss_fct = nn.CrossEntropyLoss()
102
+ loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
103
+
104
+ return CausalLMOutputWithPast(loss=loss, logits=logits)
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ # Use the CPU version of torch for wider compatibility on free hardware
4
+ torch --index-url https://download.pytorch.org/whl/cpu
5
+ transformers
6
+ sentencepiece
7
+ accelerate