Dockerfile CHANGED
@@ -1,33 +1,22 @@
1
- # Use a Python base image
2
- FROM python:3.9-slim
3
 
4
- # Set environment variables to prevent bytecode generation and buffer output for logging
5
- ENV PYTHONDONTWRITEBYTECODE=1
6
- ENV PYTHONUNBUFFERED=1
 
7
 
8
- # Install system dependencies
9
- RUN apt-get update && apt-get install -y \
10
- gcc \
11
- libgl1-mesa-glx \
12
- libglib2.0-0 \
13
- && rm -rf /var/lib/apt/lists/*
14
 
15
- # Set the working directory in the container
16
  WORKDIR /app
 
17
 
18
- # Copy requirements.txt and install Python dependencies
19
- COPY requirements.txt .
20
- RUN pip install --no-cache-dir -r requirements.txt
21
- RUN pip install accelerate
22
 
23
- COPY model /app/model
24
-
25
-
26
- # Copy the application code into the container
27
- COPY . .
28
-
29
- # Expose the port your application will run on
30
  EXPOSE 7860
31
 
32
- # Define the command to run your FastAPI application
33
- CMD ["python", "app.py"]
 
1
+ # Base image
2
+ FROM python:3.10-slim
3
 
4
+ # Install system dependencies\ nRUN apt-get update && \
5
+ RUN apt-get update && \
6
+ apt-get install -y git && \
7
+ rm -rf /var/lib/apt/lists/*
8
 
9
+ # Copy and install Python dependencies
10
+ COPY requirements.txt /app/requirements.txt
11
+ RUN pip install --no-cache-dir -r /app/requirements.txt
 
 
 
12
 
13
+ # Set working directory
14
  WORKDIR /app
15
+ COPY app.py /app/app.py
16
 
 
 
 
 
17
 
18
+ # Expose port (Gradio/Streamlit default)
 
 
 
 
 
 
19
  EXPOSE 7860
20
 
21
+ # Launch
22
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
app.py CHANGED
@@ -1,26 +1,38 @@
1
  import os
2
  import torch
3
- import uvicorn
4
  from fastapi import FastAPI, File, UploadFile, HTTPException, Body
5
  from fastapi.responses import JSONResponse
6
  from transformers import AutoTokenizer, AutoModelForCausalLM
7
  from transformers.cache_utils import DynamicCache , StaticCache
8
  from pydantic import BaseModel
9
  from typing import Optional
 
10
  import tempfile
11
  from time import time
12
- from fastapi.responses import RedirectResponse
13
 
14
  # Add necessary serialization safety
15
  torch.serialization.add_safe_globals([DynamicCache])
16
  torch.serialization.add_safe_globals([set])
 
 
17
 
18
- def generate(model, input_ids, past_key_values, max_new_tokens=50):
 
 
 
 
 
 
 
 
 
 
19
  device = model.model.embed_tokens.weight.device
20
- origin_len = input_ids.shape[-1]
21
- input_ids = input_ids.to(device)
22
- output_ids = input_ids.clone()
23
- next_token = input_ids
24
  with torch.no_grad():
25
  for _ in range(max_new_tokens):
26
  out = model(
@@ -28,19 +40,28 @@ def generate(model, input_ids, past_key_values, max_new_tokens=50):
28
  past_key_values=past_key_values,
29
  use_cache=True
30
  )
31
- logits = out.logits[:, -1, :]
32
- token = torch.argmax(logits, dim=-1, keepdim=True)
33
- output_ids = torch.cat([output_ids, token], dim=-1)
34
  past_key_values = out.past_key_values
35
  next_token = token.to(device)
36
  if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
37
  break
38
- return output_ids[:, origin_len:]
39
-
40
  def get_kv_cache(model, tokenizer, prompt):
 
 
 
 
 
 
 
41
  device = model.model.embed_tokens.weight.device
42
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
43
- cache = DynamicCache()
 
 
44
  with torch.no_grad():
45
  _ = model(
46
  input_ids=input_ids,
@@ -50,74 +71,110 @@ def get_kv_cache(model, tokenizer, prompt):
50
  return cache, input_ids.shape[-1]
51
 
52
  def clean_up(cache, origin_len):
 
53
  new_cache = DynamicCache()
54
  for i in range(len(cache.key_cache)):
55
  new_cache.key_cache.append(cache.key_cache[i].clone())
56
  new_cache.value_cache.append(cache.value_cache[i].clone())
 
 
57
  for i in range(len(new_cache.key_cache)):
58
  new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
59
  new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
60
  return new_cache
61
-
62
  os.environ["TRANSFORMERS_OFFLINE"] = "1"
63
  os.environ["HF_HUB_OFFLINE"] = "1"
64
 
 
 
 
65
  def load_model_and_tokenizer():
66
- model_path = os.environ.get("MODEL_PATH", "./model") # allow override via Docker env
 
 
67
  tokenizer = AutoTokenizer.from_pretrained(model_path)
68
  if torch.cuda.is_available():
 
69
  model = AutoModelForCausalLM.from_pretrained(
70
  model_path,
71
  torch_dtype=torch.float16,
72
- device_map="auto"
73
  )
74
  else:
 
75
  model = AutoModelForCausalLM.from_pretrained(
76
  model_path,
77
- torch_dtype=torch.float32,
78
- low_cpu_mem_usage=True
79
  )
80
  return model, tokenizer
81
 
 
82
  app = FastAPI(title="DeepSeek QA with KV Cache API")
 
 
83
  cache_store = {}
 
 
84
  model, tokenizer = load_model_and_tokenizer()
85
 
86
  class QueryRequest(BaseModel):
87
  query: str
88
  max_new_tokens: Optional[int] = 150
89
-
90
  def clean_response(response_text):
 
 
 
 
91
  import re
 
 
92
  assistant_pattern = re.compile(r'<\|assistant\|>\s*(.*?)(?:<\/\|assistant\|>|<\|user\|>|<\|system\|>)', re.DOTALL)
93
  matches = assistant_pattern.findall(response_text)
 
94
  if matches:
 
95
  for match in matches:
96
  cleaned = match.strip()
97
  if cleaned and not cleaned.startswith("<|") and len(cleaned) > 5:
98
  return cleaned
 
 
 
99
  cleaned = re.sub(r'<\|.*?\|>', '', response_text)
100
  cleaned = re.sub(r'<\/\|.*?\|>', '', cleaned)
 
 
101
  lines = cleaned.strip().split('\n')
102
  unique_lines = []
103
  for line in lines:
104
  line = line.strip()
105
  if line and line not in unique_lines:
106
  unique_lines.append(line)
 
107
  result = '\n'.join(unique_lines)
 
 
108
  result = re.sub(r'<\/?\|.*?\|>\s*$', '', result)
 
109
  return result.strip()
110
-
111
  @app.post("/upload-document_to_create_KV_cache")
112
  async def upload_document(file: UploadFile = File(...)):
 
113
  t1 = time()
 
 
114
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
115
  temp_file_path = temp_file.name
116
  content = await file.read()
117
  temp_file.write(content)
 
118
  try:
 
119
  with open(temp_file_path, "r", encoding="utf-8") as f:
120
  doc_text = f.read()
 
 
121
  system_prompt = f"""
122
  <|system|>
123
  Answer concisely and precisely, You are an assistant who provides concise factual answers.
@@ -126,109 +183,154 @@ async def upload_document(file: UploadFile = File(...)):
126
  {doc_text}
127
  Question:
128
  """.strip()
 
 
129
  cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
 
 
130
  cache_id = f"cache_{int(time())}"
 
 
131
  cache_store[cache_id] = {
132
  "cache": cache,
133
  "origin_len": origin_len,
134
  "doc_preview": doc_text[:500] + "..." if len(doc_text) > 500 else doc_text
135
  }
 
 
136
  os.unlink(temp_file_path)
 
137
  t2 = time()
 
138
  return {
139
  "cache_id": cache_id,
140
  "message": "Document uploaded and cache created successfully",
141
  "doc_preview": cache_store[cache_id]["doc_preview"],
142
  "time_taken": f"{t2 - t1:.4f} seconds"
143
  }
 
144
  except Exception as e:
 
145
  if os.path.exists(temp_file_path):
146
  os.unlink(temp_file_path)
147
  raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
148
 
149
  @app.post("/generate_answer_from_cache/{cache_id}")
150
  async def generate_answer(cache_id: str, request: QueryRequest):
 
151
  t1 = time()
 
 
152
  if cache_id not in cache_store:
153
  raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
 
154
  try:
 
155
  current_cache = clean_up(
156
- cache_store[cache_id]["cache"],
157
  cache_store[cache_id]["origin_len"]
158
  )
 
 
159
  full_prompt = f"""
160
  <|user|>
161
  Question: {request.query}
162
  <|assistant|>
163
  """.strip()
 
164
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
 
 
165
  output_ids = generate(model, input_ids, current_cache, max_new_tokens=request.max_new_tokens)
166
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
167
- rep = clean_response(response)
168
  t2 = time()
 
169
  return {
170
  "query": request.query,
171
  "answer": rep,
172
  "time_taken": f"{t2 - t1:.4f} seconds"
173
  }
 
174
  except Exception as e:
175
  raise HTTPException(status_code=500, detail=f"Error generating answer: {str(e)}")
176
 
177
  @app.post("/save_cache/{cache_id}")
178
  async def save_cache(cache_id: str):
 
179
  if cache_id not in cache_store:
180
  raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
 
181
  try:
 
182
  cleaned_cache = clean_up(
183
- cache_store[cache_id]["cache"],
184
  cache_store[cache_id]["origin_len"]
185
  )
 
186
  cache_path = f"{cache_id}_cache.pth"
187
  torch.save(cleaned_cache, cache_path)
 
188
  return {
189
  "message": f"Cache saved successfully as {cache_path}",
190
  "cache_path": cache_path
191
  }
 
192
  except Exception as e:
193
  raise HTTPException(status_code=500, detail=f"Error saving cache: {str(e)}")
194
 
195
  @app.post("/load_cache")
196
  async def load_cache(file: UploadFile = File(...)):
 
197
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pth") as temp_file:
198
  temp_file_path = temp_file.name
199
  content = await file.read()
200
  temp_file.write(content)
 
201
  try:
 
202
  loaded_cache = torch.load(temp_file_path)
 
 
203
  cache_id = f"loaded_cache_{int(time())}"
 
 
204
  cache_store[cache_id] = {
205
  "cache": loaded_cache,
206
  "origin_len": loaded_cache.key_cache[0].shape[-2],
207
  "doc_preview": "Loaded from cache file"
208
  }
 
 
209
  os.unlink(temp_file_path)
 
210
  return {
211
  "cache_id": cache_id,
212
  "message": "Cache loaded successfully"
213
  }
 
214
  except Exception as e:
 
215
  if os.path.exists(temp_file_path):
216
  os.unlink(temp_file_path)
217
  raise HTTPException(status_code=500, detail=f"Error loading cache: {str(e)}")
218
 
219
  @app.get("/list_of_caches")
220
  async def list_documents():
 
221
  documents = {}
222
  for cache_id in cache_store:
223
  documents[cache_id] = {
224
  "doc_preview": cache_store[cache_id]["doc_preview"],
225
  "origin_len": cache_store[cache_id]["origin_len"]
226
  }
 
227
  return {"documents": documents}
228
 
229
- @app.get("/", include_in_schema=False)
230
  async def root():
231
- return RedirectResponse(url="/docs")
232
 
233
  if __name__ == "__main__":
234
- uvicorn.run(app, host="0.0.0.0", port=7860)
 
 
1
  import os
2
  import torch
 
3
  from fastapi import FastAPI, File, UploadFile, HTTPException, Body
4
  from fastapi.responses import JSONResponse
5
  from transformers import AutoTokenizer, AutoModelForCausalLM
6
  from transformers.cache_utils import DynamicCache , StaticCache
7
  from pydantic import BaseModel
8
  from typing import Optional
9
+ import uvicorn
10
  import tempfile
11
  from time import time
12
+
13
 
14
  # Add necessary serialization safety
15
  torch.serialization.add_safe_globals([DynamicCache])
16
  torch.serialization.add_safe_globals([set])
17
+ #These lines allow PyTorch to serialize and deserialize these objects without raising errors,
18
+ # #ensuring compatibility and functionality during cache saving/loading.
19
 
20
+ # Minimal generate function for token-by-token generation
21
+ def generate(model,
22
+ input_ids,
23
+ past_key_values,
24
+ max_new_tokens=50):
25
+ """
26
+ This function performs token-by-token text generation using a pre-trained language model.
27
+ Purpose: To generate new text based on input tokens, without loading the full context repeatedly
28
+ Process: It takes a model, input IDs, and cached key-values, then generates new tokens one by one up to the specified maximum
29
+ Performance: Uses the cached key-values for efficiency and returns only the newly generated tokens
30
+ """
31
  device = model.model.embed_tokens.weight.device
32
+ origin_len = input_ids.shape[-1]#Stores the length of the input sequence (number of tokens) before text generation begins./return only the newly
33
+ input_ids = input_ids.to(device)#same device as the model.
34
+ output_ids = input_ids.clone()#will be updated during the generation process to include newly generated tokens.
35
+ next_token = input_ids#the token that will process in the next iteration.
36
  with torch.no_grad():
37
  for _ in range(max_new_tokens):
38
  out = model(
 
40
  past_key_values=past_key_values,
41
  use_cache=True
42
  )
43
+ logits = out.logits[:, -1, :]#Extracts the logits for the last token
44
+ token = torch.argmax(logits, dim=-1, keepdim=True)#highest predicted probability as the next token.
45
+ output_ids = torch.cat([output_ids, token], dim=-1)#add the newly generated token
46
  past_key_values = out.past_key_values
47
  next_token = token.to(device)
48
  if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
49
  break
50
+ return output_ids[:, origin_len:] # Return just the newly generated part
51
+
52
  def get_kv_cache(model, tokenizer, prompt):
53
+ """
54
+ This function creates a key-value cache for a given prompt.
55
+ Purpose: To pre-compute and store the model's internal representations (key-value states) for a prompt
56
+ Process: Encodes the prompt, runs it through the model, and captures the resulting cache
57
+ Returns: The cache object and the original prompt length for future reference
58
+ """
59
+ # Encode prompt
60
  device = model.model.embed_tokens.weight.device
61
  input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
62
+ cache = DynamicCache() # it grows as text is generated
63
+
64
+ # Run the model to populate the KV cache:
65
  with torch.no_grad():
66
  _ = model(
67
  input_ids=input_ids,
 
71
  return cache, input_ids.shape[-1]
72
 
73
  def clean_up(cache, origin_len):
74
+ # Make a deep copy of the cache first
75
  new_cache = DynamicCache()
76
  for i in range(len(cache.key_cache)):
77
  new_cache.key_cache.append(cache.key_cache[i].clone())
78
  new_cache.value_cache.append(cache.value_cache[i].clone())
79
+
80
+ # Remove any tokens appended to the original knowledge
81
  for i in range(len(new_cache.key_cache)):
82
  new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
83
  new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
84
  return new_cache
 
85
  os.environ["TRANSFORMERS_OFFLINE"] = "1"
86
  os.environ["HF_HUB_OFFLINE"] = "1"
87
 
88
+ # Path to your local model
89
+
90
+ # Initialize model and tokenizer
91
  def load_model_and_tokenizer():
92
+ model_path = "./deepseek"
93
+
94
+ # Load tokenizer and model from disk (without trust_remote_code)
95
  tokenizer = AutoTokenizer.from_pretrained(model_path)
96
  if torch.cuda.is_available():
97
+ # Load model on GPU if CUDA is available
98
  model = AutoModelForCausalLM.from_pretrained(
99
  model_path,
100
  torch_dtype=torch.float16,
101
+ device_map="auto" # Automatically map model layers to GPU
102
  )
103
  else:
104
+ # Load model on CPU if no GPU is available
105
  model = AutoModelForCausalLM.from_pretrained(
106
  model_path,
107
+ torch_dtype=torch.float32, # Use float32 for compatibility with CPU
108
+ low_cpu_mem_usage=True # Reduce memory usage on CPU
109
  )
110
  return model, tokenizer
111
 
112
+ # Create FastAPI app
113
  app = FastAPI(title="DeepSeek QA with KV Cache API")
114
+
115
+ # Global variables to store the cache, origin length, and model/tokenizer
116
  cache_store = {}
117
+
118
+ # Initialize model and tokenizer at startup
119
  model, tokenizer = load_model_and_tokenizer()
120
 
121
  class QueryRequest(BaseModel):
122
  query: str
123
  max_new_tokens: Optional[int] = 150
 
124
  def clean_response(response_text):
125
+ """
126
+ Clean up model response by removing redundant tags, repetitions, and formatting issues.
127
+ """
128
+ # First, try to extract just the answer content between tags if they exist
129
  import re
130
+
131
+ # Try to extract content between assistant tags if present
132
  assistant_pattern = re.compile(r'<\|assistant\|>\s*(.*?)(?:<\/\|assistant\|>|<\|user\|>|<\|system\|>)', re.DOTALL)
133
  matches = assistant_pattern.findall(response_text)
134
+
135
  if matches:
136
+ # Return the first meaningful assistant response
137
  for match in matches:
138
  cleaned = match.strip()
139
  if cleaned and not cleaned.startswith("<|") and len(cleaned) > 5:
140
  return cleaned
141
+
142
+ # If no proper match found, try more aggressive cleaning
143
+ # Remove all tag markers completely
144
  cleaned = re.sub(r'<\|.*?\|>', '', response_text)
145
  cleaned = re.sub(r'<\/\|.*?\|>', '', cleaned)
146
+
147
+ # Remove duplicate lines (common in generated responses)
148
  lines = cleaned.strip().split('\n')
149
  unique_lines = []
150
  for line in lines:
151
  line = line.strip()
152
  if line and line not in unique_lines:
153
  unique_lines.append(line)
154
+
155
  result = '\n'.join(unique_lines)
156
+
157
+ # Final cleanup - remove any trailing system/user markers
158
  result = re.sub(r'<\/?\|.*?\|>\s*$', '', result)
159
+
160
  return result.strip()
 
161
  @app.post("/upload-document_to_create_KV_cache")
162
  async def upload_document(file: UploadFile = File(...)):
163
+ """Upload a document and create KV cache for it"""
164
  t1 = time()
165
+
166
+ # Save the uploaded file temporarily
167
  with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
168
  temp_file_path = temp_file.name
169
  content = await file.read()
170
  temp_file.write(content)
171
+
172
  try:
173
+ # Read the document
174
  with open(temp_file_path, "r", encoding="utf-8") as f:
175
  doc_text = f.read()
176
+
177
+ # Create system prompt with document context
178
  system_prompt = f"""
179
  <|system|>
180
  Answer concisely and precisely, You are an assistant who provides concise factual answers.
 
183
  {doc_text}
184
  Question:
185
  """.strip()
186
+
187
+ # Create KV cache
188
  cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
189
+
190
+ # Generate a unique ID for this document/cache
191
  cache_id = f"cache_{int(time())}"
192
+
193
+ # Store the cache and origin_len
194
  cache_store[cache_id] = {
195
  "cache": cache,
196
  "origin_len": origin_len,
197
  "doc_preview": doc_text[:500] + "..." if len(doc_text) > 500 else doc_text
198
  }
199
+
200
+ # Clean up the temporary file
201
  os.unlink(temp_file_path)
202
+
203
  t2 = time()
204
+
205
  return {
206
  "cache_id": cache_id,
207
  "message": "Document uploaded and cache created successfully",
208
  "doc_preview": cache_store[cache_id]["doc_preview"],
209
  "time_taken": f"{t2 - t1:.4f} seconds"
210
  }
211
+
212
  except Exception as e:
213
+ # Clean up the temporary file in case of error
214
  if os.path.exists(temp_file_path):
215
  os.unlink(temp_file_path)
216
  raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
217
 
218
  @app.post("/generate_answer_from_cache/{cache_id}")
219
  async def generate_answer(cache_id: str, request: QueryRequest):
220
+ """Generate an answer to a question based on the uploaded document"""
221
  t1 = time()
222
+
223
+ # Check if the document/cache exists
224
  if cache_id not in cache_store:
225
  raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
226
+
227
  try:
228
+ # Get a clean copy of the cache
229
  current_cache = clean_up(
230
+ cache_store[cache_id]["cache"],
231
  cache_store[cache_id]["origin_len"]
232
  )
233
+
234
+ # Prepare input with just the query
235
  full_prompt = f"""
236
  <|user|>
237
  Question: {request.query}
238
  <|assistant|>
239
  """.strip()
240
+
241
  input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
242
+
243
+ # Generate response
244
  output_ids = generate(model, input_ids, current_cache, max_new_tokens=request.max_new_tokens)
245
  response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
246
+ rep = clean_response(response)
247
  t2 = time()
248
+
249
  return {
250
  "query": request.query,
251
  "answer": rep,
252
  "time_taken": f"{t2 - t1:.4f} seconds"
253
  }
254
+
255
  except Exception as e:
256
  raise HTTPException(status_code=500, detail=f"Error generating answer: {str(e)}")
257
 
258
  @app.post("/save_cache/{cache_id}")
259
  async def save_cache(cache_id: str):
260
+ """Save the cache for a document"""
261
  if cache_id not in cache_store:
262
  raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
263
+
264
  try:
265
+ # Clean up the cache and save it
266
  cleaned_cache = clean_up(
267
+ cache_store[cache_id]["cache"],
268
  cache_store[cache_id]["origin_len"]
269
  )
270
+
271
  cache_path = f"{cache_id}_cache.pth"
272
  torch.save(cleaned_cache, cache_path)
273
+
274
  return {
275
  "message": f"Cache saved successfully as {cache_path}",
276
  "cache_path": cache_path
277
  }
278
+
279
  except Exception as e:
280
  raise HTTPException(status_code=500, detail=f"Error saving cache: {str(e)}")
281
 
282
  @app.post("/load_cache")
283
  async def load_cache(file: UploadFile = File(...)):
284
+ """Load a previously saved cache"""
285
  with tempfile.NamedTemporaryFile(delete=False, suffix=".pth") as temp_file:
286
  temp_file_path = temp_file.name
287
  content = await file.read()
288
  temp_file.write(content)
289
+
290
  try:
291
+ # Load the cache
292
  loaded_cache = torch.load(temp_file_path)
293
+
294
+ # Generate a unique ID for this cache
295
  cache_id = f"loaded_cache_{int(time())}"
296
+
297
+ # Store the cache (we don't have the original document text)
298
  cache_store[cache_id] = {
299
  "cache": loaded_cache,
300
  "origin_len": loaded_cache.key_cache[0].shape[-2],
301
  "doc_preview": "Loaded from cache file"
302
  }
303
+
304
+ # Clean up the temporary file
305
  os.unlink(temp_file_path)
306
+
307
  return {
308
  "cache_id": cache_id,
309
  "message": "Cache loaded successfully"
310
  }
311
+
312
  except Exception as e:
313
+ # Clean up the temporary file in case of error
314
  if os.path.exists(temp_file_path):
315
  os.unlink(temp_file_path)
316
  raise HTTPException(status_code=500, detail=f"Error loading cache: {str(e)}")
317
 
318
  @app.get("/list_of_caches")
319
  async def list_documents():
320
+ """List all uploaded documents/caches"""
321
  documents = {}
322
  for cache_id in cache_store:
323
  documents[cache_id] = {
324
  "doc_preview": cache_store[cache_id]["doc_preview"],
325
  "origin_len": cache_store[cache_id]["origin_len"]
326
  }
327
+
328
  return {"documents": documents}
329
 
330
+ @app.get("/")
331
  async def root():
332
+ return {"message": "DeepSeek QA with KV Cache API is running"}
333
 
334
  if __name__ == "__main__":
335
+ # Run the FastAPI app
336
+ uvicorn.run(app, host="0.0.0.0", port=7860)
model/config.json DELETED
@@ -1,28 +0,0 @@
1
- {
2
- "_name_or_path": "facebook/opt-125m",
3
- "activation_dropout": 0.0,
4
- "activation_function": "relu",
5
- "architectures": [
6
- "OPTForCausalLM"
7
- ],
8
- "attention_dropout": 0.0,
9
- "bos_token_id": 2,
10
- "do_layer_norm_before": true,
11
- "dropout": 0.1,
12
- "eos_token_id": 2,
13
- "ffn_dim": 3072,
14
- "hidden_size": 768,
15
- "init_std": 0.02,
16
- "layerdrop": 0.0,
17
- "max_position_embeddings": 2048,
18
- "model_type": "opt",
19
- "num_attention_heads": 12,
20
- "num_hidden_layers": 12,
21
- "pad_token_id": 1,
22
- "prefix": "</s>",
23
- "torch_dtype": "float16",
24
- "transformers_version": "4.21.0.dev0",
25
- "use_cache": true,
26
- "vocab_size": 50272,
27
- "word_embed_proj_dim": 768
28
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
model/generation_config.json DELETED
@@ -1,7 +0,0 @@
1
- {
2
- "_from_model_config": true,
3
- "bos_token_id": 2,
4
- "eos_token_id": 2,
5
- "pad_token_id": 1,
6
- "transformers_version": "4.27.0.dev0"
7
- }
 
 
 
 
 
 
 
 
model/merges.txt DELETED
The diff for this file is too large to render. See raw diff
 
model/pytorch_model.bin DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:2d74da6615135c58cf3cf9ad4cb11e7c613ff9e55fe658a47ab83b6c8d1174a9
3
- size 250540281
 
 
 
 
model/special_tokens_map.json DELETED
@@ -1 +0,0 @@
1
- {"bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
 
 
model/tokenizer_config.json DELETED
@@ -1 +0,0 @@
1
- {"errors": "replace", "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "add_bos_token": true, "special_tokens_map_file": null, "name_or_path": "patrickvonplaten/opt-30b"}
 
 
model/vocab.json DELETED
The diff for this file is too large to render. See raw diff
 
requirements.txt CHANGED
@@ -1,8 +1,6 @@
1
  fastapi
2
  uvicorn
3
- torch
4
  transformers
5
- pyngrok
6
  accelerate
7
- python-multipart
8
- pydantic
 
1
  fastapi
2
  uvicorn
 
3
  transformers
4
+ torch
5
  accelerate
6
+ bitsandbytes