Spaces:

kouki321
/

exampleone

Runtime error

App Files Files Community

yes

by kouki321 - opened May 20, 2025

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+144

-50097

This PR is in draft mode

Files changed (10) hide show

Dockerfile +14 -25
app.py +128 -26
model/config.json +0 -28
model/generation_config.json +0 -7
model/merges.txt +0 -0
model/pytorch_model.bin +0 -3
model/special_tokens_map.json +0 -1
model/tokenizer_config.json +0 -1
model/vocab.json +0 -0
requirements.txt +2 -4

Dockerfile CHANGED Viewed

@@ -1,33 +1,22 @@
-# Use a Python base image
-FROM python:3.9-slim
-# Set environment variables to prevent bytecode generation and buffer output for logging
-ENV PYTHONDONTWRITEBYTECODE=1
-ENV PYTHONUNBUFFERED=1
-# Install system dependencies
-RUN apt-get update && apt-get install -y \
-    gcc \
-    libgl1-mesa-glx \
-    libglib2.0-0 \
-    && rm -rf /var/lib/apt/lists/*
-# Set the working directory in the container
 WORKDIR /app
-# Copy requirements.txt and install Python dependencies
-COPY requirements.txt .
-RUN pip install --no-cache-dir -r requirements.txt
-RUN pip install accelerate
-COPY model /app/model
-# Copy the application code into the container
-COPY . .
-# Expose the port your application will run on
 EXPOSE 7860
-# Define the command to run your FastAPI application
-CMD ["python", "app.py"]

+# Base image
+FROM python:3.10-slim
+# Install system dependencies\ nRUN apt-get update && \
+RUN apt-get update && \
+    apt-get install -y git && \
+    rm -rf /var/lib/apt/lists/*
+# Copy and install Python dependencies
+COPY requirements.txt /app/requirements.txt
+RUN pip install --no-cache-dir -r /app/requirements.txt
+# Set working directory
 WORKDIR /app
+COPY app.py /app/app.py
+# Expose port (Gradio/Streamlit default)
 EXPOSE 7860
+# Launch
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

app.py CHANGED Viewed

@@ -1,26 +1,38 @@
 import os
 import torch
-import uvicorn
 from fastapi import FastAPI, File, UploadFile, HTTPException, Body
 from fastapi.responses import JSONResponse
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.cache_utils import DynamicCache , StaticCache
 from pydantic import BaseModel
 from typing import Optional
 import tempfile
 from time import time
-from fastapi.responses import RedirectResponse
 # Add necessary serialization safety
 torch.serialization.add_safe_globals([DynamicCache])
 torch.serialization.add_safe_globals([set])
-def generate(model, input_ids, past_key_values, max_new_tokens=50):
     device = model.model.embed_tokens.weight.device
-    origin_len = input_ids.shape[-1]
-    input_ids = input_ids.to(device)
-    output_ids = input_ids.clone()
-    next_token = input_ids
     with torch.no_grad():
         for _ in range(max_new_tokens):
             out = model(
@@ -28,19 +40,28 @@ def generate(model, input_ids, past_key_values, max_new_tokens=50):
                 past_key_values=past_key_values,
                 use_cache=True
             )
-            logits = out.logits[:, -1, :]
-            token = torch.argmax(logits, dim=-1, keepdim=True)
-            output_ids = torch.cat([output_ids, token], dim=-1)
             past_key_values = out.past_key_values
             next_token = token.to(device)
             if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
                 break
-    return output_ids[:, origin_len:]
 def get_kv_cache(model, tokenizer, prompt):
     device = model.model.embed_tokens.weight.device
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
-    cache = DynamicCache()
     with torch.no_grad():
         _ = model(
             input_ids=input_ids,
@@ -50,74 +71,110 @@ def get_kv_cache(model, tokenizer, prompt):
     return cache, input_ids.shape[-1]
 def clean_up(cache, origin_len):
     new_cache = DynamicCache()
     for i in range(len(cache.key_cache)):
         new_cache.key_cache.append(cache.key_cache[i].clone())
         new_cache.value_cache.append(cache.value_cache[i].clone())
     for i in range(len(new_cache.key_cache)):
         new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
         new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
     return new_cache
 os.environ["TRANSFORMERS_OFFLINE"] = "1"
 os.environ["HF_HUB_OFFLINE"] = "1"
 def load_model_and_tokenizer():
-    model_path = os.environ.get("MODEL_PATH", "./model")  # allow override via Docker env
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     if torch.cuda.is_available():
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16,
-            device_map="auto"
         )
     else:
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
-            torch_dtype=torch.float32,
-            low_cpu_mem_usage=True
         )
     return model, tokenizer
 app = FastAPI(title="DeepSeek QA with KV Cache API")
 cache_store = {}
 model, tokenizer = load_model_and_tokenizer()
 class QueryRequest(BaseModel):
     query: str
     max_new_tokens: Optional[int] = 150
 def clean_response(response_text):
     import re
     assistant_pattern = re.compile(r'<\|assistant\|>\s*(.*?)(?:<\/\|assistant\|>|<\|user\|>|<\|system\|>)', re.DOTALL)
     matches = assistant_pattern.findall(response_text)
     if matches:
         for match in matches:
             cleaned = match.strip()
             if cleaned and not cleaned.startswith("<|") and len(cleaned) > 5:
                 return cleaned
     cleaned = re.sub(r'<\|.*?\|>', '', response_text)
     cleaned = re.sub(r'<\/\|.*?\|>', '', cleaned)
     lines = cleaned.strip().split('\n')
     unique_lines = []
     for line in lines:
         line = line.strip()
         if line and line not in unique_lines:
             unique_lines.append(line)
     result = '\n'.join(unique_lines)
     result = re.sub(r'<\/?\|.*?\|>\s*$', '', result)
     return result.strip()
 @app.post("/upload-document_to_create_KV_cache")
 async def upload_document(file: UploadFile = File(...)):
     t1 = time()
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
         temp_file_path = temp_file.name
         content = await file.read()
         temp_file.write(content)
     try:
         with open(temp_file_path, "r", encoding="utf-8") as f:
             doc_text = f.read()
         system_prompt = f"""
         <|system|>
         Answer concisely and precisely, You are an assistant who provides concise factual answers.
@@ -126,109 +183,154 @@ async def upload_document(file: UploadFile = File(...)):
         {doc_text}
         Question:
         """.strip()
         cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
         cache_id = f"cache_{int(time())}"
         cache_store[cache_id] = {
             "cache": cache,
             "origin_len": origin_len,
             "doc_preview": doc_text[:500] + "..." if len(doc_text) > 500 else doc_text
         }
         os.unlink(temp_file_path)
         t2 = time()
         return {
             "cache_id": cache_id,
             "message": "Document uploaded and cache created successfully",
             "doc_preview": cache_store[cache_id]["doc_preview"],
             "time_taken": f"{t2 - t1:.4f} seconds"
         }
     except Exception as e:
         if os.path.exists(temp_file_path):
             os.unlink(temp_file_path)
         raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
 @app.post("/generate_answer_from_cache/{cache_id}")
 async def generate_answer(cache_id: str, request: QueryRequest):
     t1 = time()
     if cache_id not in cache_store:
         raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
     try:
         current_cache = clean_up(
-            cache_store[cache_id]["cache"],
             cache_store[cache_id]["origin_len"]
         )
         full_prompt = f"""
         <|user|>
         Question: {request.query}
         <|assistant|>
         """.strip()
         input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
         output_ids = generate(model, input_ids, current_cache, max_new_tokens=request.max_new_tokens)
         response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
-        rep = clean_response(response)
         t2 = time()
         return {
             "query": request.query,
             "answer": rep,
             "time_taken": f"{t2 - t1:.4f} seconds"
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error generating answer: {str(e)}")
 @app.post("/save_cache/{cache_id}")
 async def save_cache(cache_id: str):
     if cache_id not in cache_store:
         raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
     try:
         cleaned_cache = clean_up(
-            cache_store[cache_id]["cache"],
             cache_store[cache_id]["origin_len"]
         )
         cache_path = f"{cache_id}_cache.pth"
         torch.save(cleaned_cache, cache_path)
         return {
             "message": f"Cache saved successfully as {cache_path}",
             "cache_path": cache_path
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error saving cache: {str(e)}")
 @app.post("/load_cache")
 async def load_cache(file: UploadFile = File(...)):
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pth") as temp_file:
         temp_file_path = temp_file.name
         content = await file.read()
         temp_file.write(content)
     try:
         loaded_cache = torch.load(temp_file_path)
         cache_id = f"loaded_cache_{int(time())}"
         cache_store[cache_id] = {
             "cache": loaded_cache,
             "origin_len": loaded_cache.key_cache[0].shape[-2],
             "doc_preview": "Loaded from cache file"
         }
         os.unlink(temp_file_path)
         return {
             "cache_id": cache_id,
             "message": "Cache loaded successfully"
         }
     except Exception as e:
         if os.path.exists(temp_file_path):
             os.unlink(temp_file_path)
         raise HTTPException(status_code=500, detail=f"Error loading cache: {str(e)}")
 @app.get("/list_of_caches")
 async def list_documents():
     documents = {}
     for cache_id in cache_store:
         documents[cache_id] = {
             "doc_preview": cache_store[cache_id]["doc_preview"],
             "origin_len": cache_store[cache_id]["origin_len"]
         }
     return {"documents": documents}
-@app.get("/", include_in_schema=False)
 async def root():
-    return RedirectResponse(url="/docs")
 if __name__ == "__main__":
-    uvicorn.run(app, host="0.0.0.0", port=7860)

 import os
 import torch
 from fastapi import FastAPI, File, UploadFile, HTTPException, Body
 from fastapi.responses import JSONResponse
 from transformers import AutoTokenizer, AutoModelForCausalLM
 from transformers.cache_utils import DynamicCache , StaticCache
 from pydantic import BaseModel
 from typing import Optional
+import uvicorn
 import tempfile
 from time import time
 # Add necessary serialization safety
 torch.serialization.add_safe_globals([DynamicCache])
 torch.serialization.add_safe_globals([set])
+#These lines allow PyTorch to serialize and deserialize these objects without raising errors,
+# #ensuring compatibility and functionality during cache saving/loading.
+# Minimal generate function for token-by-token generation
+def generate(model,
+             input_ids,
+             past_key_values,
+             max_new_tokens=50):
+    """
+    This function performs token-by-token text generation using a pre-trained language model.
+            Purpose: To generate new text based on input tokens, without loading the full context repeatedly
+            Process: It takes a model, input IDs, and cached key-values, then generates new tokens one by one up to the specified maximum
+            Performance: Uses the cached key-values for efficiency and returns only the newly generated tokens
+    """
     device = model.model.embed_tokens.weight.device
+    origin_len = input_ids.shape[-1]#Stores the length of the input sequence (number of tokens) before text generation begins./return only the newly
+    input_ids = input_ids.to(device)#same device as the model.
+    output_ids = input_ids.clone()#will be updated during the generation process to include newly generated tokens.
+    next_token = input_ids#the token that will process in the next iteration.
     with torch.no_grad():
         for _ in range(max_new_tokens):
             out = model(
                 past_key_values=past_key_values,
                 use_cache=True
             )
+            logits = out.logits[:, -1, :]#Extracts the logits for the last token
+            token = torch.argmax(logits, dim=-1, keepdim=True)#highest predicted probability as the next token.
+            output_ids = torch.cat([output_ids, token], dim=-1)#add the newly generated token
             past_key_values = out.past_key_values
             next_token = token.to(device)
             if model.config.eos_token_id is not None and token.item() == model.config.eos_token_id:
                 break
+    return output_ids[:, origin_len:] # Return just the newly generated part
 def get_kv_cache(model, tokenizer, prompt):
+    """
+    This function creates a key-value cache for a given prompt.
+        Purpose: To pre-compute and store the model's internal representations (key-value states) for a prompt
+        Process: Encodes the prompt, runs it through the model, and captures the resulting cache
+        Returns: The cache object and the original prompt length for future reference
+    """
+    # Encode prompt
     device = model.model.embed_tokens.weight.device
     input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(device)
+    cache = DynamicCache()  # it grows as text is generated
+    # Run the model to populate the KV cache:
     with torch.no_grad():
         _ = model(
             input_ids=input_ids,
     return cache, input_ids.shape[-1]
 def clean_up(cache, origin_len):
+    # Make a deep copy of the cache first
     new_cache = DynamicCache()
     for i in range(len(cache.key_cache)):
         new_cache.key_cache.append(cache.key_cache[i].clone())
         new_cache.value_cache.append(cache.value_cache[i].clone())
+    # Remove any tokens appended to the original knowledge
     for i in range(len(new_cache.key_cache)):
         new_cache.key_cache[i] = new_cache.key_cache[i][:, :, :origin_len, :]
         new_cache.value_cache[i] = new_cache.value_cache[i][:, :, :origin_len, :]
     return new_cache
 os.environ["TRANSFORMERS_OFFLINE"] = "1"
 os.environ["HF_HUB_OFFLINE"] = "1"
+# Path to your local model
+# Initialize model and tokenizer
 def load_model_and_tokenizer():
+    model_path = "./deepseek"
+    # Load tokenizer and model from disk (without trust_remote_code)
     tokenizer = AutoTokenizer.from_pretrained(model_path)
     if torch.cuda.is_available():
+        # Load model on GPU if CUDA is available
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
             torch_dtype=torch.float16,
+            device_map="auto"  # Automatically map model layers to GPU
         )
     else:
+        # Load model on CPU if no GPU is available
         model = AutoModelForCausalLM.from_pretrained(
             model_path,
+            torch_dtype=torch.float32,  # Use float32 for compatibility with CPU
+            low_cpu_mem_usage=True  # Reduce memory usage on CPU
         )
     return model, tokenizer
+# Create FastAPI app
 app = FastAPI(title="DeepSeek QA with KV Cache API")
+# Global variables to store the cache, origin length, and model/tokenizer
 cache_store = {}
+# Initialize model and tokenizer at startup
 model, tokenizer = load_model_and_tokenizer()
 class QueryRequest(BaseModel):
     query: str
     max_new_tokens: Optional[int] = 150
 def clean_response(response_text):
+    """
+    Clean up model response by removing redundant tags, repetitions, and formatting issues.
+    """
+    # First, try to extract just the answer content between tags if they exist
     import re
+    # Try to extract content between assistant tags if present
     assistant_pattern = re.compile(r'<\|assistant\|>\s*(.*?)(?:<\/\|assistant\|>|<\|user\|>|<\|system\|>)', re.DOTALL)
     matches = assistant_pattern.findall(response_text)
     if matches:
+        # Return the first meaningful assistant response
         for match in matches:
             cleaned = match.strip()
             if cleaned and not cleaned.startswith("<|") and len(cleaned) > 5:
                 return cleaned
+    # If no proper match found, try more aggressive cleaning
+    # Remove all tag markers completely
     cleaned = re.sub(r'<\|.*?\|>', '', response_text)
     cleaned = re.sub(r'<\/\|.*?\|>', '', cleaned)
+    # Remove duplicate lines (common in generated responses)
     lines = cleaned.strip().split('\n')
     unique_lines = []
     for line in lines:
         line = line.strip()
         if line and line not in unique_lines:
             unique_lines.append(line)
     result = '\n'.join(unique_lines)
+    # Final cleanup - remove any trailing system/user markers
     result = re.sub(r'<\/?\|.*?\|>\s*$', '', result)
     return result.strip()
 @app.post("/upload-document_to_create_KV_cache")
 async def upload_document(file: UploadFile = File(...)):
+    """Upload a document and create KV cache for it"""
     t1 = time()
+    # Save the uploaded file temporarily
     with tempfile.NamedTemporaryFile(delete=False, suffix=".txt") as temp_file:
         temp_file_path = temp_file.name
         content = await file.read()
         temp_file.write(content)
     try:
+        # Read the document
         with open(temp_file_path, "r", encoding="utf-8") as f:
             doc_text = f.read()
+        # Create system prompt with document context
         system_prompt = f"""
         <|system|>
         Answer concisely and precisely, You are an assistant who provides concise factual answers.
         {doc_text}
         Question:
         """.strip()
+        # Create KV cache
         cache, origin_len = get_kv_cache(model, tokenizer, system_prompt)
+        # Generate a unique ID for this document/cache
         cache_id = f"cache_{int(time())}"
+        # Store the cache and origin_len
         cache_store[cache_id] = {
             "cache": cache,
             "origin_len": origin_len,
             "doc_preview": doc_text[:500] + "..." if len(doc_text) > 500 else doc_text
         }
+        # Clean up the temporary file
         os.unlink(temp_file_path)
         t2 = time()
         return {
             "cache_id": cache_id,
             "message": "Document uploaded and cache created successfully",
             "doc_preview": cache_store[cache_id]["doc_preview"],
             "time_taken": f"{t2 - t1:.4f} seconds"
         }
     except Exception as e:
+        # Clean up the temporary file in case of error
         if os.path.exists(temp_file_path):
             os.unlink(temp_file_path)
         raise HTTPException(status_code=500, detail=f"Error processing document: {str(e)}")
 @app.post("/generate_answer_from_cache/{cache_id}")
 async def generate_answer(cache_id: str, request: QueryRequest):
+    """Generate an answer to a question based on the uploaded document"""
     t1 = time()
+    # Check if the document/cache exists
     if cache_id not in cache_store:
         raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
     try:
+        # Get a clean copy of the cache
         current_cache = clean_up(
+            cache_store[cache_id]["cache"],
             cache_store[cache_id]["origin_len"]
         )
+        # Prepare input with just the query
         full_prompt = f"""
         <|user|>
         Question: {request.query}
         <|assistant|>
         """.strip()
         input_ids = tokenizer(full_prompt, return_tensors="pt").input_ids
+        # Generate response
         output_ids = generate(model, input_ids, current_cache, max_new_tokens=request.max_new_tokens)
         response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
+        rep =  clean_response(response)
         t2 = time()
         return {
             "query": request.query,
             "answer": rep,
             "time_taken": f"{t2 - t1:.4f} seconds"
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error generating answer: {str(e)}")
 @app.post("/save_cache/{cache_id}")
 async def save_cache(cache_id: str):
+    """Save the cache for a document"""
     if cache_id not in cache_store:
         raise HTTPException(status_code=404, detail="Document not found. Please upload it first.")
     try:
+        # Clean up the cache and save it
         cleaned_cache = clean_up(
+            cache_store[cache_id]["cache"],
             cache_store[cache_id]["origin_len"]
         )
         cache_path = f"{cache_id}_cache.pth"
         torch.save(cleaned_cache, cache_path)
         return {
             "message": f"Cache saved successfully as {cache_path}",
             "cache_path": cache_path
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"Error saving cache: {str(e)}")
 @app.post("/load_cache")
 async def load_cache(file: UploadFile = File(...)):
+    """Load a previously saved cache"""
     with tempfile.NamedTemporaryFile(delete=False, suffix=".pth") as temp_file:
         temp_file_path = temp_file.name
         content = await file.read()
         temp_file.write(content)
     try:
+        # Load the cache
         loaded_cache = torch.load(temp_file_path)
+        # Generate a unique ID for this cache
         cache_id = f"loaded_cache_{int(time())}"
+        # Store the cache (we don't have the original document text)
         cache_store[cache_id] = {
             "cache": loaded_cache,
             "origin_len": loaded_cache.key_cache[0].shape[-2],
             "doc_preview": "Loaded from cache file"
         }
+        # Clean up the temporary file
         os.unlink(temp_file_path)
         return {
             "cache_id": cache_id,
             "message": "Cache loaded successfully"
         }
     except Exception as e:
+        # Clean up the temporary file in case of error
         if os.path.exists(temp_file_path):
             os.unlink(temp_file_path)
         raise HTTPException(status_code=500, detail=f"Error loading cache: {str(e)}")
 @app.get("/list_of_caches")
 async def list_documents():
+    """List all uploaded documents/caches"""
     documents = {}
     for cache_id in cache_store:
         documents[cache_id] = {
             "doc_preview": cache_store[cache_id]["doc_preview"],
             "origin_len": cache_store[cache_id]["origin_len"]
         }
     return {"documents": documents}
+@app.get("/")
 async def root():
+    return {"message": "DeepSeek QA with KV Cache API is running"}
 if __name__ == "__main__":
+    # Run the FastAPI app
+    uvicorn.run(app, host="0.0.0.0", port=7860)

model/config.json DELETED Viewed

@@ -1,28 +0,0 @@
-{
-  "_name_or_path": "facebook/opt-125m",
-  "activation_dropout": 0.0,
-  "activation_function": "relu",
-  "architectures": [
-    "OPTForCausalLM"
-  ],
-  "attention_dropout": 0.0,
-  "bos_token_id": 2,
-  "do_layer_norm_before": true,
-  "dropout": 0.1,
-  "eos_token_id": 2,
-  "ffn_dim": 3072,
-  "hidden_size": 768,
-  "init_std": 0.02,
-  "layerdrop": 0.0,
-  "max_position_embeddings": 2048,
-  "model_type": "opt",
-  "num_attention_heads": 12,
-  "num_hidden_layers": 12,
-  "pad_token_id": 1,
-  "prefix": "</s>",
-  "torch_dtype": "float16",
-  "transformers_version": "4.21.0.dev0",
-  "use_cache": true,
-  "vocab_size": 50272,
-  "word_embed_proj_dim": 768
-}

model/generation_config.json DELETED Viewed

@@ -1,7 +0,0 @@
-{
-  "_from_model_config": true,
-  "bos_token_id": 2,
-  "eos_token_id": 2,
-  "pad_token_id": 1,
-  "transformers_version": "4.27.0.dev0"
-}

model/merges.txt DELETED Viewed

The diff for this file is too large to render. See raw diff

model/pytorch_model.bin DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2d74da6615135c58cf3cf9ad4cb11e7c613ff9e55fe658a47ab83b6c8d1174a9
-size 250540281

model/special_tokens_map.json DELETED Viewed

@@ -1 +0,0 @@

- {"bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}

model/tokenizer_config.json DELETED Viewed

@@ -1 +0,0 @@

- {"errors": "replace", "unk_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "</s>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "pad_token": {"content": "<pad>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "add_bos_token": true, "special_tokens_map_file": null, "name_or_path": "patrickvonplaten/opt-30b"}

model/vocab.json DELETED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt CHANGED Viewed

@@ -1,8 +1,6 @@
 fastapi
 uvicorn
-torch
 transformers
-pyngrok
 accelerate
-python-multipart
-pydantic

 fastapi
 uvicorn
 transformers
+torch
 accelerate
+bitsandbytes