Dmitry Beresnev commited on
Commit
9d0ed97
·
1 Parent(s): ba2be63

fix dockerfile, pyproject.toml, app

Browse files
Files changed (4) hide show
  1. Dockerfile +2 -1
  2. app.py +59 -0
  3. fast_api_service.py +0 -40
  4. pyproject.toml +4 -6
Dockerfile CHANGED
@@ -16,7 +16,8 @@ COPY . /app
16
  RUN useradd -m -u 1000 user
17
  USER user
18
  ENV HOME=/home/user \
19
- PATH=/home/user/.local/bin:$PATH
 
20
 
21
  EXPOSE 7860
22
 
 
16
  RUN useradd -m -u 1000 user
17
  USER user
18
  ENV HOME=/home/user \
19
+ PATH=/home/user/.local/bin:$PATH \
20
+ HF_HOME=/home/user/.cache/huggingface
21
 
22
  EXPOSE 7860
23
 
app.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import FastAPI
2
+ from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
+ import os
5
+
6
+ # GGUF model configuration
7
+ REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
8
+ FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
9
+
10
+ app = FastAPI()
11
+
12
+ # Download and cache the GGUF model
13
+ print(f"Downloading {FILENAME} from {REPO_ID}...")
14
+ model_path = hf_hub_download(
15
+ repo_id=REPO_ID,
16
+ filename=FILENAME,
17
+ cache_dir=os.getenv("HF_HOME", "./models")
18
+ )
19
+ print(f"Model downloaded to: {model_path}")
20
+
21
+ # Load the model with llama-cpp-python
22
+ print("Loading model into memory...")
23
+ llm = Llama(
24
+ model_path=model_path,
25
+ n_ctx=2048, # Context window
26
+ n_threads=4, # CPU threads
27
+ n_gpu_layers=0, # Use CPU only (set >0 if GPU available)
28
+ verbose=False
29
+ )
30
+ print("Model loaded successfully!")
31
+
32
+
33
+ @app.post("/v1/chat/completions")
34
+ def chat(req: dict):
35
+ messages = req.get("messages", [])
36
+ max_tokens = req.get("max_tokens", 256)
37
+ temperature = req.get("temperature", 0.7)
38
+
39
+ # Use llama-cpp-python's built-in chat completion
40
+ response = llm.create_chat_completion(
41
+ messages=messages,
42
+ max_tokens=max_tokens,
43
+ temperature=temperature,
44
+ stop=["</s>", "User:", "###"]
45
+ )
46
+
47
+ return {
48
+ "choices": [{
49
+ "message": {
50
+ "role": "assistant",
51
+ "content": response["choices"][0]["message"]["content"]
52
+ }
53
+ }]
54
+ }
55
+
56
+
57
+ @app.get("/")
58
+ def root():
59
+ return {"status": "DeepSeek API is online (GGUF)"}
fast_api_service.py DELETED
@@ -1,40 +0,0 @@
1
- from fastapi import FastAPI
2
- from transformers import AutoModelForCausalLM, AutoTokenizer
3
- import torch
4
-
5
- MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
6
-
7
- app = FastAPI()
8
-
9
- tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
10
- model = AutoModelForCausalLM.from_pretrained(
11
- MODEL_NAME,
12
- torch_dtype=torch.float32,
13
- device_map="cpu"
14
- )
15
-
16
-
17
- @app.post("/v1/chat/completions")
18
- def chat(req: dict):
19
- messages = req.get("messages", [])
20
- content = messages[-1]["content"]
21
-
22
- inputs = tokenizer(content, return_tensors="pt")
23
- outputs = model.generate(
24
- **inputs,
25
- max_new_tokens=256,
26
- temperature=0.7
27
- )
28
-
29
- response = tokenizer.decode(outputs[0], skip_special_tokens=True)
30
-
31
- return {
32
- "choices": [{
33
- "message": {"role": "assistant", "content": response}
34
- }]
35
- }
36
-
37
-
38
- @app.get("/")
39
- def root():
40
- return {"status": "DeepSeek API is online"}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
pyproject.toml CHANGED
@@ -6,11 +6,9 @@ authors = [
6
  { name = "AI Developer", email = "you@example.com" }
7
  ]
8
  requires-python = ">=3.12"
9
-
10
  dependencies = [
11
- "fastapi>=0.100.0",
12
- "uvicorn>=0.23.2",
13
- "transformers>=4.36.0",
14
- "torch>=2.3.0",
15
- "accelerate<=1.12.0"
16
  ]
 
6
  { name = "AI Developer", email = "you@example.com" }
7
  ]
8
  requires-python = ">=3.12"
 
9
  dependencies = [
10
+ "fastapi",
11
+ "uvicorn",
12
+ "llama-cpp-python",
13
+ "huggingface-hub"
 
14
  ]