Spaces:

ResearchEngineering
/

AGI

Sleeping

App Files Files Community

Dmitry Beresnev commited on Dec 2, 2025

Commit

9d0ed97

1 Parent(s): ba2be63

fix dockerfile, pyproject.toml, app

Browse files

Files changed (4) hide show

Dockerfile +2 -1
app.py +59 -0
fast_api_service.py +0 -40
pyproject.toml +4 -6

Dockerfile CHANGED Viewed

@@ -16,7 +16,8 @@ COPY . /app
 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
-    PATH=/home/user/.local/bin:$PATH
 EXPOSE 7860

 RUN useradd -m -u 1000 user
 USER user
 ENV HOME=/home/user \
+    PATH=/home/user/.local/bin:$PATH \
+    HF_HOME=/home/user/.cache/huggingface
 EXPOSE 7860

app.py ADDED Viewed

	@@ -0,0 +1,59 @@

+from fastapi import FastAPI
+from llama_cpp import Llama
+from huggingface_hub import hf_hub_download
+import os
+# GGUF model configuration
+REPO_ID = "TheBloke/deepseek-coder-6.7B-instruct-GGUF"
+FILENAME = "deepseek-coder-6.7b-instruct.Q4_K_M.gguf"
+app = FastAPI()
+# Download and cache the GGUF model
+print(f"Downloading {FILENAME} from {REPO_ID}...")
+model_path = hf_hub_download(
+    repo_id=REPO_ID,
+    filename=FILENAME,
+    cache_dir=os.getenv("HF_HOME", "./models")
+)
+print(f"Model downloaded to: {model_path}")
+# Load the model with llama-cpp-python
+print("Loading model into memory...")
+llm = Llama(
+    model_path=model_path,
+    n_ctx=2048,  # Context window
+    n_threads=4,  # CPU threads
+    n_gpu_layers=0,  # Use CPU only (set >0 if GPU available)
+    verbose=False
+)
+print("Model loaded successfully!")
+@app.post("/v1/chat/completions")
+def chat(req: dict):
+    messages = req.get("messages", [])
+    max_tokens = req.get("max_tokens", 256)
+    temperature = req.get("temperature", 0.7)
+    # Use llama-cpp-python's built-in chat completion
+    response = llm.create_chat_completion(
+        messages=messages,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        stop=["</s>", "User:", "###"]
+    )
+    return {
+        "choices": [{
+            "message": {
+                "role": "assistant",
+                "content": response["choices"][0]["message"]["content"]
+            }
+        }]
+    }
+@app.get("/")
+def root():
+    return {"status": "DeepSeek API is online (GGUF)"}

fast_api_service.py DELETED Viewed

@@ -1,40 +0,0 @@
-from fastapi import FastAPI
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
-MODEL_NAME = "deepseek-ai/deepseek-coder-6.7b-instruct"
-app = FastAPI()
-tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
-model = AutoModelForCausalLM.from_pretrained(
-    MODEL_NAME,
-    torch_dtype=torch.float32,
-    device_map="cpu"
-)
-@app.post("/v1/chat/completions")
-def chat(req: dict):
-    messages = req.get("messages", [])
-    content = messages[-1]["content"]
-    inputs = tokenizer(content, return_tensors="pt")
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=256,
-        temperature=0.7
-    )
-    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
-    return {
-        "choices": [{
-            "message": {"role": "assistant", "content": response}
-        }]
-    }
-@app.get("/")
-def root():
-    return {"status": "DeepSeek API is online"}

pyproject.toml CHANGED Viewed

@@ -6,11 +6,9 @@ authors = [
     { name = "AI Developer", email = "you@example.com" }
 ]
 requires-python = ">=3.12"
 dependencies = [
-    "fastapi>=0.100.0",
-    "uvicorn>=0.23.2",
-    "transformers>=4.36.0",
-    "torch>=2.3.0",
-    "accelerate<=1.12.0"
 ]

     { name = "AI Developer", email = "you@example.com" }
 ]
 requires-python = ">=3.12"
 dependencies = [
+    "fastapi",
+    "uvicorn",
+    "llama-cpp-python",
+    "huggingface-hub"
 ]