CooLLaMACEO commited on
Commit
0c80cd5
·
verified ·
1 Parent(s): db5a668

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +43 -17
main.py CHANGED
@@ -1,25 +1,51 @@
1
- import uvicorn
2
- from fastapi import FastAPI
3
- import ollama
 
 
4
 
5
- app = FastAPI()
6
 
7
- MODEL_NAME = "gemma3" # Ollama will pull this automatically if told to
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  @app.post("/completion")
10
- async def completion(prompt: str):
11
- # This calls the Ollama service running on your machine/server
12
- response = ollama.chat(model=MODEL_NAME, messages=[
13
- {'role': 'user', 'content': prompt},
14
- ])
15
- return {"content": response['message']['content']}
 
 
 
 
 
 
 
16
 
17
- @app.get("/health")
18
- def health():
19
- return {"status": "ready"}
20
 
21
  if __name__ == "__main__":
22
- # Ensure the model is downloaded before starting
23
- print(f"📦 Checking for model {MODEL_NAME}...")
24
- ollama.pull(MODEL_NAME)
25
  uvicorn.run(app, host="0.0.0.0", port=7860)
 
1
+ import os
2
+ from fastapi import FastAPI, Request
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
5
+ from contextlib import asynccontextmanager
6
 
7
+ llm = None
8
 
9
+ @asynccontextmanager
10
+ async def lifespan(app: FastAPI):
11
+ global llm
12
+ print("📥 Downloading Gemma-3 from Hub...")
13
+ # This downloads the file to the HF cache
14
+ model_path = hf_hub_download(
15
+ repo_id="mradermacher/gemma-3-4b-it-GGUF",
16
+ filename="gemma-3-4b-it.Q4_K_M.gguf"
17
+ )
18
+
19
+ print("🚀 Loading Model...")
20
+ llm = Llama(
21
+ model_path=model_path,
22
+ n_ctx=2048,
23
+ n_threads=2 # Hugging Face free tier usually has 2 vCPUs
24
+ )
25
+ print("✅ Ready!")
26
+ yield
27
+
28
+ app = FastAPI(lifespan=lifespan)
29
 
30
  @app.post("/completion")
31
+ async def completion(request: Request):
32
+ data = await request.json()
33
+ prompt = data.get("prompt", "")
34
+
35
+ # Gemma-3 specific formatting
36
+ formatted_prompt = f"<|begin_of_text|>user\n{prompt}\nassistant\n"
37
+
38
+ output = llm(
39
+ formatted_prompt,
40
+ max_tokens=512,
41
+ stop=["<|end_of_text|>", "user"]
42
+ )
43
+ return {"content": output["choices"][0]["text"]}
44
 
45
+ @app.get("/")
46
+ def home():
47
+ return {"message": "Gemma-3 API is running on Hugging Face"}
48
 
49
  if __name__ == "__main__":
50
+ import uvicorn
 
 
51
  uvicorn.run(app, host="0.0.0.0", port=7860)