aiqknow commited on
Commit
6993d08
·
verified ·
1 Parent(s): ff1b35a

Upload 4 files

Browse files
Files changed (1) hide show
  1. app.py +16 -11
app.py CHANGED
@@ -6,18 +6,18 @@ from huggingface_hub import hf_hub_download
6
 
7
  app = FastAPI()
8
 
9
- # Model configuration
10
- MODEL_REPO = "bartowski/Phi-3-mini-4k-instruct-GGUF"
11
- MODEL_FILE = "Phi-3-mini-4k-instruct-Q4_K_M.gguf"
12
 
13
- print("Downloading model...")
14
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
15
 
16
- print("Loading model (this might take a minute on CPU)...")
17
  llm = Llama(
18
  model_path=model_path,
19
  n_ctx=2048,
20
- n_threads=2, # HF Free tier has 2 vCPUs
21
  verbose=False
22
  )
23
 
@@ -26,18 +26,22 @@ class PromptRequest(BaseModel):
26
 
27
  @app.get("/")
28
  def read_root():
29
- return {"message": "Phi-3 Mini API is running. Use POST /api for inference."}
 
 
 
 
30
 
31
  @app.post("/api")
32
  async def generate_response(request: PromptRequest):
33
  try:
34
- # Format the prompt for Phi-3 Instruct
35
- formatted_prompt = f"<|user|>\n{request.prompt}<|end|>\n<|assistant|>"
36
 
37
  output = llm(
38
  formatted_prompt,
39
- max_tokens=512,
40
- stop=["<|end|>"],
41
  echo=False
42
  )
43
 
@@ -48,6 +52,7 @@ async def generate_response(request: PromptRequest):
48
  "text": response_text
49
  }
50
  except Exception as e:
 
51
  raise HTTPException(status_code=500, detail=str(e))
52
 
53
  if __name__ == "__main__":
 
6
 
7
  app = FastAPI()
8
 
9
+ # Switching to Gemma-2-2B-Instruct (High Quality & Good Speed)
10
+ MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
11
+ MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"
12
 
13
+ print("Downloading Gemma-2-2B model...")
14
  model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
15
 
16
+ print("Loading Gemma for high-quality CPU inference...")
17
  llm = Llama(
18
  model_path=model_path,
19
  n_ctx=2048,
20
+ n_threads=2,
21
  verbose=False
22
  )
23
 
 
26
 
27
  @app.get("/")
28
  def read_root():
29
+ return {"message": "Gemma-2-2B-IT API is running."}
30
+
31
+ @app.get("/health")
32
+ def health_check():
33
+ return {"status": "alive"}
34
 
35
  @app.post("/api")
36
  async def generate_response(request: PromptRequest):
37
  try:
38
+ # Gemma 2 Prompt Format
39
+ formatted_prompt = f"<start_of_turn>user\n{request.prompt}<end_of_turn>\n<start_of_turn>model\n"
40
 
41
  output = llm(
42
  formatted_prompt,
43
+ max_tokens=1024,
44
+ stop=["<end_of_turn>"],
45
  echo=False
46
  )
47
 
 
52
  "text": response_text
53
  }
54
  except Exception as e:
55
+ print(f"Error: {e}")
56
  raise HTTPException(status_code=500, detail=str(e))
57
 
58
  if __name__ == "__main__":