Spaces:

aiqknow
/

phi3-api

Sleeping

aiqknow commited on 18 days ago

Commit

6993d08

verified ·

1 Parent(s): ff1b35a

Upload 4 files

Files changed (1) hide show

app.py CHANGED Viewed

@@ -6,18 +6,18 @@ from huggingface_hub import hf_hub_download
 app = FastAPI()
-# Model configuration
-MODEL_REPO = "bartowski/Phi-3-mini-4k-instruct-GGUF"
-MODEL_FILE = "Phi-3-mini-4k-instruct-Q4_K_M.gguf"
-print("Downloading model...")
 model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
-print("Loading model (this might take a minute on CPU)...")
 llm = Llama(
     model_path=model_path,
     n_ctx=2048,
-    n_threads=2,  # HF Free tier has 2 vCPUs
     verbose=False
 )
@@ -26,18 +26,22 @@ class PromptRequest(BaseModel):
 @app.get("/")
 def read_root():
-    return {"message": "Phi-3 Mini API is running. Use POST /api for inference."}
 @app.post("/api")
 async def generate_response(request: PromptRequest):
     try:
-        # Format the prompt for Phi-3 Instruct
-        formatted_prompt = f"<|user|>\n{request.prompt}<|end|>\n<|assistant|>"
         output = llm(
             formatted_prompt,
-            max_tokens=512,
-            stop=["<|end|>"],
             echo=False
         )
@@ -48,6 +52,7 @@ async def generate_response(request: PromptRequest):
             "text": response_text
         }
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":

 app = FastAPI()
+# Switching to Gemma-2-2B-Instruct (High Quality & Good Speed)
+MODEL_REPO = "bartowski/gemma-2-2b-it-GGUF"
+MODEL_FILE = "gemma-2-2b-it-Q4_K_M.gguf"
+print("Downloading Gemma-2-2B model...")
 model_path = hf_hub_download(repo_id=MODEL_REPO, filename=MODEL_FILE)
+print("Loading Gemma for high-quality CPU inference...")
 llm = Llama(
     model_path=model_path,
     n_ctx=2048,
+    n_threads=2,
     verbose=False
 )
 @app.get("/")
 def read_root():
+    return {"message": "Gemma-2-2B-IT API is running."}
+@app.get("/health")
+def health_check():
+    return {"status": "alive"}
 @app.post("/api")
 async def generate_response(request: PromptRequest):
     try:
+        # Gemma 2 Prompt Format
+        formatted_prompt = f"<start_of_turn>user\n{request.prompt}<end_of_turn>\n<start_of_turn>model\n"
         output = llm(
             formatted_prompt,
+            max_tokens=1024,
+            stop=["<end_of_turn>"],
             echo=False
         )
             "text": response_text
         }
     except Exception as e:
+        print(f"Error: {e}")
         raise HTTPException(status_code=500, detail=str(e))
 if __name__ == "__main__":