SarmaHighOnAI commited on
Commit
56e9328
·
verified ·
1 Parent(s): f158928

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -12
app.py CHANGED
@@ -1,24 +1,35 @@
1
- import os
2
  from fastapi import FastAPI
3
  from pydantic import BaseModel
4
- from huggingface_hub import InferenceClient
 
5
 
6
  app = FastAPI()
7
- client = InferenceClient(api_key=os.environ.get("HF_TOKEN"))
 
 
 
 
 
 
 
 
 
 
 
8
 
9
  class Request(BaseModel):
10
  prompt: str
11
 
12
  @app.get("/")
13
  def home():
14
- return {"status": "Running"}
15
 
16
  @app.post("/generate")
17
- def generate_text(request: Request):
18
- messages = [{"role": "user", "content": request.prompt}]
19
- response = client.chat_completion(
20
- messages=messages,
21
- model="HuggingFaceTB/SmolLM2-1.7B-Instruct",
22
- max_tokens=500
23
- )
24
- return {"response": response.choices[0].message.content}
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
+ from llama_cpp import Llama
4
+ from huggingface_hub import hf_hub_download
5
 
6
  app = FastAPI()
7
+
8
+ # 1. Define your specific model details
9
+ REPO_ID = "SarmaHighOnAI/physics-tutor-gguf"
10
+ FILENAME = "llama-3.2-3b-instruct.Q4_K_M.gguf"
11
+
12
+ print("Downloading your fine-tuned model...")
13
+ # This downloads the file LOCALLY to the container
14
+ model_path = hf_hub_download(repo_id=REPO_ID, filename=FILENAME)
15
+
16
+ print("Loading model into memory...")
17
+ # This loads the 'brain' locally. n_threads=2 is safe for the free tier.
18
+ llm = Llama(model_path=model_path, n_ctx=2048, n_threads=2)
19
 
20
  class Request(BaseModel):
21
  prompt: str
22
 
23
  @app.get("/")
24
  def home():
25
+ return {"status": "Running", "message": "Physics Tutor API is Live (Local Inference)"}
26
 
27
  @app.post("/generate")
28
+ def generate(request: Request):
29
+ # Standard prompt format
30
+ formatted_prompt = f"<|start_header_id|>user<|end_header_id|>\n\n{request.prompt}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n"
31
+
32
+ # Run inference LOCALLY (No API Key needed)
33
+ output = llm(formatted_prompt, max_tokens=256, stop=["<|eot_id|>"], echo=False)
34
+
35
+ return {"response": output["choices"][0]["text"]}