bsny commited on
Commit
eccc79e
·
verified ·
1 Parent(s): 9364783

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +11 -18
app.py CHANGED
@@ -1,25 +1,18 @@
1
- from fastapi import FastAPI, Request
2
- from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
3
  import torch
4
 
5
- app = FastAPI()
6
-
7
- # Load model and tokenizer
8
- model_id = "meta-llama/Llama-3.1-8B"
9
-
10
- tokenizer = AutoTokenizer.from_pretrained(model_id)
11
 
 
12
  model = AutoModelForCausalLM.from_pretrained(
13
  model_id,
14
- torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
15
- device_map="auto" # Will auto-detect if CUDA or CPU
 
 
16
  )
17
 
18
- pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=256)
19
-
20
- @app.post("/chat")
21
- async def chat(request: Request):
22
- data = await request.json()
23
- prompt = data.get("prompt", "")
24
- output = pipe(prompt)[0]['generated_text']
25
- return {"response": output}
 
1
+ from transformers import AutoTokenizer, AutoModelForCausalLM
 
2
  import torch
3
 
4
+ model_id = "hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
 
 
 
 
5
 
6
+ tokenizer = AutoTokenizer.from_pretrained(model_id, use_auth_token=True)
7
  model = AutoModelForCausalLM.from_pretrained(
8
  model_id,
9
+ device_map="auto",
10
+ torch_dtype=torch.float16,
11
+ low_cpu_mem_usage=True,
12
+ use_auth_token=True # ⬅️ ensures gated model access
13
  )
14
 
15
+ def generate(prompt):
16
+ inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
17
+ outputs = model.generate(**inputs, max_new_tokens=128)
18
+ return tokenizer.decode(outputs[0], skip_special_tokens=True)