FabioSantos commited on
Commit
c2beca9
·
verified ·
1 Parent(s): e9e29f0

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -1,7 +1,7 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from huggingface_hub import hf_hub_download
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
5
 
6
  # Definição do modelo de dados de entrada
7
  class Question(BaseModel):
@@ -16,12 +16,14 @@ model_basename = "unsloth.Q8_0.gguf"
16
  model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
17
  print(f"Model path: {model_path}")
18
 
19
- # Carregar o tokenizador e o modelo
20
- tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
21
- model = AutoModelForCausalLM.from_pretrained(model_name_or_path).to("cuda")
22
-
23
- # Ativar inferência otimizada
24
- FastLanguageModel.for_inference(model) # Ativar inferência otimizada
 
 
25
 
26
  # Formato de prompt utilizado no fine-tuning
27
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
@@ -42,14 +44,18 @@ def get_response(text: str) -> str:
42
  text,
43
  ""
44
  )
45
- inputs = tokenizer([formatted_prompt], return_tensors="pt").to("cuda")
46
-
47
- # Gerar resposta
48
- outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
49
- response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
50
-
51
- print(f"Generated Response: {response}")
52
- return response
 
 
 
 
53
 
54
  # Endpoint para receber uma questão e retornar a resposta
55
  @app.post("/ask")
@@ -62,3 +68,4 @@ if __name__ == "__main__":
62
  import uvicorn
63
  uvicorn.run(app, host="0.0.0.0", port=8000)
64
 
 
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from huggingface_hub import hf_hub_download
4
+ from llama_cpp import Llama
5
 
6
  # Definição do modelo de dados de entrada
7
  class Question(BaseModel):
 
16
  model_path = hf_hub_download(repo_id=model_name_or_path, filename=model_basename)
17
  print(f"Model path: {model_path}")
18
 
19
+ # Configuração do modelo com llama_cpp
20
+ lcpp_llm = Llama(
21
+ model_path=model_path,
22
+ n_threads=2,
23
+ n_batch=512,
24
+ n_gpu_layers=-1,
25
+ n_ctx=4096,
26
+ )
27
 
28
  # Formato de prompt utilizado no fine-tuning
29
  alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.
 
44
  text,
45
  ""
46
  )
47
+ response = lcpp_llm(
48
+ prompt=formatted_prompt,
49
+ max_tokens=256,
50
+ temperature=0.5,
51
+ top_p=0.95,
52
+ top_k=50,
53
+ stop=['\n'], # Tente usar um token de parada específico
54
+ echo=True
55
+ )
56
+ response_text = response['choices'][0]['text']
57
+ print(f"Generated Response: {response_text}")
58
+ return response_text
59
 
60
  # Endpoint para receber uma questão e retornar a resposta
61
  @app.post("/ask")
 
68
  import uvicorn
69
  uvicorn.run(app, host="0.0.0.0", port=8000)
70
 
71
+