abenkbp commited on
Commit
da75526
·
1 Parent(s): 9f99610
Files changed (1) hide show
  1. data/models/llama3-1-70b.py +14 -11
data/models/llama3-1-70b.py CHANGED
@@ -1,25 +1,19 @@
1
  from flask import Flask, request, jsonify
2
- from huggingface_hub import login
3
  import spaces
4
- import transformers
5
- import torch
6
  import os
7
 
 
 
8
  # Initialize Flask app
9
  app = Flask(__name__)
10
 
11
  api_key = os.getenv("UCODE_SECRET")
12
  login(api_key,add_to_git_credential=True)
13
 
14
- model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
15
 
16
- pipeline = transformers.pipeline(
17
- "text-generation",
18
- model=model_id,
19
- model_kwargs={"torch_dtype": torch.bfloat16,"quantization_config": {"load_in_4bit": True}},
20
- device="cuda",
21
- token=True
22
- )
23
 
24
  @app.route('/chat', methods=['POST'])
25
  @spaces.GPU(enable_queue=True)
@@ -31,6 +25,15 @@ def chat_completion():
31
  temperature = data[0].get('temperature', 0.7)
32
  top_p = data[0].get('top_p', 0.95)
33
 
 
 
 
 
 
 
 
 
 
34
  try:
35
  outputs = pipeline(
36
  user_input,
 
1
  from flask import Flask, request, jsonify
2
+ from huggingface_hub import login, InferenceClient
3
  import spaces
 
 
4
  import os
5
 
6
+
7
+
8
  # Initialize Flask app
9
  app = Flask(__name__)
10
 
11
  api_key = os.getenv("UCODE_SECRET")
12
  login(api_key,add_to_git_credential=True)
13
 
14
+ client = InferenceClient()
15
 
16
+ model_id = "meta-llama/Meta-Llama-3.1-70B-Instruct"
 
 
 
 
 
 
17
 
18
  @app.route('/chat', methods=['POST'])
19
  @spaces.GPU(enable_queue=True)
 
25
  temperature = data[0].get('temperature', 0.7)
26
  top_p = data[0].get('top_p', 0.95)
27
 
28
+ chat = client.chat.completions.create(
29
+ model=model_id,
30
+ messages=user_input,
31
+ stream=False,
32
+ max_tokens=max_tokens,
33
+ temperature=temperature,
34
+ top_p=top_p
35
+ )
36
+
37
  try:
38
  outputs = pipeline(
39
  user_input,