Adi362 commited on
Commit
f5c37d3
·
verified ·
1 Parent(s): 53f226c

removed token limits from our end

Browse files
Files changed (1) hide show
  1. app.py +21 -19
app.py CHANGED
@@ -1,27 +1,19 @@
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
- from typing import List
5
 
6
  app = FastAPI()
7
 
8
 
9
- SYSTEM_PROMPT = """You are edyx-convo.
10
- You are a neutral, concise, API-grade assistant.
11
-
12
- STRICT RULES:
13
- - Do NOT ask questions unless explicitly requested
14
- - Do NOT use greetings
15
- - Do NOT introduce yourself conversationally
16
- - Do NOT use emojis or marketing language
17
- - Answer in 1–2 sentences by default
18
- - Be factual and minimal
19
  """
20
 
21
-
22
  llm = Llama(
23
  model_path="/models/model.gguf",
24
- n_ctx=2048,
 
25
  n_threads=2,
26
  n_batch=128,
27
  verbose=False
@@ -34,23 +26,33 @@ class Message(BaseModel):
34
  class ChatRequest(BaseModel):
35
  messages: List[Message]
36
 
 
 
 
 
37
  @app.post("/v1/chat")
38
  def chat(req: ChatRequest):
 
39
  prompt = SYSTEM_PROMPT + "\n\n"
40
 
41
  for m in req.messages:
42
  role = m.role.lower()
43
- prompt += f"{role}: {m.content}\n"
 
 
 
 
44
 
45
  prompt += "assistant:"
46
 
 
47
  output = llm(
48
  prompt,
49
- max_tokens=128,
50
- temperature=0.4,
51
  top_p=0.9,
52
- repeat_penalty=1.15,
53
- stop=["user:", "assistant:"]
54
  )
55
 
56
  text = output["choices"][0]["text"].strip()
@@ -59,4 +61,4 @@ def chat(req: ChatRequest):
59
  "model": "edyx-convo",
60
  "text": text,
61
  "tokens": output["usage"]["total_tokens"]
62
- }
 
1
  from fastapi import FastAPI
2
  from pydantic import BaseModel
3
  from llama_cpp import Llama
4
+ from typing import List, Optional
5
 
6
  app = FastAPI()
7
 
8
 
9
+ SYSTEM_PROMPT = """You are Edyx.
10
+ You are a helpful, harmless, and honest AI assistant.
 
 
 
 
 
 
 
 
11
  """
12
 
 
13
  llm = Llama(
14
  model_path="/models/model.gguf",
15
+
16
+ n_ctx=4096,
17
  n_threads=2,
18
  n_batch=128,
19
  verbose=False
 
26
  class ChatRequest(BaseModel):
27
  messages: List[Message]
28
 
29
+ max_tokens: Optional[int] = 1024
30
+ temperature: Optional[float] = 0.7
31
+ repetition_penalty: Optional[float] = 1.1
32
+
33
  @app.post("/v1/chat")
34
  def chat(req: ChatRequest):
35
+
36
  prompt = SYSTEM_PROMPT + "\n\n"
37
 
38
  for m in req.messages:
39
  role = m.role.lower()
40
+ if role == "system":
41
+
42
+ prompt = f"{m.content}\n\n"
43
+ else:
44
+ prompt += f"{role}: {m.content}\n"
45
 
46
  prompt += "assistant:"
47
 
48
+
49
  output = llm(
50
  prompt,
51
+ max_tokens=req.max_tokens,
52
+ temperature=req.temperature,
53
  top_p=0.9,
54
+ repeat_penalty=req.repetition_penalty,
55
+ stop=["user:", "assistant:", "<|end|>", "User:"]
56
  )
57
 
58
  text = output["choices"][0]["text"].strip()
 
61
  "model": "edyx-convo",
62
  "text": text,
63
  "tokens": output["usage"]["total_tokens"]
64
+ }