gen

Sleeping

App Files Files Community

premalt commited on Sep 29, 2024

Commit

d4f4413

1 Parent(s): e7a9684

fix max tokens

Browse files

Files changed (1) hide show

main.py +15 -20

main.py CHANGED Viewed

@@ -6,17 +6,18 @@ from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 from typing import List
 app = FastAPI()
 client = InferenceClient("openai-community/gpt2-medium")
-SYSTEM_PROMPT = "You are a very powerful AI to generate interesting stories for short-form content consumption. Make sure to hook the readers attention in the first few seconds. Make sure to be engaging and creative in your responses."
 class Item(BaseModel):
     prompt: str
     history: List[str] = []
-    # system_prompt: str = "You are a very powerful AI assistant."
     temperature: float = 0.0
     max_new_tokens: int = 1048
     top_p: float = 0.15
@@ -26,28 +27,24 @@ class Item(BaseModel):
 def format_prompt(message, history):
     prompt = "<s>"
     for user_prompt, bot_response in history:
-        prompt += f"[INST] {user_prompt} [/INST]"
-        prompt += f" {bot_response}</s> "
     prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(item: Item):
     temperature = max(float(item.temperature), 1e-2)
-    # generate_kwargs = dict(
-    #     temperature=temperature,
-    #     max_new_tokens=item.max_new_tokens,
-    #     top_p=float(item.top_p),
-    #     repetition_penalty=item.repetition_penalty,
-    #     do_sample=True,
-    #     seed=42,
-    # )
     formatted_prompt = format_prompt(f"{SYSTEM_PROMPT}, {item.prompt}", item.history)
     stream = client.text_generation(
         formatted_prompt,
         temperature=temperature,
-        max_new_tokens=item.max_new_tokens,
         top_p=float(item.top_p),
         repetition_penalty=item.repetition_penalty,
         do_sample=True,
@@ -56,10 +53,10 @@ def generate(item: Item):
         details=True,
         return_full_text=False,
     )
     output = "".join(response.token.text for response in stream)
-    # Remove unwanted sequences or patterns (e.g., <s>, [/INST], etc.)
-    output = re.sub(r"<[^>]+>", "", output)  # Remove any HTML-like tags
-    output = re.sub(r"\s+", " ", output).strip()  # Clean up extra whitespace
     return output
@@ -68,7 +65,6 @@ def generate(item: Item):
 async def generate_text(
     prompt: str,
     history: List[str] = [],
-    # system_prompt: str = "You are a very powerful AI assistant.",
     temperature: float = 0.0,
     max_new_tokens: int = 1048,
     top_p: float = 0.15,
@@ -77,7 +73,6 @@ async def generate_text(
     item = Item(
         prompt=prompt,
         history=history,
-        # system_prompt=system_prompt,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         top_p=top_p,

 from huggingface_hub import InferenceClient
 from typing import List
 app = FastAPI()
 client = InferenceClient("openai-community/gpt2-medium")
+SYSTEM_PROMPT = (
+    "You are a very powerful AI to generate interesting stories for short-form content "
+    "consumption. Make sure to hook the reader's attention in the first few seconds. "
+    "Make sure to be engaging and creative in your responses."
+)
 class Item(BaseModel):
     prompt: str
     history: List[str] = []
     temperature: float = 0.0
     max_new_tokens: int = 1048
     top_p: float = 0.15
 def format_prompt(message, history):
     prompt = "<s>"
     for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST] {bot_response}</s> "
     prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(item: Item):
     temperature = max(float(item.temperature), 1e-2)
     formatted_prompt = format_prompt(f"{SYSTEM_PROMPT}, {item.prompt}", item.history)
+    input_token_length = len(formatted_prompt.split())
+    max_allowed_tokens = 1024 - input_token_length
+    max_new_tokens = min(item.max_new_tokens, max_allowed_tokens)
     stream = client.text_generation(
         formatted_prompt,
         temperature=temperature,
+        max_new_tokens=max_new_tokens,
         top_p=float(item.top_p),
         repetition_penalty=item.repetition_penalty,
         do_sample=True,
         details=True,
         return_full_text=False,
     )
     output = "".join(response.token.text for response in stream)
+    output = re.sub(r"<[^>]+>", "", output)
+    output = re.sub(r"\s+", " ", output).strip()
     return output
 async def generate_text(
     prompt: str,
     history: List[str] = [],
     temperature: float = 0.0,
     max_new_tokens: int = 1048,
     top_p: float = 0.15,
     item = Item(
         prompt=prompt,
         history=history,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         top_p=top_p,