gen

Sleeping

App Files Files Community

pr0methium commited on Sep 29, 2024

Commit

3d4ff6b

verified ·

1 Parent(s): 72c4724

Update main.py

Browse files

Files changed (1) hide show

main.py +39 -28

main.py CHANGED Viewed

@@ -1,72 +1,83 @@
-import os
 import re
 import asyncio
 from fastapi import FastAPI
-from pydantic import BaseModel, Field
 from huggingface_hub import InferenceClient
 from typing import List
-# Set the cache directory to a writable location
-os.environ['TRANSFORMERS_CACHE'] = '/tmp/huggingface'
 app = FastAPI()
-client = InferenceClient("EleutherAI/gpt-neo-125M")
 SYSTEM_PROMPT = "You are a very powerful AI to generate interesting stories for short-form content consumption. Make sure to hook the readers attention in the first few seconds. Make sure to be engaging and creative in your responses."
-MAX_TOTAL_TOKENS = 2048
 class Item(BaseModel):
     prompt: str
     history: List[str] = []
-    temperature: float = Field(default=0.8, ge=0.0, le=1.0)
-    max_new_tokens: int = Field(default=1024, ge=1, le=MAX_TOTAL_TOKENS)
-    top_p: float = Field(default=0.9, ge=0.0, le=1.0)
-    repetition_penalty: float = Field(default=1.1, ge=0.0)
 def format_prompt(message, history):
-    prompt = ""
     for user_prompt, bot_response in history:
-        prompt += f"Human: {user_prompt}\nAI: {bot_response}\n"
-    prompt += f"Human: {message}\nAI:"
     return prompt
 def generate(item: Item):
     temperature = max(float(item.temperature), 1e-2)
-    formatted_prompt = format_prompt(f"{SYSTEM_PROMPT}\n{item.prompt}", item.history)
-    # A simple approximation for token count
-    estimated_input_tokens = len(formatted_prompt.split())
-    max_new_tokens = min(item.max_new_tokens, MAX_TOTAL_TOKENS - estimated_input_tokens)
-    response = client.text_generation(
         formatted_prompt,
-        max_new_tokens=max_new_tokens,
         temperature=temperature,
         top_p=float(item.top_p),
         repetition_penalty=item.repetition_penalty,
         do_sample=True,
         seed=42,
     )
-    output = response.strip()
-    output = re.sub(r"\s+", " ", output)
     return output
 @app.get("/generate/")
 async def generate_text(
     prompt: str,
     history: List[str] = [],
-    temperature: float = 0.8,
-    max_new_tokens: int = 1024,
-    top_p: float = 0.9,
-    repetition_penalty: float = 1.1,
 ):
     item = Item(
         prompt=prompt,
         history=history,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         top_p=top_p,

+import uvicorn
 import re
 import asyncio
 from fastapi import FastAPI
+from pydantic import BaseModel
 from huggingface_hub import InferenceClient
 from typing import List
 app = FastAPI()
+client = InferenceClient("mistralai/Mixtral-8x7B-Instruct-v0.1")
 SYSTEM_PROMPT = "You are a very powerful AI to generate interesting stories for short-form content consumption. Make sure to hook the readers attention in the first few seconds. Make sure to be engaging and creative in your responses."
 class Item(BaseModel):
     prompt: str
     history: List[str] = []
+    # system_prompt: str = "You are a very powerful AI assistant."
+    temperature: float = 0.0
+    max_new_tokens: int = 1048
+    top_p: float = 0.15
+    repetition_penalty: float = 1.0
 def format_prompt(message, history):
+    prompt = "<s>"
     for user_prompt, bot_response in history:
+        prompt += f"[INST] {user_prompt} [/INST]"
+        prompt += f" {bot_response}</s> "
+    prompt += f"[INST] {message} [/INST]"
     return prompt
 def generate(item: Item):
     temperature = max(float(item.temperature), 1e-2)
+    # generate_kwargs = dict(
+    #     temperature=temperature,
+    #     max_new_tokens=item.max_new_tokens,
+    #     top_p=float(item.top_p),
+    #     repetition_penalty=item.repetition_penalty,
+    #     do_sample=True,
+    #     seed=42,
+    # )
+    formatted_prompt = format_prompt(f"{SYSTEM_PROMPT}, {item.prompt}", item.history)
+    stream = client.text_generation(
         formatted_prompt,
         temperature=temperature,
+        max_new_tokens=item.max_new_tokens,
         top_p=float(item.top_p),
         repetition_penalty=item.repetition_penalty,
         do_sample=True,
         seed=42,
+        stream=True,
+        details=True,
+        return_full_text=False,
     )
+    output = "".join(response.token.text for response in stream)
+    # Remove unwanted sequences or patterns (e.g., <s>, [/INST], etc.)
+    output = re.sub(r"<[^>]+>", "", output)  # Remove any HTML-like tags
+    output = re.sub(r"\s+", " ", output).strip()  # Clean up extra whitespace
     return output
 @app.get("/generate/")
 async def generate_text(
     prompt: str,
     history: List[str] = [],
+    # system_prompt: str = "You are a very powerful AI assistant.",
+    temperature: float = 0.0,
+    max_new_tokens: int = 1048,
+    top_p: float = 0.15,
+    repetition_penalty: float = 1.0,
 ):
     item = Item(
         prompt=prompt,
         history=history,
+        # system_prompt=system_prompt,
         temperature=temperature,
         max_new_tokens=max_new_tokens,
         top_p=top_p,