Spaces:

Mikhil-jivus
/

EndpointTesting

Runtime error

App Files Files Community

Mikhil-jivus commited on Oct 6, 2024

Commit

37ca5d0

verified ·

1 Parent(s): 6fc221d

Update app.py

Browse files

Files changed (1) hide show

app.py +138 -84

app.py CHANGED Viewed

@@ -1,97 +1,151 @@
-import gradio as gr
 import os
-import torch
-from transformers import (
-    AutoTokenizer,
-    AutoModelForCausalLM,
-    TextIteratorStreamer,
-    pipeline,
-)
 from threading import Thread
-access_token = os.getenv('HF_TOKEN')
-# The huggingface model id for Finetuned model
-checkpoint = "Mikhil-jivus/Llama-32-3B-FineTuned"
-# Download and load model and tokenizer
-tokenizer = AutoTokenizer.from_pretrained(checkpoint, trust_remote_code=True,token=access_token)
-model = AutoModelForCausalLM.from_pretrained(
-    checkpoint, torch_dtype=torch.bfloat16, device_map="auto", trust_remote_code=True,token=access_token
-)
-# Text generation pipeline
-phi2 = pipeline(
-    "text-generation",
-    tokenizer=tokenizer,
-    model=model,
-    pad_token_id=tokenizer.eos_token_id,
-    eos_token_id=tokenizer.eos_token_id,
-    device_map="auto",
-)
-# Function that accepts a prompt and generates text using the phi2 pipeline
-def generate(message, chat_history, max_new_tokens):
-    instruction = "You are Abhishek Rai, Lead sales At PerfectSprout. Here is your Personality: He is energetic smart and has 17 years of sales experience. You are to strictly follow your personality and respond like Abhishek Rai on a cold Call. Be quick to judge the relevance of the pitch and proceed only if the product/pitch promises to add some value to your work. You possess comprehensive real-world knowledge, enabling you to scrutinize and challenge any implausible claims, inaccuracies, or conventional fallacies presented by a sales rep during a cold call."
-    final_prompt = f"Instruction: {instruction}\n"
-    for sent, received in chat_history:
-        final_prompt += "User: " + sent + "\n"
-        final_prompt += "Assistant: " + received + "\n"
-    final_prompt += "User: " + message + "\n"
-    final_prompt += "Output:"
-    # Streamer
-    streamer = TextIteratorStreamer(
-        tokenizer=tokenizer, skip_prompt=True, skip_special_tokens=True, timeout=300.0
-    )
-    thread = Thread(
-        target=phi2,
-        kwargs={
-            "text_inputs": final_prompt,
-            "max_new_tokens": max_new_tokens,
-            "streamer": streamer,
-        },
-    )
-    thread.start()
-    generated_text = ""
-    for word in streamer:
-        generated_text += word
-        response = generated_text.strip()
-        if "User:" in response:
-            response = response.split("User:")[0].strip()
-        if "Assistant:" in response:
-            response = response.split("Assistant:")[1].strip()
-        yield response
-# Chat interface with gradio
-with gr.Blocks() as demo:
-    gr.Markdown(
-        """
-  # Jivus AI Chatbot Demo
-  This chatbot was created using Llama 3 billion parameter Transformer model.
-  """
-    )
-    tokens_slider = gr.Slider(
-        minimum=8,
-        maximum=512,
-        value=256,
-        label="Maximum new tokens"
     )
-    chatbot = gr.ChatInterface(
-        fn=generate,
-        additional_inputs=[tokens_slider],
-        stop_btn=None,
-        examples=[["Who is Leonhard Euler?"]],
-    )
-demo.queue().launch()

 import os
 from threading import Thread
+from typing import Iterator
+import gradio as gr
+import spaces
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
+DESCRIPTION = """\
+# Llama 3.2 3B Instruct
+Llama 3.2 3B is Meta's latest iteration of open LLMs.
+This is a demo of [`meta-llama/Llama-3.2-3B-Instruct`](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct), fine-tuned for instruction following.
+For more details, please check [our post](https://huggingface.co/blog/llama32).
+"""
+# Access token for the model (if required)
+access_token = os.getenv('HF_TOKEN')
+# Download the Base model
+#model_id = "./models/Llama-32-3B-Instruct"
+model_id = "Mikhil-jivus/Llama-32-3B-FineTuned-Instruct"
+MAX_MAX_NEW_TOKENS = 2048
+DEFAULT_MAX_NEW_TOKENS = 1024
+MAX_INPUT_TOKEN_LENGTH = int(os.getenv("MAX_INPUT_TOKEN_LENGTH", "4096"))
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+#model_id = "nltpt/Llama-3.2-3B-Instruct"
+tokenizer = AutoTokenizer.from_pretrained(model_id,token = access_token)
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    device_map="auto",
+    torch_dtype=torch.bfloat16,
+    token = access_token
+)
+model.eval()
+@spaces.GPU(duration=90)
+def generate(
+    message: str,
+    chat_history: list[tuple[str, str]],
+    system_prompt: str,
+    max_new_tokens: int = 1024,
+    temperature: float = 0.6,
+    top_p: float = 0.9,
+    top_k: int = 50,
+    repetition_penalty: float = 1.2,
+) -> Iterator[str]:
+    conversation = [{"role": "system", "content": system_prompt}]
+    for user, assistant in chat_history:
+        conversation.extend(
+            [
+                {"role": "user", "content": user},
+                {"role": "assistant", "content": assistant},
+            ]
+        )
+    conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt")
+    if input_ids.shape[1] > MAX_INPUT_TOKEN_LENGTH:
+        input_ids = input_ids[:, -MAX_INPUT_TOKEN_LENGTH:]
+        gr.Warning(f"Trimmed input from conversation as it was longer than {MAX_INPUT_TOKEN_LENGTH} tokens.")
+    input_ids = input_ids.to(model.device)
+    streamer = TextIteratorStreamer(tokenizer, timeout=20.0, skip_prompt=True, skip_special_tokens=True)
+    generate_kwargs = dict(
+        {"input_ids": input_ids},
+        streamer=streamer,
+        max_new_tokens=max_new_tokens,
+        do_sample=True,
+        top_p=top_p,
+        top_k=top_k,
+        temperature=temperature,
+        num_beams=1,
+        repetition_penalty=repetition_penalty,
     )
+    t = Thread(target=model.generate, kwargs=generate_kwargs)
+    t.start()
+    outputs = []
+    for text in streamer:
+        outputs.append(text)
+        yield "".join(outputs)
+chat_interface = gr.ChatInterface(
+    fn=generate,
+    additional_inputs=[
+        gr.Textbox(
+            label="System Prompt",
+            placeholder="Enter system prompt here...",
+            lines=2,
+        ),
+        gr.Slider(
+            label="Max new tokens",
+            minimum=1,
+            maximum=MAX_MAX_NEW_TOKENS,
+            step=1,
+            value=DEFAULT_MAX_NEW_TOKENS,
+        ),
+        gr.Slider(
+            label="Temperature",
+            minimum=0.1,
+            maximum=4.0,
+            step=0.1,
+            value=0.6,
+        ),
+        gr.Slider(
+            label="Top-p (nucleus sampling)",
+            minimum=0.05,
+            maximum=1.0,
+            step=0.05,
+            value=0.9,
+        ),
+        gr.Slider(
+            label="Top-k",
+            minimum=1,
+            maximum=1000,
+            step=1,
+            value=50,
+        ),
+        gr.Slider(
+            label="Repetition penalty",
+            minimum=1.0,
+            maximum=2.0,
+            step=0.05,
+            value=1.2,
+        ),
+    ],
+    stop_btn=None,
+    examples=[
+        ["Hello there! How are you doing?"],
+        ["Can you explain briefly to me what is the Python programming language?"],
+        ["Explain the plot of Cinderella in a sentence."],
+        ["How many hours does it take a man to eat a Helicopter?"],
+        ["Write a 100-word article on 'Benefits of Open-Source in AI research'"],
+    ],
+    cache_examples=False,
+)
+with gr.Blocks(css="style.css", fill_height=True) as demo:
+    gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
+    chat_interface.render()
+if __name__ == "__main__":
+    demo.queue(max_size=20).launch()