Spaces:

Azzindani
/

ID_Legal

Sleeping

App Files Files Community

Azzindani commited on Apr 4, 2025

Commit

40a19a5

verified ·

1 Parent(s): 25ff041

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -28

app.py CHANGED Viewed

@@ -1,30 +1,31 @@
 import gradio as gr
 import os
 import spaces
 from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
-# Set an environment variable
-HF_TOKEN = os.environ.get("HF_TOKEN", None)
 DESCRIPTION = '''
 <div>
-<h1 style="text-align: center;">deepseek-ai/DeepSeek-R1-Distill-Llama-8B</h1>
 </div>
 '''
 LICENSE = """
 <p/>
 ---
 """
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
-   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">DeepSeek-R1-Distill-Llama-8B</h1>
-   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">Ask me anything...</p>
 </div>
 """
@@ -34,7 +35,6 @@ h1 {
   text-align: center;
   display: block;
 }
 #duplicate-button {
   margin: auto;
   color: white;
@@ -43,17 +43,11 @@ h1 {
 }
 """
-model_path = "Azzindani/Qwen2.5_1.5B_Instruct_ID_Legal"
 # Load the tokenizer and model
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-model = AutoModelForCausalLM.from_pretrained(model_path, device_map="auto")  # to("cuda:0")
-terminators = [
-    tokenizer.eos_token_id,
-    tokenizer.convert_tokens_to_ids("<|eot_id|>")
-]
-@spaces.GPU(duration=120)
 def chat_llama3_8b(message: str,
               history: list,
               temperature: float,
@@ -70,11 +64,12 @@ def chat_llama3_8b(message: str,
         str: The generated response.
     """
     conversation = []
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
-    input_ids = tokenizer.apply_chat_template(conversation, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
@@ -84,7 +79,8 @@ def chat_llama3_8b(message: str,
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
-        eos_token_id=terminators,
     )
     # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0:
@@ -96,7 +92,7 @@ def chat_llama3_8b(message: str,
     outputs = []
     for text in streamer:
         outputs.append(text)
-        #print(outputs)
         yield "".join(outputs)
@@ -106,6 +102,7 @@ chatbot=gr.Chatbot(height=450, placeholder=PLACEHOLDER, label='Gradio ChatInterf
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
@@ -115,7 +112,7 @@ with gr.Blocks(fill_height=True, css=css) as demo:
             gr.Slider(minimum=0,
                       maximum=1,
                       step=0.1,
-                      value=0.5,
                       label="Temperature",
                       render=False),
             gr.Slider(minimum=128,
@@ -126,11 +123,11 @@ with gr.Blocks(fill_height=True, css=css) as demo:
                       render=False ),
             ],
         examples=[
-            ['How to setup a human base on Mars? Give short answer.'],
-            ['Explain theory of relativity to me like I’m 8 years old.'],
-            ['What is 9,000 * 9,000?'],
-            ['Write a pun-filled happy birthday message to my friend Alex.'],
-            ['Justify why a penguin might make a good king of the jungle.']
             ],
         cache_examples=False,
                      )
@@ -138,5 +135,4 @@ with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
-    demo.launch()

+# Ref: https://huggingface.co/spaces/ysharma/Chat_with_Meta_llama3_8b
 import gradio as gr
 import os
 import spaces
+import torch
 from transformers import GemmaTokenizer, AutoModelForCausalLM
 from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
 from threading import Thread
 DESCRIPTION = '''
 <div>
+<h1 style="text-align: center;">非公式Llama-3.1-Swallow-8B-Instruct-v0.1</h1>
+<p>tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.1の非公式デモだよ。 <a href="https://huggingface.co/tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.1"><b>tokyotech-llm/Llama-3.1-Swallow-8B-Instruct-v0.1</b></a>.</p>
 </div>
 '''
 LICENSE = """
 <p/>
 ---
+Built with Meta Llama 3.1
 """
 PLACEHOLDER = """
 <div style="padding: 30px; text-align: center; display: flex; flex-direction: column; align-items: center;">
+   <h1 style="font-size: 28px; margin-bottom: 2px; opacity: 0.55;">Meta llama3.1</h1>
+   <p style="font-size: 18px; margin-bottom: 2px; opacity: 0.65;">なんでもきいてね</p>
 </div>
 """
   text-align: center;
   display: block;
 }
 #duplicate-button {
   margin: auto;
   color: white;
 }
 """
 # Load the tokenizer and model
+tokenizer = AutoTokenizer.from_pretrained("Azzindani/Qwen2.5_1.5B_Instruct_ID_Legal")
+model = AutoModelForCausalLM.from_pretrained("Azzindani/Qwen2.5_1.5B_Instruct_ID_Legal", torch_dtype=torch.bfloat16, device_map="auto")
+@spaces.GPU()
 def chat_llama3_8b(message: str,
               history: list,
               temperature: float,
         str: The generated response.
     """
     conversation = []
+    conversation.append({"role": "system", "content": "あなたは誠実で優秀な日本人のアシスタントです。日本語で聞かれた場合、必ず日本語で返答してください。"})
     for user, assistant in history:
         conversation.extend([{"role": "user", "content": user}, {"role": "assistant", "content": assistant}])
     conversation.append({"role": "user", "content": message})
+    input_ids = tokenizer.apply_chat_template(conversation, tokenize=True, add_generation_prompt=True, return_tensors="pt").to(model.device)
     streamer = TextIteratorStreamer(tokenizer, timeout=10.0, skip_prompt=True, skip_special_tokens=True)
         max_new_tokens=max_new_tokens,
         do_sample=True,
         temperature=temperature,
+        top_p=0.9,
+        repetition_penalty=1.1,
     )
     # This will enforce greedy generation (do_sample=False) when the temperature is passed 0, avoiding the crash.
     if temperature == 0:
     outputs = []
     for text in streamer:
         outputs.append(text)
+        print(outputs)
         yield "".join(outputs)
 with gr.Blocks(fill_height=True, css=css) as demo:
     gr.Markdown(DESCRIPTION)
+    gr.DuplicateButton(value="Duplicate Space for private use", elem_id="duplicate-button")
     gr.ChatInterface(
         fn=chat_llama3_8b,
         chatbot=chatbot,
             gr.Slider(minimum=0,
                       maximum=1,
                       step=0.1,
+                      value=0.6,
                       label="Temperature",
                       render=False),
             gr.Slider(minimum=128,
                       render=False ),
             ],
         examples=[
+            ['小学生にもわかるように相対性理論を教えてください。'],
+            ['宇宙の起源を知るための方法をステップ・バイ・ステップで教えてください。'],
+            ['1から100までの素数を求めるスクリプトをPythonで書いてください。'],
+            ['友達の陽葵にあげる誕生日プレゼントを考えてください。ただし、陽葵は中学生で、私は同じクラスの男性であることを考慮してください。'],
+            ['ペンギンがジャングルの王様であることを正当化するように説明してください。']
             ],
         cache_examples=False,
                      )
     gr.Markdown(LICENSE)
 if __name__ == "__main__":
+    demo.launch()