Spaces:

NLPark
/

AI-Demo

Sleeping

App Files Files Community

Chris STC commited on Aug 1, 2023

Commit

44d3d18

1 Parent(s): 8532aae

Update app.py

Browse files

Files changed (1) hide show

app.py +39 -30

app.py CHANGED Viewed

@@ -1,37 +1,46 @@
-import torch
-from transformers import BitsAndBytesConfig
-from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
-quantization_config = BitsAndBytesConfig(
-    load_in_4bit=True,
-    bnb_4bit_compute_dtype=torch.float16,
-    bnb_4bit_quant_type="nf4",
-    bnb_4bit_use_double_quant=True,
-)
-# My version with smaller chunks on safetensors for low RAM environments
-model_id = "vilsonrodrigues/falcon-7b-instruct-sharded"
-model_4bit = AutoModelForCausalLM.from_pretrained(
-        model_id,
-        device_map="auto",
-        quantization_config=quantization_config,
-        trust_remote_code=True)
-tokenizer = AutoTokenizer.from_pretrained(model_id)
-pipeline = pipeline(
-        "text-generation",
-        model=model_4bit,
-        tokenizer=tokenizer,
-        use_cache=True,
-        device_map="auto",
-        max_length=296,
-        do_sample=True,
-        top_k=10,
-        num_return_sequences=1,
-        eos_token_id=tokenizer.eos_token_id,
-        pad_token_id=tokenizer.eos_token_id,
 )
-print(pipeline("Hello"))

+import gradio as gr
+import os
+import wget
+from llama_cpp import Llama
+import random
+os.system('CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python')
+url = 'https://huggingface.co/TheBloke/WizardLM-7B-uncensored-GGML/resolve/main/WizardLM-7B-uncensored.ggmlv3.q2_K.bin'
+filename = wget.download(url)
+llm2 = Llama(model_path=filename, seed=random.randint(1, 2**31))
+title = """<h1 align="center">Chat with awesome WizardLM 7b model!</h1><br>"""
+description = "This model is awesome for its size! It is only 20th the size of Chatgpt but is around 90% as good as Chatgpt. However, please don't rely on WizardLM to provide 100% true information as it might be wrong sometimes."
+def bot(user_message, temperature, top_p, repeat_penalty):
+    tokens3 = llm2.tokenize(user_message.encode())
+    token4 = llm2.tokenize(b"\n\n### Response:")
+    tokens = tokens3 + token4
+    output = ""
+    for token in llm2.generate(tokens, top_k=50, top_p=top_p, temp=temperature, repeat_penalty=repeat_penalty):
+        text = llm2.detokenize([token])
+        output += text.decode()
+        if token == llm2.token_eos():
+            break
+    # Removing other parts of the conversation, we just want the bot's response
+    response_start_idx = output.find("### Response:") + len("### Response:")
+    return output[response_start_idx:].strip()
+interface = gr.Interface(
+    fn=bot,
+    inputs=[
+        gr.Textbox(label="Your Message", placeholder="Type your message here..."),
+        gr.Slider(minimum=0, maximum=2, default=1, label="Temperature"),
+        gr.Slider(minimum=0, maximum=1, default=0.73, label="Top P"),
+        gr.Slider(minimum=0, maximum=2, default=1.1, label="Repeat Penalty")
+    ],
+    outputs="text",
+    live=True,
+    description=description,
+    title=title
 )
+interface.launch(debug=True)