Spaces:
Sleeping
Sleeping
| # import gradio as gr | |
| # from huggingface_hub import InferenceClient | |
| # """ | |
| # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
| # """ | |
| # client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
| # ## None type | |
| # def respond( | |
| # message: str, | |
| # history: list[tuple[str, str]], # This will not be used | |
| # system_message: str, | |
| # max_tokens: int, | |
| # temperature: float, | |
| # top_p: float, | |
| # ): | |
| # messages = [{"role": "system", "content": system_message}] | |
| # # Append only the latest user message | |
| # messages.append({"role": "user", "content": message}) | |
| # response = "" | |
| # try: | |
| # # Generate response from the model | |
| # for message in client.chat_completion( | |
| # messages, | |
| # max_tokens=max_tokens, | |
| # stream=True, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # ): | |
| # if message.choices[0].delta.content is not None: | |
| # token = message.choices[0].delta.content | |
| # response += token | |
| # yield response | |
| # except Exception as e: | |
| # yield f"An error occurred: {e}" | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| ##Running smothly CHATBOT | |
| # import gradio as gr | |
| # from huggingface_hub import InferenceClient | |
| # """ | |
| # For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
| # """ | |
| # client = InferenceClient("meta-llama/Meta-Llama-3-8B-Instruct") | |
| # def respond( | |
| # message: str, | |
| # history: list[tuple[str, str]], # This will not be used | |
| # system_message: str, | |
| # max_tokens: int, | |
| # temperature: float, | |
| # top_p: float, | |
| # ): | |
| # # Build the messages list | |
| # messages = [{"role": "system", "content": system_message}] | |
| # messages.append({"role": "user", "content": message}) | |
| # response = "" | |
| # try: | |
| # # Generate response from the model | |
| # for msg in client.chat_completion( | |
| # messages=messages, | |
| # max_tokens=max_tokens, | |
| # stream=True, | |
| # temperature=temperature, | |
| # top_p=top_p, | |
| # ): | |
| # if msg.choices[0].delta.content is not None: | |
| # token = msg.choices[0].delta.content | |
| # response += token | |
| # yield response | |
| # except Exception as e: | |
| # yield f"An error occurred: {e}" | |
| # """ | |
| # For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
| # """ | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Textbox(value="You are a friendly Chatbot.", label="System message"), | |
| # gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider( | |
| # minimum=0.1, | |
| # maximum=1.0, | |
| # value=0.95, | |
| # step=0.05, | |
| # label="Top-p (nucleus sampling)", | |
| # ), | |
| # ], | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| ### 26 aug Use a pipeline as a high-level Logic | |
| # import spaces | |
| # import os | |
| # import subprocess | |
| # from llama_cpp import Llama | |
| # from llama_cpp_agent import LlamaCppAgent, MessagesFormatterType | |
| # from llama_cpp_agent.providers import LlamaCppPythonProvider | |
| # from llama_cpp_agent.chat_history import BasicChatHistory | |
| # from llama_cpp_agent.chat_history.messages import Roles | |
| # import gradio as gr | |
| # from huggingface_hub import hf_hub_download | |
| # huggingface_token = os.getenv("HF_TOKEN") | |
| # # Download the Meta-Llama-3.1-8B-Instruct model | |
| # hf_hub_download( | |
| # repo_id="bartowski/Meta-Llama-3.1-8B-Instruct-GGUF", | |
| # filename="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf", | |
| # local_dir="./models", | |
| # token=huggingface_token | |
| # ) | |
| # llm = None | |
| # llm_model = None | |
| # @spaces.GPU(duration=120) | |
| # def respond( | |
| # message, | |
| # history: list[tuple[str, str]], | |
| # model, | |
| # system_message, | |
| # max_tokens, | |
| # temperature, | |
| # top_p, | |
| # top_k, | |
| # repeat_penalty, | |
| # ): | |
| # chat_template = MessagesFormatterType.GEMMA_2 | |
| # global llm | |
| # global llm_model | |
| # # Load model only if it's not already loaded or if a new model is selected | |
| # if llm is None or llm_model != model: | |
| # try: | |
| # llm = Llama( | |
| # model_path=f"models/{model}", | |
| # flash_attn=True, | |
| # n_gpu_layers=81, # Adjust based on available GPU resources | |
| # n_batch=1024, | |
| # n_ctx=8192, | |
| # ) | |
| # llm_model = model | |
| # except Exception as e: | |
| # return f"Error loading model: {str(e)}" | |
| # provider = LlamaCppPythonProvider(llm) | |
| # agent = LlamaCppAgent( | |
| # provider, | |
| # system_prompt=f"{system_message}", | |
| # predefined_messages_formatter_type=chat_template, | |
| # debug_output=True | |
| # ) | |
| # settings = provider.get_provider_default_settings() | |
| # settings.temperature = temperature | |
| # settings.top_k = top_k | |
| # settings.top_p = top_p | |
| # settings.max_tokens = max_tokens | |
| # settings.repeat_penalty = repeat_penalty | |
| # settings.stream = True | |
| # messages = BasicChatHistory() | |
| # # Add user and assistant messages to the history | |
| # for msn in history: | |
| # user = {'role': Roles.user, 'content': msn[0]} | |
| # assistant = {'role': Roles.assistant, 'content': msn[1]} | |
| # messages.add_message(user) | |
| # messages.add_message(assistant) | |
| # # Stream the response | |
| # try: | |
| # stream = agent.get_chat_response( | |
| # message, | |
| # llm_sampling_settings=settings, | |
| # chat_history=messages, | |
| # returns_streaming_generator=True, | |
| # print_output=False | |
| # ) | |
| # outputs = "" | |
| # for output in stream: | |
| # outputs += output | |
| # yield outputs | |
| # except Exception as e: | |
| # yield f"Error during response generation: {str(e)}" | |
| # description = """<p align="center">Using the Meta-Llama-3.1-8B-Instruct Model</p>""" | |
| # demo = gr.ChatInterface( | |
| # respond, | |
| # additional_inputs=[ | |
| # gr.Dropdown([ | |
| # 'Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf' | |
| # ], | |
| # value="Meta-Llama-3.1-8B-Instruct-Q5_K_M.gguf", | |
| # label="Model" | |
| # ), | |
| # gr.Textbox(value="You are a helpful assistant.", label="System message"), | |
| # gr.Slider(minimum=1, maximum=4096, value=2048, step=1, label="Max tokens"), | |
| # gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| # gr.Slider( | |
| # minimum=0.1, | |
| # maximum=1.0, | |
| # value=0.95, | |
| # step=0.05, | |
| # label="Top-p", | |
| # ), | |
| # gr.Slider( | |
| # minimum=0, | |
| # maximum=100, | |
| # value=40, | |
| # step=1, | |
| # label="Top-k", | |
| # ), | |
| # gr.Slider( | |
| # minimum=0.0, | |
| # maximum=2.0, | |
| # value=1.1, | |
| # step=0.1, | |
| # label="Repetition penalty", | |
| # ), | |
| # ], | |
| # retry_btn="Retry", | |
| # undo_btn="Undo", | |
| # clear_btn="Clear", | |
| # submit_btn="Send", | |
| # title="Chat with Meta-Llama-3.1-8B-Instruct using llama.cpp", | |
| # description=description, | |
| # chatbot=gr.Chatbot( | |
| # scale=1, | |
| # likeable=False, | |
| # show_copy_button=True | |
| # ) | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| ####03 3.1 8b | |
| # import os | |
| # import time | |
| # import spaces | |
| # import torch | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer, BitsAndBytesConfig | |
| # import gradio as gr | |
| # from threading import Thread | |
| # MODEL_LIST = ["meta-llama/Meta-Llama-3.1-8B-Instruct"] | |
| # HF_TOKEN = os.environ.get("HF_API_TOKEN",None) | |
| # print(HF_TOKEN,"######$$$$$$$$$$$$$$$") | |
| # MODEL = os.environ.get("MODEL_ID","meta-llama/Meta-Llama-3.1-8B-Instruct") | |
| # TITLE = "<h1><center>Meta-Llama3.1-8B</center></h1>" | |
| # PLACEHOLDER = """ | |
| # <center> | |
| # <p>Hi! How can I help you today?</p> | |
| # </center> | |
| # """ | |
| # CSS = """ | |
| # .duplicate-button { | |
| # margin: auto !important; | |
| # color: white !important; | |
| # background: black !important; | |
| # border-radius: 100vh !important; | |
| # } | |
| # h3 { | |
| # text-align: center; | |
| # } | |
| # """ | |
| # device = "cuda" # for GPU usage or "cpu" for CPU usage | |
| # quantization_config = BitsAndBytesConfig( | |
| # load_in_4bit=True, | |
| # bnb_4bit_compute_dtype=torch.bfloat16, | |
| # bnb_4bit_use_double_quant=True, | |
| # bnb_4bit_quant_type= "nf4") | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # MODEL, | |
| # torch_dtype=torch.bfloat16, | |
| # device_map="auto", | |
| # quantization_config=quantization_config) | |
| # @spaces.GPU() | |
| # def stream_chat( | |
| # message: str, | |
| # history: list, | |
| # system_prompt: str, | |
| # temperature: float = 0.8, | |
| # max_new_tokens: int = 1024, | |
| # top_p: float = 1.0, | |
| # top_k: int = 20, | |
| # penalty: float = 1.2, | |
| # ): | |
| # print(f'message: {message}') | |
| # print(f'history: {history}') | |
| # conversation = [ | |
| # {"role": "system", "content": system_prompt} | |
| # ] | |
| # for prompt, answer in history: | |
| # conversation.extend([ | |
| # {"role": "user", "content": prompt}, | |
| # {"role": "assistant", "content": answer}, | |
| # ]) | |
| # conversation.append({"role": "user", "content": message}) | |
| # input_ids = tokenizer.apply_chat_template(conversation, add_generation_prompt=True, return_tensors="pt").to(model.device) | |
| # streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
| # generate_kwargs = dict( | |
| # input_ids=input_ids, | |
| # max_new_tokens = max_new_tokens, | |
| # do_sample = False if temperature == 0 else True, | |
| # top_p = top_p, | |
| # top_k = top_k, | |
| # temperature = temperature, | |
| # repetition_penalty=penalty, | |
| # eos_token_id=[128001,128008,128009], | |
| # streamer=streamer, | |
| # ) | |
| # with torch.no_grad(): | |
| # thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
| # thread.start() | |
| # buffer = "" | |
| # for new_text in streamer: | |
| # buffer += new_text | |
| # yield buffer | |
| # chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
| # with gr.Blocks(css=CSS, theme="soft") as demo: | |
| # gr.HTML(TITLE) | |
| # gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
| # gr.ChatInterface( | |
| # fn=stream_chat, | |
| # chatbot=chatbot, | |
| # fill_height=True, | |
| # additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
| # additional_inputs=[ | |
| # gr.Textbox( | |
| # value="You are a helpful assistant", | |
| # label="System Prompt", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=0, | |
| # maximum=1, | |
| # step=0.1, | |
| # value=0.8, | |
| # label="Temperature", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=128, | |
| # maximum=8192, | |
| # step=1, | |
| # value=1024, | |
| # label="Max new tokens", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=0.0, | |
| # maximum=1.0, | |
| # step=0.1, | |
| # value=1.0, | |
| # label="top_p", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=1, | |
| # maximum=20, | |
| # step=1, | |
| # value=20, | |
| # label="top_k", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=0.0, | |
| # maximum=2.0, | |
| # step=0.1, | |
| # value=1.2, | |
| # label="Repetition penalty", | |
| # render=False, | |
| # ), | |
| # ], | |
| # examples=[ | |
| # ["Help me study vocabulary: write a sentence for me to fill in the blank, and I'll try to pick the correct option."], | |
| # ["What are 5 creative things I could do with my kids' art? I don't want to throw them away, but it's also so much clutter."], | |
| # ["Tell me a random fun fact about the Roman Empire."], | |
| # ["Show me a code snippet of a website's sticky header in CSS and JavaScript."], | |
| # ], | |
| # cache_examples=False, | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| ###########new clientkey | |
| # import os | |
| # import time | |
| # import spaces | |
| # import torch | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| # import gradio as gr | |
| # from threading import Thread | |
| # MODEL = "THUDM/LongWriter-llama3.1-8b" | |
| # TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>" | |
| # PLACEHOLDER = """ | |
| # <center> | |
| # <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p> | |
| # </center> | |
| # """ | |
| # CSS = """ | |
| # .duplicate-button { | |
| # margin: auto !important; | |
| # color: white !important; | |
| # background: black !important; | |
| # border-radius: 100vh !important; | |
| # } | |
| # h3 { | |
| # text-align: center; | |
| # } | |
| # """ | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
| # model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto") | |
| # model = model.eval() | |
| # @spaces.GPU() | |
| # def stream_chat( | |
| # message: str, | |
| # history: list, | |
| # system_prompt: str, | |
| # temperature: float = 0.5, | |
| # max_new_tokens: int = 32768, | |
| # top_p: float = 1.0, | |
| # top_k: int = 50, | |
| # ): | |
| # print(f'message: {message}') | |
| # print(f'history: {history}') | |
| # full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" | |
| # for prompt, answer in history: | |
| # full_prompt += f"[INST]{prompt}[/INST]{answer}" | |
| # full_prompt += f"[INST]{message}[/INST]" | |
| # inputs = tokenizer(full_prompt, truncation=False, return_tensors="pt").to(device) | |
| # context_length = inputs.input_ids.shape[-1] | |
| # streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
| # generate_kwargs = dict( | |
| # inputs=inputs.input_ids, | |
| # max_new_tokens=max_new_tokens, | |
| # do_sample=True, | |
| # top_p=top_p, | |
| # top_k=top_k, | |
| # temperature=temperature, | |
| # num_beams=1, | |
| # streamer=streamer, | |
| # ) | |
| # thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
| # thread.start() | |
| # buffer = "" | |
| # for new_text in streamer: | |
| # buffer += new_text | |
| # yield buffer | |
| # chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
| # with gr.Blocks(css=CSS, theme="soft") as demo: | |
| # gr.HTML(TITLE) | |
| # gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
| # gr.ChatInterface( | |
| # fn=stream_chat, | |
| # chatbot=chatbot, | |
| # fill_height=True, | |
| # additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
| # additional_inputs=[ | |
| # gr.Textbox( | |
| # value="You are a helpful assistant capable of generating long-form content.", | |
| # label="System Prompt", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=0, | |
| # maximum=1, | |
| # step=0.1, | |
| # value=0.5, | |
| # label="Temperature", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=1024, | |
| # maximum=32768, | |
| # step=1024, | |
| # value=32768, | |
| # label="Max new tokens", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=0.0, | |
| # maximum=1.0, | |
| # step=0.1, | |
| # value=1.0, | |
| # label="Top p", | |
| # render=False, | |
| # ), | |
| # gr.Slider( | |
| # minimum=1, | |
| # maximum=100, | |
| # step=1, | |
| # value=50, | |
| # label="Top k", | |
| # render=False, | |
| # ), | |
| # ], | |
| # examples=[ | |
| # ["Write a 5000-word comprehensive guide on machine learning for beginners."], | |
| # ["Create a detailed 3000-word business plan for a sustainable energy startup."], | |
| # ["Compose a 2000-word short story set in a futuristic underwater city."], | |
| # ["Develop a 4000-word research proposal on the potential effects of climate change on global food security."], | |
| # ], | |
| # cache_examples=False, | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| import gradio as gr | |
| from threading import Thread | |
| # Model and constants | |
| MODEL = "THUDM/LongWriter-llama3.1-8b" | |
| TITLE = "<h1><center>AreaX LLC-llama3.1-8b</center></h1>" | |
| PLACEHOLDER = """ | |
| <center> | |
| <p>Hi! I'm AreaX AI Agent, capable of generating 10,000+ words. How can I assist you today?</p> | |
| </center> | |
| """ | |
| CSS = """ | |
| .duplicate-button { | |
| margin: auto !important; | |
| color: white !important; | |
| background: black !important; | |
| border-radius: 100vh !important; | |
| } | |
| h3 { | |
| text-align: center; | |
| } | |
| """ | |
| # Check device | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # Load model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL, trust_remote_code=True) | |
| model = AutoModelForCausalLM.from_pretrained(MODEL, torch_dtype=torch.bfloat16, trust_remote_code=True, device_map="auto").eval() | |
| def stream_chat( | |
| message: str, | |
| history: list, | |
| system_prompt: str, | |
| temperature: float = 0.5, | |
| max_new_tokens: int = 4096, # Lowered max tokens for efficiency | |
| top_p: float = 1.0, | |
| top_k: int = 50, | |
| ): | |
| try: | |
| full_prompt = f"<<SYS>>\n{system_prompt}\n<</SYS>>\n\n" | |
| for prompt, answer in history: | |
| full_prompt += f"[INST]{prompt}[/INST]{answer}" | |
| full_prompt += f"[INST]{message}[/INST]" | |
| # Tokenize input | |
| inputs = tokenizer(full_prompt, truncation=True, max_length=2048, return_tensors="pt").to(device) | |
| context_length = inputs.input_ids.shape[-1] | |
| # Setup TextIteratorStreamer for streaming response | |
| streamer = TextIteratorStreamer(tokenizer, timeout=60.0, skip_prompt=True, skip_special_tokens=True) | |
| # Generation parameters | |
| generate_kwargs = dict( | |
| inputs=inputs.input_ids, | |
| max_new_tokens=max_new_tokens, | |
| do_sample=True, | |
| top_p=top_p, | |
| top_k=top_k, | |
| temperature=temperature, | |
| num_beams=1, | |
| streamer=streamer, | |
| ) | |
| # Generate text in a separate thread to avoid blocking | |
| thread = Thread(target=model.generate, kwargs=generate_kwargs) | |
| thread.start() | |
| # Stream response | |
| buffer = "" | |
| for new_text in streamer: | |
| buffer += new_text | |
| yield buffer | |
| except Exception as e: | |
| yield f"An error occurred: {str(e)}" | |
| # Gradio setup | |
| chatbot = gr.Chatbot(height=600, placeholder=PLACEHOLDER) | |
| with gr.Blocks(css=CSS, theme="soft") as demo: | |
| gr.HTML(TITLE) | |
| gr.DuplicateButton(value="Duplicate Space for private use", elem_classes="duplicate-button") | |
| gr.ChatInterface( | |
| fn=stream_chat, | |
| chatbot=chatbot, | |
| fill_height=True, | |
| additional_inputs_accordion=gr.Accordion(label="⚙️ Parameters", open=False, render=False), | |
| additional_inputs=[ | |
| gr.Textbox( | |
| value="You are a helpful assistant capable of generating long-form content.", | |
| label="System Prompt", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=0, | |
| maximum=1, | |
| step=0.1, | |
| value=0.5, | |
| label="Temperature", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=1024, | |
| maximum=4096, # Reduced to a more manageable value | |
| step=1024, | |
| value=4096, | |
| label="Max new tokens", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| step=0.1, | |
| value=1.0, | |
| label="Top p", | |
| render=False, | |
| ), | |
| gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| step=1, | |
| value=50, | |
| label="Top k", | |
| render=False, | |
| ), | |
| ], | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |