Spaces:

Mykes
/

test

Runtime error

App Files Files Community

Mykes commited on Jul 17, 2024

Commit

4bf96b6

verified ·

1 Parent(s): 70be70c

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -142

app.py CHANGED Viewed

@@ -2,148 +2,74 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-def create_responder(model):
-    def respond(
-        message,
-        history: list[tuple[str, str]],
-        system_message,
-        max_tokens,
-        temperature,
-        top_p,
     ):
-        history = history[-3:]
-        # Construct the prompt
-        prompt = f"<s>{system_message}\n\n"
-        for user_msg, assistant_msg in history:
-            prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
-        prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
-        # Generate response
-        response = ""
-        for token in model(
-            prompt,
-            max_tokens=max_tokens,
-            temperature=temperature,
-            top_p=top_p,
-            stream=True,
-            stop=["<|end|>", "</s>"]
-        ):
-            response += token['choices'][0]['text']
-            yield response.strip()
-    return respond
 if __name__ == "__main__":
-    # Download the model
-    model_name = "Mykes/med_tinyllama_gguf"
-    filename = "unsloth.Q4_K_M.gguf"
-    model_path = hf_hub_download(repo_id=model_name, filename=filename)
-    # Initialize the model
-    model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
-    # Create a responder function with the model
-    respond = create_responder(model)
-    # Create the Gradio interface
-    demo = gr.ChatInterface(
-        respond,
-        undo_btn="Отменить",
-        clear_btn="Очистить",
-        additional_inputs=[
-            gr.Textbox(value="", label="System message"),
-            gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
-            gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
-            gr.Slider(
-                minimum=0.1,
-                maximum=1.0,
-                value=0.9,
-                step=0.05,
-                label="Top-p (nucleus sampling)"
-            ),
-        ],
-        title="Med TinyLlama Chat",
-        description="Chat with the Med TinyLlama model for medical information.",
-    )
-    demo.launch()
-# import gradio as gr
-# from llama_cpp import Llama
-# from huggingface_hub import hf_hub_download
-# # Download the model
-# model_name = "Mykes/med_tinyllama_gguf"
-# filename = "unsloth.Q4_K_M.gguf"
-# model_path = hf_hub_download(repo_id=model_name, filename=filename)
-# # Initialize the model
-# # model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
-# model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
-# # def preload_model(model, preload_tokens=1024):
-# #     # Dummy call to load model into RAM by accessing parts of it
-# #     try:
-# #         dummy_input = " " * preload_tokens
-# #         _ = model(dummy_input, max_tokens=1)
-# #         print("Model preloaded into RAM.")
-# #     except Exception as e:
-# #         print(f"Error preloading model: {e}")
-# # # Preload the model into RAM
-# # preload_model(model)
-# def respond(
-#     message,
-#     history: list[tuple[str, str]],
-#     system_message,
-#     max_tokens,
-#     temperature,
-#     top_p,
-# ):
-#     history = history[-3:]
-#     # Construct the prompt
-#     prompt = f"<s>{system_message}\n\n"
-#     for user_msg, assistant_msg in history:
-#         prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
-#     prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
-#     # Generate response
-#     response = ""
-#     for token in model(
-#         prompt,
-#         max_tokens=max_tokens,
-#         temperature=temperature,
-#         top_p=top_p,
-#         stream=True,
-#         stop=["<|end|>", "</s>"]
-#     ):
-#         response += token['choices'][0]['text']
-#         yield response.strip()
-# # Create the Gradio interface
-# demo = gr.ChatInterface(
-#     respond,
-#     undo_btn="Отменить",
-#     clear_btn="Очистить",
-#     additional_inputs=[
-#         # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
-#         gr.Textbox(value="", label="System message"),
-#         gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
-#         gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
-#         gr.Slider(
-#             minimum=0.1,
-#             maximum=1.0,
-#             value=0.9,
-#             step=0.05,
-#             label="Top-p (nucleus sampling)",
-#         ),
-#     ],
-#     title="Med TinyLlama Chat",
-#     description="Chat with the Med TinyLlama model for medical information.",
-# )
-# if __name__ == "__main__":
-#     demo.launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# Download the model
+model_name = "Mykes/med_tinyllama_gguf"
+filename = "unsloth.Q4_K_M.gguf"
+model_path = hf_hub_download(repo_id=model_name, filename=filename)
+# Initialize the model
+# model = Llama(model_path=model_path, n_ctx=2048, n_threads=4, n_batch=32, use_mmap=True, use_mlock=True, rope_freq_base=10000, rope_freq_scale=1.0)
+model = Llama(model_path=model_path, n_ctx=256, n_threads=2, n_batch=8, use_mlock=True)
+# def preload_model(model, preload_tokens=1024):
+#     # Dummy call to load model into RAM by accessing parts of it
+#     try:
+#         dummy_input = " " * preload_tokens
+#         _ = model(dummy_input, max_tokens=1)
+#         print("Model preloaded into RAM.")
+#     except Exception as e:
+#         print(f"Error preloading model: {e}")
+# # Preload the model into RAM
+# preload_model(model)
+def respond(
+    message,
+    history: list[tuple[str, str]],
+    system_message,
+    max_tokens,
+    temperature,
+    top_p,
+):
+    history = history[-3:]
+    # Construct the prompt
+    prompt = f"<s>{system_message}\n\n"
+    for user_msg, assistant_msg in history:
+        prompt += f"<|user|>{user_msg}<|end|></s> <|assistant|>{assistant_msg}<|end|></s>"
+    prompt += f"<|user|>{message}<|end|></s> <|assistant|>"
+    # Generate response
+    response = ""
+    for token in model(
+        prompt,
+        max_tokens=max_tokens,
+        temperature=temperature,
+        top_p=top_p,
+        stream=True,
+        stop=["<|end|>", "</s>"]
     ):
+        response += token['choices'][0]['text']
+        yield response.strip()
+# Create the Gradio interface
+demo = gr.ChatInterface(
+    respond,
+    undo_btn="Отменить",
+    clear_btn="Очистить",
+    additional_inputs=[
+        # gr.Textbox(value="You are a friendly medical assistant.", label="System message"),
+        gr.Textbox(value="", label="System message"),
+        gr.Slider(minimum=128, maximum=4096, value=2048, step=1, label="Max new tokens"),
+        gr.Slider(minimum=0.1, maximum=2.0, value=0.7, step=0.1, label="Temperature"),
+        gr.Slider(
+            minimum=0.1,
+            maximum=1.0,
+            value=0.9,
+            step=0.05,
+            label="Top-p (nucleus sampling)",
+        ),
+    ],
+    title="Med TinyLlama Chat",
+    description="Chat with the Med TinyLlama model for medical information.",
+)
 if __name__ == "__main__":
+    demo.launch()