dynamodenis254 commited on
Commit
a4aae63
·
verified ·
1 Parent(s): ddc82e3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -58
app.py CHANGED
@@ -1,70 +1,93 @@
1
  import gradio as gr
2
- from huggingface_hub import InferenceClient
 
 
3
 
 
 
 
 
 
4
 
5
- def respond(
6
- message,
7
- history: list[dict[str, str]],
8
- system_message,
9
- max_tokens,
10
- temperature,
11
- top_p,
12
- hf_token: gr.OAuthToken,
13
- ):
14
- """
15
- For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
16
- """
17
- client = InferenceClient(token=hf_token.token, model="openai/gpt-oss-20b")
18
 
19
- messages = [{"role": "system", "content": system_message}]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
- messages.extend(history)
 
 
 
 
 
 
 
 
 
22
 
23
- messages.append({"role": "user", "content": message})
 
24
 
25
- response = ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
27
- for message in client.chat_completion(
28
- messages,
29
- max_tokens=max_tokens,
30
- stream=True,
31
- temperature=temperature,
32
- top_p=top_p,
33
- ):
34
- choices = message.choices
35
- token = ""
36
- if len(choices) and choices[0].delta.content:
37
- token = choices[0].delta.content
38
-
39
- response += token
40
- yield response
41
-
42
-
43
- """
44
- For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
45
- """
46
- chatbot = gr.ChatInterface(
47
- respond,
48
- type="messages",
49
- additional_inputs=[
50
- gr.Textbox(value="You are a friendly Chatbot.", label="System message"),
51
- gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
52
- gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
53
- gr.Slider(
54
- minimum=0.1,
55
- maximum=1.0,
56
- value=0.95,
57
- step=0.05,
58
- label="Top-p (nucleus sampling)",
59
- ),
60
  ],
 
 
 
 
61
  )
62
 
63
- with gr.Blocks() as demo:
64
- with gr.Sidebar():
65
- gr.LoginButton()
66
- chatbot.render()
67
-
68
-
69
  if __name__ == "__main__":
70
- demo.launch()
 
1
  import gradio as gr
2
+ import torch
3
+ from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
4
+ from peft import PeftModel, PeftConfig # Necessary for loading the adapter weights
5
 
6
+ # --- Configuration ---
7
+ # 1. Base Llama 2 model used for fine-tuning
8
+ BASE_MODEL = "aboonaji/llama2finetune-v2"
9
+ # 2. Your newly published adapter model on the Hub
10
+ ADAPTER_MODEL = "dynamodenis254/dynamo-denis-llama2finetune-medical"
11
 
12
+ # --- Model Loading ---
13
+ # This function loads the model and runs only once when the app starts
14
+ def load_model():
15
+ """Loads the base model and applies the fine-tuned adapter weights."""
16
+ print(f"Loading base model: {BASE_MODEL}")
17
+
18
+ # Check for GPU availability
19
+ device = "cuda" if torch.cuda.is_available() else "cpu"
20
+ print(f"Using device: {device}")
 
 
 
 
21
 
22
+ # Load the base model (ensure trust_remote_code=True for custom Llama models)
23
+ base_model = AutoModelForCausalLM.from_pretrained(
24
+ BASE_MODEL,
25
+ torch_dtype=torch.float16, # Use half precision for faster GPU inference
26
+ device_map="auto",
27
+ trust_remote_code=True
28
+ )
29
+
30
+ # Load the Peft (LoRA) adapter weights on top of the base model
31
+ model = PeftModel.from_pretrained(base_model, ADAPTER_MODEL)
32
+
33
+ # Get the tokenizer
34
+ tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL, trust_remote_code=True)
35
+ tokenizer.pad_token = tokenizer.eos_token
36
+ tokenizer.padding_side = "right"
37
 
38
+ # Create the Hugging Face Pipeline for easy text generation
39
+ generator = pipeline(
40
+ "text-generation",
41
+ model=model,
42
+ tokenizer=tokenizer,
43
+ device=0 if device == "cuda" else -1 # Use GPU 0 if available, otherwise use CPU
44
+ )
45
+
46
+ print("Model and Tokenizer loaded successfully.")
47
+ return generator
48
 
49
+ # Load the model outside the prediction function so it runs only once
50
+ generator = load_model()
51
 
52
+ # --- Prediction Function ---
53
+ def generate_response(prompt, max_new_tokens=256, temperature=0.7):
54
+ """Generates text using the fine-tuned model."""
55
+
56
+ # Llama models often work best with a system prompt structure
57
+ system_prompt = "You are a specialized medical assistant. Provide concise and accurate information."
58
+ formatted_prompt = f"### System:\n{system_prompt}\n\n### User:\n{prompt}\n\n### Assistant:\n"
59
+
60
+ try:
61
+ # Run the generation pipeline
62
+ result = generator(
63
+ formatted_prompt,
64
+ max_new_tokens=max_new_tokens,
65
+ temperature=temperature,
66
+ do_sample=True,
67
+ return_full_text=False # Only return the generated part of the response
68
+ )
69
+
70
+ # Extract the text and clean up any potential trailing newlines
71
+ generated_text = result[0]['generated_text'].strip()
72
+ return generated_text
73
+
74
+ except Exception as e:
75
+ return f"An error occurred during generation: {e}"
76
 
77
+ # --- Gradio Interface Setup ---
78
+ iface = gr.Interface(
79
+ fn=generate_response,
80
+ inputs=[
81
+ gr.Textbox(lines=4, label="Medical Query (e.g., 'What are the symptoms of type 2 diabetes?')", placeholder="Enter your medical question..."),
82
+ gr.Slider(minimum=32, maximum=1024, step=32, value=256, label="Max Response Length", info="Controls the length of the generated answer."),
83
+ gr.Slider(minimum=0.1, maximum=1.0, step=0.1, value=0.7, label="Creativity (Temperature)", info="Higher temperature means more creative/risky answers.")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
  ],
85
+ outputs=gr.Textbox(lines=10, label="Fine-Tuned Medical Assistant Response"),
86
+ title="⚕️ Medical Llama 2 Fine-Tune Demo (dynamodenis254)",
87
+ description="This demo uses a Llama 2 model fine-tuned on medical data. Enter a query and observe the specialized response.",
88
+ theme="soft"
89
  )
90
 
91
+ # Launch is handled automatically by Hugging Face Spaces
 
 
 
 
 
92
  if __name__ == "__main__":
93
+ iface.launch()