Spaces:
Runtime error
Runtime error
Create app.py
Browse files
app.py
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Install necessary libraries
|
| 2 |
+
#!pip install transformers accelerate datasets gradio sympy
|
| 3 |
+
|
| 4 |
+
# Import libraries
|
| 5 |
+
import torch
|
| 6 |
+
from transformers import AutoModelForCausalLM, AutoTokenizer
|
| 7 |
+
import gradio as gr
|
| 8 |
+
import sympy
|
| 9 |
+
|
| 10 |
+
# Load Model and Tokenizer
|
| 11 |
+
MODEL_NAME = "meta/llama-3.2-1b-instruct"
|
| 12 |
+
PRM_NAME = "RLHFlow/Llama3.1-8B-PRM"
|
| 13 |
+
|
| 14 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 15 |
+
|
| 16 |
+
# Load LLaMA model
|
| 17 |
+
def load_model(model_name):
|
| 18 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 19 |
+
model = AutoModelForCausalLM.from_pretrained(model_name, device_map="auto")
|
| 20 |
+
return model.to(device), tokenizer
|
| 21 |
+
|
| 22 |
+
llama_model, llama_tokenizer = load_model(MODEL_NAME)
|
| 23 |
+
|
| 24 |
+
# Load Process Reward Model (PRM)
|
| 25 |
+
prm_model, prm_tokenizer = load_model(PRM_NAME)
|
| 26 |
+
|
| 27 |
+
# Strategies
|
| 28 |
+
def majority_voting(prompt, num_samples=5):
|
| 29 |
+
outputs = []
|
| 30 |
+
for _ in range(num_samples):
|
| 31 |
+
input_ids = llama_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
| 32 |
+
output = llama_model.generate(input_ids, max_new_tokens=50)
|
| 33 |
+
outputs.append(llama_tokenizer.decode(output[0], skip_special_tokens=True))
|
| 34 |
+
# Return the most common result
|
| 35 |
+
return max(set(outputs), key=outputs.count)
|
| 36 |
+
|
| 37 |
+
def best_of_n(prompt, num_samples=5):
|
| 38 |
+
scored_outputs = []
|
| 39 |
+
for _ in range(num_samples):
|
| 40 |
+
input_ids = llama_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
| 41 |
+
output = llama_model.generate(input_ids, max_new_tokens=50)
|
| 42 |
+
response = llama_tokenizer.decode(output[0], skip_special_tokens=True)
|
| 43 |
+
score = prm_model(**prm_tokenizer(response, return_tensors="pt").to(device)).logits.mean().item()
|
| 44 |
+
scored_outputs.append((response, score))
|
| 45 |
+
# Return the highest scored response
|
| 46 |
+
return max(scored_outputs, key=lambda x: x[1])[0]
|
| 47 |
+
|
| 48 |
+
def beam_search(prompt, num_beams=5):
|
| 49 |
+
input_ids = llama_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
| 50 |
+
outputs = llama_model.generate(input_ids, max_new_tokens=50, num_beams=num_beams, num_return_sequences=num_beams)
|
| 51 |
+
return [llama_tokenizer.decode(output, skip_special_tokens=True) for output in outputs]
|
| 52 |
+
|
| 53 |
+
def dvts(prompt, depth=3, breadth=2):
|
| 54 |
+
"""
|
| 55 |
+
Simplified implementation of DVTS: generates a tree of solutions and evaluates branches using PRM.
|
| 56 |
+
"""
|
| 57 |
+
results = []
|
| 58 |
+
for _ in range(breadth):
|
| 59 |
+
input_ids = llama_tokenizer(prompt, return_tensors="pt").input_ids.to(device)
|
| 60 |
+
output = llama_model.generate(input_ids, max_new_tokens=50)
|
| 61 |
+
response = llama_tokenizer.decode(output[0], skip_special_tokens=True)
|
| 62 |
+
score = prm_model(**prm_tokenizer(response, return_tensors="pt").to(device)).logits.mean().item()
|
| 63 |
+
results.append((response, score))
|
| 64 |
+
# Select the top responses and expand them recursively
|
| 65 |
+
for _ in range(depth - 1):
|
| 66 |
+
best_responses = sorted(results, key=lambda x: x[1], reverse=True)[:breadth]
|
| 67 |
+
for response, _ in best_responses:
|
| 68 |
+
input_ids = llama_tokenizer(response, return_tensors="pt").input_ids.to(device)
|
| 69 |
+
output = llama_model.generate(input_ids, max_new_tokens=50)
|
| 70 |
+
extended_response = llama_tokenizer.decode(output[0], skip_special_tokens=True)
|
| 71 |
+
score = prm_model(**prm_tokenizer(extended_response, return_tensors="pt").to(device)).logits.mean().item()
|
| 72 |
+
results.append((extended_response, score))
|
| 73 |
+
# Return the best overall response
|
| 74 |
+
return max(results, key=lambda x: x[1])[0]
|
| 75 |
+
|
| 76 |
+
# Gradio Interface
|
| 77 |
+
def inference(prompt, strategy, num_samples, depth, breadth):
|
| 78 |
+
if strategy == "Majority Voting":
|
| 79 |
+
return majority_voting(prompt, num_samples)
|
| 80 |
+
elif strategy == "Best-of-N":
|
| 81 |
+
return best_of_n(prompt, num_samples)
|
| 82 |
+
elif strategy == "Beam Search":
|
| 83 |
+
return beam_search(prompt, num_samples)
|
| 84 |
+
elif strategy == "DVTS":
|
| 85 |
+
return dvts(prompt, depth, breadth)
|
| 86 |
+
else:
|
| 87 |
+
return "Invalid Strategy"
|
| 88 |
+
|
| 89 |
+
gr.Interface(
|
| 90 |
+
fn=inference,
|
| 91 |
+
inputs=[
|
| 92 |
+
gr.Textbox(label="Problem Statement", placeholder="Enter your problem here"),
|
| 93 |
+
gr.Radio(
|
| 94 |
+
["Majority Voting", "Best-of-N", "Beam Search", "DVTS"],
|
| 95 |
+
label="Inference Strategy",
|
| 96 |
+
),
|
| 97 |
+
gr.Slider(1, 10, step=1, value=5, label="Number of Samples"),
|
| 98 |
+
gr.Slider(1, 5, step=1, value=3, label="Depth (DVTS Only)"),
|
| 99 |
+
gr.Slider(1, 5, step=1, value=2, label="Breadth (DVTS Only)"),
|
| 100 |
+
],
|
| 101 |
+
outputs="text",
|
| 102 |
+
title="Dynamic Inference Toolkit",
|
| 103 |
+
description="Explore test-time compute scaling strategies with Meta's LLaMA model.",
|
| 104 |
+
).launch()
|