import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer # 1. Define your repository repo_name = "Phase-Technologies/qwen2.5-math-1.5b-generalized-merged" print("Loading model into memory... This takes a minute on a CPU.") # 2. Load the Tokenizer and Model # We load in standard precision because the free tier does not have a GPU for 4-bit tokenizer = AutoTokenizer.from_pretrained(repo_name) model = AutoModelForCausalLM.from_pretrained( repo_name, device_map="cpu", torch_dtype=torch.float32 ) # 3. Define the inference function def generate_response(prompt): # Apply your training template universal_prompt = "### Instruction:\n{}\n\n### Response:\n{}" formatted_prompt = universal_prompt.format(prompt, "") # Tokenize input inputs = tokenizer( formatted_prompt, return_tensors="pt" ).to(model.device) # Generate output with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=1024, max_length=None, use_cache=True, repetition_penalty=1.15, pad_token_id=tokenizer.eos_token_id ) # Decode and format the response response = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0] final_answer = response.split("### Response:\n")[-1] return final_answer # 4. Build the Gradio Web UI with gr.Blocks(theme=gr.themes.Soft()) as demo: gr.Markdown("# 🧠 Phase-Technologies: Generalized Qwen-Math (1.5B)") gr.Markdown("An ultra-lightweight reasoning model fine-tuned for graduate-level proofs and conversational instruction-following.") with gr.Row(): with gr.Column(): user_input = gr.Textbox( lines=5, label="Your Prompt", placeholder="E.g., What is 2+2? OR Provide a step-by-step proof for the eigenvalues of [[2,1],[1,2]]..." ) submit_btn = gr.Button("Generate Response", variant="primary") with gr.Column(): output_box = gr.Textbox(lines=15, label="Model Output") submit_btn.click(fn=generate_response, inputs=user_input, outputs=output_box) # 5. Launch the app demo.launch()