# app.py from transformers import ( AutoModelForCausalLM, AutoTokenizer, StoppingCriteria, StoppingCriteriaList ) import torch import gradio as gr # ====================== # Configuration # ====================== MODEL_ID = "microsoft/Phi-3-mini-128k-instruct" # ====================== # Load Model & Tokenizer # ====================== print(f"🚀 Loading model: {MODEL_ID}") tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, torch_dtype=torch.float16, device_map="auto", trust_remote_code=False, attn_implementation="eager" # Use "flash_attention_2" if installed ) print("✅ Model loaded successfully!") # ====================== # Stopping Criteria # ====================== class StopOnTokens(StoppingCriteria): def __init__(self, stop_token_ids): self.stop_token_ids = list(stop_token_ids) def __call__(self, input_ids, scores, **kwargs): for stop_id in self.stop_token_ids: if input_ids[0, -1] == stop_id: return True return False # Get stop token IDs stop_token_ids = [ tokenizer.eos_token_id, # Standard EOS ] # Add <|end|> token if it exists end_token_id = tokenizer.convert_tokens_to_ids("<|end|>") if isinstance(end_token_id, int) and end_token_id >= 0: stop_token_ids.append(end_token_id) stopping_criteria = StoppingCriteriaList([StopOnTokens(stop_token_ids)]) # ====================== # Response Function # ====================== def respond(message: str, history): """ Generate a response from the Phi-3 model. Args: message (str): New user input history (List[dict]): Chat history in {"role": ..., "content": ...} format Returns: str: The model's response (text only) """ if not message.strip(): return "" # Build conversation messages = history + [{"role": "user", "content": message}] # Apply Phi-3 chat template prompt = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) # Tokenize inputs = tokenizer( prompt, return_tensors="pt", truncation=True, max_length=128000 ).to(model.device) print('Tokenized input: ', inputs) # Generate with torch.no_grad(): outputs = model.generate( **inputs, max_new_tokens=256, do_sample=True, temperature=0.1, top_p=0.9, eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.eos_token_id, stopping_criteria=stopping_criteria, ) # Decode only the new tokens (after input) new_tokens = outputs[0][inputs.input_ids.shape[1]:] response = tokenizer.decode(new_tokens, skip_special_tokens=True) print('Response: ', response) return response # Gradio will auto-append to chat history # ====================== # Gradio Interface # ====================== demo = gr.ChatInterface( fn=respond, chatbot=gr.Chatbot( height=600, type="messages" # Required for Gradio v5 ), textbox=gr.Textbox( placeholder="Ask me anything about AI, science, coding, and more...", container=False, scale=7 ), title="🧠 Phi-3 Mini (128K Context) Chat", description=""" A demo of Microsoft's **Phi-3-mini-128k-instruct** model — a powerful small LLM with support for ultra-long context. Try asking it to summarize long texts, explain complex topics, or write code. """, examples=[ "Who are you?", "Explain quantum entanglement simply.", "Write a Python function to detect cycles in a linked list." ], # Note: retry_btn, undo_btn, clear_btn removed — not supported in v5 # Toolbar appears automatically ) # ====================== # Launch # ====================== if __name__ == "__main__": demo.launch()