""" iNosh AI - Hugging Face Inference Handler This file defines how to load and run the model on HF Inference Endpoints """ from typing import Dict, List, Any import torch from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline from peft import PeftModel class EndpointHandler: def __init__(self, path=""): """ Initialize the model and tokenizer path: Path to the model files (HF will provide this) """ # Load base model base_model_name = "unsloth/Llama-3.2-1B-Instruct" print(f"Loading tokenizer from {base_model_name}...") self.tokenizer = AutoTokenizer.from_pretrained(base_model_name) print(f"Loading base model from {base_model_name}...") self.model = AutoModelForCausalLM.from_pretrained( base_model_name, torch_dtype=torch.float16, device_map="auto", load_in_4bit=True, # Use 4-bit quantization for faster loading ) # Load LoRA adapter from the uploaded files print(f"Loading LoRA adapter from {path}...") self.model = PeftModel.from_pretrained( self.model, path, ) # Merge for faster inference print("Merging adapter with base model...") self.model = self.model.merge_and_unload() self.model.eval() print("iNosh AI loaded successfully!") def __call__(self, data: Dict[str, Any]) -> List[Dict[str, str]]: """ Handle inference requests Args: data: {"inputs": "User message here"} Returns: [{"generated_text": "Response here"}] """ # Extract user message inputs = data.pop("inputs", data) user_message = inputs if isinstance(inputs, str) else inputs.get("message", "") # System prompt (matches training) system_prompt = """You are iNosh AI, a smart kitchen assistant that helps with pantry management and meal planning. Target Market: Australia & New Zealand (Multicultural) Your capabilities: - Manage pantry items (expiry tracking, low stock alerts, barcode scanning) - Create shopping lists (store-specific: Woolworths, Coles, Countdown, etc.) - Suggest recipes (15 cuisines, nutrition-focused, dietary restrictions) - Plan meals (weekly, budget-aware, nutrition-optimized) - Track fitness (Apple Health, Google Fit integration) - Log restaurant meals (AU/NZ chains) - Scan barcodes (instant nutrition lookup) - Plan kids meals (school lunch requirements) CRITICAL RESPONSE RULES: 1. For action requests, respond with valid JSON 2. For general conversation, respond naturally without JSON 3. Always respect dietary restrictions (no pork in halal, no meat in vegan, etc.) 4. Use metric units (g, kg, ml, L) - AU/NZ standard 5. Price estimates in AUD/NZD 6. Include nutrition data when relevant (calories, protein, carbs, fat) 7. Suggest recipes from available pantry items when possible JSON Action Formats: - Pantry: {"action": "add_pantry", "item": {...}} - Shopping: {"action": "create_list", "list": {...}} - Recipes: {"action": "suggest_recipes", "recipes": [...]} - Meal Plan: {"action": "create_meal_plan", "plan": {...}} - Fitness: {"action": "log_workout", "workout": {...}} - Restaurant: {"action": "log_restaurant", "meal": {...}} - Barcode: {"action": "lookup_barcode", "product": {...}} Tone: Professional and helpful. Provide clear, concise responses.""" # Format using Llama chat template messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": user_message}, ] prompt = self.tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True, ) # Tokenize inputs = self.tokenizer(prompt, return_tensors="pt").to(self.model.device) # Generate with torch.no_grad(): outputs = self.model.generate( **inputs, max_new_tokens=500, temperature=0.7, do_sample=True, pad_token_id=self.tokenizer.eos_token_id, ) # Decode full_response = self.tokenizer.decode(outputs[0], skip_special_tokens=True) # Extract assistant response (after the prompt) assistant_response = full_response[len(prompt):].strip() return [{"generated_text": assistant_response}]