Spaces:
Runtime error
Runtime error
| # app.py | |
| # Minimal & stable version for free CPU Hugging Face Space β Phi-3-mini + LoRA | |
| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig | |
| from peft import PeftModel | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Config | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| BASE_MODEL = "unsloth/Phi-3-mini-4k-instruct-bnb-4bit" | |
| LORA_PATH = "saadkhi/SQL_Chat_finetuned_model" | |
| MAX_NEW_TOKENS = 180 | |
| TEMPERATURE = 0.0 | |
| DO_SAMPLE = False | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Load model & tokenizer | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading base model (CPU)...") | |
| try: | |
| bnb_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_quant_type="nf4", | |
| bnb_4bit_compute_dtype=torch.bfloat16 | |
| ) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| BASE_MODEL, | |
| quantization_config = bnb_config, | |
| device_map = "cpu", | |
| trust_remote_code = True, | |
| low_cpu_mem_usage = True | |
| ) | |
| print("Loading LoRA...") | |
| model = PeftModel.from_pretrained(model, LORA_PATH) | |
| print("Merging LoRA weights...") | |
| model = model.merge_and_unload() | |
| tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL) | |
| model.eval() | |
| print("Model & tokenizer loaded successfully") | |
| except Exception as e: | |
| print(f"Model loading failed: {str(e)}") | |
| raise | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Inference function | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def generate_sql(question: str): | |
| try: | |
| messages = [{"role": "user", "content": question.strip()}] | |
| inputs = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=True, | |
| add_generation_prompt=True, | |
| return_tensors="pt" | |
| ) | |
| with torch.inference_mode(): | |
| outputs = model.generate( | |
| input_ids = inputs, | |
| max_new_tokens = MAX_NEW_TOKENS, | |
| temperature = TEMPERATURE, | |
| do_sample = DO_SAMPLE, | |
| use_cache = True, | |
| pad_token_id = tokenizer.eos_token_id, | |
| ) | |
| response = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # Clean typical Phi-3 output markers | |
| for marker in ["<|assistant|>", "<|end|>", "<|user|>"]: | |
| if marker in response: | |
| response = response.split(marker, 1)[-1].strip() | |
| return response.strip() or "(empty response)" | |
| except Exception as e: | |
| return f"Generation error: {str(e)}" | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Gradio UI | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| demo = gr.Interface( | |
| fn = generate_sql, | |
| inputs = gr.Textbox( | |
| label = "SQL question", | |
| placeholder = "Find duplicate emails in users table", | |
| lines = 3, | |
| max_lines = 6 | |
| ), | |
| outputs = gr.Textbox( | |
| label = "Generated SQL", | |
| lines = 8 | |
| ), | |
| title = "SQL Chat β Phi-3-mini fine-tuned (CPU)", | |
| description = ( | |
| "Free CPU version β first answer usually takes 60β180+ seconds.\n" | |
| "Later answers are faster (model stays in memory)." | |
| ), | |
| examples = [ | |
| ["Find duplicate emails in users table"], | |
| ["Top 5 highest paid employees"], | |
| ["Count orders per customer last month"], | |
| ["Delete duplicate rows based on email"] | |
| ], | |
| cache_examples = False, | |
| ) | |
| if __name__ == "__main__": | |
| print("Launching interface...") | |
| demo.launch( | |
| server_name = "0.0.0.0", | |
| # NO fixed server_port β let Gradio pick free port automatically | |
| debug = False, | |
| quiet = False, | |
| show_error = True, | |
| prevent_thread_lock = True | |
| ) |