| # from fastapi import FastAPI, Request | |
| # from pydantic import BaseModel | |
| # # from unsloth import FastLanguageModel | |
| # import torch | |
| # import re | |
| # app = FastAPI() | |
| # # Load model once on startup | |
| # model, tokenizer = FastLanguageModel.from_pretrained( | |
| # model_name = "unsloth/mistral-7b-instruct-v0.3-bnb-4bit", | |
| # max_seq_length = 2048, | |
| # dtype = None, | |
| # load_in_4bit = True, | |
| # ) | |
| # FastLanguageModel.for_inference(model) | |
| # class SAPNoteRequest(BaseModel): | |
| # text: str | |
| # @app.post("/generate_qa") | |
| # def generate_qa(req: SAPNoteRequest): | |
| # text = req.text | |
| # match = re.search(r"SAP Note\s*(\d+)", text) | |
| # sap_note_number = match.group(1) if match else "UNKNOWN" | |
| # prompt = f""" | |
| # Generate 20 question-answer pairs based on the following SAP Note. | |
| # Each question should include the SAP note number {sap_note_number} to clarify context. | |
| # \"\"\"{text}\"\"\" | |
| # Q1: question | |
| # A1: answer | |
| # ### Response: | |
| # """ | |
| # inputs = tokenizer(prompt, return_tensors="pt").to("cuda") | |
| # outputs = model.generate( | |
| # inputs.input_ids, | |
| # max_new_tokens=2048, | |
| # do_sample=True, | |
| # temperature=0.7, | |
| # top_p=0.95, | |
| # repetition_penalty=1.2 | |
| # ) | |
| # output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| # qa_pairs = output_text.split("### Response:")[-1].strip() | |
| # return {"qa_pairs": qa_pairs} | |
| ### Hugging face code | |
| # import torch | |
| # from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig | |
| # # Quantization settings | |
| # quantization_config = BitsAndBytesConfig( | |
| # load_in_4bit=True, | |
| # bnb_4bit_quant_type="nf4", | |
| # bnb_4bit_compute_dtype=torch.float16, | |
| # ) | |
| import torch | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, GenerationConfig, BitsAndBytesConfig | |
| import gradio as gr | |
| # Use quantization for low-memory GPU inference | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.bfloat16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| model_name = "mistralai/Mistral-7B-Instruct-v0.3" | |
| # Load model and tokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| quantization_config=quantization_config, | |
| torch_dtype=torch.bfloat16, | |
| device_map="auto" | |
| ) | |
| # Define generation function | |
| def generate_qa(text): | |
| prompt = f"""### Instruction: | |
| Based on the following SAP Note, generate exactly 20 unique and informative question-answer pairs. | |
| Each question must refer to the SAP note number from text if additional context is needed. | |
| Only output the pairs in the format: | |
| Q1: ... | |
| A1: ... | |
| ... | |
| Q20: ... | |
| A20: ... | |
| ### Input: | |
| {text} | |
| ### Response: | |
| """ | |
| inputs = tokenizer(prompt, return_tensors="pt").to(model.device) | |
| outputs = model.generate( | |
| input_ids=inputs.input_ids, | |
| attention_mask=inputs.attention_mask, | |
| max_new_tokens=2500, | |
| do_sample=True, | |
| temperature=0.9, | |
| top_p=0.95, | |
| repetition_penalty=1.1, | |
| pad_token_id=tokenizer.eos_token_id | |
| ) | |
| output_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
| qa_pairs = output_text.split("### Response:")[-1].strip() | |
| return qa_pairs | |
| # Define Gradio UI | |
| demo = gr.Interface( | |
| fn=generate_qa, | |
| inputs=gr.Textbox(lines=20, label="SAP Note Text"), | |
| outputs=gr.Textbox(lines=25, label="Generated Q&A Pairs"), | |
| title="Mistral Q&A Generator for SAP Notes", | |
| description="Upload or paste SAP Note content to generate 20 question-answer pairs." | |
| ) | |
| demo.launch() | |