| import torch | |
| import gradio as gr | |
| from transformers import AutoTokenizer, AutoModelForCausalLM, TextIteratorStreamer | |
| from threading import Thread | |
| # Load model and tokenizer once at startup | |
| model_name = "jsbeaudry/makandal-v2" | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_name, | |
| torch_dtype=torch.float16, | |
| device_map="auto" | |
| ) | |
| think_token_id = tokenizer.convert_tokens_to_ids("</think>") | |
| def generate_response_stream(prompt): | |
| # Format input for chat template | |
| messages = [{"role": "user", "content": prompt}] | |
| text = tokenizer.apply_chat_template( | |
| messages, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| enable_thinking=False | |
| ) | |
| # Tokenize | |
| model_inputs = tokenizer([text], return_tensors="pt") | |
| model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} | |
| # Create streamer | |
| text_streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True) | |
| # Generation parameters | |
| generation_kwargs = dict( | |
| **model_inputs, | |
| streamer=text_streamer, | |
| max_new_tokens=100, | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.9, | |
| use_cache=True, | |
| ) | |
| # Start generation in a separate thread | |
| thread = Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| # Stream the response | |
| partial_response = "" | |
| for new_text in text_streamer: | |
| partial_response += new_text | |
| yield partial_response | |
| # Wait for thread to complete | |
| thread.join() | |
| # Gradio Interface with streaming | |
| demo = gr.Interface( | |
| fn=generate_response_stream, | |
| inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
| outputs=gr.Textbox(label="Respons"), | |
| title="Makandal Text Generator (Streaming)", | |
| description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti.", | |
| live=False # Set to False to prevent auto-triggering | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |
| # import torch | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # # Load model and tokenizer once at startup | |
| # model_name = "jsbeaudry/makandal-v2" | |
| # tokenizer = AutoTokenizer.from_pretrained(model_name) | |
| # model = AutoModelForCausalLM.from_pretrained( | |
| # model_name, | |
| # torch_dtype=torch.float16, | |
| # device_map="auto" | |
| # ) | |
| # think_token_id = tokenizer.convert_tokens_to_ids("</think>") | |
| # def generate_response(prompt): | |
| # # Format input for chat template | |
| # messages = [{"role": "user", "content": prompt}] | |
| # text = tokenizer.apply_chat_template( | |
| # messages, | |
| # tokenize=False, | |
| # add_generation_prompt=True, | |
| # enable_thinking=False | |
| # ) | |
| # # Tokenize | |
| # model_inputs = tokenizer([text], return_tensors="pt") | |
| # model_inputs = {k: v.to(model.device) for k, v in model_inputs.items()} | |
| # # Generate | |
| # generated_ids = model.generate( | |
| # **model_inputs, | |
| # max_new_tokens=100, | |
| # do_sample=True, | |
| # temperature=0.7, | |
| # top_p=0.9 | |
| # ) | |
| # output_ids = generated_ids[0][len(model_inputs["input_ids"][0]):].tolist() | |
| # try: | |
| # index = len(output_ids) - output_ids[::-1].index(think_token_id) | |
| # except ValueError: | |
| # index = 0 | |
| # thinking_content = tokenizer.decode(output_ids[:index], skip_special_tokens=True).strip("\n") | |
| # content = tokenizer.decode(output_ids[index:], skip_special_tokens=True).strip("\n") | |
| # return thinking_content, content | |
| # # Gradio Interface | |
| # demo = gr.Interface( | |
| # fn=generate_response, | |
| # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
| # outputs=[ | |
| # # gr.Textbox(label="Thinking Content"), | |
| # gr.Textbox(label="Respons") | |
| # ], | |
| # title="Makandal Text Generator", | |
| # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." | |
| # ) | |
| # if __name__ == "__main__": | |
| # demo.launch() | |
| # import gradio as gr | |
| # from transformers import AutoTokenizer, AutoModelForCausalLM | |
| # import torch | |
| # # Load model and tokenizer | |
| # tokenizer = AutoTokenizer.from_pretrained("jsbeaudry/makandal-v2") | |
| # model = AutoModelForCausalLM.from_pretrained("jsbeaudry/makandal-v2") | |
| # # Set device | |
| # device = "cuda" if torch.cuda.is_available() else "cpu" | |
| # model.to(device) | |
| # # Generation function | |
| # def generate_text(prompt): | |
| # inputs = tokenizer(prompt, return_tensors="pt", padding=True).to(device) | |
| # output = model.generate( | |
| # **inputs, | |
| # max_new_tokens=30, | |
| # do_sample=True, | |
| # repetition_penalty=1.2, | |
| # no_repeat_ngram_size=3, | |
| # temperature=0.9, | |
| # top_k=40, | |
| # top_p=0.85, | |
| # pad_token_id=tokenizer.pad_token_id, | |
| # eos_token_id=tokenizer.eos_token_id | |
| # ) | |
| # return tokenizer.decode(output[0], skip_special_tokens=True) | |
| # # Gradio interface | |
| # iface = gr.Interface( | |
| # fn=generate_text, | |
| # inputs=gr.Textbox(lines=2, placeholder="Ekri yon sijè oswa yon fraz..."), | |
| # outputs="text", | |
| # title="Makandal Text Generator", | |
| # description="Ekri yon fraz oswa mo kle pou jenere tèks ak modèl Makandal la. Modèl sa fèt espesyalman pou kontèks Ayiti." | |
| # ) | |
| # if __name__ == "__main__": | |
| # iface.launch() | |