import os import base64 import torch import gradio as gr from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor from qwen_vl_utils import process_vision_info import cv2 # Đường dẫn đến thư mục chứa các file .safetensors và các file cấu hình model_directory = "THP2903/erax_llm" # Load model từ các file .safetensors shards trong thư mục đã tải model = Qwen2VLForConditionalGeneration.from_pretrained( model_directory, torch_dtype=torch.bfloat16, # Sử dụng torch.float16 nếu cần tiết kiệm bộ nhớ device_map="auto" ) # Load tokenizer from local files tokenizer = AutoTokenizer( vocab_file=f"{model_directory}/vocab.json", merges_file=f"{model_directory}/merges.txt", tokenizer_file=f"{model_directory}/tokenizer.json" ) # Load processor directly from config processor_config = torch.load(f"{model_directory}/preprocessor_config.json") processor = AutoProcessor.from_config(processor_config) # Set generation configuration directly generation_config = torch.load(f"{model_directory}/generation_config.json") generation_config.do_sample = True generation_config.temperature = 0.2 generation_config.top_k = 1 generation_config.top_p = 0.001 generation_config.max_new_tokens = 2048 generation_config.repetition_penalty = 1.1 # Define the function for generating description based on image and prompt def generate_description(image, prompt): # Encode image to base64 _, encoded_image = cv2.imencode('.jpg', image) encoded_image = base64.b64encode(encoded_image).decode('utf-8') base64_data = f"data:image;base64,{encoded_image}" # Prepare input message with prompt messages = [ { "role": "user", "content": [ { "type": "image", "image": base64_data, }, { "type": "text", "text": prompt # Use the custom prompt input }, ], } ] # Prepare prompt tokenized_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) image_inputs, video_inputs = process_vision_info(messages) # Process inputs for the model inputs = processor( text=[tokenized_text], images=image_inputs, videos=video_inputs, padding=True, return_tensors="pt", ).to("cuda") # Generate text description generated_ids = model.generate(**inputs, generation_config=generation_config) generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)] output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False) return output_text[0] # Gradio interface with image and text prompt input iface = gr.Interface( fn=generate_description, inputs=[ gr.Image(type="numpy", label="Upload Image"), gr.Textbox(lines=2, placeholder="Enter your prompt/question here", label="Prompt") ], outputs="text", title="Image Description Generator", description="Upload an image and enter a prompt/question to get a detailed description or answer based on the image." ) iface.launch()