erax_llm / app.py
THP2903's picture
Update app.py
3d89d29 verified
import os
import base64
import torch
import gradio as gr
from transformers import Qwen2VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info
import cv2
# Đường dẫn đến thư mục chứa các file .safetensors và các file cấu hình
model_directory = "THP2903/erax_llm"
# Load model từ các file .safetensors shards trong thư mục đã tải
model = Qwen2VLForConditionalGeneration.from_pretrained(
model_directory,
torch_dtype=torch.bfloat16, # Sử dụng torch.float16 nếu cần tiết kiệm bộ nhớ
device_map="auto"
)
# Load tokenizer from local files
tokenizer = AutoTokenizer(
vocab_file=f"{model_directory}/vocab.json",
merges_file=f"{model_directory}/merges.txt",
tokenizer_file=f"{model_directory}/tokenizer.json"
)
# Load processor directly from config
processor_config = torch.load(f"{model_directory}/preprocessor_config.json")
processor = AutoProcessor.from_config(processor_config)
# Set generation configuration directly
generation_config = torch.load(f"{model_directory}/generation_config.json")
generation_config.do_sample = True
generation_config.temperature = 0.2
generation_config.top_k = 1
generation_config.top_p = 0.001
generation_config.max_new_tokens = 2048
generation_config.repetition_penalty = 1.1
# Define the function for generating description based on image and prompt
def generate_description(image, prompt):
# Encode image to base64
_, encoded_image = cv2.imencode('.jpg', image)
encoded_image = base64.b64encode(encoded_image).decode('utf-8')
base64_data = f"data:image;base64,{encoded_image}"
# Prepare input message with prompt
messages = [
{
"role": "user",
"content": [
{
"type": "image",
"image": base64_data,
},
{
"type": "text",
"text": prompt # Use the custom prompt input
},
],
}
]
# Prepare prompt
tokenized_text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
image_inputs, video_inputs = process_vision_info(messages)
# Process inputs for the model
inputs = processor(
text=[tokenized_text],
images=image_inputs,
videos=video_inputs,
padding=True,
return_tensors="pt",
).to("cuda")
# Generate text description
generated_ids = model.generate(**inputs, generation_config=generation_config)
generated_ids_trimmed = [out_ids[len(in_ids):] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)]
output_text = processor.batch_decode(generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False)
return output_text[0]
# Gradio interface with image and text prompt input
iface = gr.Interface(
fn=generate_description,
inputs=[
gr.Image(type="numpy", label="Upload Image"),
gr.Textbox(lines=2, placeholder="Enter your prompt/question here", label="Prompt")
],
outputs="text",
title="Image Description Generator",
description="Upload an image and enter a prompt/question to get a detailed description or answer based on the image."
)
iface.launch()