|
|
--- |
|
|
license: apache-2.0 |
|
|
--- |
|
|
|
|
|
### Generation |
|
|
|
|
|
The following is the sample code for inference. |
|
|
|
|
|
```python |
|
|
|
|
|
from llava.model.builder import load_pretrained_model |
|
|
from llava.mm_utils import process_images, tokenizer_image_token |
|
|
from llava.constants import DEFAULT_IMAGE_TOKEN |
|
|
|
|
|
from PIL import Image |
|
|
import torch |
|
|
import time |
|
|
import warnings |
|
|
import json |
|
|
|
|
|
# export PYTHONPATH="/thestack/LLM4CodeBeta/LLaVA-NeXT-FLAME:$PYTHONPATH" |
|
|
|
|
|
warnings.filterwarnings("ignore") |
|
|
|
|
|
pretrained = "/root/nfs3/flame_ft/res/checkpoints/flame-google_siglip-so400m-patch14-384-deepseek-ai_deepseek-coder-6.7b-instruct-mlp2x_gelu-selectlayer-2-onevision-1-pretrain_mmcoder-3NODE-Date1212-STAGE2v9-2-data_1220_no_code_v1-inst_data-STAGE2v9-eos-16k-1220-FINETUNE-2-data_1220/no_code_v1-inst_data-v5_v6-eos-16k-1223" |
|
|
|
|
|
model_name = "flame" |
|
|
device = "cuda" |
|
|
device_map = "auto" |
|
|
llava_model_args = { |
|
|
"multimodal": True, |
|
|
"attn_implementation": None, |
|
|
} |
|
|
tokenizer, model, image_processor, max_length = load_pretrained_model(pretrained, None, model_name, device_map=device_map,**llava_model_args) |
|
|
model.config.tokenizer_padding_side = 'left' # Use left padding for batch processing |
|
|
# model.config.image_aspect_ratio = "resize" |
|
|
model.eval() |
|
|
|
|
|
url = "/root/nfs2/flame_ft/datasets/data_1220/TESTING_DATA/TEST80/imgs/000000034/000000034.png" |
|
|
image = Image.open(url) |
|
|
image_tensor = process_images([image], image_processor, model.config) |
|
|
image_tensor = [_image.to(dtype=torch.float16, device=device) for _image in image_tensor] |
|
|
|
|
|
prompt = "Below is an image of the page to create. Generate React code and styles to replicate the design, including layout, typography, and styling. Format your response as follows:'// CSS\n[CSS/SCSS code]\n\n// [React Implementation (JS/TS/JSX/TSX)]\n[Component code]'.\n\n ### Input Image:\n{image}\n\n### Response:\n" |
|
|
|
|
|
input_ids = tokenizer_image_token(prompt, tokenizer, return_tensors='pt') |
|
|
input_ids = input_ids.unsqueeze(0) |
|
|
input_ids=input_ids.to(device) |
|
|
image_sizes = [image.size] |
|
|
modalities = ["image"] |
|
|
|
|
|
cont = model.generate( |
|
|
input_ids, |
|
|
images=image_tensor, |
|
|
image_sizes=image_sizes, |
|
|
modalities=modalities, # Added this line with the modalities |
|
|
do_sample=True, |
|
|
num_beams=5, |
|
|
temperature=0.1, |
|
|
max_new_tokens=4096, |
|
|
top_p=0.95, |
|
|
repetition_penalty=1.05 |
|
|
) |
|
|
|
|
|
text_outputs = tokenizer.batch_decode(cont, skip_special_tokens=True) |
|
|
|
|
|
``` |
|
|
|