Jeblest commited on
Commit
41e7b9b
·
1 Parent(s): ec13977

First app test

Browse files
Files changed (4) hide show
  1. README.md +19 -9
  2. app.py +15 -0
  3. inference.py +101 -0
  4. requirements.txt +8 -0
README.md CHANGED
@@ -1,12 +1,22 @@
1
  ---
2
- title: Qwen 2.5 VL 7B Image Captioning
3
- emoji: 🏢
4
- colorFrom: red
5
- colorTo: pink
6
- sdk: gradio
7
- sdk_version: 5.35.0
8
- app_file: app.py
9
- pinned: false
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ base_model: Qwen/Qwen2.5-VL-7B-Instruct
3
+ tags:
4
+ - image_captioning
5
+ - lora
6
+ - peft
7
+ library_name: peft
 
 
8
  ---
9
 
10
+ This is a LoRA adapter for the `Qwen/Qwen2.5-VL-7B-Instruct` model
11
+
12
+ ## How to use
13
+ You can load this adapter on top of the base model like this:
14
+ ```python
15
+ from peft import PeftModel
16
+ from transformers import AutoModelForCausalLM, AutoTokenizer
17
+
18
+ base_model_id = "Qwen/Qwen2.5-VL-7B-Instruct"
19
+ adapter_id = "Jeblest/Qwen-2.5-7B-Instruct-fine-tune-image-caption"
20
+
21
+ base_model = AutoModelForCausalLM.from_pretrained(base_model_id)
22
+ model = PeftModel.from_pretrained(base_model, adapter_id)
app.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from inference import infer_single_image, model, processor
2
+ import gradio as gr
3
+
4
+ def generate_caption(image, prompt):
5
+ return infer_single_image(model, processor, image, prompt or "Describe this image.")
6
+
7
+ gr.Interface(
8
+ fn=generate_caption,
9
+ inputs=[
10
+ gr.Image(type="pil", label="Upload Image"),
11
+ gr.Textbox(label="Prompt (optional)")
12
+ ],
13
+ outputs=gr.Textbox(label="Generated Caption"),
14
+ title="Qwen2.5-VL-7B Fine-tuned Image Captioning",
15
+ ).launch()
inference.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from transformers import AutoProcessor, BitsAndBytesConfig
3
+ from peft import PeftModel
4
+ from modelscope import Qwen2_5_VLForConditionalGeneration
5
+ from PIL import Image
6
+
7
+ # Your Hugging Face repo
8
+ MODEL_REPO = "Jeblest/Qwen-2.5-7B-Instruct-fine-tune-image-caption"
9
+ BASE_MODEL = "Qwen/Qwen2.5-VL-7B-Instruct"
10
+
11
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
+
13
+ # Quantization setup
14
+ quantization_config = BitsAndBytesConfig(
15
+ load_in_4bit=True,
16
+ bnb_4bit_quant_type="nf4",
17
+ bnb_4bit_compute_dtype=torch.bfloat16,
18
+ bnb_4bit_use_double_quant=True,
19
+ )
20
+
21
+ # 🔄 Load base model with quantization
22
+ base_model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
23
+ BASE_MODEL,
24
+ device_map="auto",
25
+ quantization_config=quantization_config,
26
+ torch_dtype=torch.bfloat16,
27
+ trust_remote_code=True,
28
+ )
29
+
30
+ # 🔄 Load LoRA adapters directly from your Hugging Face repo
31
+ model = PeftModel.from_pretrained(
32
+ base_model,
33
+ MODEL_REPO, # This will download LoRA adapter config & weights
34
+ torch_dtype=torch.bfloat16,
35
+ )
36
+
37
+ model.eval()
38
+
39
+ # Load processor
40
+ processor = AutoProcessor.from_pretrained(BASE_MODEL)
41
+ if processor.tokenizer.pad_token is None:
42
+ processor.tokenizer.pad_token = processor.tokenizer.eos_token
43
+
44
+ class SingleImageCollator:
45
+ """
46
+ A data collator for single-image inference (Gradio or custom input).
47
+ """
48
+ def __init__(self, processor, user_query: str = "Generate a detailed caption based on this image."):
49
+ self.processor = processor
50
+ self.user_query = user_query
51
+
52
+ def __call__(self, image: Image.Image):
53
+ image = image.convert("RGB").resize((448, 448))
54
+ messages = [{"role": "user", "content": [
55
+ {"type": "text", "text": self.user_query},
56
+ {"type": "image", "image": image}
57
+ ]}]
58
+ text_input = self.processor.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
59
+ return self.processor(text=text_input.strip(), images=[image], return_tensors="pt", padding=True, padding_side="left")
60
+
61
+ def infer_single_image(
62
+ model,
63
+ processor,
64
+ image: Image.Image,
65
+ prompt: str = "Generate a detailed caption based on this image.",
66
+ max_new_tokens: int = 100,
67
+ temperature: float = 0.3,
68
+ top_k: int = 30,
69
+ top_p: float = 0.8,
70
+ repetition_penalty=1.1,
71
+ length_penalty=1.0,
72
+ device: str = None
73
+ ) -> str:
74
+ if device is None:
75
+ device = "cuda" if torch.cuda.is_available() else "cpu"
76
+ model.to(device)
77
+ model.eval()
78
+
79
+ collator = SingleImageCollator(processor, user_query=prompt)
80
+ inputs = collator(image)
81
+ inputs = {k: v.to(device) for k, v in inputs.items()}
82
+
83
+ with torch.no_grad():
84
+ generated_ids = model.generate(
85
+ **inputs,
86
+ max_new_tokens=max_new_tokens,
87
+ do_sample=True,
88
+ temperature=temperature,
89
+ top_k=top_k,
90
+ top_p=top_p,
91
+ repetition_penalty=repetition_penalty,
92
+ length_penalty=length_penalty,
93
+ pad_token_id=processor.tokenizer.pad_token_id
94
+ )
95
+
96
+ generated_text = processor.batch_decode(
97
+ generated_ids[:, inputs["input_ids"].shape[1]:],
98
+ skip_special_tokens=True
99
+ )[0]
100
+
101
+ return generated_text.strip()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ torch
2
+ transformers
3
+ peft
4
+ bitsandbytes
5
+ accelerate
6
+ gradio
7
+ pillow
8
+ datasets