Monimoy commited on
Commit
4e42ff8
·
verified ·
1 Parent(s): f3720b2

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +144 -145
app.py CHANGED
@@ -1,145 +1,144 @@
1
- # app.py
2
- import os
3
- import gradio as gr
4
- import torch
5
- from PIL import Image
6
- from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
7
- import timm
8
- from torchvision import transforms
9
- #from llama_cpp import Llama
10
- from peft import PeftModel
11
-
12
- # 1. Model Definitions (Same as in training script)
13
- class SigLIPImageEncoder(torch.nn.Module):
14
- def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
15
- super().__init__()
16
- self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
17
- self.embed_dim = embed_dim
18
- self.projection = torch.nn.Linear(self.model.num_features, embed_dim)
19
-
20
- if pretrained_path:
21
- self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
22
- print(f"Loaded SigLIP image encoder from {pretrained_path}")
23
- else:
24
- print("Initialized SigLIP image encoder without pretrained weights.")
25
-
26
- def forward(self, image):
27
- features = self.model(image)
28
- embedding = self.projection(features)
29
- return embedding
30
-
31
- # 2. Load Models and Tokenizer
32
- phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF" # Path to your quantized Phi-3 GGUF model
33
- peft_model_path = "./qlora-phi3-model"
34
- image_model_name = 'resnet50'
35
- image_embed_dim = 512
36
- siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model
37
-
38
- device = torch.device("cpu") # Force CPU
39
- print(f"Using device: {device}")
40
-
41
- # Load Tokenizer (using a compatible tokenizer)
42
- text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
43
- text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training
44
-
45
- # Image Transformations
46
- image_transform = transforms.Compose([
47
- transforms.Resize((224, 224)),
48
- transforms.ToTensor(),
49
- transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
50
- ])
51
-
52
- # Load SigLIP Image Encoder
53
- image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
54
- image_encoder.eval() # Set to evaluation mode
55
-
56
- # Load Phi-3 model using llama.cpp
57
- #base_model = Llama(
58
- # model_path=phi3_model_path,
59
- # n_gpu_layers=0, # Ensure no GPU usage
60
- # n_ctx=2048, # Adjust context length as needed
61
- # verbose=True,
62
- #)
63
-
64
-
65
- #base_model = Llama.from_pretrained(
66
- # repo_id="QuantFactory/Phi-3-mini-4k-instruct-GGUF",
67
- # filename="Phi-3-mini-4k-instruct.Q2_K.gguf",
68
- # n_gpu_layers=0,
69
- # n_ctx=2048,
70
- # verbose=True
71
- #)
72
-
73
- base_model_name="microsoft/Phi-3-mini-4k-instruct"
74
- device = "cpu"
75
- bnb_config = BitsAndBytesConfig(
76
- load_in_4bit=True,
77
- bnb_4bit_use_double_quant=True,
78
- bnb_4bit_quant_type="nf4",
79
- bnb_4bit_compute_dtype=torch.bfloat16
80
- )
81
- #base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map={"": device})
82
- base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.bfloat16, trust_remote_code=True, # Important for some Phi-3 variants
83
- quantization_config=bnb_config, device_map={"": device})
84
-
85
- # Load and merge
86
- model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
87
- model = model.merge_and_unload()
88
- print("phi-3 model loaded sucessfully")
89
- # 3. Inference Function
90
- def predict(image, question):
91
- """
92
- Takes an image and a question as input and returns an answer.
93
- """
94
- if image is None or question is None or question == "":
95
- return "Please provide both an image and a question."
96
-
97
- try:
98
- image = Image.fromarray(image).convert("RGB")
99
- image = image_transform(image).unsqueeze(0).to(device)
100
-
101
- # Get image embeddings
102
- with torch.no_grad():
103
- image_embeddings = image_encoder(image)
104
- # Flatten the image embeddings for simplicity
105
- image_embeddings = image_embeddings.flatten().tolist()
106
-
107
- # Create the prompt with image embeddings
108
- prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"
109
-
110
- # Generate answer using llama.cpp
111
- output = model(
112
- prompt,
113
- max_tokens=128,
114
- stop=["Q:", "\n"],
115
- echo=False,
116
- )
117
-
118
- answer = output["choices"][0]["text"].strip()
119
-
120
- return answer
121
-
122
- except Exception as e:
123
- return f"An error occurred: {str(e)}"
124
-
125
- # 4. Gradio Interface
126
- iface = gr.Interface(
127
- fn=predict,
128
- inputs=[
129
- gr.Image(label="Upload an Image"),
130
- gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
131
- ],
132
- outputs=gr.Textbox(label="Answer"),
133
- title="Image Question Answering with Phi-3 and SigLIP (CPU)",
134
- description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
135
- examples=[
136
- ["cat_0006.png", "Create a interesting story about this image?"],
137
- ["bird_0004.png", "Can you describe this image?"],
138
- ["truck_0003.png", "Elaborate the setting of the image"],
139
- ["ship_0007.png", "Explain the purpose of image"]
140
- ]
141
- )
142
-
143
- # 5. Launch the App
144
- if __name__ == "__main__":
145
- iface.launch()
 
1
+ # app.py
2
+ import spaces
3
+ import os
4
+ import gradio as gr
5
+ import torch
6
+ from PIL import Image
7
+ from transformers import AutoTokenizer, AutoModelForCausalLM
8
+ import timm
9
+ from torchvision import transforms
10
+ #from llama_cpp import Llama
11
+ from peft import PeftModel
12
+
13
+ # 1. Model Definitions (Same as in training script)
14
+ class SigLIPImageEncoder(torch.nn.Module):
15
+ def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
16
+ super().__init__()
17
+ self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
18
+ self.embed_dim = embed_dim
19
+ self.projection = torch.nn.Linear(self.model.num_features, embed_dim)
20
+
21
+ if pretrained_path:
22
+ self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
23
+ print(f"Loaded SigLIP image encoder from {pretrained_path}")
24
+ else:
25
+ print("Initialized SigLIP image encoder without pretrained weights.")
26
+
27
+ def forward(self, image):
28
+ features = self.model(image)
29
+ embedding = self.projection(features)
30
+ return embedding
31
+
32
+ # 2. Load Models and Tokenizer
33
+ #phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF" # Path to your quantized Phi-3 GGUF model
34
+ peft_model_path = "./qlora-phi3-model"
35
+ image_model_name = 'resnet50'
36
+ image_embed_dim = 512
37
+ siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model
38
+
39
+ #device = torch.device("cpu") # Force CPU
40
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
41
+ print(f"Using device: {device}")
42
+
43
+ # Load Tokenizer (using a compatible tokenizer)
44
+ text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
45
+ text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training
46
+
47
+ # Image Transformations
48
+ image_transform = transforms.Compose([
49
+ transforms.Resize((224, 224)),
50
+ transforms.ToTensor(),
51
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
52
+ ])
53
+
54
+ # Load SigLIP Image Encoder
55
+ image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
56
+ image_encoder.eval() # Set to evaluation mode
57
+
58
+ # Load Phi-3 model using llama.cpp
59
+ #base_model = Llama(
60
+ # model_path=phi3_model_path,
61
+ # n_gpu_layers=0, # Ensure no GPU usage
62
+ # n_ctx=2048, # Adjust context length as needed
63
+ # verbose=True,
64
+ #)
65
+
66
+
67
+ #base_model = Llama.from_pretrained(
68
+ # repo_id="QuantFactory/Phi-3-mini-4k-instruct-GGUF",
69
+ # filename="Phi-3-mini-4k-instruct.Q2_K.gguf",
70
+ # n_gpu_layers=0,
71
+ # n_ctx=2048,
72
+ # verbose=True
73
+ #)
74
+
75
+ base_model_name="microsoft/Phi-3-mini-4k-instruct"
76
+ #device = "cuda"
77
+
78
+ #base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map={"": device})
79
+ base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float32, device_map="auto")
80
+
81
+
82
+ # Load and merge
83
+ model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
84
+ model = model.merge_and_unload()
85
+ print("phi-3 model loaded sucessfully")
86
+ # 3. Inference Function
87
+
88
+ @spaces.GPU
89
+ def predict(image, question):
90
+ """
91
+ Takes an image and a question as input and returns an answer.
92
+ """
93
+ if image is None or question is None or question == "":
94
+ return "Please provide both an image and a question."
95
+
96
+ try:
97
+ image = Image.fromarray(image).convert("RGB")
98
+ image = image_transform(image).unsqueeze(0).to(device)
99
+
100
+ # Get image embeddings
101
+ with torch.no_grad():
102
+ image_embeddings = image_encoder(image)
103
+ # Flatten the image embeddings for simplicity
104
+ image_embeddings = image_embeddings.flatten().tolist()
105
+
106
+ # Create the prompt with image embeddings
107
+ prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"
108
+
109
+ # Generate answer using llama.cpp
110
+ output = model(
111
+ prompt,
112
+ max_tokens=128,
113
+ stop=["Q:", "\n"],
114
+ echo=False,
115
+ )
116
+
117
+ answer = output["choices"][0]["text"].strip()
118
+
119
+ return answer
120
+
121
+ except Exception as e:
122
+ return f"An error occurred: {str(e)}"
123
+
124
+ # 4. Gradio Interface
125
+ iface = gr.Interface(
126
+ fn=predict,
127
+ inputs=[
128
+ gr.Image(label="Upload an Image"),
129
+ gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
130
+ ],
131
+ outputs=gr.Textbox(label="Answer"),
132
+ title="Image Question Answering with Phi-3 and SigLIP (CPU)",
133
+ description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
134
+ examples=[
135
+ ["cat_0006.png", "Create a interesting story about this image?"],
136
+ ["bird_0004.png", "Can you describe this image?"],
137
+ ["truck_0003.png", "Elaborate the setting of the image"],
138
+ ["ship_0007.png", "Explain the purpose of image"]
139
+ ]
140
+ )
141
+
142
+ # 5. Launch the App
143
+ if __name__ == "__main__":
144
+ iface.launch()