Monimoy commited on
Commit
89c355b
·
verified ·
1 Parent(s): b2cc644

Upload 3 files

Browse files
Files changed (3) hide show
  1. app.py +134 -0
  2. image_encoder.pth +3 -0
  3. requirements.txt +7 -0
app.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import os
3
+ import gradio as gr
4
+ import torch
5
+ from PIL import Image
6
+ from transformers import AutoTokenizer
7
+ import timm
8
+ from torchvision import transforms
9
+ from llama_cpp import Llama
10
+ from peft import PeftModel
11
+
12
+ # 1. Model Definitions (Same as in training script)
13
+ class SigLIPImageEncoder(torch.nn.Module):
14
+ def __init__(self, model_name='resnet50', embed_dim=512, pretrained_path=None):
15
+ super().__init__()
16
+ self.model = timm.create_model(model_name, pretrained=False, num_classes=0, global_pool='avg') # pretrained=False
17
+ self.embed_dim = embed_dim
18
+ self.projection = torch.nn.Linear(self.model.num_features, embed_dim)
19
+
20
+ if pretrained_path:
21
+ self.load_state_dict(torch.load(pretrained_path, map_location=torch.device('cpu'))) # Load to CPU first
22
+ print(f"Loaded SigLIP image encoder from {pretrained_path}")
23
+ else:
24
+ print("Initialized SigLIP image encoder without pretrained weights.")
25
+
26
+ def forward(self, image):
27
+ features = self.model(image)
28
+ embedding = self.projection(features)
29
+ return embedding
30
+
31
+ # 2. Load Models and Tokenizer
32
+ phi3_model_path = "QuantFactory/Phi-3-mini-4k-instruct-GGUF" # Path to your quantized Phi-3 GGUF model
33
+ peft_model_path = "./qlora_phi3_model"
34
+ image_model_name = 'resnet50'
35
+ image_embed_dim = 512
36
+ siglip_pretrained_path = "image_encoder.pth" # Path to your pretrained SigLIP model
37
+
38
+ device = torch.device("cpu") # Force CPU
39
+ print(f"Using device: {device}")
40
+
41
+ # Load Tokenizer (using a compatible tokenizer)
42
+ text_tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", trust_remote_code=True) # Or a compatible tokenizer
43
+ text_tokenizer.pad_token = text_tokenizer.eos_token # Important for training
44
+
45
+ # Image Transformations
46
+ image_transform = transforms.Compose([
47
+ transforms.Resize((224, 224)),
48
+ transforms.ToTensor(),
49
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
50
+ ])
51
+
52
+ # Load SigLIP Image Encoder
53
+ image_encoder = SigLIPImageEncoder(model_name=image_model_name, embed_dim=image_embed_dim, pretrained_path=siglip_pretrained_path).to(device)
54
+ image_encoder.eval() # Set to evaluation mode
55
+
56
+ # Load Phi-3 model using llama.cpp
57
+ base_model = Llama(
58
+ model_path=phi3_model_path,
59
+ n_gpu_layers=0, # Ensure no GPU usage
60
+ n_ctx=2048, # Adjust context length as needed
61
+ verbose=True,
62
+ )
63
+
64
+
65
+ model = PeftModel.from_pretrained(base_model, peft_model_path, offload_dir='./offload')
66
+ model = model.merge_and_unload()
67
+ print("phi-3 model loaded sucessfully")
68
+ # 3. Inference Function
69
+ def predict(image, question):
70
+ """
71
+ Takes an image and a question as input and returns an answer.
72
+ """
73
+ if image is None or question is None or question == "":
74
+ return "Please provide both an image and a question."
75
+
76
+ try:
77
+ image = Image.fromarray(image).convert("RGB")
78
+ image = image_transform(image).unsqueeze(0).to(device)
79
+
80
+ # Get image embeddings
81
+ with torch.no_grad():
82
+ image_embeddings = image_encoder(image)
83
+ # Flatten the image embeddings for simplicity
84
+ image_embeddings = image_embeddings.flatten().tolist()
85
+
86
+ # Create the prompt with image embeddings
87
+ prompt = f"Question: {question}\nImage Embeddings: {image_embeddings}\nAnswer:"
88
+
89
+ # Generate answer using llama.cpp
90
+ output = model(
91
+ prompt,
92
+ max_tokens=128,
93
+ stop=["Q:", "\n"],
94
+ echo=False,
95
+ )
96
+
97
+ answer = output["choices"][0]["text"].strip()
98
+
99
+ return answer
100
+
101
+ except Exception as e:
102
+ return f"An error occurred: {str(e)}"
103
+
104
+ # 4. Gradio Interface
105
+ iface = gr.Interface(
106
+ fn=predict,
107
+ inputs=[
108
+ gr.Image(label="Upload an Image"),
109
+ gr.Textbox(label="Ask a Question about the Image", placeholder="What is in the image?")
110
+ ],
111
+ outputs=gr.Textbox(label="Answer"),
112
+ title="Image Question Answering with Phi-3 and SigLIP (CPU)",
113
+ description="Ask questions about an image and get answers powered by Phi-3 (llama.cpp) and SigLIP.",
114
+ examples=[
115
+ ["example_image_1.jpg", "What color is the car?"],
116
+ ["example_image_2.jpg", "How many people are in the image?"],
117
+ ["example_image_3.jpg", "What is the person doing?"]
118
+ ]
119
+ )
120
+
121
+ # 5. Launch the App
122
+ if __name__ == "__main__":
123
+ # Create dummy example images
124
+ if not os.path.exists("example_image_1.jpg"):
125
+ dummy_image = Image.new("RGB", (224, 224), color="red")
126
+ dummy_image.save("example_image_1.jpg")
127
+ if not os.path.exists("example_image_2.jpg"):
128
+ dummy_image = Image.new("RGB", (224, 224), color="green")
129
+ dummy_image.save("example_image_2.jpg")
130
+ if not os.path.exists("example_image_3.jpg"):
131
+ dummy_image = Image.new("RGB", (224, 224), color="blue")
132
+ dummy_image.save("example_image_3.jpg")
133
+
134
+ iface.launch()
image_encoder.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f070bb389421acfedc5a21e4c3fc2a0f5f9298da7e71611dd352333e037f0cd
3
+ size 98553802
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ gradio
2
+ torch
3
+ torchvision
4
+ timm
5
+ Pillow
6
+ transformers
7
+ llama-cpp-python