PatrickRedStar commited on
Commit
e4b559c
·
1 Parent(s): e0b9d86
Files changed (2) hide show
  1. app.py +19 -3
  2. requirements.txt +1 -0
app.py CHANGED
@@ -145,7 +145,13 @@ def image_captioning(image):
145
 
146
  def visual_question_answering(image, question):
147
  vqa_pipeline = load_image_model("vqa")
148
- result = vqa_pipeline(image, question)
 
 
 
 
 
 
149
  return f"{result[0]['answer']} (confidence: {result[0]['score']:.3f})"
150
 
151
  def zero_shot_classification(image, classes):
@@ -175,7 +181,17 @@ def image_retrieval(images, query):
175
  processor = models["clip_processor"]
176
 
177
  # Обрабатываем все изображения
178
- image_inputs = processor(images=images, return_tensors="pt", padding=True)
 
 
 
 
 
 
 
 
 
 
179
  with torch.no_grad():
180
  image_embeddings = model.get_image_features(**image_inputs)
181
  image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
@@ -193,7 +209,7 @@ def image_retrieval(images, query):
193
  best_idx = similarities.argmax().item()
194
  best_score = similarities[best_idx].item()
195
 
196
- return f"Лучшее изображение: #{best_idx + 1} (схожесть: {best_score:.4f})", images[best_idx]
197
 
198
  # Создаем интерфейс Gradio
199
  with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo:
 
145
 
146
  def visual_question_answering(image, question):
147
  vqa_pipeline = load_image_model("vqa")
148
+ cleaned_question = (question or "").strip()
149
+ result = vqa_pipeline(
150
+ image=image,
151
+ question=cleaned_question,
152
+ truncation=True, # keep text within ViLT max sequence length (40)
153
+ max_length=40,
154
+ )
155
  return f"{result[0]['answer']} (confidence: {result[0]['score']:.3f})"
156
 
157
  def zero_shot_classification(image, classes):
 
181
  processor = models["clip_processor"]
182
 
183
  # Обрабатываем все изображения
184
+ if isinstance(images, tuple):
185
+ images = list(images)
186
+ normalized_images = []
187
+ for item in images:
188
+ # Gallery может вернуть (image, caption); берем только картинку
189
+ if isinstance(item, (list, tuple)) and item:
190
+ normalized_images.append(item[0])
191
+ else:
192
+ normalized_images.append(item)
193
+
194
+ image_inputs = processor(images=normalized_images, return_tensors="pt", padding=True)
195
  with torch.no_grad():
196
  image_embeddings = model.get_image_features(**image_inputs)
197
  image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)
 
209
  best_idx = similarities.argmax().item()
210
  best_score = similarities[best_idx].item()
211
 
212
+ return f"Лучшее изображение: #{best_idx + 1} (схожесть: {best_score:.4f})", normalized_images[best_idx]
213
 
214
  # Создаем интерфейс Gradio
215
  with gr.Blocks(title="Multimodal AI Demo", theme=gr.themes.Soft()) as demo:
requirements.txt CHANGED
@@ -10,3 +10,4 @@ sentence-transformers>=2.2.0
10
  librosa>=0.10.0
11
  requests>=2.28.0
12
  accelerate>=0.20.0
 
 
10
  librosa>=0.10.0
11
  requests>=2.28.0
12
  accelerate>=0.20.0
13
+ timm>=0.6.12