Afsha001 commited on
Commit
a0d9361
Β·
1 Parent(s): 25245f2

add app.py and requirements.txt

Browse files
Files changed (1) hide show
  1. app.py +48 -0
app.py CHANGED
@@ -72,6 +72,54 @@ def load_local_models():
72
 
73
  return blip_processor, itm_model, dino_processor, dino_model
74
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  # ── Step 2: BLIP ITM Scoring (local CPU) ──
76
  def compute_itm_scores(image, captions, blip_processor, itm_model):
77
  scores = []
 
72
 
73
  return blip_processor, itm_model, dino_processor, dino_model
74
 
75
+ # ── Step 1: Generate 5 captions via Qwen2-VL API ──
76
+ def generate_captions_api(image: Image.Image) -> list:
77
+ buffered = BytesIO()
78
+ image.save(buffered, format="JPEG")
79
+ img_bytes = buffered.getvalue()
80
+
81
+ PROMPTS = [
82
+ "Describe this image in one detailed sentence.",
83
+ "What is happening in this image? Write one descriptive sentence.",
84
+ "Describe the main subjects, actions and setting in one sentence.",
85
+ "Write a detailed caption focusing on people, animals and objects visible.",
86
+ "Describe this scene including background details and activities shown.",
87
+ ]
88
+
89
+ captions = []
90
+ for prompt in PROMPTS:
91
+ payload = {"inputs": prompt, "image": img_bytes.hex()}
92
+ try:
93
+ response = requests.post(
94
+ QWEN_VL_URL,
95
+ headers = HF_HEADERS,
96
+ json = {"inputs": prompt},
97
+ files = {"image": img_bytes},
98
+ timeout = 30
99
+ )
100
+ if response.status_code == 200:
101
+ result = response.json()
102
+ if isinstance(result, list):
103
+ cap = result[0].get("generated_text", "").strip().lower()
104
+ else:
105
+ cap = str(result).strip().lower()
106
+ captions.append(cap if cap else "a scene with various objects and people")
107
+ else:
108
+ captions.append("a detailed scene with people and objects")
109
+ except Exception as e:
110
+ captions.append("a scene captured in the image")
111
+
112
+ # Deduplicate
113
+ seen, unique = set(), []
114
+ for c in captions:
115
+ if c not in seen:
116
+ seen.add(c)
117
+ unique.append(c)
118
+ while len(unique) < 5:
119
+ unique.append(unique[0])
120
+
121
+ return unique[:5]
122
+
123
  # ── Step 2: BLIP ITM Scoring (local CPU) ──
124
  def compute_itm_scores(image, captions, blip_processor, itm_model):
125
  scores = []