Spaces:

vidhi0405
/

ImageToText

Sleeping

App Files Files Community

vidhi0405 commited on 10 days ago

Commit

3d467cc

1 Parent(s): 4864dad

separate endpoints

Browse files

Files changed (1) hide show

app.py +111 -15

app.py CHANGED Viewed

@@ -261,8 +261,8 @@ def summarize_captions(captions: list[str]) -> str:
         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
-                max_length=300,
-                min_length=50,
                 length_penalty=2.0,
                 num_beams=4,
                 early_stopping=True,
@@ -274,7 +274,7 @@ def summarize_captions(captions: list[str]) -> str:
     return _finalize_caption(summary, max_sentences=10)
-def generate_caption_text(image: Image.Image) -> str:
     runtime_model, runtime_processor = _get_caption_runtime()
     model_device = str(next(runtime_model.parameters()).device)
@@ -300,7 +300,7 @@ def generate_caption_text(image: Image.Image) -> str:
         )
     try:
-        inputs = _build_inputs(CAPTION_PROMPT)
     except Exception as exc:
         if "Mismatch in `image` token count" not in str(exc):
             raise AppError("Failed to preprocess image for captioning.", 422) from exc
@@ -324,10 +324,10 @@ def generate_caption_text(image: Image.Image) -> str:
     return _finalize_caption(caption)
-def generate_caption_text_safe(image: Image.Image) -> str:
     global _caption_model, _caption_processor, _caption_force_cpu
     try:
-        return generate_caption_text(image)
     except Exception as exc:
         msg = str(exc)
         if "CUDA error" not in msg and "device-side assert" not in msg:
@@ -344,7 +344,7 @@ def generate_caption_text_safe(image: Image.Image) -> str:
             except Exception:
                 pass
-        return generate_caption_text(image)
 def insert_record(collection, payload: dict) -> str:
@@ -355,10 +355,7 @@ def insert_record(collection, payload: dict) -> str:
         raise AppError("MongoDB insert failed.", 503) from exc
-@app.post("/generate-caption")
-async def generate_caption(request: Request):
-    _ensure_db_ready()
     try:
         form = await request.form()
     except Exception as exc:
@@ -381,8 +378,8 @@ async def generate_caption(request: Request):
     if len(uploads) > MAX_IMAGES:
         raise AppError("You can upload a maximum of 5 images.", 400)
-    image_captions = []
-    for upload in uploads:
         if upload.content_type and not upload.content_type.startswith("image/"):
             raise AppError("All uploaded files must be images.", 400)
@@ -397,11 +394,23 @@ async def generate_caption(request: Request):
         except OSError as exc:
             raise AppError("Unable to read one of the uploaded images.", 400) from exc
         caption = generate_caption_text_safe(image)
         if not caption:
             raise AppError("Caption generation produced empty text.", 500)
-        image_captions.append({"filename": upload.filename, "caption": caption})
     caption_texts = [x["caption"] for x in image_captions]
     caption = summarize_captions(caption_texts)
@@ -424,3 +433,90 @@ async def generate_caption(request: Request):
     response_data["created_at"] = response_data["created_at"].isoformat()
     return ok("Caption generated successfully.", response_data)

         with torch.no_grad():
             output_ids = model.generate(
                 **inputs,
+                max_length=512,
+                min_length=100,
                 length_penalty=2.0,
                 num_beams=4,
                 early_stopping=True,
     return _finalize_caption(summary, max_sentences=10)
+def generate_caption_text(image: Image.Image, prompt: str = CAPTION_PROMPT) -> str:
     runtime_model, runtime_processor = _get_caption_runtime()
     model_device = str(next(runtime_model.parameters()).device)
         )
     try:
+        inputs = _build_inputs(prompt)
     except Exception as exc:
         if "Mismatch in `image` token count" not in str(exc):
             raise AppError("Failed to preprocess image for captioning.", 422) from exc
     return _finalize_caption(caption)
+def generate_caption_text_safe(image: Image.Image, prompt: str = CAPTION_PROMPT) -> str:
     global _caption_model, _caption_processor, _caption_force_cpu
     try:
+        return generate_caption_text(image, prompt)
     except Exception as exc:
         msg = str(exc)
         if "CUDA error" not in msg and "device-side assert" not in msg:
             except Exception:
                 pass
+        return generate_caption_text(image, prompt)
 def insert_record(collection, payload: dict) -> str:
         raise AppError("MongoDB insert failed.", 503) from exc
+async def _parse_images(request: Request) -> list[tuple[str, Image.Image]]:
     try:
         form = await request.form()
     except Exception as exc:
     if len(uploads) > MAX_IMAGES:
         raise AppError("You can upload a maximum of 5 images.", 400)
+    parsed_images = []
+    for i, upload in enumerate(uploads):
         if upload.content_type and not upload.content_type.startswith("image/"):
             raise AppError("All uploaded files must be images.", 400)
         except OSError as exc:
             raise AppError("Unable to read one of the uploaded images.", 400) from exc
+        filename = upload.filename or f"image_{i+1}"
+        parsed_images.append((filename, image))
+    return parsed_images
+@app.post("/generate-caption-summary")
+async def generate_caption_summary(request: Request):
+    _ensure_db_ready()
+    images = await _parse_images(request)
+    image_captions = []
+    for filename, image in images:
         caption = generate_caption_text_safe(image)
         if not caption:
             raise AppError("Caption generation produced empty text.", 500)
+        image_captions.append({"filename": filename, "caption": caption})
     caption_texts = [x["caption"] for x in image_captions]
     caption = summarize_captions(caption_texts)
     response_data["created_at"] = response_data["created_at"].isoformat()
     return ok("Caption generated successfully.", response_data)
+@app.post("/generate-caption-collage")
+async def generate_caption_collage(request: Request):
+    _ensure_db_ready()
+    images = await _parse_images(request)
+    # Create collage (horizontal strip, resized to height 512 for consistency)
+    resized_images = []
+    target_height = 512
+    for _, img in images:
+        aspect_ratio = img.width / img.height
+        new_width = int(target_height * aspect_ratio)
+        resized_images.append(img.resize((new_width, target_height), Image.Resampling.LANCZOS))
+    total_width = sum(img.width for img in resized_images)
+    collage = Image.new("RGB", (total_width, target_height))
+    x_offset = 0
+    for img in resized_images:
+        collage.paste(img, (x_offset, 0))
+        x_offset += img.width
+    caption = generate_caption_text_safe(collage)
+    if not caption:
+        raise AppError("Collage caption generation produced empty text.", 500)
+    # For database storage, we list source filenames but the 'image_captions'
+    # will just contain the single collage caption to avoid confusion.
+    source_filenames = [fname for fname, _ in images]
+    mongo_payload = {
+        "caption": caption,
+        "source_filenames": source_filenames,
+        "image_captions": [{"filename": "collage", "caption": caption}],
+        "images_count": len(images),
+        "is_summarized": False,  # It's a direct caption of a collage
+        "created_at": datetime.now(timezone.utc),
+    }
+    audio_file_id = insert_record(caption_collection, mongo_payload)
+    response_data = {**mongo_payload, "audio_file_id": audio_file_id}
+    response_data.pop("_id", None)
+    response_data["created_at"] = response_data["created_at"].isoformat()
+    return ok("Collage caption generated successfully.", response_data)
+@app.post("/generate-caption-context")
+async def generate_caption_context(request: Request):
+    _ensure_db_ready()
+    images = await _parse_images(request)
+    image_captions = []
+    previous_context = ""
+    for i, (filename, image) in enumerate(images):
+        prompt = CAPTION_PROMPT
+        if i > 0 and previous_context:
+            prompt = f"Context from previous image: {previous_context}. {CAPTION_PROMPT}"
+        caption = generate_caption_text_safe(image, prompt=prompt)
+        if not caption:
+            caption = "No caption generated."
+        image_captions.append({"filename": filename, "caption": caption})
+        previous_context = caption
+    # Combine captions for the main 'caption' field
+    full_text = " ".join([ic["caption"] for ic in image_captions])
+    mongo_payload = {
+        "caption": full_text,
+        "source_filenames": [fname for fname, _ in images],
+        "image_captions": image_captions,
+        "images_count": len(images),
+        "is_summarized": False,
+        "created_at": datetime.now(timezone.utc),
+    }
+    audio_file_id = insert_record(caption_collection, mongo_payload)
+    response_data = {**mongo_payload, "audio_file_id": audio_file_id}
+    response_data.pop("_id", None)
+    response_data["created_at"] = response_data["created_at"].isoformat()
+    return ok("Contextual captions generated successfully.", response_data)