ImageToText

Sleeping

App Files Files Community

vidhi0405 commited on 10 days ago

Commit

7014644

1 Parent(s): 0ffe62a

only for Image to Text

Browse files

Files changed (2) hide show

app.py +158 -6
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ import logging
 import os
 import re
 import threading
 # Avoid invalid OMP setting from runtime environment (e.g. empty/non-numeric).
 _omp_threads = os.getenv("OMP_NUM_THREADS", "").strip()
@@ -15,16 +16,26 @@ from fastapi import FastAPI, File, UploadFile
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from PIL import Image, UnidentifiedImageError
-from transformers import AutoModelForImageTextToText, AutoProcessor
 load_dotenv()
 CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "vidhi0405/Qwen_I2T")
 DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 MAX_NEW_TOKENS = 120
 MAX_IMAGES = 5
 CAPTION_PROMPT = (
     "Act as a professional news reporter delivering a live on-scene report in real time. "
@@ -70,9 +81,30 @@ _caption_model = None
 _caption_processor = None
 _caption_lock = threading.Lock()
 _caption_force_cpu = False
 app = FastAPI(title="Image to Text API")
 @app.get("/")
 def root():
@@ -85,7 +117,40 @@ def root():
 @app.get("/health")
 def health():
-    return {"success": True, "message": "ok", "data": {"caption_model_id": CAPTION_MODEL_ID}}
 @app.exception_handler(AppError)
@@ -104,6 +169,11 @@ async def unhandled_error_handler(_, exc: Exception):
     return fail("Internal server error.", 500)
 def _finalize_caption(raw_text: str) -> str:
     text = " ".join(raw_text.split()).strip()
     if not text:
@@ -149,6 +219,59 @@ def _get_caption_runtime():
     return _caption_model, _caption_processor
 def generate_caption_text(image: Image.Image) -> str:
     runtime_model, runtime_processor = _get_caption_runtime()
     model_device = str(next(runtime_model.parameters()).device)
@@ -222,11 +345,21 @@ def generate_caption_text_safe(image: Image.Image) -> str:
         return generate_caption_text(image)
 @app.post("/generate-caption")
 async def generate_caption(
     file: UploadFile | None = File(default=None),
     files: list[UploadFile] | None = File(default=None),
 ):
     uploads = []
     if files:
         uploads.extend(files)
@@ -259,11 +392,30 @@ async def generate_caption(
         image_captions.append({"filename": upload.filename, "caption": caption})
-    return ok(
-        "Caption generated successfully.",
         {
-            "caption": image_captions[0]["caption"] if len(image_captions) == 1 else None,
-            "individual_captions": image_captions,
             "images_count": len(image_captions),
         },
     )

 import os
 import re
 import threading
+from datetime import datetime, timezone
 # Avoid invalid OMP setting from runtime environment (e.g. empty/non-numeric).
 _omp_threads = os.getenv("OMP_NUM_THREADS", "").strip()
 from fastapi.exceptions import RequestValidationError
 from fastapi.responses import JSONResponse
 from PIL import Image, UnidentifiedImageError
+from pymongo import MongoClient
+from pymongo.errors import PyMongoError, ServerSelectionTimeoutError
+from transformers import (
+    AutoModelForImageTextToText,
+    AutoModelForSeq2SeqLM,
+    AutoProcessor,
+    AutoTokenizer,
+)
 load_dotenv()
 CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "vidhi0405/Qwen_I2T")
+SUMMARIZER_MODEL_ID = os.getenv("SUMMARIZER_MODEL_ID", "facebook/bart-large-cnn")
 DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
 DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
 MAX_NEW_TOKENS = 120
 MAX_IMAGES = 5
+MONGO_URI = (os.getenv("MONGO_URI") or os.getenv("MONGODB_URI") or "").strip().strip('"').strip("'")
+MONGO_DB_NAME = os.getenv("MONGO_DB_NAME", "image_to_speech")
 CAPTION_PROMPT = (
     "Act as a professional news reporter delivering a live on-scene report in real time. "
 _caption_processor = None
 _caption_lock = threading.Lock()
 _caption_force_cpu = False
+_summarizer_model = None
+_summarizer_tokenizer = None
+_summarizer_lock = threading.Lock()
 app = FastAPI(title="Image to Text API")
+mongo_client = None
+mongo_db = None
+caption_collection = None
+db_init_error = None
+if not MONGO_URI:
+    db_init_error = "MONGO_URI (or MONGODB_URI) is not set."
+else:
+    try:
+        mongo_client = MongoClient(MONGO_URI, serverSelectionTimeoutMS=5000)
+        mongo_client.admin.command("ping")
+        mongo_db = mongo_client[MONGO_DB_NAME]
+        caption_collection = mongo_db["captions"]
+    except ServerSelectionTimeoutError:
+        db_init_error = "Unable to connect to MongoDB (timeout)."
+    except PyMongoError as exc:
+        db_init_error = "Unable to initialize MongoDB: {}".format(exc)
 @app.get("/")
 def root():
 @app.get("/health")
 def health():
+    if db_init_error:
+        return {
+            "success": False,
+            "message": db_init_error,
+            "data": {
+                "caption_model_id": CAPTION_MODEL_ID,
+                "summarizer_model_id": SUMMARIZER_MODEL_ID,
+            },
+        }
+    return {
+        "success": True,
+        "message": "ok",
+        "data": {
+            "caption_model_id": CAPTION_MODEL_ID,
+            "summarizer_model_id": SUMMARIZER_MODEL_ID,
+        },
+    }
+@app.on_event("startup")
+async def preload_runtime_models():
+    if os.getenv("PRELOAD_MODELS", "1").strip().lower() in {"0", "false", "no"}:
+        logger.info("Model preloading disabled via PRELOAD_MODELS.")
+        return
+    try:
+        _get_caption_runtime()
+        logger.info("Caption model preloaded successfully.")
+    except Exception as exc:
+        logger.warning("Caption model preload failed: %s", exc)
+    try:
+        _get_summarizer_runtime()
+        logger.info("Summarizer model preloaded successfully.")
+    except Exception as exc:
+        logger.warning("Summarizer model preload failed: %s", exc)
 @app.exception_handler(AppError)
     return fail("Internal server error.", 500)
+def _ensure_db_ready():
+    if db_init_error:
+        raise AppError(db_init_error, 503)
 def _finalize_caption(raw_text: str) -> str:
     text = " ".join(raw_text.split()).strip()
     if not text:
     return _caption_model, _caption_processor
+def _get_summarizer_runtime():
+    global _summarizer_model, _summarizer_tokenizer
+    if _summarizer_model is not None and _summarizer_tokenizer is not None:
+        return _summarizer_model, _summarizer_tokenizer
+    with _summarizer_lock:
+        if _summarizer_model is None or _summarizer_tokenizer is None:
+            try:
+                tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_ID)
+                model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_ID)
+            except Exception as exc:
+                raise AppError("Failed to load summarization model.", 503) from exc
+            model.eval()
+            _summarizer_tokenizer = tokenizer
+            _summarizer_model = model
+    return _summarizer_model, _summarizer_tokenizer
+def summarize_captions(captions: list[str]) -> str:
+    if not captions:
+        return ""
+    if len(captions) == 1:
+        return captions[0]
+    model, tokenizer = _get_summarizer_runtime()
+    combined = " ".join(c.strip() for c in captions if c and c.strip())
+    if not combined:
+        return ""
+    try:
+        inputs = tokenizer(
+            combined,
+            max_length=1024,
+            truncation=True,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            output_ids = model.generate(
+                **inputs,
+                max_length=150,
+                min_length=40,
+                length_penalty=2.0,
+                num_beams=4,
+                early_stopping=True,
+            )
+        summary = tokenizer.decode(output_ids[0], skip_special_tokens=True).strip()
+    except Exception as exc:
+        raise AppError("Failed to summarize captions.", 500) from exc
+    return _finalize_caption(summary)
 def generate_caption_text(image: Image.Image) -> str:
     runtime_model, runtime_processor = _get_caption_runtime()
     model_device = str(next(runtime_model.parameters()).device)
         return generate_caption_text(image)
+def insert_record(collection, payload: dict) -> str:
+    try:
+        result = collection.insert_one(payload)
+        return str(result.inserted_id)
+    except PyMongoError as exc:
+        raise AppError("MongoDB insert failed.", 503) from exc
 @app.post("/generate-caption")
 async def generate_caption(
     file: UploadFile | None = File(default=None),
     files: list[UploadFile] | None = File(default=None),
 ):
+    _ensure_db_ready()
     uploads = []
     if files:
         uploads.extend(files)
         image_captions.append({"filename": upload.filename, "caption": caption})
+    caption_texts = [x["caption"] for x in image_captions]
+    caption = summarize_captions(caption_texts)
+    if not caption:
+        raise AppError("Caption summarization produced empty text.", 500)
+    audio_file_id = insert_record(
+        caption_collection,
         {
+            "caption": caption,
+            "source_filenames": [item["filename"] for item in image_captions],
+            "image_captions": image_captions,
             "images_count": len(image_captions),
+            "is_summarized": len(image_captions) > 1,
+            "created_at": datetime.now(timezone.utc),
         },
     )
+    response_data = {
+        "audio_file_id": audio_file_id,
+        "caption": caption,
+        "images_count": len(image_captions),
+    }
+    if len(image_captions) > 1:
+        response_data["individual_captions"] = image_captions
+        response_data["summarized_caption"] = caption
+    return ok("Caption generated successfully.", response_data)

requirements.txt CHANGED Viewed

@@ -20,3 +20,4 @@ opencv-python==4.9.0.80
 tqdm==4.66.0
 requests==2.31.0
 python-dotenv==1.0.1

 tqdm==4.66.0
 requests==2.31.0
 python-dotenv==1.0.1
+pymongo[srv]==4.8.0