ImageToText

Sleeping

App Files Files Community

vidhi0405 commited on 10 days ago

Commit

0ffe62a

0 Parent(s):

Reset history to image-to-text only

Browse files

Files changed (6) hide show

.gitattributes +35 -0
.gitignore +3 -0
Dockerfile +31 -0
README.md +10 -0
app.py +269 -0
requirements.txt +22 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+.env
+outputs/
+__pycache__/

Dockerfile ADDED Viewed

	@@ -0,0 +1,31 @@

+FROM python:3.10-slim
+ENV PYTHONDONTWRITEBYTECODE=1 \
+    PYTHONUNBUFFERED=1 \
+    PIP_NO_CACHE_DIR=1 \
+    PIP_DISABLE_PIP_VERSION_CHECK=1 \
+    OMP_NUM_THREADS=8 \
+    HF_HOME=/data/.huggingface \
+    HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
+    CAPTION_MODEL_ID=vidhi0405/Qwen_I2T \
+    PRELOAD_MODELS=1 \
+    PORT=7860
+WORKDIR /app
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+RUN mkdir -p /data/.huggingface && chmod -R 777 /data
+COPY requirements.txt ./
+RUN python -m pip install --upgrade pip && \
+    python -m pip install -r requirements.txt
+COPY app.py ./app.py
+EXPOSE 7860
+VOLUME ["/data"]
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md ADDED Viewed

	@@ -0,0 +1,10 @@

+---
+title: ImageToText
+emoji: 🦀
+colorFrom: indigo
+colorTo: blue
+sdk: docker
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,269 @@

+import io
+import logging
+import os
+import re
+import threading
+# Avoid invalid OMP setting from runtime environment (e.g. empty/non-numeric).
+_omp_threads = os.getenv("OMP_NUM_THREADS", "").strip()
+if not _omp_threads.isdigit() or int(_omp_threads) < 1:
+    os.environ["OMP_NUM_THREADS"] = "8"
+import torch
+from dotenv import load_dotenv
+from fastapi import FastAPI, File, UploadFile
+from fastapi.exceptions import RequestValidationError
+from fastapi.responses import JSONResponse
+from PIL import Image, UnidentifiedImageError
+from transformers import AutoModelForImageTextToText, AutoProcessor
+load_dotenv()
+CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "vidhi0405/Qwen_I2T")
+DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
+DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
+MAX_NEW_TOKENS = 120
+MAX_IMAGES = 5
+CAPTION_PROMPT = (
+    "Act as a professional news reporter delivering a live on-scene report in real time. "
+    "Speak naturally, as if you are addressing viewers who are watching this unfold right now. "
+    "Describe the scene in 3 to 4 complete, vivid sentences. "
+    "Mention what is happening, the surrounding environment, and the overall mood, "
+    "and convey the urgency or emotion of the moment when appropriate."
+)
+CAPTION_RETRY_PROMPT = (
+    "Describe this image in 2 to 3 complete sentences. "
+    "Mention the main subject, action, environment, and mood."
+)
+CAPTION_MIN_SENTENCES = 3
+CAPTION_MAX_SENTENCES = 4
+PROCESSOR_MAX_LENGTH = 8192
+logger = logging.getLogger(__name__)
+def ok(message: str, data):
+    return JSONResponse(
+        status_code=200,
+        content={"success": True, "message": message, "data": data},
+    )
+def fail(message: str, status_code: int = 400):
+    return JSONResponse(
+        status_code=status_code,
+        content={"success": False, "message": message, "data": None},
+    )
+class AppError(Exception):
+    def __init__(self, message: str, status_code: int = 400):
+        super().__init__(message)
+        self.message = message
+        self.status_code = status_code
+torch.set_num_threads(8)
+_caption_model = None
+_caption_processor = None
+_caption_lock = threading.Lock()
+_caption_force_cpu = False
+app = FastAPI(title="Image to Text API")
+@app.get("/")
+def root():
+    return {
+        "success": True,
+        "message": "Use POST /generate-caption with form-data key 'file' or 'files' (up to 5 images).",
+        "data": None,
+    }
+@app.get("/health")
+def health():
+    return {"success": True, "message": "ok", "data": {"caption_model_id": CAPTION_MODEL_ID}}
+@app.exception_handler(AppError)
+async def app_error_handler(_, exc: AppError):
+    return fail(exc.message, exc.status_code)
+@app.exception_handler(RequestValidationError)
+async def validation_error_handler(_, exc: RequestValidationError):
+    return fail("Invalid request payload.", 422)
+@app.exception_handler(Exception)
+async def unhandled_error_handler(_, exc: Exception):
+    logger.exception("Unhandled server error: %s", exc)
+    return fail("Internal server error.", 500)
+def _finalize_caption(raw_text: str) -> str:
+    text = " ".join(raw_text.split()).strip()
+    if not text:
+        return ""
+    sentences = re.findall(r"[^.!?]+[.!?]", text)
+    sentences = [s.strip() for s in sentences if s.strip()]
+    if len(sentences) >= CAPTION_MIN_SENTENCES:
+        return " ".join(sentences[:CAPTION_MAX_SENTENCES]).strip()
+    if text and text[-1] not in ".!?":
+        text = re.sub(r"[,:;\-]\s*[^,:;\-]*$", "", text).strip()
+    return text
+def _get_caption_runtime():
+    global _caption_model, _caption_processor, _caption_force_cpu
+    if _caption_model is not None and _caption_processor is not None:
+        return _caption_model, _caption_processor
+    with _caption_lock:
+        if _caption_model is None or _caption_processor is None:
+            device = "cpu" if _caption_force_cpu else DEVICE
+            dtype = torch.float32 if device == "cpu" else DTYPE
+            try:
+                loaded_model = AutoModelForImageTextToText.from_pretrained(
+                    CAPTION_MODEL_ID,
+                    trust_remote_code=True,
+                    torch_dtype=dtype,
+                    low_cpu_mem_usage=True,
+                ).to(device)
+                loaded_processor = AutoProcessor.from_pretrained(
+                    CAPTION_MODEL_ID,
+                    trust_remote_code=True,
+                )
+            except Exception as exc:
+                raise AppError("Failed to load caption model.", 503) from exc
+            loaded_model.eval()
+            _caption_model = loaded_model
+            _caption_processor = loaded_processor
+    return _caption_model, _caption_processor
+def generate_caption_text(image: Image.Image) -> str:
+    runtime_model, runtime_processor = _get_caption_runtime()
+    model_device = str(next(runtime_model.parameters()).device)
+    def _build_inputs(prompt: str):
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "image"},
+                    {"type": "text", "text": prompt},
+                ],
+            }
+        ]
+        text = runtime_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        return runtime_processor(
+            text=text,
+            images=image,
+            return_tensors="pt",
+            truncation=False,
+            max_length=PROCESSOR_MAX_LENGTH,
+        )
+    try:
+        inputs = _build_inputs(CAPTION_PROMPT)
+    except Exception as exc:
+        if "Mismatch in `image` token count" not in str(exc):
+            raise AppError("Failed to preprocess image for captioning.", 422) from exc
+        inputs = _build_inputs(CAPTION_RETRY_PROMPT)
+    inputs = {k: v.to(model_device) for k, v in inputs.items()}
+    try:
+        with torch.no_grad():
+            outputs = runtime_model.generate(
+                **inputs,
+                max_new_tokens=MAX_NEW_TOKENS,
+                do_sample=False,
+                num_beams=1,
+            )
+    except Exception as exc:
+        raise AppError("Caption generation failed.", 500) from exc
+    decoded = runtime_processor.decode(outputs[0], skip_special_tokens=True).strip()
+    caption = decoded.split("assistant")[-1].lstrip(":\n ").strip()
+    return _finalize_caption(caption)
+def generate_caption_text_safe(image: Image.Image) -> str:
+    global _caption_model, _caption_processor, _caption_force_cpu
+    try:
+        return generate_caption_text(image)
+    except Exception as exc:
+        msg = str(exc)
+        if "CUDA error" not in msg and "device-side assert" not in msg:
+            raise
+        with _caption_lock:
+            _caption_force_cpu = True
+            _caption_model = None
+            _caption_processor = None
+        if torch.cuda.is_available():
+            try:
+                torch.cuda.empty_cache()
+            except Exception:
+                pass
+        return generate_caption_text(image)
+@app.post("/generate-caption")
+async def generate_caption(
+    file: UploadFile | None = File(default=None),
+    files: list[UploadFile] | None = File(default=None),
+):
+    uploads = []
+    if files:
+        uploads.extend(files)
+    if file is not None:
+        uploads.append(file)
+    if not uploads:
+        raise AppError("At least one image is required.", 400)
+    if len(uploads) > MAX_IMAGES:
+        raise AppError("You can upload a maximum of 5 images.", 400)
+    image_captions = []
+    for upload in uploads:
+        if upload.content_type and not upload.content_type.startswith("image/"):
+            raise AppError("All uploaded files must be images.", 400)
+        file_bytes = await upload.read()
+        if not file_bytes:
+            raise AppError("One of the uploaded images is empty.", 400)
+        try:
+            image = Image.open(io.BytesIO(file_bytes)).convert("RGB")
+        except UnidentifiedImageError as exc:
+            raise AppError("One of the uploaded files is not a valid image.", 400) from exc
+        except OSError as exc:
+            raise AppError("Unable to read one of the uploaded images.", 400) from exc
+        caption = generate_caption_text_safe(image)
+        if not caption:
+            raise AppError("Caption generation produced empty text.", 500)
+        image_captions.append({"filename": upload.filename, "caption": caption})
+    return ok(
+        "Caption generated successfully.",
+        {
+            "caption": image_captions[0]["caption"] if len(image_captions) == 1 else None,
+            "individual_captions": image_captions,
+            "images_count": len(image_captions),
+        },
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+fastapi==0.110.0
+uvicorn[standard]==0.29.0
+python-multipart==0.0.9
+torch==2.5.1
+torchvision==0.20.1
+transformers==4.55.2
+accelerate==0.30.1
+timm==0.9.16
+einops==0.7.0
+qwen-vl-utils==0.0.8
+huggingface-hub==0.34.1
+sentencepiece==0.1.99
+tiktoken==0.7.0
+protobuf==4.25.3
+pillow==10.3.0
+numpy==1.26.4
+safetensors==0.4.3
+opencv-python==4.9.0.80
+tqdm==4.66.0
+requests==2.31.0
+python-dotenv==1.0.1