vidhi0405 commited on
Commit
0ffe62a
·
0 Parent(s):

Reset history to image-to-text only

Browse files
Files changed (6) hide show
  1. .gitattributes +35 -0
  2. .gitignore +3 -0
  3. Dockerfile +31 -0
  4. README.md +10 -0
  5. app.py +269 -0
  6. requirements.txt +22 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ .env
2
+ outputs/
3
+ __pycache__/
Dockerfile ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ ENV PYTHONDONTWRITEBYTECODE=1 \
4
+ PYTHONUNBUFFERED=1 \
5
+ PIP_NO_CACHE_DIR=1 \
6
+ PIP_DISABLE_PIP_VERSION_CHECK=1 \
7
+ OMP_NUM_THREADS=8 \
8
+ HF_HOME=/data/.huggingface \
9
+ HF_HUB_DISABLE_SYMLINKS_WARNING=1 \
10
+ CAPTION_MODEL_ID=vidhi0405/Qwen_I2T \
11
+ PRELOAD_MODELS=1 \
12
+ PORT=7860
13
+
14
+ WORKDIR /app
15
+
16
+ RUN apt-get update && apt-get install -y --no-install-recommends \
17
+ libgl1 \
18
+ libglib2.0-0 \
19
+ && rm -rf /var/lib/apt/lists/*
20
+
21
+ RUN mkdir -p /data/.huggingface && chmod -R 777 /data
22
+
23
+ COPY requirements.txt ./
24
+ RUN python -m pip install --upgrade pip && \
25
+ python -m pip install -r requirements.txt
26
+
27
+ COPY app.py ./app.py
28
+
29
+ EXPOSE 7860
30
+ VOLUME ["/data"]
31
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
README.md ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: ImageToText
3
+ emoji: 🦀
4
+ colorFrom: indigo
5
+ colorTo: blue
6
+ sdk: docker
7
+ pinned: false
8
+ ---
9
+
10
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,269 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import logging
3
+ import os
4
+ import re
5
+ import threading
6
+
7
+ # Avoid invalid OMP setting from runtime environment (e.g. empty/non-numeric).
8
+ _omp_threads = os.getenv("OMP_NUM_THREADS", "").strip()
9
+ if not _omp_threads.isdigit() or int(_omp_threads) < 1:
10
+ os.environ["OMP_NUM_THREADS"] = "8"
11
+
12
+ import torch
13
+ from dotenv import load_dotenv
14
+ from fastapi import FastAPI, File, UploadFile
15
+ from fastapi.exceptions import RequestValidationError
16
+ from fastapi.responses import JSONResponse
17
+ from PIL import Image, UnidentifiedImageError
18
+ from transformers import AutoModelForImageTextToText, AutoProcessor
19
+
20
+
21
+ load_dotenv()
22
+
23
+ CAPTION_MODEL_ID = os.getenv("CAPTION_MODEL_ID", "vidhi0405/Qwen_I2T")
24
+ DEVICE = os.getenv("DEVICE", "cuda" if torch.cuda.is_available() else "cpu")
25
+ DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
26
+ MAX_NEW_TOKENS = 120
27
+ MAX_IMAGES = 5
28
+
29
+ CAPTION_PROMPT = (
30
+ "Act as a professional news reporter delivering a live on-scene report in real time. "
31
+ "Speak naturally, as if you are addressing viewers who are watching this unfold right now. "
32
+ "Describe the scene in 3 to 4 complete, vivid sentences. "
33
+ "Mention what is happening, the surrounding environment, and the overall mood, "
34
+ "and convey the urgency or emotion of the moment when appropriate."
35
+ )
36
+ CAPTION_RETRY_PROMPT = (
37
+ "Describe this image in 2 to 3 complete sentences. "
38
+ "Mention the main subject, action, environment, and mood."
39
+ )
40
+ CAPTION_MIN_SENTENCES = 3
41
+ CAPTION_MAX_SENTENCES = 4
42
+ PROCESSOR_MAX_LENGTH = 8192
43
+
44
+ logger = logging.getLogger(__name__)
45
+
46
+
47
+ def ok(message: str, data):
48
+ return JSONResponse(
49
+ status_code=200,
50
+ content={"success": True, "message": message, "data": data},
51
+ )
52
+
53
+
54
+ def fail(message: str, status_code: int = 400):
55
+ return JSONResponse(
56
+ status_code=status_code,
57
+ content={"success": False, "message": message, "data": None},
58
+ )
59
+
60
+
61
+ class AppError(Exception):
62
+ def __init__(self, message: str, status_code: int = 400):
63
+ super().__init__(message)
64
+ self.message = message
65
+ self.status_code = status_code
66
+
67
+
68
+ torch.set_num_threads(8)
69
+ _caption_model = None
70
+ _caption_processor = None
71
+ _caption_lock = threading.Lock()
72
+ _caption_force_cpu = False
73
+
74
+ app = FastAPI(title="Image to Text API")
75
+
76
+
77
+ @app.get("/")
78
+ def root():
79
+ return {
80
+ "success": True,
81
+ "message": "Use POST /generate-caption with form-data key 'file' or 'files' (up to 5 images).",
82
+ "data": None,
83
+ }
84
+
85
+
86
+ @app.get("/health")
87
+ def health():
88
+ return {"success": True, "message": "ok", "data": {"caption_model_id": CAPTION_MODEL_ID}}
89
+
90
+
91
+ @app.exception_handler(AppError)
92
+ async def app_error_handler(_, exc: AppError):
93
+ return fail(exc.message, exc.status_code)
94
+
95
+
96
+ @app.exception_handler(RequestValidationError)
97
+ async def validation_error_handler(_, exc: RequestValidationError):
98
+ return fail("Invalid request payload.", 422)
99
+
100
+
101
+ @app.exception_handler(Exception)
102
+ async def unhandled_error_handler(_, exc: Exception):
103
+ logger.exception("Unhandled server error: %s", exc)
104
+ return fail("Internal server error.", 500)
105
+
106
+
107
+ def _finalize_caption(raw_text: str) -> str:
108
+ text = " ".join(raw_text.split()).strip()
109
+ if not text:
110
+ return ""
111
+
112
+ sentences = re.findall(r"[^.!?]+[.!?]", text)
113
+ sentences = [s.strip() for s in sentences if s.strip()]
114
+
115
+ if len(sentences) >= CAPTION_MIN_SENTENCES:
116
+ return " ".join(sentences[:CAPTION_MAX_SENTENCES]).strip()
117
+
118
+ if text and text[-1] not in ".!?":
119
+ text = re.sub(r"[,:;\-]\s*[^,:;\-]*$", "", text).strip()
120
+ return text
121
+
122
+
123
+ def _get_caption_runtime():
124
+ global _caption_model, _caption_processor, _caption_force_cpu
125
+ if _caption_model is not None and _caption_processor is not None:
126
+ return _caption_model, _caption_processor
127
+
128
+ with _caption_lock:
129
+ if _caption_model is None or _caption_processor is None:
130
+ device = "cpu" if _caption_force_cpu else DEVICE
131
+ dtype = torch.float32 if device == "cpu" else DTYPE
132
+ try:
133
+ loaded_model = AutoModelForImageTextToText.from_pretrained(
134
+ CAPTION_MODEL_ID,
135
+ trust_remote_code=True,
136
+ torch_dtype=dtype,
137
+ low_cpu_mem_usage=True,
138
+ ).to(device)
139
+ loaded_processor = AutoProcessor.from_pretrained(
140
+ CAPTION_MODEL_ID,
141
+ trust_remote_code=True,
142
+ )
143
+ except Exception as exc:
144
+ raise AppError("Failed to load caption model.", 503) from exc
145
+ loaded_model.eval()
146
+ _caption_model = loaded_model
147
+ _caption_processor = loaded_processor
148
+
149
+ return _caption_model, _caption_processor
150
+
151
+
152
+ def generate_caption_text(image: Image.Image) -> str:
153
+ runtime_model, runtime_processor = _get_caption_runtime()
154
+ model_device = str(next(runtime_model.parameters()).device)
155
+
156
+ def _build_inputs(prompt: str):
157
+ messages = [
158
+ {
159
+ "role": "user",
160
+ "content": [
161
+ {"type": "image"},
162
+ {"type": "text", "text": prompt},
163
+ ],
164
+ }
165
+ ]
166
+ text = runtime_processor.apply_chat_template(
167
+ messages, tokenize=False, add_generation_prompt=True
168
+ )
169
+ return runtime_processor(
170
+ text=text,
171
+ images=image,
172
+ return_tensors="pt",
173
+ truncation=False,
174
+ max_length=PROCESSOR_MAX_LENGTH,
175
+ )
176
+
177
+ try:
178
+ inputs = _build_inputs(CAPTION_PROMPT)
179
+ except Exception as exc:
180
+ if "Mismatch in `image` token count" not in str(exc):
181
+ raise AppError("Failed to preprocess image for captioning.", 422) from exc
182
+ inputs = _build_inputs(CAPTION_RETRY_PROMPT)
183
+
184
+ inputs = {k: v.to(model_device) for k, v in inputs.items()}
185
+
186
+ try:
187
+ with torch.no_grad():
188
+ outputs = runtime_model.generate(
189
+ **inputs,
190
+ max_new_tokens=MAX_NEW_TOKENS,
191
+ do_sample=False,
192
+ num_beams=1,
193
+ )
194
+ except Exception as exc:
195
+ raise AppError("Caption generation failed.", 500) from exc
196
+
197
+ decoded = runtime_processor.decode(outputs[0], skip_special_tokens=True).strip()
198
+ caption = decoded.split("assistant")[-1].lstrip(":\n ").strip()
199
+ return _finalize_caption(caption)
200
+
201
+
202
+ def generate_caption_text_safe(image: Image.Image) -> str:
203
+ global _caption_model, _caption_processor, _caption_force_cpu
204
+ try:
205
+ return generate_caption_text(image)
206
+ except Exception as exc:
207
+ msg = str(exc)
208
+ if "CUDA error" not in msg and "device-side assert" not in msg:
209
+ raise
210
+
211
+ with _caption_lock:
212
+ _caption_force_cpu = True
213
+ _caption_model = None
214
+ _caption_processor = None
215
+
216
+ if torch.cuda.is_available():
217
+ try:
218
+ torch.cuda.empty_cache()
219
+ except Exception:
220
+ pass
221
+
222
+ return generate_caption_text(image)
223
+
224
+
225
+ @app.post("/generate-caption")
226
+ async def generate_caption(
227
+ file: UploadFile | None = File(default=None),
228
+ files: list[UploadFile] | None = File(default=None),
229
+ ):
230
+ uploads = []
231
+ if files:
232
+ uploads.extend(files)
233
+ if file is not None:
234
+ uploads.append(file)
235
+ if not uploads:
236
+ raise AppError("At least one image is required.", 400)
237
+ if len(uploads) > MAX_IMAGES:
238
+ raise AppError("You can upload a maximum of 5 images.", 400)
239
+
240
+ image_captions = []
241
+ for upload in uploads:
242
+ if upload.content_type and not upload.content_type.startswith("image/"):
243
+ raise AppError("All uploaded files must be images.", 400)
244
+
245
+ file_bytes = await upload.read()
246
+ if not file_bytes:
247
+ raise AppError("One of the uploaded images is empty.", 400)
248
+
249
+ try:
250
+ image = Image.open(io.BytesIO(file_bytes)).convert("RGB")
251
+ except UnidentifiedImageError as exc:
252
+ raise AppError("One of the uploaded files is not a valid image.", 400) from exc
253
+ except OSError as exc:
254
+ raise AppError("Unable to read one of the uploaded images.", 400) from exc
255
+
256
+ caption = generate_caption_text_safe(image)
257
+ if not caption:
258
+ raise AppError("Caption generation produced empty text.", 500)
259
+
260
+ image_captions.append({"filename": upload.filename, "caption": caption})
261
+
262
+ return ok(
263
+ "Caption generated successfully.",
264
+ {
265
+ "caption": image_captions[0]["caption"] if len(image_captions) == 1 else None,
266
+ "individual_captions": image_captions,
267
+ "images_count": len(image_captions),
268
+ },
269
+ )
requirements.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ fastapi==0.110.0
2
+ uvicorn[standard]==0.29.0
3
+ python-multipart==0.0.9
4
+
5
+ torch==2.5.1
6
+ torchvision==0.20.1
7
+ transformers==4.55.2
8
+ accelerate==0.30.1
9
+ timm==0.9.16
10
+ einops==0.7.0
11
+ qwen-vl-utils==0.0.8
12
+ huggingface-hub==0.34.1
13
+ sentencepiece==0.1.99
14
+ tiktoken==0.7.0
15
+ protobuf==4.25.3
16
+ pillow==10.3.0
17
+ numpy==1.26.4
18
+ safetensors==0.4.3
19
+ opencv-python==4.9.0.80
20
+ tqdm==4.66.0
21
+ requests==2.31.0
22
+ python-dotenv==1.0.1