Spaces:
Sleeping
Sleeping
only for Image to Text
Browse files
app.py
CHANGED
|
@@ -229,7 +229,7 @@ def _get_summarizer_runtime():
|
|
| 229 |
if _summarizer_model is None or _summarizer_tokenizer is None:
|
| 230 |
try:
|
| 231 |
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_ID)
|
| 232 |
-
model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_ID)
|
| 233 |
except Exception as exc:
|
| 234 |
raise AppError("Failed to load summarization model.", 503) from exc
|
| 235 |
model.eval()
|
|
@@ -257,11 +257,12 @@ def summarize_captions(captions: list[str]) -> str:
|
|
| 257 |
truncation=True,
|
| 258 |
return_tensors="pt",
|
| 259 |
)
|
|
|
|
| 260 |
with torch.no_grad():
|
| 261 |
output_ids = model.generate(
|
| 262 |
**inputs,
|
| 263 |
max_length=150,
|
| 264 |
-
min_length=
|
| 265 |
length_penalty=2.0,
|
| 266 |
num_beams=4,
|
| 267 |
early_stopping=True,
|
|
@@ -407,25 +408,19 @@ async def generate_caption(request: Request):
|
|
| 407 |
if not caption:
|
| 408 |
raise AppError("Caption summarization produced empty text.", 500)
|
| 409 |
|
| 410 |
-
|
| 411 |
-
caption_collection,
|
| 412 |
-
{
|
| 413 |
-
"caption": caption,
|
| 414 |
-
"source_filenames": [item["filename"] for item in image_captions],
|
| 415 |
-
"image_captions": image_captions,
|
| 416 |
-
"images_count": len(image_captions),
|
| 417 |
-
"is_summarized": len(image_captions) > 1,
|
| 418 |
-
"created_at": datetime.now(timezone.utc),
|
| 419 |
-
},
|
| 420 |
-
)
|
| 421 |
-
|
| 422 |
-
response_data = {
|
| 423 |
-
"audio_file_id": audio_file_id,
|
| 424 |
"caption": caption,
|
|
|
|
|
|
|
| 425 |
"images_count": len(image_captions),
|
|
|
|
|
|
|
| 426 |
}
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
|
|
|
| 430 |
|
| 431 |
return ok("Caption generated successfully.", response_data)
|
|
|
|
| 229 |
if _summarizer_model is None or _summarizer_tokenizer is None:
|
| 230 |
try:
|
| 231 |
tokenizer = AutoTokenizer.from_pretrained(SUMMARIZER_MODEL_ID)
|
| 232 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(SUMMARIZER_MODEL_ID, torch_dtype=DTYPE).to(DEVICE)
|
| 233 |
except Exception as exc:
|
| 234 |
raise AppError("Failed to load summarization model.", 503) from exc
|
| 235 |
model.eval()
|
|
|
|
| 257 |
truncation=True,
|
| 258 |
return_tensors="pt",
|
| 259 |
)
|
| 260 |
+
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
| 261 |
with torch.no_grad():
|
| 262 |
output_ids = model.generate(
|
| 263 |
**inputs,
|
| 264 |
max_length=150,
|
| 265 |
+
min_length=20,
|
| 266 |
length_penalty=2.0,
|
| 267 |
num_beams=4,
|
| 268 |
early_stopping=True,
|
|
|
|
| 408 |
if not caption:
|
| 409 |
raise AppError("Caption summarization produced empty text.", 500)
|
| 410 |
|
| 411 |
+
mongo_payload = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
"caption": caption,
|
| 413 |
+
"source_filenames": [item["filename"] for item in image_captions],
|
| 414 |
+
"image_captions": image_captions,
|
| 415 |
"images_count": len(image_captions),
|
| 416 |
+
"is_summarized": len(image_captions) > 1,
|
| 417 |
+
"created_at": datetime.now(timezone.utc),
|
| 418 |
}
|
| 419 |
+
|
| 420 |
+
audio_file_id = insert_record(caption_collection, mongo_payload)
|
| 421 |
+
|
| 422 |
+
response_data = {**mongo_payload, "audio_file_id": audio_file_id}
|
| 423 |
+
response_data.pop("_id", None) # Remove ObjectId as it is not JSON serializable
|
| 424 |
+
response_data["created_at"] = response_data["created_at"].isoformat()
|
| 425 |
|
| 426 |
return ok("Caption generated successfully.", response_data)
|