Spaces:

saad003
/

rad-retrieval-api

Sleeping

App Files Files Community

saad003 commited on Dec 9, 2025

Commit

81e97e9

verified ·

1 Parent(s): 63a5265

Update app.py

Browse files

Files changed (1) hide show

app.py +619 -256

app.py CHANGED Viewed

@@ -1,8 +1,9 @@
 # app.py
 import io
 import os
-import re
 import random
 import faiss
 import torch
@@ -14,26 +15,38 @@ from fastapi.middleware.cors import CORSMiddleware
 from fastapi.responses import JSONResponse
 from huggingface_hub import hf_hub_download
-from transformers import CLIPProcessor, CLIPModel
-from transformers import BlipForConditionalGeneration, AutoProcessor
 # ---------- FastAPI app ----------
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
-    allow_origins=["*"],  # later you can restrict to your frontend domain
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 # ---------- Config ----------
-EMBED_REPO_ID = "saad003/Red01"       # FAISS + metadata
-IMAGE_REPO_ID = "saad003/images04"    # test, valid, train01..train07
 BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
-HF_TOKEN = os.environ.get("HF_TOKEN")  # set in HF Space secrets if private
 # ---------- Download index + metadata ----------
 print("Downloading FAISS index & metadata from Hugging Face...")
@@ -58,350 +71,700 @@ index = faiss.read_index(INDEX_PATH)
 print("Loading metadata CSV...")
 metadata = pd.read_csv(META_PATH)
-required_cols = {"vec_index", "ID", "caption", "concepts_manual"}
-missing = required_cols - set(metadata.columns)
-if missing:
-    raise ValueError(f"radiology_metadata.csv is missing columns: {missing}")
 assert index.ntotal == len(metadata), "Index size and metadata rows mismatch!"
 # ---------- Load CLIP (retrieval) ----------
 print("Loading PubMedCLIP model for retrieval...")
 CLIP_MODEL_NAME = "flaviagiammarino/pubmed-clip-vit-base-patch32"
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Using device:", device)
 clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(device)
 clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
 clip_model.eval()
-# ---------- Load BLIP (captioning) ----------
-print("Loading BLIP radiology captioning model...")
-CAPTION_MODEL_ID = "WafaaFraih/blip-roco-radiology-captioning"
-caption_processor = AutoProcessor.from_pretrained(CAPTION_MODEL_ID)
-caption_model = BlipForConditionalGeneration.from_pretrained(
-    CAPTION_MODEL_ID
 ).to(device)
 caption_model.eval()
 print("Backend ready ✅")
-# ---------- Helpers for dataset path ----------
-def train_folder_from_id(image_id: str) -> str:
-    """
-    For IDs like 'ROCOv2_2023_train_000001', decide which trainXX folder
-    based on the last 6 digits.
-    """
-    try:
-        num_str = image_id.split("_")[-1]  # "000001"
-        num = int(num_str)
-    except Exception:
-        return "train01"  # safe default
-    if num <= 9000:
-        return "train01"
-    elif num <= 18000:
-        return "train02"
-    elif num <= 27000:
-        return "train03"
-    elif num <= 36000:
-        return "train04"
-    elif num <= 45000:
-        return "train05"
-    elif num <= 54000:
-        return "train06"
-    else:
-        return "train07"
 def id_to_image_url(image_id: str) -> str:
     """
-    Build raw image URL based on ID and folder structure.
-    Examples:
-      ROCOv2_2023_test_000001  -> test/ROCOv2_2023_test_000001.jpg
-      ROCOv2_2023_valid_000005 -> valid/ROCOv2_2023_valid_000005.jpg
-      ROCOv2_2023_train_000001 -> train01/ROCOv2_2023_train_000001.jpg
     """
-    if not isinstance(image_id, str):
-        return None
     image_id = image_id.strip()
-    if "test_" in image_id:
         folder = "test"
-    elif "valid_" in image_id:
         folder = "valid"
-    elif "train_" in image_id:
-        folder = train_folder_from_id(image_id)
     else:
         folder = ""
-    filename = f"{image_id}.jpg"
     if folder:
-        return f"{BASE_IMAGE_URL}/{folder}/{filename}"
     else:
-        return f"{BASE_IMAGE_URL}/{filename}"
-def search_similar_by_image(
-    image: Image.Image, k: int = 5, query_id: str | None = None
-) -> pd.DataFrame:
     """
-    Encode query image with CLIP, search FAISS, and return top-k rows
-    with vec_index, ID, caption, concepts_manual, score, image_url.
-    If query_id is provided, we exclude that exact ID from results
-    (so the query image itself is not returned as "similar").
     """
-    # Encode query
     inputs = clip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         feats = clip_model.get_image_features(**inputs)
     feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
     feats = feats.cpu().numpy().astype("float32")
-    # Fetch a few extra results in case we need to drop the query image
-    extra = 1 if query_id else 0
-    D, I = index.search(feats, k + extra)
     rows = metadata.iloc[I[0]].copy()
     rows["score"] = D[0]
     rows["image_url"] = rows["ID"].apply(id_to_image_url)
-    if query_id:
-        qid = query_id.strip()
-        rows = rows[rows["ID"] != qid]
-    # Keep only top-k after filtering
-    if len(rows) > k:
-        rows = rows.iloc[:k]
-    return rows[
-        ["vec_index", "ID", "caption", "concepts_manual", "score", "image_url"]
-    ]
-# ---------- Captioning ----------
 def generate_query_caption(image: Image.Image) -> str:
-    """Generate a medical caption for the query image using BLIP."""
-    inputs = caption_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
-        out = caption_model.generate(**inputs, max_new_tokens=64)
-    caption = caption_processor.batch_decode(out, skip_special_tokens=True)[0]
-    return caption.strip()
-# ---------- Improved modality detection ----------
-def infer_modality_from_caption(caption: str) -> str:
     """
-    Heuristic modality detector, fairly robust to spelling/spacing.
     """
     if not caption:
         return "Unknown"
     text = caption.lower()
-    text = " " + " ".join(text.split()) + " "
-    normalized = re.sub(r"[^a-z0-9]", "", text)
-    def contains_any(substrs, use_normalized=False):
-        target = normalized if use_normalized else text
-        return any(s in target for s in substrs)
-    # PET / PET-CT
-    if contains_any(
-        [
-            " pet-ct ",
-            " pet ct ",
-            " pet/ct ",
-            " fdg pet ",
-            " fdg-pet ",
-            " positron emission tomography ",
-        ]
-    ) or contains_any(["petscan", "fdgpet"], use_normalized=True):
-        return "PET/CT"
-    # CT
-    if contains_any(
-        [
-            " ct scan",
-            " ct of ",
-            "ct of ",
-            "contrast-enhanced ct",
-            "contrast enhanced ct",
-            "non-contrast ct",
-            "non contrast ct",
-            "computed tomography",
-            "computerized tomography",
-            "computerised tomography",
-        ]
-    ) or contains_any(["ctscan", "cect"], use_normalized=True):
-        return "CT"
-    # MRI
-    if contains_any(
-        [
-            " mri ",
-            " mr imaging",
-            " mr scan",
-            " mr study",
-            " magnetic resonance",
-            " mr of ",
-        ]
-    ) or contains_any(
-        [
-            "t1weighted",
-            "t2weighted",
-            "flairsequence",
-            "diffusionweighted",
-            "dwi",
-            "swisequence",
-            "susceptibilityweighted",
-        ],
-        use_normalized=True,
-    ):
         return "MRI"
-    # X-ray / radiography
-    if (
-        contains_any(
-            [
-                " x-ray",
-                " x ray",
-                " chest xray",
-                " chest x-ray",
-                " radiograph",
-                " radiography",
-                " plain film",
-                " plain radiograph",
-                " chest radiograph",
-                " erect chest",
-                " upright chest",
-                " lateral view",
-                " ap view ",
-                " pa view ",
-            ]
         )
-        or contains_any(["xray", "cxr"], use_normalized=True)
-    ):
-        return "X-ray"
-    # Ultrasound
-    if contains_any(
-        [
-            " ultrasound",
-            " usg ",
-            " sonography",
-            " sonogram",
-            " echography",
-            " echocardiogram",
-            " echocardiography",
-            " doppler ultrasound",
-            " duplex ultrasound",
-            " transvaginal ultrasound",
-            " transabdominal ultrasound",
-        ]
-    ) or contains_any(["ultrasoundscan"], use_normalized=True):
-        return "Ultrasound"
-    # Mammography
-    if contains_any(
-        [
-            " mammogram",
-            " mammography",
-            " screening mammo",
-            " diagnostic mammo",
-        ]
-    ):
-        return "Mammography"
-    # Angiography / Fluoroscopy
-    if contains_any(
-        [
-            " angiogram",
-            " angiography",
-            " digital subtraction angiography",
-            " dsa ",
-            " fluoroscopy",
-            " fluoroscopic",
-            " catheter angiography",
-        ]
-    ):
-        return "Angiography / Fluoroscopy"
-    # Nuclear medicine (non-PET)
-    if contains_any(
-        [
-            " scintigraphy",
-            " bone scan",
-            " radionuclide",
-            " radioisotope",
-            " sestamibi",
-            "mibg ",
-        ]
-    ):
-        return "Nuclear medicine"
-    return "Unknown"
 # ---------- Routes ----------
 @app.get("/")
 def root():
-    return {"status": "ok", "message": "Radiology retrieval + captioning API"}
 @app.post("/search_by_image")
 async def search_by_image(file: UploadFile = File(...), k: int = 5):
     """
     Upload a radiology image.
     Returns:
-      - query_caption: BLIP caption ("diagnosis details")
-      - modality: inferred imaging modality
-      - modality_score, cui_at_k, bert_score, medbert_score (random metrics)
-      - results: list of similar images with
-          ID, concepts_manual, score, image_url
     """
     content = await file.read()
     image = Image.open(io.BytesIO(content)).convert("RGB")
-    # derive ID from filename (strip extension)
-    filename = file.filename or ""
-    query_id = filename.rsplit(".", 1)[0] if "." in filename else filename
-    # 1) Retrieval (exclude the query image itself if present)
-    results_df = search_similar_by_image(image, k=k, query_id=query_id)
     results = results_df.to_dict(orient="records")
-    # 2) Caption
     try:
         query_caption = generate_query_caption(image)
     except Exception as e:
-        print("Error generating caption:", e)
         query_caption = None
-    # 3) Modality + random metrics
-    modality = infer_modality_from_caption(query_caption or "")
-    modality_score = round(random.uniform(0.85, 0.93), 3)
-    cui_at_k = round(random.uniform(0.30, 0.61), 3)
-    bert_score = round(random.uniform(0.20, 0.40), 3)
-    medbert_score = round(random.uniform(0.20, 0.35), 3)
     return JSONResponse(
         {
             "query_caption": query_caption,
             "modality": modality,
-            "modality_score": modality_score,
-            "cui_at_k": cui_at_k,
-            "bert_score": bert_score,
-            "medbert_score": medbert_score,
             "results": results,
         }
     )

 # app.py
 import io
 import os
 import random
+import re
+from typing import Dict
 import faiss
 import torch
 from fastapi.responses import JSONResponse
 from huggingface_hub import hf_hub_download
+from transformers import (
+    CLIPProcessor,
+    CLIPModel,
+    Blip2Processor,
+    Blip2ForConditionalGeneration,
+)
 # ---------- FastAPI app ----------
 app = FastAPI()
 app.add_middleware(
     CORSMiddleware,
+    allow_origins=["*"],  # later restrict to your frontend domain
     allow_credentials=True,
     allow_methods=["*"],
     allow_headers=["*"],
 )
 # ---------- Config ----------
+# Dataset with FAISS index + radiology_metadata.csv
+EMBED_REPO_ID = "saad003/Red01"
+# Dataset with all radiology images (new structure with train01–train07)
+IMAGE_REPO_ID = "saad003/images04"
 BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
+# Optional: token if Red01 is private
+HF_TOKEN = os.environ.get("HF_TOKEN")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Using device:", device)
 # ---------- Download index + metadata ----------
 print("Downloading FAISS index & metadata from Hugging Face...")
 print("Loading metadata CSV...")
 metadata = pd.read_csv(META_PATH)
 assert index.ntotal == len(metadata), "Index size and metadata rows mismatch!"
 # ---------- Load CLIP (retrieval) ----------
 print("Loading PubMedCLIP model for retrieval...")
 CLIP_MODEL_NAME = "flaviagiammarino/pubmed-clip-vit-base-patch32"
 clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(device)
 clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
 clip_model.eval()
+# ---------- Load BLIP-2 (captioning) ----------
+print("Loading BLIP-2 model for medical captioning...")
+CAPTION_MODEL_ID = "Salesforce/blip2-opt-2.7b"
+# Use fp16 on GPU, fp32 on CPU
+caption_dtype = torch.float16 if device == "cuda" else torch.float32
+caption_processor = Blip2Processor.from_pretrained(CAPTION_MODEL_ID)
+caption_model = Blip2ForConditionalGeneration.from_pretrained(
+    CAPTION_MODEL_ID,
+    torch_dtype=caption_dtype,
 ).to(device)
 caption_model.eval()
 print("Backend ready ✅")
+# ---------- Helper: image path mapping ----------
 def id_to_image_url(image_id: str) -> str:
     """
+    Map ROCO image IDs to folders in saad003/images04.
+    test  -> test/
+    valid -> valid/
+    train -> train01 ... train07 based on numeric ID
     """
     image_id = image_id.strip()
+    base = BASE_IMAGE_URL
+    if "_test_" in image_id:
         folder = "test"
+    elif "_valid_" in image_id:
         folder = "valid"
+    elif "_train_" in image_id:
+        # last part: ROCOv2_2023_train_054005 -> "054005"
+        num_str = image_id.split("_")[-1]
+        try:
+            n = int(num_str)
+        except ValueError:
+            n = 0
+        # Rough ranges based on your description
+        if 1 <= n <= 9000:
+            folder = "train01"
+        elif 9001 <= n <= 18000:
+            folder = "train02"
+        elif 18001 <= n <= 27000:
+            folder = "train03"
+        elif 27001 <= n <= 36000:
+            folder = "train04"
+        elif 36001 <= n <= 45000:
+            folder = "train05"
+        elif 45001 <= n <= 54000:
+            folder = "train06"
+        else:
+            folder = "train07"
     else:
         folder = ""
     if folder:
+        return f"{base}/{folder}/{image_id}.jpg"
     else:
+        # fallback – should not happen, but safe
+        return f"{base}/{image_id}.jpg"
+# ---------- Helper: modality detection ----------
+MODALITY_KEYWORDS = {
+    "CT": [
+        "ct ",
+        "ctscan",
+        "computed tomography",
+        "tomography",
+        "ct scan",
+        "non-contrast ct",
+        "contrast-enhanced ct",
+    ],
+    "MRI": [
+        "mri ",
+        "magnetic resonance",
+        "t1-weighted",
+        "t2-weighted",
+        "flair sequence",
+        "diffusion-weighted",
+        "dwi",
+    ],
+    "X-ray": [
+        "x-ray",
+        "x ray",
+        "radiograph",
+        "plain film",
+        "chest film",
+        "postoperative x",
+        "post-operative x",
+        "cxr",
+    ],
+    "Ultrasound": [
+        "ultrasound",
+        "sonogram",
+        "sonography",
+        "usg",
+        "doppler",
+        "echocardiogram",
+        "echocardiography",
+    ],
+    "PET/CT": [
+        "pet-ct",
+        "pet/ct",
+        "pet scan",
+        "positron emission tomography",
+    ],
+    "Fluoroscopy": [
+        "fluoroscopy",
+        "fluoroscopic",
+        "angiogram",
+        "angiography",
+        "barium swallow",
+        "barium enema",
+    ],
+}
+def detect_modality(caption: str) -> str:
+    if not caption:
+        return "Unknown"
+    text = caption.lower()
+    for modality, keywords in MODALITY_KEYWORDS.items():
+        for kw in keywords:
+            if kw in text:
+                return modality
+    # Back-up heuristics
+    if "mra" in text:
+        return "MRI"
+    if "cta " in text or "ct angiography" in text:
+        return "CT"
+    return "Unknown"
+# ---------- Helper: random scoring ----------
+def generate_random_scores() -> Dict[str, float]:
+    """
+    Return random scores in the ranges you specified.
     """
+    rng = random.Random()
+    modality_score = rng.uniform(85.0, 93.0)   # percent
+    cui_at_k = rng.uniform(0.30, 0.61)
+    bert = rng.uniform(0.20, 0.40)
+    medbert = rng.uniform(0.20, 0.35)
+    return {
+        "modality_score": round(modality_score, 1),
+        "cui_at_k": round(cui_at_k, 3),
+        "bertscore": round(bert, 3),
+        "medbertscore": round(medbert, 3),
+    }
+# ---------- Helper: search by image ----------
+def search_similar_by_image(image: Image.Image, k: int = 5) -> pd.DataFrame:
+    """
+    Encode query image with CLIP, search FAISS,
+    filter out self-match (score ~ 1.0), and return top-k results.
     """
+    # Encode image
     inputs = clip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         feats = clip_model.get_image_features(**inputs)
+    # Normalize (same as you did when building the index)
     feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
     feats = feats.cpu().numpy().astype("float32")
+    # Search a bit more than k so we can drop self-match
+    search_k = min(index.ntotal, k + 5)
+    D, I = index.search(feats, search_k)
     rows = metadata.iloc[I[0]].copy()
     rows["score"] = D[0]
+    # Remove potential self-match (exact same image → cosine ~ 1.0)
+    rows = rows[rows["score"] < 0.999].copy()
+    # Add image_url
     rows["image_url"] = rows["ID"].apply(id_to_image_url)
+    # Keep only needed columns and top-k by score
+    rows = rows.sort_values("score", ascending=False).head(k)
+    # If concepts_manual is missing, fill with empty string
+    if "concepts_manual" not in rows.columns:
+        rows["concepts_manual"] = ""
+    return rows[["ID", "caption", "concepts_manual", "score", "image_url"]]
+# ---------- Helper: caption with BLIP-2 ----------
+def clean_caption(text: str) -> str:
+    """Basic cleanup to remove obvious repetition artifacts."""
+    text = text.strip()
+    # Deduplicate immediate repeated phrases separated by commas
+    parts = [p.strip() for p in text.split(",")]
+    dedup = []
+    for p in parts:
+        if not dedup or p.lower() != dedup[-1].lower():
+            dedup.append(p)
+    text = ", ".join(dedup)
+    # Remove repeated 'respectively'
+    text = re.sub(r"(respectively,?\s+)+", "respectively ", text, flags=re.IGNORECASE)
+    # Remove exact doubled sentence patterns like "..., and a large ... and a large ..."
+    text = re.sub(r"\b(\w+(?:\s+\w+){2,})\s+\1\b", r"\1", text, flags=re.IGNORECASE)
+    # Normalize whitespace
+    text = " ".join(text.split())
+    return text
 def generate_query_caption(image: Image.Image) -> str:
+    """
+    Generate a radiology-focused caption using BLIP-2.
+    """
+    prompt = (
+        "You are an expert radiologist. "
+        "Describe the key radiology findings in one concise sentence. "
+        "Avoid repeating phrases."
+    )
+    inputs = caption_processor(
+        images=image,
+        text=prompt,
+        return_tensors="pt",
+    ).to(device, dtype=caption_dtype)
     with torch.no_grad():
+        generated_ids = caption_model.generate(
+            **inputs,
+            max_new_tokens=64,
+            num_beams=4,
+            no_repeat_ngram_size=3,
+            repetition_penalty=1.1,
+        )
+    caption = caption_processor.batch_decode(
+        generated_ids, skip_special_tokens=True
+    )[0]
+    return clean_caption(caption)
+# ---------- Routes ----------
+@app.get("/")
+def root():
+    return {"status": "ok", "message": "Radiology retrieval + BLIP-2 captioning API"}
+@app.post("/search_by_image")
+async def search_by_image(file: UploadFile = File(...), k: int = 5):
+    """
+    Upload a radiology image.
+    Returns:
+      - query_caption: BLIP-2 caption for the query image
+      - modality: detected imaging modality from caption
+      - scores: random quality metrics in given ranges
+      - results: list of similar images with similarity + concepts + image_url
+    """
+    # Read uploaded file
+    content = await file.read()
+    image = Image.open(io.BytesIO(content)).convert("RGB")
+    # Retrieval
+    results_df = search_similar_by_image(image, k=int(k))
+    results = results_df.to_dict(orient="records")
+    # Caption + modality
+    try:
+        query_caption = generate_query_caption(image)
+    except Exception as e:
+        print("Error generating caption with BLIP-2:", e)
+        query_caption = None
+    modality = detect_modality(query_caption or "")
+    # Random scores
+    scores = generate_random_scores()
+    return JSONResponse(
+        {
+            "query_caption": query_caption,
+            "modality": modality,
+            "scores": scores,
+            "results": results,
+        }
+    )
+# app.py
+import io
+import os
+import random
+import re
+from typing import Dict
+import faiss
+import torch
+import pandas as pd
+from PIL import Image
+from fastapi import FastAPI, File, UploadFile
+from fastapi.middleware.cors import CORSMiddleware
+from fastapi.responses import JSONResponse
+from huggingface_hub import hf_hub_download
+from transformers import (
+    CLIPProcessor,
+    CLIPModel,
+    Blip2Processor,
+    Blip2ForConditionalGeneration,
+)
+# ---------- FastAPI app ----------
+app = FastAPI()
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],  # later restrict to your frontend domain
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# ---------- Config ----------
+# Dataset with FAISS index + radiology_metadata.csv
+EMBED_REPO_ID = "saad003/Red01"
+# Dataset with all radiology images (new structure with train01–train07)
+IMAGE_REPO_ID = "saad003/images04"
+BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
+# Optional: token if Red01 is private
+HF_TOKEN = os.environ.get("HF_TOKEN")
+device = "cuda" if torch.cuda.is_available() else "cpu"
+print("Using device:", device)
+# ---------- Download index + metadata ----------
+print("Downloading FAISS index & metadata from Hugging Face...")
+INDEX_PATH = hf_hub_download(
+    repo_id=EMBED_REPO_ID,
+    filename="radiology_index.faiss",
+    repo_type="dataset",
+    token=HF_TOKEN,
+)
+META_PATH = hf_hub_download(
+    repo_id=EMBED_REPO_ID,
+    filename="radiology_metadata.csv",
+    repo_type="dataset",
+    token=HF_TOKEN,
+)
+print("Loading FAISS index...")
+index = faiss.read_index(INDEX_PATH)
+print("Loading metadata CSV...")
+metadata = pd.read_csv(META_PATH)
+assert index.ntotal == len(metadata), "Index size and metadata rows mismatch!"
+# ---------- Load CLIP (retrieval) ----------
+print("Loading PubMedCLIP model for retrieval...")
+CLIP_MODEL_NAME = "flaviagiammarino/pubmed-clip-vit-base-patch32"
+clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(device)
+clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
+clip_model.eval()
+# ---------- Load BLIP-2 (captioning) ----------
+print("Loading BLIP-2 model for medical captioning...")
+CAPTION_MODEL_ID = "Salesforce/blip2-opt-2.7b"
+# Use fp16 on GPU, fp32 on CPU
+caption_dtype = torch.float16 if device == "cuda" else torch.float32
+caption_processor = Blip2Processor.from_pretrained(CAPTION_MODEL_ID)
+caption_model = Blip2ForConditionalGeneration.from_pretrained(
+    CAPTION_MODEL_ID,
+    torch_dtype=caption_dtype,
+).to(device)
+caption_model.eval()
+print("Backend ready ✅")
+# ---------- Helper: image path mapping ----------
+def id_to_image_url(image_id: str) -> str:
     """
+    Map ROCO image IDs to folders in saad003/images04.
+    test  -> test/
+    valid -> valid/
+    train -> train01 ... train07 based on numeric ID
     """
+    image_id = image_id.strip()
+    base = BASE_IMAGE_URL
+    if "_test_" in image_id:
+        folder = "test"
+    elif "_valid_" in image_id:
+        folder = "valid"
+    elif "_train_" in image_id:
+        # last part: ROCOv2_2023_train_054005 -> "054005"
+        num_str = image_id.split("_")[-1]
+        try:
+            n = int(num_str)
+        except ValueError:
+            n = 0
+        # Rough ranges based on your description
+        if 1 <= n <= 9000:
+            folder = "train01"
+        elif 9001 <= n <= 18000:
+            folder = "train02"
+        elif 18001 <= n <= 27000:
+            folder = "train03"
+        elif 27001 <= n <= 36000:
+            folder = "train04"
+        elif 36001 <= n <= 45000:
+            folder = "train05"
+        elif 45001 <= n <= 54000:
+            folder = "train06"
+        else:
+            folder = "train07"
+    else:
+        folder = ""
+    if folder:
+        return f"{base}/{folder}/{image_id}.jpg"
+    else:
+        # fallback – should not happen, but safe
+        return f"{base}/{image_id}.jpg"
+# ---------- Helper: modality detection ----------
+MODALITY_KEYWORDS = {
+    "CT": [
+        "ct ",
+        "ctscan",
+        "computed tomography",
+        "tomography",
+        "ct scan",
+        "non-contrast ct",
+        "contrast-enhanced ct",
+    ],
+    "MRI": [
+        "mri ",
+        "magnetic resonance",
+        "t1-weighted",
+        "t2-weighted",
+        "flair sequence",
+        "diffusion-weighted",
+        "dwi",
+    ],
+    "X-ray": [
+        "x-ray",
+        "x ray",
+        "radiograph",
+        "plain film",
+        "chest film",
+        "postoperative x",
+        "post-operative x",
+        "cxr",
+    ],
+    "Ultrasound": [
+        "ultrasound",
+        "sonogram",
+        "sonography",
+        "usg",
+        "doppler",
+        "echocardiogram",
+        "echocardiography",
+    ],
+    "PET/CT": [
+        "pet-ct",
+        "pet/ct",
+        "pet scan",
+        "positron emission tomography",
+    ],
+    "Fluoroscopy": [
+        "fluoroscopy",
+        "fluoroscopic",
+        "angiogram",
+        "angiography",
+        "barium swallow",
+        "barium enema",
+    ],
+}
+def detect_modality(caption: str) -> str:
     if not caption:
         return "Unknown"
     text = caption.lower()
+    for modality, keywords in MODALITY_KEYWORDS.items():
+        for kw in keywords:
+            if kw in text:
+                return modality
+    # Back-up heuristics
+    if "mra" in text:
         return "MRI"
+    if "cta " in text or "ct angiography" in text:
+        return "CT"
+    return "Unknown"
+# ---------- Helper: random scoring ----------
+def generate_random_scores() -> Dict[str, float]:
+    """
+    Return random scores in the ranges you specified.
+    """
+    rng = random.Random()
+    modality_score = rng.uniform(85.0, 93.0)   # percent
+    cui_at_k = rng.uniform(0.30, 0.61)
+    bert = rng.uniform(0.20, 0.40)
+    medbert = rng.uniform(0.20, 0.35)
+    return {
+        "modality_score": round(modality_score, 1),
+        "cui_at_k": round(cui_at_k, 3),
+        "bertscore": round(bert, 3),
+        "medbertscore": round(medbert, 3),
+    }
+# ---------- Helper: search by image ----------
+def search_similar_by_image(image: Image.Image, k: int = 5) -> pd.DataFrame:
+    """
+    Encode query image with CLIP, search FAISS,
+    filter out self-match (score ~ 1.0), and return top-k results.
+    """
+    # Encode image
+    inputs = clip_processor(images=image, return_tensors="pt").to(device)
+    with torch.no_grad():
+        feats = clip_model.get_image_features(**inputs)
+    # Normalize (same as you did when building the index)
+    feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
+    feats = feats.cpu().numpy().astype("float32")
+    # Search a bit more than k so we can drop self-match
+    search_k = min(index.ntotal, k + 5)
+    D, I = index.search(feats, search_k)
+    rows = metadata.iloc[I[0]].copy()
+    rows["score"] = D[0]
+    # Remove potential self-match (exact same image → cosine ~ 1.0)
+    rows = rows[rows["score"] < 0.999].copy()
+    # Add image_url
+    rows["image_url"] = rows["ID"].apply(id_to_image_url)
+    # Keep only needed columns and top-k by score
+    rows = rows.sort_values("score", ascending=False).head(k)
+    # If concepts_manual is missing, fill with empty string
+    if "concepts_manual" not in rows.columns:
+        rows["concepts_manual"] = ""
+    return rows[["ID", "caption", "concepts_manual", "score", "image_url"]]
+# ---------- Helper: caption with BLIP-2 ----------
+def clean_caption(text: str) -> str:
+    """Basic cleanup to remove obvious repetition artifacts."""
+    text = text.strip()
+    # Deduplicate immediate repeated phrases separated by commas
+    parts = [p.strip() for p in text.split(",")]
+    dedup = []
+    for p in parts:
+        if not dedup or p.lower() != dedup[-1].lower():
+            dedup.append(p)
+    text = ", ".join(dedup)
+    # Remove repeated 'respectively'
+    text = re.sub(r"(respectively,?\s+)+", "respectively ", text, flags=re.IGNORECASE)
+    # Remove exact doubled sentence patterns like "..., and a large ... and a large ..."
+    text = re.sub(r"\b(\w+(?:\s+\w+){2,})\s+\1\b", r"\1", text, flags=re.IGNORECASE)
+    # Normalize whitespace
+    text = " ".join(text.split())
+    return text
+def generate_query_caption(image: Image.Image) -> str:
+    """
+    Generate a radiology-focused caption using BLIP-2.
+    """
+    prompt = (
+        "You are an expert radiologist. "
+        "Describe the key radiology findings in one concise sentence. "
+        "Avoid repeating phrases."
+    )
+    inputs = caption_processor(
+        images=image,
+        text=prompt,
+        return_tensors="pt",
+    ).to(device, dtype=caption_dtype)
+    with torch.no_grad():
+        generated_ids = caption_model.generate(
+            **inputs,
+            max_new_tokens=64,
+            num_beams=4,
+            no_repeat_ngram_size=3,
+            repetition_penalty=1.1,
         )
+    caption = caption_processor.batch_decode(
+        generated_ids, skip_special_tokens=True
+    )[0]
+    return clean_caption(caption)
 # ---------- Routes ----------
 @app.get("/")
 def root():
+    return {"status": "ok", "message": "Radiology retrieval + BLIP-2 captioning API"}
 @app.post("/search_by_image")
 async def search_by_image(file: UploadFile = File(...), k: int = 5):
     """
     Upload a radiology image.
     Returns:
+      - query_caption: BLIP-2 caption for the query image
+      - modality: detected imaging modality from caption
+      - scores: random quality metrics in given ranges
+      - results: list of similar images with similarity + concepts + image_url
     """
+    # Read uploaded file
     content = await file.read()
     image = Image.open(io.BytesIO(content)).convert("RGB")
+    # Retrieval
+    results_df = search_similar_by_image(image, k=int(k))
     results = results_df.to_dict(orient="records")
+    # Caption + modality
     try:
         query_caption = generate_query_caption(image)
     except Exception as e:
+        print("Error generating caption with BLIP-2:", e)
         query_caption = None
+    modality = detect_modality(query_caption or "")
+    # Random scores
+    scores = generate_random_scores()
     return JSONResponse(
         {
             "query_caption": query_caption,
             "modality": modality,
+            "scores": scores,
             "results": results,
         }
     )