Spaces:

saad003
/

rad-retrieval-api

Running

App Files Files Community

saad003 commited on 12 days ago

Commit

4f357eb

verified ·

1 Parent(s): 05a8813

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -446

app.py CHANGED Viewed

@@ -18,8 +18,8 @@ from huggingface_hub import hf_hub_download
 from transformers import (
     CLIPProcessor,
     CLIPModel,
-    Blip2Processor,
-    Blip2ForConditionalGeneration,
 )
 # ---------- FastAPI app ----------
@@ -38,7 +38,7 @@ app.add_middleware(
 # Dataset with FAISS index + radiology_metadata.csv
 EMBED_REPO_ID = "saad003/Red01"
-# Dataset with all radiology images (new structure with train01–train07)
 IMAGE_REPO_ID = "saad003/images04"
 BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
@@ -48,6 +48,9 @@ HF_TOKEN = os.environ.get("HF_TOKEN")
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Using device:", device)
 # ---------- Download index + metadata ----------
 print("Downloading FAISS index & metadata from Hugging Face...")
@@ -81,15 +84,12 @@ clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(device)
 clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
 clip_model.eval()
-# ---------- Load BLIP-2 (captioning) ----------
-print("Loading BLIP-2 model for medical captioning...")
-CAPTION_MODEL_ID = "Salesforce/blip2-opt-2.7b"
-# Use fp16 on GPU, fp32 on CPU
-caption_dtype = torch.float16 if device == "cuda" else torch.float32
-caption_processor = Blip2Processor.from_pretrained(CAPTION_MODEL_ID)
-caption_model = Blip2ForConditionalGeneration.from_pretrained(
     CAPTION_MODEL_ID,
     torch_dtype=caption_dtype,
 ).to(device)
@@ -116,14 +116,12 @@ def id_to_image_url(image_id: str) -> str:
     elif "_valid_" in image_id:
         folder = "valid"
     elif "_train_" in image_id:
-        # last part: ROCOv2_2023_train_054005 -> "054005"
         num_str = image_id.split("_")[-1]
         try:
             n = int(num_str)
         except ValueError:
             n = 0
-        # Rough ranges based on your description
         if 1 <= n <= 9000:
             folder = "train01"
         elif 9001 <= n <= 18000:
@@ -214,7 +212,6 @@ def detect_modality(caption: str) -> str:
             if kw in text:
                 return modality
-    # Back-up heuristics
     if "mra" in text:
         return "MRI"
     if "cta " in text or "ct angiography" in text:
@@ -243,491 +240,115 @@ def generate_random_scores() -> Dict[str, float]:
     }
-# ---------- Helper: search by image ----------
 def search_similar_by_image(image: Image.Image, k: int = 5) -> pd.DataFrame:
     """
     Encode query image with CLIP, search FAISS,
-    filter out self-match (score ~ 1.0), and return top-k results.
     """
-    # Encode image
     inputs = clip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         feats = clip_model.get_image_features(**inputs)
-    # Normalize (same as you did when building the index)
     feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
     feats = feats.cpu().numpy().astype("float32")
-    # Search a bit more than k so we can drop self-match
     search_k = min(index.ntotal, k + 5)
     D, I = index.search(feats, search_k)
     rows = metadata.iloc[I[0]].copy()
     rows["score"] = D[0]
-    # Remove potential self-match (exact same image → cosine ~ 1.0)
     rows = rows[rows["score"] < 0.999].copy()
-    # Add image_url
     rows["image_url"] = rows["ID"].apply(id_to_image_url)
-    # Keep only needed columns and top-k by score
     rows = rows.sort_values("score", ascending=False).head(k)
-    # If concepts_manual is missing, fill with empty string
     if "concepts_manual" not in rows.columns:
         rows["concepts_manual"] = ""
     return rows[["ID", "caption", "concepts_manual", "score", "image_url"]]
-# ---------- Helper: caption with BLIP-2 ----------
 def clean_caption(text: str) -> str:
-    """Basic cleanup to remove obvious repetition artifacts."""
-    text = text.strip()
-    # Deduplicate immediate repeated phrases separated by commas
-    parts = [p.strip() for p in text.split(",")]
-    dedup = []
-    for p in parts:
-        if not dedup or p.lower() != dedup[-1].lower():
-            dedup.append(p)
-    text = ", ".join(dedup)
-    # Remove repeated 'respectively'
-    text = re.sub(r"(respectively,?\s+)+", "respectively ", text, flags=re.IGNORECASE)
-    # Remove exact doubled sentence patterns like "..., and a large ... and a large ..."
-    text = re.sub(r"\b(\w+(?:\s+\w+){2,})\s+\1\b", r"\1", text, flags=re.IGNORECASE)
-    # Normalize whitespace
-    text = " ".join(text.split())
-    return text
-def generate_query_caption(image: Image.Image) -> str:
-    """
-    Generate a radiology-focused caption using BLIP-2.
-    """
-    prompt = (
-        "You are an expert radiologist. "
-        "Describe the key radiology findings in one concise sentence. "
-        "Avoid repeating phrases."
-    )
-    inputs = caption_processor(
-        images=image,
-        text=prompt,
-        return_tensors="pt",
-    ).to(device, dtype=caption_dtype)
-    with torch.no_grad():
-        generated_ids = caption_model.generate(
-            **inputs,
-            max_new_tokens=64,
-            num_beams=4,
-            no_repeat_ngram_size=3,
-            repetition_penalty=1.1,
-        )
-    caption = caption_processor.batch_decode(
-        generated_ids, skip_special_tokens=True
-    )[0]
-    return clean_caption(caption)
-# ---------- Routes ----------
-@app.get("/")
-def root():
-    return {"status": "ok", "message": "Radiology retrieval + BLIP-2 captioning API"}
-@app.post("/search_by_image")
-async def search_by_image(file: UploadFile = File(...), k: int = 5):
     """
-    Upload a radiology image.
-    Returns:
-      - query_caption: BLIP-2 caption for the query image
-      - modality: detected imaging modality from caption
-      - scores: random quality metrics in given ranges
-      - results: list of similar images with similarity + concepts + image_url
     """
-    # Read uploaded file
-    content = await file.read()
-    image = Image.open(io.BytesIO(content)).convert("RGB")
-    # Retrieval
-    results_df = search_similar_by_image(image, k=int(k))
-    results = results_df.to_dict(orient="records")
-    # Caption + modality
-    try:
-        query_caption = generate_query_caption(image)
-    except Exception as e:
-        print("Error generating caption with BLIP-2:", e)
-        query_caption = None
-    modality = detect_modality(query_caption or "")
-    # Random scores
-    scores = generate_random_scores()
-    return JSONResponse(
-        {
-            "query_caption": query_caption,
-            "modality": modality,
-            "scores": scores,
-            "results": results,
-        }
-    )
-# app.py
-import io
-import os
-import random
-import re
-from typing import Dict
-import faiss
-import torch
-import pandas as pd
-from PIL import Image
-from fastapi import FastAPI, File, UploadFile
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.responses import JSONResponse
-from huggingface_hub import hf_hub_download
-from transformers import (
-    CLIPProcessor,
-    CLIPModel,
-    Blip2Processor,
-    Blip2ForConditionalGeneration,
-)
-# ---------- FastAPI app ----------
-app = FastAPI()
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=["*"],  # later restrict to your frontend domain
-    allow_credentials=True,
-    allow_methods=["*"],
-    allow_headers=["*"],
-)
-# ---------- Config ----------
-# Dataset with FAISS index + radiology_metadata.csv
-EMBED_REPO_ID = "saad003/Red01"
-# Dataset with all radiology images (new structure with train01–train07)
-IMAGE_REPO_ID = "saad003/images04"
-BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
-# Optional: token if Red01 is private
-HF_TOKEN = os.environ.get("HF_TOKEN")
-device = "cuda" if torch.cuda.is_available() else "cpu"
-print("Using device:", device)
-# ---------- Download index + metadata ----------
-print("Downloading FAISS index & metadata from Hugging Face...")
-INDEX_PATH = hf_hub_download(
-    repo_id=EMBED_REPO_ID,
-    filename="radiology_index.faiss",
-    repo_type="dataset",
-    token=HF_TOKEN,
-)
-META_PATH = hf_hub_download(
-    repo_id=EMBED_REPO_ID,
-    filename="radiology_metadata.csv",
-    repo_type="dataset",
-    token=HF_TOKEN,
-)
-print("Loading FAISS index...")
-index = faiss.read_index(INDEX_PATH)
-print("Loading metadata CSV...")
-metadata = pd.read_csv(META_PATH)
-assert index.ntotal == len(metadata), "Index size and metadata rows mismatch!"
-# ---------- Load CLIP (retrieval) ----------
-print("Loading PubMedCLIP model for retrieval...")
-CLIP_MODEL_NAME = "flaviagiammarino/pubmed-clip-vit-base-patch32"
-clip_model = CLIPModel.from_pretrained(CLIP_MODEL_NAME).to(device)
-clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
-clip_model.eval()
-# ---------- Load BLIP-2 (captioning) ----------
-print("Loading BLIP-2 model for medical captioning...")
-CAPTION_MODEL_ID = "Salesforce/blip2-opt-2.7b"
-# Use fp16 on GPU, fp32 on CPU
-caption_dtype = torch.float16 if device == "cuda" else torch.float32
-caption_processor = Blip2Processor.from_pretrained(CAPTION_MODEL_ID)
-caption_model = Blip2ForConditionalGeneration.from_pretrained(
-    CAPTION_MODEL_ID,
-    torch_dtype=caption_dtype,
-).to(device)
-caption_model.eval()
-print("Backend ready ✅")
-# ---------- Helper: image path mapping ----------
-def id_to_image_url(image_id: str) -> str:
-    """
-    Map ROCO image IDs to folders in saad003/images04.
-    test  -> test/
-    valid -> valid/
-    train -> train01 ... train07 based on numeric ID
-    """
-    image_id = image_id.strip()
-    base = BASE_IMAGE_URL
-    if "_test_" in image_id:
-        folder = "test"
-    elif "_valid_" in image_id:
-        folder = "valid"
-    elif "_train_" in image_id:
-        # last part: ROCOv2_2023_train_054005 -> "054005"
-        num_str = image_id.split("_")[-1]
-        try:
-            n = int(num_str)
-        except ValueError:
-            n = 0
-        # Rough ranges based on your description
-        if 1 <= n <= 9000:
-            folder = "train01"
-        elif 9001 <= n <= 18000:
-            folder = "train02"
-        elif 18001 <= n <= 27000:
-            folder = "train03"
-        elif 27001 <= n <= 36000:
-            folder = "train04"
-        elif 36001 <= n <= 45000:
-            folder = "train05"
-        elif 45001 <= n <= 54000:
-            folder = "train06"
-        else:
-            folder = "train07"
-    else:
-        folder = ""
-    if folder:
-        return f"{base}/{folder}/{image_id}.jpg"
-    else:
-        # fallback – should not happen, but safe
-        return f"{base}/{image_id}.jpg"
-# ---------- Helper: modality detection ----------
-MODALITY_KEYWORDS = {
-    "CT": [
-        "ct ",
-        "ctscan",
-        "computed tomography",
-        "tomography",
-        "ct scan",
-        "non-contrast ct",
-        "contrast-enhanced ct",
-    ],
-    "MRI": [
-        "mri ",
-        "magnetic resonance",
-        "t1-weighted",
-        "t2-weighted",
-        "flair sequence",
-        "diffusion-weighted",
-        "dwi",
-    ],
-    "X-ray": [
-        "x-ray",
-        "x ray",
-        "radiograph",
-        "plain film",
-        "chest film",
-        "postoperative x",
-        "post-operative x",
-        "cxr",
-    ],
-    "Ultrasound": [
-        "ultrasound",
-        "sonogram",
-        "sonography",
-        "usg",
-        "doppler",
-        "echocardiogram",
-        "echocardiography",
-    ],
-    "PET/CT": [
-        "pet-ct",
-        "pet/ct",
-        "pet scan",
-        "positron emission tomography",
-    ],
-    "Fluoroscopy": [
-        "fluoroscopy",
-        "fluoroscopic",
-        "angiogram",
-        "angiography",
-        "barium swallow",
-        "barium enema",
-    ],
-}
-def detect_modality(caption: str) -> str:
-    if not caption:
-        return "Unknown"
-    text = caption.lower()
-    for modality, keywords in MODALITY_KEYWORDS.items():
-        for kw in keywords:
-            if kw in text:
-                return modality
-    # Back-up heuristics
-    if "mra" in text:
-        return "MRI"
-    if "cta " in text or "ct angiography" in text:
-        return "CT"
-    return "Unknown"
-# ---------- Helper: random scoring ----------
-def generate_random_scores() -> Dict[str, float]:
-    """
-    Return random scores in the ranges you specified.
-    """
-    rng = random.Random()
-    modality_score = rng.uniform(85.0, 93.0)   # percent
-    cui_at_k = rng.uniform(0.30, 0.61)
-    bert = rng.uniform(0.20, 0.40)
-    medbert = rng.uniform(0.20, 0.35)
-    return {
-        "modality_score": round(modality_score, 1),
-        "cui_at_k": round(cui_at_k, 3),
-        "bertscore": round(bert, 3),
-        "medbertscore": round(medbert, 3),
-    }
-# ---------- Helper: search by image ----------
-def search_similar_by_image(image: Image.Image, k: int = 5) -> pd.DataFrame:
-    """
-    Encode query image with CLIP, search FAISS,
-    filter out self-match (score ~ 1.0), and return top-k results.
-    """
-    # Encode image
-    inputs = clip_processor(images=image, return_tensors="pt").to(device)
-    with torch.no_grad():
-        feats = clip_model.get_image_features(**inputs)
-    # Normalize (same as you did when building the index)
-    feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
-    feats = feats.cpu().numpy().astype("float32")
-    # Search a bit more than k so we can drop self-match
-    search_k = min(index.ntotal, k + 5)
-    D, I = index.search(feats, search_k)
-    rows = metadata.iloc[I[0]].copy()
-    rows["score"] = D[0]
-    # Remove potential self-match (exact same image → cosine ~ 1.0)
-    rows = rows[rows["score"] < 0.999].copy()
-    # Add image_url
-    rows["image_url"] = rows["ID"].apply(id_to_image_url)
-    # Keep only needed columns and top-k by score
-    rows = rows.sort_values("score", ascending=False).head(k)
-    # If concepts_manual is missing, fill with empty string
-    if "concepts_manual" not in rows.columns:
-        rows["concepts_manual"] = ""
-    return rows[["ID", "caption", "concepts_manual", "score", "image_url"]]
-# ---------- Helper: caption with BLIP-2 ----------
-def clean_caption(text: str) -> str:
-    """Basic cleanup to remove obvious repetition artifacts."""
     text = text.strip()
-    # Deduplicate immediate repeated phrases separated by commas
-    parts = [p.strip() for p in text.split(",")]
-    dedup = []
     for p in parts:
-        if not dedup or p.lower() != dedup[-1].lower():
-            dedup.append(p)
-    text = ", ".join(dedup)
-    # Remove repeated 'respectively'
-    text = re.sub(r"(respectively,?\s+)+", "respectively ", text, flags=re.IGNORECASE)
-    # Remove exact doubled sentence patterns like "..., and a large ... and a large ..."
-    text = re.sub(r"\b(\w+(?:\s+\w+){2,})\s+\1\b", r"\1", text, flags=re.IGNORECASE)
-    # Normalize whitespace
-    text = " ".join(text.split())
-    return text
 def generate_query_caption(image: Image.Image) -> str:
     """
-    Generate a radiology-focused caption using BLIP-2.
     """
-    prompt = (
-        "You are an expert radiologist. "
-        "Describe the key radiology findings in one concise sentence. "
-        "Avoid repeating phrases."
     )
-    inputs = caption_processor(
-        images=image,
-        text=prompt,
-        return_tensors="pt",
-    ).to(device, dtype=caption_dtype)
     with torch.no_grad():
-        generated_ids = caption_model.generate(
             **inputs,
-            max_new_tokens=64,
-            num_beams=4,
-            no_repeat_ngram_size=3,
-            repetition_penalty=1.1,
         )
-    caption = caption_processor.batch_decode(
-        generated_ids, skip_special_tokens=True
     )[0]
-    return clean_caption(caption)
 # ---------- Routes ----------
 @app.get("/")
 def root():
-    return {"status": "ok", "message": "Radiology retrieval + BLIP-2 captioning API"}
 @app.post("/search_by_image")
@@ -735,12 +356,11 @@ async def search_by_image(file: UploadFile = File(...), k: int = 5):
     """
     Upload a radiology image.
     Returns:
-      - query_caption: BLIP-2 caption for the query image
       - modality: detected imaging modality from caption
-      - scores: random quality metrics in given ranges
-      - results: list of similar images with similarity + concepts + image_url
     """
-    # Read uploaded file
     content = await file.read()
     image = Image.open(io.BytesIO(content)).convert("RGB")
@@ -752,7 +372,7 @@ async def search_by_image(file: UploadFile = File(...), k: int = 5):
     try:
         query_caption = generate_query_caption(image)
     except Exception as e:
-        print("Error generating caption with BLIP-2:", e)
         query_caption = None
     modality = detect_modality(query_caption or "")

 from transformers import (
     CLIPProcessor,
     CLIPModel,
+    BlipForConditionalGeneration,
+    AutoProcessor,
 )
 # ---------- FastAPI app ----------
 # Dataset with FAISS index + radiology_metadata.csv
 EMBED_REPO_ID = "saad003/Red01"
+# Dataset with all radiology images (test, valid, train01–train07)
 IMAGE_REPO_ID = "saad003/images04"
 BASE_IMAGE_URL = f"https://huggingface.co/datasets/{IMAGE_REPO_ID}/resolve/main"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 print("Using device:", device)
+# use fp16 on GPU to speed up BLIP, fp32 on CPU
+caption_dtype = torch.float16 if device == "cuda" else torch.float32
 # ---------- Download index + metadata ----------
 print("Downloading FAISS index & metadata from Hugging Face...")
 clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL_NAME)
 clip_model.eval()
+# ---------- Load BLIP (radiology captioning) ----------
+print("Loading BLIP ROCO radiology captioning model...")
+CAPTION_MODEL_ID = "WafaaFraih/blip-roco-radiology-captioning"
+caption_processor = AutoProcessor.from_pretrained(CAPTION_MODEL_ID)
+caption_model = BlipForConditionalGeneration.from_pretrained(
     CAPTION_MODEL_ID,
     torch_dtype=caption_dtype,
 ).to(device)
     elif "_valid_" in image_id:
         folder = "valid"
     elif "_train_" in image_id:
         num_str = image_id.split("_")[-1]
         try:
             n = int(num_str)
         except ValueError:
             n = 0
         if 1 <= n <= 9000:
             folder = "train01"
         elif 9001 <= n <= 18000:
             if kw in text:
                 return modality
     if "mra" in text:
         return "MRI"
     if "cta " in text or "ct angiography" in text:
     }
+# ---------- Helper: FAISS search ----------
 def search_similar_by_image(image: Image.Image, k: int = 5) -> pd.DataFrame:
     """
     Encode query image with CLIP, search FAISS,
+    filter out self-match, and return top-k results.
     """
     inputs = clip_processor(images=image, return_tensors="pt").to(device)
     with torch.no_grad():
         feats = clip_model.get_image_features(**inputs)
     feats = feats / feats.norm(p=2, dim=-1, keepdim=True)
     feats = feats.cpu().numpy().astype("float32")
     search_k = min(index.ntotal, k + 5)
     D, I = index.search(feats, search_k)
     rows = metadata.iloc[I[0]].copy()
     rows["score"] = D[0]
+    # drop exact self-match
     rows = rows[rows["score"] < 0.999].copy()
     rows["image_url"] = rows["ID"].apply(id_to_image_url)
     rows = rows.sort_values("score", ascending=False).head(k)
     if "concepts_manual" not in rows.columns:
         rows["concepts_manual"] = ""
     return rows[["ID", "caption", "concepts_manual", "score", "image_url"]]
+# ---------- Helper: caption cleaning & generation ----------
 def clean_caption(text: str) -> str:
     """
+    Clean BLIP captions:
+    - strip
+    - split into clauses and remove duplicates
+    - normalize spacing and punctuation
     """
+    if not text:
+        return ""
     text = text.strip()
+    # break into clauses
+    parts = re.split(r"[,.]", text)
+    parts = [p.strip() for p in parts if p.strip()]
+    seen = set()
+    unique_parts = []
     for p in parts:
+        key = p.lower()
+        if key not in seen:
+            seen.add(key)
+            unique_parts.append(p)
+    if not unique_parts:
+        cleaned = text
+    else:
+        cleaned = ", ".join(unique_parts)
+    # remove repeated 'respectively'
+    cleaned = re.sub(
+        r"(respectively,?\s+)+", "respectively ", cleaned, flags=re.IGNORECASE
+    )
+    cleaned = " ".join(cleaned.split())
+    if cleaned and not cleaned.endswith("."):
+        cleaned += "."
+    cleaned = cleaned[0].upper() + cleaned[1:] if cleaned else cleaned
+    return cleaned
 def generate_query_caption(image: Image.Image) -> str:
     """
+    Generate a radiology caption using BLIP (ROCO).
+    Tuned decoding to reduce repetition and keep it concise.
     """
+    inputs = caption_processor(images=image, return_tensors="pt").to(
+        device, dtype=caption_dtype
     )
     with torch.no_grad():
+        out_ids = caption_model.generate(
             **inputs,
+            max_new_tokens=40,
+            num_beams=5,
+            no_repeat_ngram_size=4,
+            repetition_penalty=1.4,
+            length_penalty=0.9,
+            early_stopping=True,
         )
+    raw_caption = caption_processor.batch_decode(
+        out_ids, skip_special_tokens=True
     )[0]
+    return clean_caption(raw_caption)
 # ---------- Routes ----------
 @app.get("/")
 def root():
+    return {
+        "status": "ok",
+        "message": "Radiology retrieval + BLIP radiology captioning API",
+    }
 @app.post("/search_by_image")
     """
     Upload a radiology image.
     Returns:
+      - query_caption: BLIP caption for the query image
       - modality: detected imaging modality from caption
+      - scores: random quality metrics
+      - results: similar images (similarity + concepts + image_url)
     """
     content = await file.read()
     image = Image.open(io.BytesIO(content)).convert("RGB")
     try:
         query_caption = generate_query_caption(image)
     except Exception as e:
+        print("Error generating caption:", e)
         query_caption = None
     modality = detect_modality(query_caption or "")