Spaces:

supib4132
/

RAGExplo1234

Sleeping

App Files Files Community

supib4132 commited on Apr 22, 2025

Commit

2d10e05

verified ·

1 Parent(s): fe6f15c

Upload 5 files

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +34 -13
captions.json +0 -0
faiss_index.idx +3 -0
inference.py +110 -0
requirements.txt +11 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+faiss_index.idx filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,13 +1,34 @@
----
-title: RAGExplo1234
-emoji: 👀
-colorFrom: blue
-colorTo: purple
-sdk: gradio
-sdk_version: 5.25.2
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+title: RAG Image Captioningemoji: 📸colorFrom: bluecolorTo: greensdk: gradiosdk_version: 3.35.2app_file: app.pypinned: false
+RAG Image Captioning Space
+This Space hosts a RAG-based image captioning model that generates captions for images using CLIP (openai/clip-vit-base-patch32), T5 (t5-small), and SentenceTransformer (all-MiniLM-L6-v2). It retrieves similar captions from a FAISS index and generates a final caption using T5.
+Usage
+Upload an image via the Gradio interface to generate a caption.
+Use the API (/api/predict) to integrate with web or mobile apps.
+Files
+app.py: Gradio interface for the Space.
+inference.py: Custom inference script with generate_rag_caption.
+requirements.txt: Dependencies.
+faiss_index.idx: FAISS index for retrieval.
+captions.json: Caption corpus.
+Setup
+Dependencies are installed from requirements.txt. The en_core_web_sm spaCy model is downloaded automatically.
+pip install -r requirements.txt
+python -m spacy download en_core_web_sm
+API Integration
+Send a POST request to /api/predict with a base64-encoded image:
+import requests
+import base64
+api_url = "https://your-username-rag-image-captioning.hf.space/api/predict"
+with open("test_image.jpg", "rb") as f:
+    image_bytes = f.read()
+    base64_image = f"data:image/jpeg;base64,{base64.b64encode(image_bytes).decode()}"
+response = requests.post(api_url, json={"data": [base64_image]})
+print(response.json()["data"][0])
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

captions.json ADDED Viewed

The diff for this file is too large to render. See raw diff

faiss_index.idx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a43f9410efe919810cd35354d77d7396cdee594a5d3998aadb6bc03606274332
+size 3446829

inference.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from PIL import Image
+import torch
+from transformers import CLIPProcessor, CLIPModel, T5Tokenizer, T5ForConditionalGeneration
+from sentence_transformers import SentenceTransformer
+import faiss
+import numpy as np
+import json
+import spacy
+# Load models and resources
+clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
+clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
+text_encoder = SentenceTransformer('all-MiniLM-L6-v2')
+tokenizer = T5Tokenizer.from_pretrained("t5-small")
+generator = T5ForConditionalGeneration.from_pretrained("t5-small")
+nlp = spacy.load("en_core_web_sm")
+# Load FAISS index and captions
+faiss_index = faiss.read_index("./faiss_index.idx")
+with open("./captions.json", "r", encoding="utf-8") as f:
+    captions = json.load(f)
+def extract_image_features(image):
+    """
+    Extract image features using CLIP model.
+    Input: PIL Image or image path (str).
+    Output: Normalized image embedding (numpy array).
+    """
+    try:
+        # Handle both PIL Image and file path
+        if isinstance(image, str):
+            image = Image.open(image).convert("RGB")
+        else:
+            image = image.convert("RGB")
+        inputs = clip_processor(images=image, return_tensors="pt")
+        with torch.no_grad():
+            features = clip_model.get_image_features(**inputs)
+        features = torch.nn.functional.normalize(features, p=2, dim=-1)
+        return features.squeeze(0).cpu().numpy().astype("float32")
+    except Exception as e:
+        print(f"Error extracting features: {e}")
+        return None
+def retrieve_similar_captions(image_embedding, k=5):
+    """
+    Retrieve k most similar captions using FAISS index.
+    Input: Image embedding (numpy array).
+    Output: List of captions.
+    """
+    if image_embedding.ndim == 1:
+        image_embedding = image_embedding.reshape(1, -1)
+    D, I = faiss_index.search(image_embedding, k)
+    return [captions[i] for i in I[0]]
+def extract_location_names(texts):
+    """
+    Extract location names from captions using spaCy.
+    Input: List of captions.
+    Output: List of unique location names.
+    """
+    names = []
+    for text in texts:
+        doc = nlp(text)
+        for ent in doc.ents:
+            if ent.label_ in ["GPE", "LOC", "FAC"]:
+                names.append(ent.text)
+    return list(set(names))
+def generate_caption_from_retrieved(retrieved_captions):
+    """
+    Generate a caption from retrieved captions using T5.
+    Input: List of retrieved captions.
+    Output: Generated caption (str).
+    """
+    locations = extract_location_names(retrieved_captions)
+    location_hint = f"The place might be: {', '.join(locations)}. " if locations else ""
+    prompt = location_hint + " ".join(retrieved_captions) + " Generate a caption with the landmark name:"
+    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
+    outputs = generator.generate(
+        input_ids=inputs.input_ids,
+        attention_mask=inputs.attention_mask,
+        max_length=300,
+        num_beams=5,
+        early_stopping=True
+    )
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
+def generate_rag_caption(image):
+    """
+    Generate a RAG-based caption for an image.
+    Input: PIL Image or image path (str).
+    Output: Caption (str).
+    """
+    embedding = extract_image_features(image)
+    if embedding is None:
+        return "Failed to process image."
+    retrieved = retrieve_similar_captions(embedding, k=5)
+    if not retrieved:
+        return "No similar captions found."
+    return generate_caption_from_retrieved(retrieved)
+def predict(image):
+    """
+    API-compatible function for inference.
+    Input: PIL Image or image file path.
+    Output: Dictionary with caption.
+    """
+    caption = generate_rag_caption(image)
+    return {"caption": caption}

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+transformers>=4.30.0
+sentence-transformers>=2.2.0
+faiss-cpu>=1.7.0
+torch>=2.0.0
+torchvision>=0.15.0
+torchaudio>=2.0.0
+pillow>=9.0.0
+spacy>=3.5.0
+langchain>=0.0.200
+huggingface_hub>=0.15.0