Multi_modal / app.py
Sameer-Handsome173's picture
Update app.py
e1dae85 verified
import os
from fastapi import FastAPI, Form, File, UploadFile
from transformers import pipeline
from PIL import Image
import io
# 🧩 LangSmith Integration (Optional)
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "Multimodal_RAG_App"
os.environ["LANGCHAIN_API_KEY"] = "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"
from langsmith import traceable
app = FastAPI(title="🚀 Multimodal RAG API (CPU-friendly)")
# ✅ Always use writable cache dir in Hugging Face Spaces
os.environ["HF_HOME"] = "/tmp/huggingface_cache"
# ------------------ Load Pipelines ------------------
# 🧠 Lightweight Qwen model (0.6B params - perfect for CPU!)
text_pipeline = pipeline(
"text-generation",
model="Qwen/Qwen2.5-0.5B-Instruct", # Using the instruct version for better responses
device=-1, # -1 means CPU in pipeline API
trust_remote_code=True
)
# 👁️ Image-to-text (multimodal captioning)
image_pipeline = pipeline(
"image-to-text",
model="Salesforce/blip-image-captioning-base",
device=-1 # Force CPU
)
@app.get("/")
def home():
return {"message": "✅ Multimodal RAG API is running successfully!"}
# ------------------ 1️⃣ Summarize Text ------------------
@traceable(name="Qwen_Text_Summarization")
@app.post("/summarize_qwen")
def summarize_text(prompt: str = Form(...)):
try:
result = text_pipeline(
prompt,
max_new_tokens=200,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return {"response": result[0]["generated_text"]}
except Exception as e:
return {"error": str(e), "details": "Text summarization failed"}
# ------------------ 2️⃣ Image + Text Summarizer ------------------
@traceable(name="BLIP_Image_Text_Summarization")
@app.post("/summarize_smol")
async def summarize_smol(text: str = Form(...), image: UploadFile = File(None)):
try:
if image:
image_bytes = await image.read()
pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
img_result = image_pipeline(pil_image)
response = f"{text}\n\n🖼️ Image Summary: {img_result[0]['generated_text']}"
else:
response = text
return {"response": response}
except Exception as e:
return {"error": str(e), "details": "Image processing failed"}
# ------------------ 3️⃣ Final Answer (Text + Multiple Images) ------------------
@traceable(name="Final_Multimodal_Answer")
@app.post("/final_answer")
async def final_answer(
context: str = Form(...),
question: str = Form(...),
images: list[UploadFile] = File(None)
):
try:
combined_context = context
# Process images if provided
if images:
for img_file in images:
img_bytes = await img_file.read()
pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB")
img_result = image_pipeline(pil_image)
combined_context += f"\n\n🖼️ Image Info: {img_result[0]['generated_text']}"
# Create prompt for Qwen
final_prompt = f"Context: {combined_context}\n\nQuestion: {question}\n\nAnswer:"
result = text_pipeline(
final_prompt,
max_new_tokens=300,
do_sample=True,
temperature=0.7,
top_p=0.9
)
return {"response": result[0]["generated_text"]}
except Exception as e:
return {"error": str(e), "details": "Final answer generation failed"}