import os from fastapi import FastAPI, Form, File, UploadFile from transformers import pipeline from PIL import Image import io # 🧩 LangSmith Integration (Optional) os.environ["LANGCHAIN_TRACING_V2"] = "true" os.environ["LANGCHAIN_PROJECT"] = "Multimodal_RAG_App" os.environ["LANGCHAIN_API_KEY"] = "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" from langsmith import traceable app = FastAPI(title="🚀 Multimodal RAG API (CPU-friendly)") # ✅ Always use writable cache dir in Hugging Face Spaces os.environ["HF_HOME"] = "/tmp/huggingface_cache" # ------------------ Load Pipelines ------------------ # 🧠 Lightweight Qwen model (0.6B params - perfect for CPU!) text_pipeline = pipeline( "text-generation", model="Qwen/Qwen2.5-0.5B-Instruct", # Using the instruct version for better responses device=-1, # -1 means CPU in pipeline API trust_remote_code=True ) # 👁️ Image-to-text (multimodal captioning) image_pipeline = pipeline( "image-to-text", model="Salesforce/blip-image-captioning-base", device=-1 # Force CPU ) @app.get("/") def home(): return {"message": "✅ Multimodal RAG API is running successfully!"} # ------------------ 1️⃣ Summarize Text ------------------ @traceable(name="Qwen_Text_Summarization") @app.post("/summarize_qwen") def summarize_text(prompt: str = Form(...)): try: result = text_pipeline( prompt, max_new_tokens=200, do_sample=True, temperature=0.7, top_p=0.9 ) return {"response": result[0]["generated_text"]} except Exception as e: return {"error": str(e), "details": "Text summarization failed"} # ------------------ 2️⃣ Image + Text Summarizer ------------------ @traceable(name="BLIP_Image_Text_Summarization") @app.post("/summarize_smol") async def summarize_smol(text: str = Form(...), image: UploadFile = File(None)): try: if image: image_bytes = await image.read() pil_image = Image.open(io.BytesIO(image_bytes)).convert("RGB") img_result = image_pipeline(pil_image) response = f"{text}\n\n🖼️ Image Summary: {img_result[0]['generated_text']}" else: response = text return {"response": response} except Exception as e: return {"error": str(e), "details": "Image processing failed"} # ------------------ 3️⃣ Final Answer (Text + Multiple Images) ------------------ @traceable(name="Final_Multimodal_Answer") @app.post("/final_answer") async def final_answer( context: str = Form(...), question: str = Form(...), images: list[UploadFile] = File(None) ): try: combined_context = context # Process images if provided if images: for img_file in images: img_bytes = await img_file.read() pil_image = Image.open(io.BytesIO(img_bytes)).convert("RGB") img_result = image_pipeline(pil_image) combined_context += f"\n\n🖼️ Image Info: {img_result[0]['generated_text']}" # Create prompt for Qwen final_prompt = f"Context: {combined_context}\n\nQuestion: {question}\n\nAnswer:" result = text_pipeline( final_prompt, max_new_tokens=300, do_sample=True, temperature=0.7, top_p=0.9 ) return {"response": result[0]["generated_text"]} except Exception as e: return {"error": str(e), "details": "Final answer generation failed"}