# app.py (complete and updated) import io, os, json from typing import Dict, List, Any import gradio as gr from fastapi import FastAPI, UploadFile from fastapi.middleware.cors import CORSMiddleware from PIL import Image import pytesseract import pdfplumber from pptx import Presentation from transformers import BlipProcessor, BlipForConditionalGeneration import torch import uvicorn # --------- Image Caption Model (BLIP base) ----------- processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base") blip_model = BlipForConditionalGeneration.from_pretrained( "Salesforce/blip-image-captioning-base", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32 ).eval() def _caption_image(img: Image.Image) -> str: """Run BLIP to caption a PIL image.""" inputs = processor(img.convert("RGB"), return_tensors="pt") with torch.no_grad(): out = blip_model.generate(**{k: v.to(blip_model.device) for k, v in inputs.items()}) return processor.decode(out[0], skip_special_tokens=True) # --------- Core analysis function ----------- def analyze_slidepack(file: Any) -> Dict[str, Any]: fname = os.path.basename(file.name) slides_out: List[Dict[str, Any]] = [] # ---------- PPTX ---------- if fname.lower().endswith(".pptx"): pres = Presentation(file.name) for idx, slide in enumerate(pres.slides, start=1): texts, caps = [], [] for shape in slide.shapes: if hasattr(shape, "text"): text = shape.text.strip() if text: texts.append(text) if shape.shape_type == 13: img_blob = shape.image.blob img = Image.open(io.BytesIO(img_blob)) caps.append(_caption_image(img)) slides_out.append({ "slide_index": idx, "textBlocks": texts, "imageCaptions": caps }) # ---------- PDF ---------- elif fname.lower().endswith(".pdf"): with pdfplumber.open(file.name) as pdf: for idx, page in enumerate(pdf.pages, start=1): texts = [page.extract_text() or ""] caps = [] img = page.to_image(resolution=200).original caps.append(_caption_image(img)) ocr_text = pytesseract.image_to_string(img) if ocr_text.strip(): texts.append(ocr_text) slides_out.append({ "slide_index": idx, "textBlocks": [t for t in texts if t.strip()], "imageCaptions": caps }) else: raise gr.Error("Unsupported file type. Upload a .pptx or .pdf.") return {"file_name": fname, "slides": slides_out} # --------- Gradio Interface ----------- demo = gr.Interface( fn=analyze_slidepack, inputs=gr.File(label="Upload PPTX or PDF"), outputs=gr.JSON(), title="Slide-Pack Full Extractor", description=( "Returns **every** text fragment and BLIP-generated image caption in JSON. " "No summarisation – perfect for downstream quiz agents." ) ) # --------- FastAPI Tool Endpoint ----------- api = FastAPI() api.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) @api.post("/extract_slidepack") async def extract_slidepack(file: UploadFile): path = f"/tmp/{file.filename}" with open(path, "wb") as f: f.write(await file.read()) return analyze_slidepack(type("File", (object,), {"name": path})) if __name__ == "__main__": demo.launch(mcp_server=True, server_name="0.0.0.0", server_port=7860)