Spaces:

Jorick-python
/

mcp-slidedeck

Running

App Files Files Community

mcp-slidedeck / app.py

Jorick-python

Update app.py with latest MCP-compatible changes

c99a3fd 10 months ago

raw

history blame

3.79 kB

	# app.py (complete and updated)
	import io, os, json
	from typing import Dict, List, Any
	import gradio as gr
	from fastapi import FastAPI, UploadFile
	from fastapi.middleware.cors import CORSMiddleware
	from PIL import Image
	import pytesseract
	import pdfplumber
	from pptx import Presentation
	from transformers import BlipProcessor, BlipForConditionalGeneration
	import torch
	import uvicorn

	# --------- Image Caption Model (BLIP base) -----------
	processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
	blip_model = BlipForConditionalGeneration.from_pretrained(
	"Salesforce/blip-image-captioning-base",
	torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
	).eval()

	def _caption_image(img: Image.Image) -> str:
	"""Run BLIP to caption a PIL image."""
	inputs = processor(img.convert("RGB"), return_tensors="pt")
	with torch.no_grad():
	out = blip_model.generate(**{k: v.to(blip_model.device) for k, v in inputs.items()})
	return processor.decode(out[0], skip_special_tokens=True)

	# --------- Core analysis function -----------
	def analyze_slidepack(file: Any) -> Dict[str, Any]:
	fname = os.path.basename(file.name)
	slides_out: List[Dict[str, Any]] = []

	# ---------- PPTX ----------
	if fname.lower().endswith(".pptx"):
	pres = Presentation(file.name)
	for idx, slide in enumerate(pres.slides, start=1):
	texts, caps = [], []
	for shape in slide.shapes:
	if hasattr(shape, "text"):
	text = shape.text.strip()
	if text:
	texts.append(text)
	if shape.shape_type == 13:
	img_blob = shape.image.blob
	img = Image.open(io.BytesIO(img_blob))
	caps.append(_caption_image(img))
	slides_out.append({
	"slide_index": idx,
	"textBlocks": texts,
	"imageCaptions": caps
	})

	# ---------- PDF ----------
	elif fname.lower().endswith(".pdf"):
	with pdfplumber.open(file.name) as pdf:
	for idx, page in enumerate(pdf.pages, start=1):
	texts = [page.extract_text() or ""]
	caps = []
	img = page.to_image(resolution=200).original
	caps.append(_caption_image(img))
	ocr_text = pytesseract.image_to_string(img)
	if ocr_text.strip():
	texts.append(ocr_text)
	slides_out.append({
	"slide_index": idx,
	"textBlocks": [t for t in texts if t.strip()],
	"imageCaptions": caps
	})

	else:
	raise gr.Error("Unsupported file type. Upload a .pptx or .pdf.")

	return {"file_name": fname, "slides": slides_out}

	# --------- Gradio Interface -----------
	demo = gr.Interface(
	fn=analyze_slidepack,
	inputs=gr.File(label="Upload PPTX or PDF"),
	outputs=gr.JSON(),
	title="Slide-Pack Full Extractor",
	description=(
	"Returns every text fragment and BLIP-generated image caption in JSON. "
	"No summarisation – perfect for downstream quiz agents."
	)
	)

	# --------- FastAPI Tool Endpoint -----------
	api = FastAPI()
	api.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	@api.post("/extract_slidepack")
	async def extract_slidepack(file: UploadFile):
	path = f"/tmp/{file.filename}"
	with open(path, "wb") as f:
	f.write(await file.read())
	return analyze_slidepack(type("File", (object,), {"name": path}))

	if __name__ == "__main__":
	demo.launch(mcp_server=True, server_name="0.0.0.0", server_port=7860)