Spaces:

Jorick-python
/

mcp-slidedeck

Sleeping

App Files Files Community

Jorick-python commited on May 25, 2025

Commit

1a1604b

1 Parent(s): d19baf9

Add slide extraction server

Browse files

Files changed (2) hide show

app.py +107 -0
requirements.txt +8 -0

app.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import io, os, json
+from typing import Dict, List, Any
+import gradio as gr
+from PIL import Image
+import pytesseract
+import pdfplumber
+from pptx import Presentation          # pip: python-pptx
+from transformers import BlipProcessor, BlipForConditionalGeneration
+import torch
+# ---------  Image Caption Model (BLIP base) -----------
+processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+blip_model = BlipForConditionalGeneration.from_pretrained(
+    "Salesforce/blip-image-captioning-base",
+    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+).eval()
+def _caption_image(img: Image.Image) -> str:
+    """Run BLIP to caption a PIL image."""
+    inputs = processor(img.convert("RGB"), return_tensors="pt")
+    with torch.no_grad():
+        out = blip_model.generate(**{k: v.to(blip_model.device) for k, v in inputs.items()})
+    return processor.decode(out[0], skip_special_tokens=True)
+# ---------  Core analysis function -----------
+def analyze_slidepack(file: gr.File) -> Dict[str, Any]:
+    """
+    Extract **all** text + AI-generated image captions from a PPTX or PDF.
+    Args:
+        file (File): Any `.pptx` or `.pdf` uploaded by the user/agent.
+    Returns:
+        dict: {
+            "file_name": str,
+            "slides": [
+                {
+                  "slide_index": int,
+                  "textBlocks": List[str],
+                  "imageCaptions": List[str]
+                }, ...
+            ]
+        }
+    """
+    fname = os.path.basename(file.name)
+    slides_out: List[Dict[str, Any]] = []
+     # ---------- PPTX ----------
+    if fname.lower().endswith(".pptx"):
+        pres = Presentation(file.name)
+        for idx, slide in enumerate(pres.slides, start=1):
+            texts, caps = [], []
+            # Collect text
+            for shape in slide.shapes:
+                if hasattr(shape, "text"):
+                    text = shape.text.strip()
+                    if text:
+                        texts.append(text)
+                # Collect images
+                if shape.shape_type == 13:                        # picture
+                    img_blob = shape.image.blob
+                    img = Image.open(io.BytesIO(img_blob))
+                    caps.append(_caption_image(img))
+            slides_out.append({
+                "slide_index": idx,
+                "textBlocks": texts,
+                "imageCaptions": caps
+            })
+    # ---------- PDF ----------
+    elif fname.lower().endswith(".pdf"):
+        with pdfplumber.open(file.name) as pdf:
+            for idx, page in enumerate(pdf.pages, start=1):
+                texts = [page.extract_text() or ""]
+                caps = []
+                # Render page to image for captioning & OCR
+                img = page.to_image(resolution=200).original
+                caps.append(_caption_image(img))
+                # OCR any text that extract_text missed (diagrams)
+                ocr_text = pytesseract.image_to_string(img)
+                if ocr_text.strip():
+                    texts.append(ocr_text)
+                slides_out.append({
+                    "slide_index": idx,
+                    "textBlocks": [t for t in texts if t.strip()],
+                    "imageCaptions": caps
+                })
+    else:
+        raise gr.Error("Unsupported file type. Upload a .pptx or .pdf.")
+    return {"file_name": fname, "slides": slides_out}
+# ---------  Gradio Interface -----------
+demo = gr.Interface(
+    fn=analyze_slidepack,
+    inputs=gr.File(label="Upload PPTX or PDF"),
+    outputs=gr.JSON(),
+    title="Slide-Pack Full Extractor",
+    description=(
+        "Returns **every** text fragment and BLIP-generated image caption in JSON. "
+        "No summarisation – perfect for downstream quiz agents."
+    )
+)
+if __name__ == "__main__":
+    demo.launch(mcp_server=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,8 @@

+gradio[mcp]
+python-pptx
+pdfplumber
+pillow
+pytesseract
+torch>=2.2,<3.0
+torchvision
+transformers