File size: 3,203 Bytes
cc8abd5
 
 
 
 
4d921db
cc8abd5
 
4d921db
 
 
 
 
 
 
 
 
 
 
 
 
cc8abd5
 
 
4d921db
 
cc8abd5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d921db
cc8abd5
 
4d921db
cc8abd5
 
4d921db
cc8abd5
4d921db
cc8abd5
 
4d921db
 
 
 
cc8abd5
4d921db
 
 
 
 
cc8abd5
 
 
 
 
 
 
 
4d921db
 
 
 
 
 
 
 
cc8abd5
 
 
 
4d921db
 
cc8abd5
 
 
4d921db
 
cc8abd5
 
4d921db
 
 
 
 
cc8abd5
 
 
4d921db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
import os
import gradio as gr
import pandas as pd
from PIL import Image
import fitz  # PyMuPDF
import torch
from transformers import pipeline

GPU_MODEL_ID = os.getenv("GPU_MODEL_ID", "microsoft/dit-base-finetuned-rvlcdip")
CPU_MODEL_ID = os.getenv("CPU_MODEL_ID", "HAMMALE/vit-tiny-classifier-rvlcdip")

# Optional override: set FORCE_CPU=1 in Space variables
FORCE_CPU = os.getenv("FORCE_CPU", "0") == "1"

def pick_device_and_model():
    has_cuda = torch.cuda.is_available() and not FORCE_CPU
    if has_cuda:
        return 0, GPU_MODEL_ID, "cuda"
    return -1, CPU_MODEL_ID, "cpu"

DEVICE, ACTIVE_MODEL_ID, ACTIVE_BACKEND = pick_device_and_model()

clf = pipeline(
    task="image-classification",
    model=ACTIVE_MODEL_ID,
    device=DEVICE,
)

def pdf_to_images(pdf_path: str, max_pages: int = 6, dpi: int = 150):
    doc = fitz.open(pdf_path)
    images = []
    zoom = dpi / 72.0
    mat = fitz.Matrix(zoom, zoom)
    for i in range(min(len(doc), max_pages)):
        page = doc.load_page(i)
        pix = page.get_pixmap(matrix=mat, alpha=False)
        img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
        images.append(img)
    doc.close()
    return images

def run_infer(file_obj, max_pages: int = 6, top_k: int = 5):
    path = file_obj.name
    ext = os.path.splitext(path)[1].lower()

    if ext == ".pdf":
        images = pdf_to_images(path, max_pages=max_pages)
        items = [f"page_{i+1}" for i in range(len(images))]
    else:
        images = [Image.open(path).convert("RGB")]
        items = ["image"]

    rows = []
    agg = {}  # sum scores by label across pages

    for item, img in zip(items, images):
        preds = clf(img, top_k=top_k)
        for p in preds:
            lab = p["label"]
            sc = float(p["score"])
            rows.append({"item": item, "label": lab, "score": sc})
            agg[lab] = agg.get(lab, 0.0) + sc

    per_item = (
        pd.DataFrame(rows)
        .sort_values(["item", "score"], ascending=[True, False])
        .reset_index(drop=True)
    )

    agg_df = (
        pd.DataFrame([{"label": k, "score_sum": v} for k, v in agg.items()])
        .sort_values("score_sum", ascending=False)
        .head(top_k)
        .reset_index(drop=True)
    )

    meta = pd.DataFrame([{
        "backend": ACTIVE_BACKEND,
        "model_id": ACTIVE_MODEL_ID,
        "torch_cuda_available": torch.cuda.is_available(),
        "force_cpu": FORCE_CPU,
    }])

    return meta, per_item, agg_df

demo = gr.Interface(
    fn=run_infer,
    inputs=[
        gr.File(label="Upload PDF / PNG / JPG"),
        gr.Slider(1, 50, value=6, step=1, label="Max PDF pages"),
        gr.Slider(1, 20, value=5, step=1, label="Top-K labels"),
    ],
    outputs=[
        gr.Dataframe(label="Runtime (device/model)"),
        gr.Dataframe(label="Per-page / per-image predictions"),
        gr.Dataframe(label="Aggregated across pages (sum of scores)"),
    ],
    title="Document Type Classifier (GPU-first, CPU fallback)",
    description=(
        "GPU model if available; otherwise CPU model. "
        "Set GPU_MODEL_ID / CPU_MODEL_ID / FORCE_CPU=1 as Space variables."
    ),
)

if __name__ == "__main__":
    demo.launch()