Spaces:

raahinaez
/

doc_classifictaion_tinybert

Runtime error

App Files Files Community

raahinaez commited on Feb 2

Commit

7c1270e

verified ·

1 Parent(s): 783ec1e

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -0

app.py CHANGED Viewed

	@@ -0,0 +1,81 @@

+import streamlit as st
+import fitz  # PyMuPDF
+import torch
+from transformers import AutoTokenizer, AutoModelForSequenceClassification
+from peft import PeftModel
+import json
+# -----------------------
+# CONFIG
+# -----------------------
+MODEL_NAME = "prajjwal1/bert-tiny"
+LORA_PATH = "./lora_adapter"
+LABEL_FILE = "./lora_adapter/label_map.json"
+# -----------------------
+# LOAD LABEL MAP
+# -----------------------
+with open(LABEL_FILE, "r") as f:
+    label_map = json.load(f)
+id2label = {int(k): v for k, v in label_map.items()}
+# -----------------------
+# LOAD MODEL
+# -----------------------
+@st.cache_resource
+def load_model():
+    base_model = AutoModelForSequenceClassification.from_pretrained(
+        MODEL_NAME,
+        num_labels=len(id2label),
+        id2label=id2label
+    )
+    model = PeftModel.from_pretrained(base_model, LORA_PATH)
+    tokenizer = AutoTokenizer.from_pretrained(LORA_PATH)
+    model.eval()
+    return model, tokenizer
+model, tokenizer = load_model()
+# -----------------------
+# PDF TEXT EXTRACTION
+# -----------------------
+def extract_text_from_pdf(uploaded_file):
+    doc = fitz.open(stream=uploaded_file.read(), filetype="pdf")
+    text = ""
+    for page in doc:
+        text += page.get_text()
+    return text.strip()
+# -----------------------
+# STREAMLIT UI
+# -----------------------
+st.set_page_config(page_title="Document Classifier", layout="centered")
+st.title("📄 Document Classification App")
+st.write("Upload a PDF and classify the document type.")
+uploaded_file = st.file_uploader("Upload PDF", type=["pdf"])
+if uploaded_file:
+    with st.spinner("Extracting text..."):
+        text = extract_text_from_pdf(uploaded_file)
+    if len(text) < 20:
+        st.error("Not enough text extracted from PDF.")
+    else:
+        with st.spinner("Classifying document..."):
+            inputs = tokenizer(
+                text,
+                return_tensors="pt",
+                truncation=True,
+                max_length=256
+            )
+            with torch.no_grad():
+                outputs = model(**inputs)
+            pred_id = torch.argmax(outputs.logits, dim=-1).item()
+            prediction = model.config.id2label[pred_id]
+        st.success(f"✅ Predicted Document Type: **{prediction.upper()}**")