import gradio as gr import numpy as np, re, os, joblib from sentence_transformers import SentenceTransformer from huggingface_hub import hf_hub_download import fitz, docx, chardet MODEL_REPO = "kauzan25/pathora-bert-classifier" print("Loading...") model_path = hf_hub_download(repo_id=MODEL_REPO, filename="artifacts/sklearn_model.joblib", repo_type="model") le_path = hf_hub_download(repo_id=MODEL_REPO, filename="artifacts/label_encoder.joblib", repo_type="model") clf = joblib.load(model_path) le = joblib.load(le_path) st = SentenceTransformer("all-MiniLM-L6-v2") print("Loaded!") def process(file, text_input): if file is not None: ext = os.path.splitext(file.name)[1].lower() if ext == ".pdf": doc = fitz.open(file.name) text = "".join(p.get_text() for p in doc); doc.close() elif ext in (".doc", ".docx"): doc = docx.Document(file.name) text = chr(10).join(p.text for p in doc.paragraphs) elif ext == ".txt": with open(file.name, "rb") as f: raw = f.read() text = raw.decode(chardet.detect(raw)["encoding"] or "utf-8", errors="ignore") else: return "Unsupported.", None src = ext.upper().lstrip(".") elif text_input and text_input.strip(): text = text_input.strip(); src = "Text" else: return "Upload file or enter text.", None if len(text) < 20: return "Text too short.", None text = re.sub("<[^>]+>", " ", text) text = re.sub("\\s+", " ", text).strip() emb = st.encode([text], convert_to_numpy=True) probs = clf.predict_proba(emb)[0] top = np.argsort(probs)[::-1][:5] results = [{"category": le.classes_[i], "confidence": float(probs[i])} for i in top] out = "### Hasil: **{}** ({:.1f}%)".format(results[0]["category"], results[0]["confidence"]*100) for i, p in enumerate(results): out += chr(10) + "{}. {} ({:.1f}%)".format(i+1, p["category"], p["confidence"]*100) return out, {"predictions": results} with gr.Blocks(title="PathOra") as demo: gr.Markdown("# PathOra - Career Prediction") f_in = gr.File(label="File", file_types=[".pdf",".doc",".docx",".txt"]) t_in = gr.Textbox(label="Or paste text", lines=3) btn = gr.Button("Analyze") out_md = gr.Markdown() out_json = gr.JSON() btn.click(process, [f_in, t_in], [out_md, out_json]) demo.launch()