pathora-app / app.py
kauzan25's picture
Upload app.py with huggingface_hub
382d48f verified
import gradio as gr
import numpy as np, re, os, joblib
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
import fitz, docx, chardet
MODEL_REPO = "kauzan25/pathora-bert-classifier"
print("Loading...")
model_path = hf_hub_download(repo_id=MODEL_REPO, filename="artifacts/sklearn_model.joblib", repo_type="model")
le_path = hf_hub_download(repo_id=MODEL_REPO, filename="artifacts/label_encoder.joblib", repo_type="model")
clf = joblib.load(model_path)
le = joblib.load(le_path)
st = SentenceTransformer("all-MiniLM-L6-v2")
print("Loaded!")
def process(file, text_input):
if file is not None:
ext = os.path.splitext(file.name)[1].lower()
if ext == ".pdf":
doc = fitz.open(file.name)
text = "".join(p.get_text() for p in doc); doc.close()
elif ext in (".doc", ".docx"):
doc = docx.Document(file.name)
text = chr(10).join(p.text for p in doc.paragraphs)
elif ext == ".txt":
with open(file.name, "rb") as f:
raw = f.read()
text = raw.decode(chardet.detect(raw)["encoding"] or "utf-8", errors="ignore")
else:
return "Unsupported.", None
src = ext.upper().lstrip(".")
elif text_input and text_input.strip():
text = text_input.strip(); src = "Text"
else:
return "Upload file or enter text.", None
if len(text) < 20:
return "Text too short.", None
text = re.sub("<[^>]+>", " ", text)
text = re.sub("\\s+", " ", text).strip()
emb = st.encode([text], convert_to_numpy=True)
probs = clf.predict_proba(emb)[0]
top = np.argsort(probs)[::-1][:5]
results = [{"category": le.classes_[i], "confidence": float(probs[i])} for i in top]
out = "### Hasil: **{}** ({:.1f}%)".format(results[0]["category"], results[0]["confidence"]*100)
for i, p in enumerate(results):
out += chr(10) + "{}. {} ({:.1f}%)".format(i+1, p["category"], p["confidence"]*100)
return out, {"predictions": results}
with gr.Blocks(title="PathOra") as demo:
gr.Markdown("# PathOra - Career Prediction")
f_in = gr.File(label="File", file_types=[".pdf",".doc",".docx",".txt"])
t_in = gr.Textbox(label="Or paste text", lines=3)
btn = gr.Button("Analyze")
out_md = gr.Markdown()
out_json = gr.JSON()
btn.click(process, [f_in, t_in], [out_md, out_json])
demo.launch()