cv-matcher-app / main.py
lilcoderi's picture
Update main.py
4f67c97 verified
import io
import re
import os
import torch
import PyPDF2
from fastapi import FastAPI, UploadFile, File, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from sentence_transformers import SentenceTransformer, util
app = FastAPI()
# ==============================
# CORS (Allow semua untuk testing)
# ==============================
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
# ==============================
# LOAD MODEL
# ==============================
REPO_ID = "lilcoderi/cv-matcher-model"
try:
model = SentenceTransformer(REPO_ID)
model.eval()
except Exception as e:
raise RuntimeError(f"Gagal load model: {str(e)}")
THRESHOLD = 0.59
# ==============================
# REGEX OPTIMIZED
# ==============================
RE_CLEAN = re.compile(r'[•\-*●▪◦☑]')
RE_SPACES = re.compile(r'\s+')
RE_NON_ALPHA = re.compile(r'[^\w\s]')
# ==============================
# TEXT PREPROCESSING
# ==============================
def clean_text(text: str) -> str:
text = text.lower()
text = RE_CLEAN.sub(' ', text)
text = text.encode("ascii", "ignore").decode()
text = RE_NON_ALPHA.sub(' ', text)
return RE_SPACES.sub(' ', text).strip()
def standardize_education(text: str) -> str:
edu_map = {
r'\b(sarjana|s1|strata 1|universitas|politeknik|institut)\b': 's1',
r'\b(diploma 3|d3|ahli madya)\b': 'd3',
r'\b(sma|smk|stm|smu|ma|sekolah menengah)\b': 'sma_smk',
}
for pattern, replacement in edu_map.items():
text = re.sub(pattern, replacement, text)
return text
def clean_job_description(text: str) -> str:
noise_patterns = [
r'we are hiring',
r'send us your cv',
r'kirim cv anda',
r'subjek:.*',
r'lowongan ini dibuka sampai.*',
r'format pdf'
]
for pattern in noise_patterns:
text = re.sub(pattern, '', text, flags=re.IGNORECASE)
return text
# ==============================
# PDF READER
# ==============================
def extract_text_from_pdf(file_bytes, max_pages=3):
try:
pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
text = ""
pages_to_read = min(len(pdf_reader.pages), max_pages)
for i in range(pages_to_read):
content = pdf_reader.pages[i].extract_text()
if content:
text += content + " "
return text.strip()
except Exception:
raise HTTPException(status_code=400, detail="Gagal membaca file PDF")
# ==============================
# HEALTH CHECK (penting buat HF)
# ==============================
@app.get("/")
def root():
return {"status": "CV Matcher API Running"}
# ==============================
# MAIN ENDPOINT
# ==============================
@app.post("/match")
async def match_cvs(
job_file: UploadFile = File(...),
cv_files: list[UploadFile] = File(...)
):
# ---------- JOB DESCRIPTION ----------
job_raw = extract_text_from_pdf(await job_file.read(), max_pages=5)
job_cleaned = clean_job_description(job_raw)
job_final = standardize_education(clean_text(job_cleaned))
if not job_final:
raise HTTPException(status_code=400, detail="Job description kosong")
# ---------- CV PROCESS ----------
cv_texts_processed = []
filenames = []
for cv in cv_files:
content = await cv.read()
raw_text = extract_text_from_pdf(content, max_pages=3)
processed_text = standardize_education(clean_text(raw_text))
if processed_text:
cv_texts_processed.append(processed_text)
filenames.append(cv.filename)
if not cv_texts_processed:
raise HTTPException(status_code=400, detail="Tidak ada CV yang valid")
# ---------- EMBEDDING ----------
with torch.no_grad():
job_embedding = model.encode(
job_final,
convert_to_tensor=True,
normalize_embeddings=True
)
cv_embeddings = model.encode(
cv_texts_processed,
convert_to_tensor=True,
normalize_embeddings=True
)
scores = util.cos_sim(job_embedding, cv_embeddings)[0]
# ---------- RESULT ----------
results = []
for i in range(len(filenames)):
score_val = float(scores[i])
results.append({
"filename": filenames[i],
"score": round(score_val, 4),
"percentage": round(score_val * 100, 2),
"status": "Cocok" if score_val >= THRESHOLD else "Tidak Cocok"
})
results.sort(key=lambda x: x['score'], reverse=True)
return {"results": results}