Spaces:
Sleeping
Sleeping
| import io | |
| import re | |
| import os | |
| import torch | |
| import PyPDF2 | |
| from fastapi import FastAPI, UploadFile, File, HTTPException | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from sentence_transformers import SentenceTransformer, util | |
| app = FastAPI() | |
| # ============================== | |
| # CORS (Allow semua untuk testing) | |
| # ============================== | |
| app.add_middleware( | |
| CORSMiddleware, | |
| allow_origins=["*"], | |
| allow_methods=["*"], | |
| allow_headers=["*"], | |
| ) | |
| # ============================== | |
| # LOAD MODEL | |
| # ============================== | |
| REPO_ID = "lilcoderi/cv-matcher-model" | |
| try: | |
| model = SentenceTransformer(REPO_ID) | |
| model.eval() | |
| except Exception as e: | |
| raise RuntimeError(f"Gagal load model: {str(e)}") | |
| THRESHOLD = 0.59 | |
| # ============================== | |
| # REGEX OPTIMIZED | |
| # ============================== | |
| RE_CLEAN = re.compile(r'[•\-*●▪◦☑]') | |
| RE_SPACES = re.compile(r'\s+') | |
| RE_NON_ALPHA = re.compile(r'[^\w\s]') | |
| # ============================== | |
| # TEXT PREPROCESSING | |
| # ============================== | |
| def clean_text(text: str) -> str: | |
| text = text.lower() | |
| text = RE_CLEAN.sub(' ', text) | |
| text = text.encode("ascii", "ignore").decode() | |
| text = RE_NON_ALPHA.sub(' ', text) | |
| return RE_SPACES.sub(' ', text).strip() | |
| def standardize_education(text: str) -> str: | |
| edu_map = { | |
| r'\b(sarjana|s1|strata 1|universitas|politeknik|institut)\b': 's1', | |
| r'\b(diploma 3|d3|ahli madya)\b': 'd3', | |
| r'\b(sma|smk|stm|smu|ma|sekolah menengah)\b': 'sma_smk', | |
| } | |
| for pattern, replacement in edu_map.items(): | |
| text = re.sub(pattern, replacement, text) | |
| return text | |
| def clean_job_description(text: str) -> str: | |
| noise_patterns = [ | |
| r'we are hiring', | |
| r'send us your cv', | |
| r'kirim cv anda', | |
| r'subjek:.*', | |
| r'lowongan ini dibuka sampai.*', | |
| r'format pdf' | |
| ] | |
| for pattern in noise_patterns: | |
| text = re.sub(pattern, '', text, flags=re.IGNORECASE) | |
| return text | |
| # ============================== | |
| # PDF READER | |
| # ============================== | |
| def extract_text_from_pdf(file_bytes, max_pages=3): | |
| try: | |
| pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes)) | |
| text = "" | |
| pages_to_read = min(len(pdf_reader.pages), max_pages) | |
| for i in range(pages_to_read): | |
| content = pdf_reader.pages[i].extract_text() | |
| if content: | |
| text += content + " " | |
| return text.strip() | |
| except Exception: | |
| raise HTTPException(status_code=400, detail="Gagal membaca file PDF") | |
| # ============================== | |
| # HEALTH CHECK (penting buat HF) | |
| # ============================== | |
| def root(): | |
| return {"status": "CV Matcher API Running"} | |
| # ============================== | |
| # MAIN ENDPOINT | |
| # ============================== | |
| async def match_cvs( | |
| job_file: UploadFile = File(...), | |
| cv_files: list[UploadFile] = File(...) | |
| ): | |
| # ---------- JOB DESCRIPTION ---------- | |
| job_raw = extract_text_from_pdf(await job_file.read(), max_pages=5) | |
| job_cleaned = clean_job_description(job_raw) | |
| job_final = standardize_education(clean_text(job_cleaned)) | |
| if not job_final: | |
| raise HTTPException(status_code=400, detail="Job description kosong") | |
| # ---------- CV PROCESS ---------- | |
| cv_texts_processed = [] | |
| filenames = [] | |
| for cv in cv_files: | |
| content = await cv.read() | |
| raw_text = extract_text_from_pdf(content, max_pages=3) | |
| processed_text = standardize_education(clean_text(raw_text)) | |
| if processed_text: | |
| cv_texts_processed.append(processed_text) | |
| filenames.append(cv.filename) | |
| if not cv_texts_processed: | |
| raise HTTPException(status_code=400, detail="Tidak ada CV yang valid") | |
| # ---------- EMBEDDING ---------- | |
| with torch.no_grad(): | |
| job_embedding = model.encode( | |
| job_final, | |
| convert_to_tensor=True, | |
| normalize_embeddings=True | |
| ) | |
| cv_embeddings = model.encode( | |
| cv_texts_processed, | |
| convert_to_tensor=True, | |
| normalize_embeddings=True | |
| ) | |
| scores = util.cos_sim(job_embedding, cv_embeddings)[0] | |
| # ---------- RESULT ---------- | |
| results = [] | |
| for i in range(len(filenames)): | |
| score_val = float(scores[i]) | |
| results.append({ | |
| "filename": filenames[i], | |
| "score": round(score_val, 4), | |
| "percentage": round(score_val * 100, 2), | |
| "status": "Cocok" if score_val >= THRESHOLD else "Tidak Cocok" | |
| }) | |
| results.sort(key=lambda x: x['score'], reverse=True) | |
| return {"results": results} | |