lilcoderi commited on
Commit
f4c9841
·
verified ·
1 Parent(s): 95aef29

Upload 3 files

Browse files
Files changed (3) hide show
  1. Dockerfile +17 -0
  2. main.py +122 -0
  3. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Gunakan image Python yang ringan
2
+ FROM python:3.9-slim
3
+
4
+ # Set working directory
5
+ WORKDIR /code
6
+
7
+ # Copy file requirements
8
+ COPY ./requirements.txt /code/requirements.txt
9
+
10
+ # Install dependencies
11
+ RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
12
+
13
+ # Copy semua file ke dalam container
14
+ COPY . .
15
+
16
+ # Jalankan Uvicorn (FastAPI) pada port 7860 (port standar HF Spaces)
17
+ CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]
main.py ADDED
@@ -0,0 +1,122 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import io
2
+ import re
3
+ import os
4
+ import torch
5
+ import PyPDF2
6
+ from fastapi import FastAPI, UploadFile, File, HTTPException
7
+ from fastapi.middleware.cors import CORSMiddleware
8
+ from sentence_transformers import SentenceTransformer, util
9
+
10
+ app = FastAPI()
11
+
12
+ app.add_middleware(
13
+ CORSMiddleware,
14
+ allow_origins=["*"],
15
+ allow_methods=["*"],
16
+ allow_headers=["*"],
17
+ )
18
+
19
+ # --- LOAD MODEL DARI HUGGING FACE ---
20
+ # Mengambil token dari Secret yang nanti kamu setting di HF Spaces
21
+ HF_TOKEN = os.getenv("HF_TOKEN")
22
+ REPO_ID = "lilcoderi/cv-matcher-model"
23
+
24
+ # Load model langsung dari Hub
25
+ model = SentenceTransformer(REPO_ID, use_auth_token=HF_TOKEN)
26
+
27
+ THRESHOLD = 0.59
28
+
29
+ # Pre-compile regex untuk kecepatan eksekusi
30
+ RE_CLEAN = re.compile(r'[•\-*●▪◦☑]')
31
+ RE_SPACES = re.compile(r'\s+')
32
+ RE_NON_ALPHA = re.compile(r'[^\w\s]')
33
+
34
+ # --- FUNGSI PREPROCESSING OPTIMIZED ---
35
+
36
+ def clean_text(text: str) -> str:
37
+ text = text.lower()
38
+ text = RE_CLEAN.sub(' ', text)
39
+ text = text.encode("ascii", "ignore").decode()
40
+ text = RE_NON_ALPHA.sub(' ', text)
41
+ return RE_SPACES.sub(' ', text).strip()
42
+
43
+ def standardize_education(text: str) -> str:
44
+ edu_map = {
45
+ r'\b(sarjana|s1|strata 1|universitas|politeknik|institut)\b': 's1',
46
+ r'\b(diploma 3|d3|ahli madya)\b': 'd3',
47
+ r'\b(sma|smk|stm|smu|ma|sekolah menengah)\b': 'sma_smk',
48
+ }
49
+ for pattern, replacement in edu_map.items():
50
+ text = re.sub(pattern, replacement, text)
51
+ return text
52
+
53
+ def clean_job_description(text: str) -> str:
54
+ noise_patterns = [
55
+ r'we are hiring', r'send us your cv', r'kirim cv anda',
56
+ r'hrdptoba@gmail\.com', r'subjek:.*', r'lowongan ini dibuka sampai.*',
57
+ r'posisi_nama_domisili', r'format pdf'
58
+ ]
59
+ for pattern in noise_patterns:
60
+ text = re.sub(pattern, '', text, flags=re.IGNORECASE)
61
+ return text
62
+
63
+ def extract_text_from_pdf(file_bytes, max_pages=3):
64
+ try:
65
+ pdf_reader = PyPDF2.PdfReader(io.BytesIO(file_bytes))
66
+ text = ""
67
+ pages_to_read = min(len(pdf_reader.pages), max_pages)
68
+ for i in range(pages_to_read):
69
+ content = pdf_reader.pages[i].extract_text()
70
+ if content:
71
+ text += content + " "
72
+ return text
73
+ except Exception:
74
+ raise HTTPException(status_code=400, detail="Gagal membaca file PDF")
75
+
76
+ # --- ENDPOINT UTAMA ---
77
+
78
+ @app.post("/match")
79
+ async def match_cvs(
80
+ job_file: UploadFile = File(...),
81
+ cv_files: list[UploadFile] = File(...)
82
+ ):
83
+ # 1. Proses Job Description
84
+ job_raw = extract_text_from_pdf(await job_file.read(), max_pages=5)
85
+ job_cleaned = clean_job_description(job_raw)
86
+ job_final = standardize_education(clean_text(job_cleaned))
87
+
88
+ # 2. Ekstrak teks dari banyak CV
89
+ cv_texts_processed = []
90
+ filenames = []
91
+
92
+ for cv in cv_files:
93
+ content = await cv.read()
94
+ raw_text = extract_text_from_pdf(content, max_pages=3)
95
+ processed_text = standardize_education(clean_text(raw_text))
96
+
97
+ cv_texts_processed.append(processed_text)
98
+ filenames.append(cv.filename)
99
+
100
+ if not cv_texts_processed:
101
+ raise HTTPException(status_code=400, detail="Tidak ada CV yang valid")
102
+
103
+ # 3. Proses Embedding & Similarity
104
+ with torch.no_grad():
105
+ job_embedding = model.encode(job_final, convert_to_tensor=True, normalize_embeddings=True)
106
+ cv_embeddings = model.encode(cv_texts_processed, convert_to_tensor=True, normalize_embeddings=True)
107
+
108
+ scores = util.cos_sim(job_embedding, cv_embeddings)[0]
109
+
110
+ # 4. Susun Hasil & Ranking
111
+ results = []
112
+ for i in range(len(filenames)):
113
+ score_val = float(scores[i])
114
+ results.append({
115
+ "filename": filenames[i],
116
+ "score": round(score_val, 4),
117
+ "percentage": round(score_val * 100, 2),
118
+ "status": "Cocok" if score_val >= THRESHOLD else "Tidak Cocok"
119
+ })
120
+
121
+ results.sort(key=lambda x: x['score'], reverse=True)
122
+ return {"results": results}
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ python-multipart
4
+ sentence-transformers
5
+ PyPDF2
6
+ torch --index-url https://download.pytorch.org/whl/cpu