Adedoyinjames commited on
Commit
2f017f7
·
verified ·
1 Parent(s): 3ddad06

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +220 -0
app.py ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ # Proof-of-concept CV screening API using a small embedding model (all-MiniLM-L6-v2)
3
+ # Supports: upload CV (PDF/DOCX/TXT) with name/email, stores embedding in SQLite,
4
+ # and ranking endpoint to return top candidates for a job description.
5
+
6
+ import io
7
+ import json
8
+ import os
9
+ import sqlite3
10
+ import typing as t
11
+ from fastapi import FastAPI, UploadFile, File, Form, HTTPException
12
+ from fastapi.middleware.cors import CORSMiddleware
13
+ from pydantic import BaseModel
14
+ from sentence_transformers import SentenceTransformer
15
+ import numpy as np
16
+ import pdfplumber
17
+ import docx
18
+
19
+ DB_PATH = "candidates.db"
20
+ MODEL_NAME = "all-MiniLM-L6-v2" # small, CPU-friendly sentence-transformers model
21
+
22
+ app = FastAPI(title="CV Screening PoC")
23
+
24
+ # Allow CORS for testing/demo
25
+ app.add_middleware(
26
+ CORSMiddleware,
27
+ allow_origins=["*"],
28
+ allow_credentials=True,
29
+ allow_methods=["*"],
30
+ allow_headers=["*"],
31
+ )
32
+
33
+ # Load the embedding model once at startup
34
+ model = SentenceTransformer(MODEL_NAME)
35
+
36
+ # Initialize SQLite DB
37
+ def init_db():
38
+ conn = sqlite3.connect(DB_PATH)
39
+ cur = conn.cursor()
40
+ cur.execute(
41
+ """
42
+ CREATE TABLE IF NOT EXISTS candidates (
43
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
44
+ name TEXT,
45
+ email TEXT,
46
+ text TEXT,
47
+ embedding TEXT
48
+ )
49
+ """
50
+ )
51
+ conn.commit()
52
+ conn.close()
53
+
54
+ init_db()
55
+
56
+ # Utility: Extract text from uploaded file
57
+ async def extract_text_from_file(upload: UploadFile) -> str:
58
+ filename = upload.filename or "file"
59
+ name_lower = filename.lower()
60
+ content = await upload.read()
61
+
62
+ # PDF
63
+ if name_lower.endswith(".pdf"):
64
+ try:
65
+ text = ""
66
+ with pdfplumber.open(io.BytesIO(content)) as pdf:
67
+ for page in pdf.pages:
68
+ page_text = page.extract_text()
69
+ if page_text:
70
+ text += page_text + "\n"
71
+ return text.strip()
72
+ except Exception:
73
+ pass
74
+
75
+ # DOCX
76
+ if name_lower.endswith(".docx") or name_lower.endswith(".doc"):
77
+ try:
78
+ doc = docx.Document(io.BytesIO(content))
79
+ full_text = []
80
+ for para in doc.paragraphs:
81
+ if para.text:
82
+ full_text.append(para.text)
83
+ return "\n".join(full_text).strip()
84
+ except Exception:
85
+ pass
86
+
87
+ # TXT or fallback
88
+ try:
89
+ return content.decode("utf-8", errors="ignore")
90
+ except Exception:
91
+ return ""
92
+
93
+ # Utility: compute embedding and return numpy array
94
+ def get_embedding(text: str) -> np.ndarray:
95
+ if not text:
96
+ return np.zeros(model.get_sentence_embedding_dimension(), dtype=float)
97
+ emb = model.encode(text, show_progress_bar=False)
98
+ return np.array(emb, dtype=float)
99
+
100
+ # Utility: store candidate
101
+ def store_candidate(name: str, email: str, text: str, embedding: np.ndarray) -> int:
102
+ conn = sqlite3.connect(DB_PATH)
103
+ cur = conn.cursor()
104
+ emb_json = json.dumps(embedding.tolist())
105
+ cur.execute(
106
+ "INSERT INTO candidates (name, email, text, embedding) VALUES (?, ?, ?, ?)",
107
+ (name, email, text, emb_json),
108
+ )
109
+ cid = cur.lastrowid
110
+ conn.commit()
111
+ conn.close()
112
+ return cid
113
+
114
+ # Utility: retrieve all candidates
115
+ def load_all_candidates() -> t.List[dict]:
116
+ conn = sqlite3.connect(DB_PATH)
117
+ cur = conn.cursor()
118
+ cur.execute("SELECT id, name, email, text, embedding FROM candidates")
119
+ rows = cur.fetchall()
120
+ conn.close()
121
+ candidates = []
122
+ for r in rows:
123
+ cid, name, email, text, emb_json = r
124
+ try:
125
+ emb = np.array(json.loads(emb_json), dtype=float)
126
+ except Exception:
127
+ emb = np.zeros(model.get_sentence_embedding_dimension(), dtype=float)
128
+ candidates.append({"id": cid, "name": name, "email": email, "text": text, "embedding": emb})
129
+ return candidates
130
+
131
+ # Utility: cosine similarity
132
+ def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
133
+ if a is None or b is None:
134
+ return 0.0
135
+ denom = (np.linalg.norm(a) * np.linalg.norm(b))
136
+ if denom == 0:
137
+ return 0.0
138
+ return float(np.dot(a, b) / denom)
139
+
140
+ # Simple summary and matched-skills extractor for PoC
141
+ def summarize_and_match(cv_text: str, job_text: str) -> dict:
142
+ job_tokens = set([t.lower() for t in job_text.replace("/", " ").split() if len(t) > 2])
143
+ sentences = [s.strip() for s in cv_text.replace('\r', '\n').split('\n') if s.strip()]
144
+ matched_sentences = []
145
+ matched_skills = set()
146
+ for s in sentences:
147
+ s_lower = s.lower()
148
+ for token in job_tokens:
149
+ if token in s_lower:
150
+ matched_sentences.append(s)
151
+ matched_skills.add(token)
152
+ if len(matched_sentences) >= 3:
153
+ break
154
+
155
+ summary = " ".join(matched_sentences).strip()
156
+ if not summary:
157
+ summary = (cv_text[:200] + "...") if len(cv_text) > 200 else cv_text
158
+
159
+ return {"summary": summary, "matched_skills": list(matched_skills)}
160
+
161
+ # Pydantic model for rank request
162
+ class RankRequest(BaseModel):
163
+ job_description: str
164
+ top_n: int = 5
165
+
166
+ @app.post("/upload_cv")
167
+ async def upload_cv(name: str = Form(...), email: str = Form(...), file: UploadFile = File(...)):
168
+ """Upload CV with name and email. Supports PDF, DOCX, TXT. Returns candidate id."""
169
+ if not name or not email:
170
+ raise HTTPException(status_code=400, detail="name and email are required")
171
+
172
+ text = await extract_text_from_file(file)
173
+ if not text:
174
+ raise HTTPException(status_code=400, detail="Could not extract text from the uploaded file")
175
+
176
+ emb = get_embedding(text)
177
+ cid = store_candidate(name, email, text, emb)
178
+ return {"status": "ok", "candidate_id": cid}
179
+
180
+ @app.post("/rank_candidates")
181
+ async def rank_candidates(req: RankRequest):
182
+ """Rank stored candidates for a provided job description. Returns top N matches."""
183
+ if not req.job_description or not req.job_description.strip():
184
+ raise HTTPException(status_code=400, detail="job_description is required")
185
+
186
+ job_emb = get_embedding(req.job_description)
187
+ candidates = load_all_candidates()
188
+
189
+ scored = []
190
+ for c in candidates:
191
+ score = cosine_sim(job_emb, c.get("embedding")) if c.get("embedding") is not None else 0.0
192
+ extras = summarize_and_match(c.get("text", ""), req.job_description)
193
+ scored.append({
194
+ "id": c["id"],
195
+ "name": c["name"],
196
+ "email": c["email"],
197
+ "score": round(score, 4),
198
+ "summary": extras["summary"],
199
+ "matched_skills": extras["matched_skills"],
200
+ })
201
+
202
+ scored_sorted = sorted(scored, key=lambda x: x["score"], reverse=True)
203
+ top = scored_sorted[: req.top_n]
204
+ return {"status": "ok", "results": top}
205
+
206
+ @app.get("/candidate/{candidate_id}")
207
+ async def get_candidate(candidate_id: int):
208
+ conn = sqlite3.connect(DB_PATH)
209
+ cur = conn.cursor()
210
+ cur.execute("SELECT id, name, email, text FROM candidates WHERE id = ?", (candidate_id,))
211
+ row = cur.fetchone()
212
+ conn.close()
213
+ if not row:
214
+ raise HTTPException(status_code=404, detail="candidate not found")
215
+ cid, name, email, text = row
216
+ return {"id": cid, "name": name, "email": email, "text": text}
217
+
218
+ if __name__ == "__main__":
219
+ import uvicorn
220
+ uvicorn.run("app:app", host="0.0.0.0", port=int(os.environ.get("PORT", 8000)), reload=False)