Spaces:

Adedoyinjames
/

CVbot

Running

App Files Files Community

Adedoyinjames commited on Dec 5, 2025

Commit

2f017f7

verified ·

1 Parent(s): 3ddad06

Create app.py

Browse files

Files changed (1) hide show

app.py +220 -0

app.py ADDED Viewed

	@@ -0,0 +1,220 @@

+# app.py
+# Proof-of-concept CV screening API using a small embedding model (all-MiniLM-L6-v2)
+# Supports: upload CV (PDF/DOCX/TXT) with name/email, stores embedding in SQLite,
+# and ranking endpoint to return top candidates for a job description.
+import io
+import json
+import os
+import sqlite3
+import typing as t
+from fastapi import FastAPI, UploadFile, File, Form, HTTPException
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from sentence_transformers import SentenceTransformer
+import numpy as np
+import pdfplumber
+import docx
+DB_PATH = "candidates.db"
+MODEL_NAME = "all-MiniLM-L6-v2"  # small, CPU-friendly sentence-transformers model
+app = FastAPI(title="CV Screening PoC")
+# Allow CORS for testing/demo
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# Load the embedding model once at startup
+model = SentenceTransformer(MODEL_NAME)
+# Initialize SQLite DB
+def init_db():
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    cur.execute(
+        """
+        CREATE TABLE IF NOT EXISTS candidates (
+            id INTEGER PRIMARY KEY AUTOINCREMENT,
+            name TEXT,
+            email TEXT,
+            text TEXT,
+            embedding TEXT
+        )
+        """
+    )
+    conn.commit()
+    conn.close()
+init_db()
+# Utility: Extract text from uploaded file
+async def extract_text_from_file(upload: UploadFile) -> str:
+    filename = upload.filename or "file"
+    name_lower = filename.lower()
+    content = await upload.read()
+    # PDF
+    if name_lower.endswith(".pdf"):
+        try:
+            text = ""
+            with pdfplumber.open(io.BytesIO(content)) as pdf:
+                for page in pdf.pages:
+                    page_text = page.extract_text()
+                    if page_text:
+                        text += page_text + "\n"
+            return text.strip()
+        except Exception:
+            pass
+    # DOCX
+    if name_lower.endswith(".docx") or name_lower.endswith(".doc"):
+        try:
+            doc = docx.Document(io.BytesIO(content))
+            full_text = []
+            for para in doc.paragraphs:
+                if para.text:
+                    full_text.append(para.text)
+            return "\n".join(full_text).strip()
+        except Exception:
+            pass
+    # TXT or fallback
+    try:
+        return content.decode("utf-8", errors="ignore")
+    except Exception:
+        return ""
+# Utility: compute embedding and return numpy array
+def get_embedding(text: str) -> np.ndarray:
+    if not text:
+        return np.zeros(model.get_sentence_embedding_dimension(), dtype=float)
+    emb = model.encode(text, show_progress_bar=False)
+    return np.array(emb, dtype=float)
+# Utility: store candidate
+def store_candidate(name: str, email: str, text: str, embedding: np.ndarray) -> int:
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    emb_json = json.dumps(embedding.tolist())
+    cur.execute(
+        "INSERT INTO candidates (name, email, text, embedding) VALUES (?, ?, ?, ?)",
+        (name, email, text, emb_json),
+    )
+    cid = cur.lastrowid
+    conn.commit()
+    conn.close()
+    return cid
+# Utility: retrieve all candidates
+def load_all_candidates() -> t.List[dict]:
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    cur.execute("SELECT id, name, email, text, embedding FROM candidates")
+    rows = cur.fetchall()
+    conn.close()
+    candidates = []
+    for r in rows:
+        cid, name, email, text, emb_json = r
+        try:
+            emb = np.array(json.loads(emb_json), dtype=float)
+        except Exception:
+            emb = np.zeros(model.get_sentence_embedding_dimension(), dtype=float)
+        candidates.append({"id": cid, "name": name, "email": email, "text": text, "embedding": emb})
+    return candidates
+# Utility: cosine similarity
+def cosine_sim(a: np.ndarray, b: np.ndarray) -> float:
+    if a is None or b is None:
+        return 0.0
+    denom = (np.linalg.norm(a) * np.linalg.norm(b))
+    if denom == 0:
+        return 0.0
+    return float(np.dot(a, b) / denom)
+# Simple summary and matched-skills extractor for PoC
+def summarize_and_match(cv_text: str, job_text: str) -> dict:
+    job_tokens = set([t.lower() for t in job_text.replace("/", " ").split() if len(t) > 2])
+    sentences = [s.strip() for s in cv_text.replace('\r', '\n').split('\n') if s.strip()]
+    matched_sentences = []
+    matched_skills = set()
+    for s in sentences:
+        s_lower = s.lower()
+        for token in job_tokens:
+            if token in s_lower:
+                matched_sentences.append(s)
+                matched_skills.add(token)
+        if len(matched_sentences) >= 3:
+            break
+    summary = " ".join(matched_sentences).strip()
+    if not summary:
+        summary = (cv_text[:200] + "...") if len(cv_text) > 200 else cv_text
+    return {"summary": summary, "matched_skills": list(matched_skills)}
+# Pydantic model for rank request
+class RankRequest(BaseModel):
+    job_description: str
+    top_n: int = 5
+@app.post("/upload_cv")
+async def upload_cv(name: str = Form(...), email: str = Form(...), file: UploadFile = File(...)):
+    """Upload CV with name and email. Supports PDF, DOCX, TXT. Returns candidate id."""
+    if not name or not email:
+        raise HTTPException(status_code=400, detail="name and email are required")
+    text = await extract_text_from_file(file)
+    if not text:
+        raise HTTPException(status_code=400, detail="Could not extract text from the uploaded file")
+    emb = get_embedding(text)
+    cid = store_candidate(name, email, text, emb)
+    return {"status": "ok", "candidate_id": cid}
+@app.post("/rank_candidates")
+async def rank_candidates(req: RankRequest):
+    """Rank stored candidates for a provided job description. Returns top N matches."""
+    if not req.job_description or not req.job_description.strip():
+        raise HTTPException(status_code=400, detail="job_description is required")
+    job_emb = get_embedding(req.job_description)
+    candidates = load_all_candidates()
+    scored = []
+    for c in candidates:
+        score = cosine_sim(job_emb, c.get("embedding")) if c.get("embedding") is not None else 0.0
+        extras = summarize_and_match(c.get("text", ""), req.job_description)
+        scored.append({
+            "id": c["id"],
+            "name": c["name"],
+            "email": c["email"],
+            "score": round(score, 4),
+            "summary": extras["summary"],
+            "matched_skills": extras["matched_skills"],
+        })
+    scored_sorted = sorted(scored, key=lambda x: x["score"], reverse=True)
+    top = scored_sorted[: req.top_n]
+    return {"status": "ok", "results": top}
+@app.get("/candidate/{candidate_id}")
+async def get_candidate(candidate_id: int):
+    conn = sqlite3.connect(DB_PATH)
+    cur = conn.cursor()
+    cur.execute("SELECT id, name, email, text FROM candidates WHERE id = ?", (candidate_id,))
+    row = cur.fetchone()
+    conn.close()
+    if not row:
+        raise HTTPException(status_code=404, detail="candidate not found")
+    cid, name, email, text = row
+    return {"id": cid, "name": name, "email": email, "text": text}
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("app:app", host="0.0.0.0", port=int(os.environ.get("PORT", 8000)), reload=False)