cv_match3 / app.py
gigswar's picture
Create app.py
29619b8 verified
import os
import fitz
import chromadb
import gradio as gr
from docx import Document
from huggingface_hub import snapshot_download
from sentence_transformers import SentenceTransformer
from chromadb.config import Settings
from chromadb.utils.embedding_functions import EmbeddingFunction
print("➡️ Downloading dataset snapshot (fetching all LFS files)...")
local_dir = snapshot_download(
repo_id="gigswar/cv_files",
repo_type="dataset",
local_dir="data_repo",
local_dir_use_symlinks="auto",
force_download=True, # always fetch binaries
allow_patterns=["*"], # include all files
ignore_patterns=[] # don't skip anything
)
print(f"✔️ Dataset ready at: {local_dir}")
# -----------------------------
# 2️⃣ Collect all PDF/DOCX/CSV
# -----------------------------
cv_paths = []
for root, _, files in os.walk(local_dir):
for fname in files:
if fname.lower().endswith((".pdf", ".docx", ".csv")):
cv_paths.append(os.path.join(root, fname))
print(f"🔍 Found {len(cv_paths)} CV files (PDF/DOCX/CSV).")
if not cv_paths:
raise FileNotFoundError("❌ No valid CV files found in dataset.")
# 3️⃣ Initialize ChromaDB (store inside repo)
# -----------------------------
chroma_path = os.path.join(local_dir, "chroma_db") # DB inside dataset repo
os.makedirs(chroma_path, exist_ok=True)
client = chromadb.Client(Settings(persist_directory=chroma_path))
# SentenceTransformer wrapper (Chroma v0.4+ compatible)
class SBERTEmbeddingFunction(EmbeddingFunction):
def __init__(self, model):
self.model = model
def __call__(self, texts):
return self.model.encode(texts).tolist()
def name(self):
return "sbert-embedder"
sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
embedder = SBERTEmbeddingFunction(sbert_model)
collection = client.get_or_create_collection(
name="cv_collection",
embedding_function=embedder
)
# -----------------------------
# 4️⃣ Text extraction functions
# -----------------------------
def extract_pdf(path):
try:
with fitz.open(path) as doc:
return "\n".join(page.get_text("text") or "" for page in doc)
except Exception as e:
raise RuntimeError(f"PDF error: {e}")
def extract_docx(path):
try:
doc = Document(path)
return "\n".join(para.text for para in doc.paragraphs)
except Exception as e:
raise RuntimeError(f"DOCX error: {e}")
def extract_csv(path):
try:
with open(path, "r", encoding="utf-8", errors="ignore") as f:
return f.read()
except Exception as e:
raise RuntimeError(f"CSV error: {e}")
def extract_text(path):
if path.lower().endswith(".pdf"):
return extract_pdf(path)
elif path.lower().endswith(".docx"):
return extract_docx(path)
else:
return extract_csv(path)
# -----------------------------
# 5️⃣ Index CVs into ChromaDB (only if needed)
# -----------------------------
def load_cvs(paths):
current_count = collection.count()
if current_count > 0:
print(f"ℹ️ Skipping indexing (ChromaDB already has {current_count} CVs).")
return
print("➡️ Indexing CVs into ChromaDB (this will run only once)...")
indexed, skipped = 0, 0
for i, path in enumerate(paths, start=1):
try:
text = extract_text(path).strip()
if not text:
skipped += 1
continue
rel_id = os.path.relpath(path, local_dir)
collection.add(
ids=[rel_id],
documents=[text],
metadatas=[{"name": os.path.basename(path), "path": path}]
)
indexed += 1
except Exception as e:
print(f"⚠️ Skipped {path}: {e}")
skipped += 1
if i % 100 == 0:
print(f"Progress: Indexed {indexed}/{i} processed...")
print(f"✅ Finished: Indexed {indexed}/{len(paths)} CVs, skipped {skipped} (corrupt/encrypted).")
load_cvs(cv_paths)
# -----------------------------
# 6️⃣ Matching and Stats
# -----------------------------
def find_matching(jd, top_n):
jd = jd.strip()
if not jd:
return [], "⚠️ Please enter a job description."
res = collection.query(query_texts=[jd], n_results=top_n)
md, ds = res["metadatas"][0], res["distances"][0]
if not md:
return [], "❌ No matches found."
files, scores = [], []
for meta, dist in zip(md, ds):
if os.path.exists(meta["path"]):
sim = 1 / (1 + dist)
files.append(meta["path"])
scores.append(f"{meta['name']}: {sim:.3f}")
return files, "✅ Matches:\n" + "\n".join(scores)
def show_stats():
return f"📊 Indexed {collection.count()} CV(s) (stored in {chroma_path})"
# -----------------------------
# 7️⃣ Gradio Web App
# -----------------------------
with gr.Blocks(title="JD→CV Semantic Matcher (Persistent DB)") as app:
gr.Markdown("# 🎯 JD→CV Semantic Matcher\nHandles all CVs (PDF, DOCX, CSV, Git LFS) with persistent DB")
jd = gr.Textbox(lines=8, placeholder="Paste your job description...")
top_n = gr.Slider(1, 20, value=5, label="Top N CVs")
search_btn = gr.Button("🔍 Search")
stats_btn = gr.Button("📊 Stats")
files_out = gr.Files()
status_out = gr.Textbox(lines=6, interactive=False)
search_btn.click(find_matching, [jd, top_n], [files_out, status_out])
stats_btn.click(show_stats, outputs=status_out)
app.launch(
auth=("gigaswar", "gigaswarai"), # will prompt user before app loads
server_name="0.0.0.0", # avoid localhost
server_port=7860, # default port for Spaces
share=True, # create a public link (required)
ssr_mode=False # fix Gradio locale error
)