Spaces:

gigswar
/

cv_match3

Sleeping

App Files Files Community

cv_match3 / app.py

gigswar

Create app.py

29619b8 verified 8 months ago

raw

history blame contribute delete

5.87 kB

	import os
	import fitz
	import chromadb
	import gradio as gr
	from docx import Document
	from huggingface_hub import snapshot_download
	from sentence_transformers import SentenceTransformer
	from chromadb.config import Settings
	from chromadb.utils.embedding_functions import EmbeddingFunction

	print("➡️ Downloading dataset snapshot (fetching all LFS files)...")
	local_dir = snapshot_download(
	repo_id="gigswar/cv_files",
	repo_type="dataset",
	local_dir="data_repo",
	local_dir_use_symlinks="auto",
	force_download=True, # always fetch binaries
	allow_patterns=["*"], # include all files
	ignore_patterns=[] # don't skip anything
	)
	print(f"✔️ Dataset ready at: {local_dir}")

	# -----------------------------
	# 2️⃣ Collect all PDF/DOCX/CSV
	# -----------------------------
	cv_paths = []
	for root, _, files in os.walk(local_dir):
	for fname in files:
	if fname.lower().endswith((".pdf", ".docx", ".csv")):
	cv_paths.append(os.path.join(root, fname))

	print(f"🔍 Found {len(cv_paths)} CV files (PDF/DOCX/CSV).")
	if not cv_paths:
	raise FileNotFoundError("❌ No valid CV files found in dataset.")

	# 3️⃣ Initialize ChromaDB (store inside repo)
	# -----------------------------
	chroma_path = os.path.join(local_dir, "chroma_db") # DB inside dataset repo
	os.makedirs(chroma_path, exist_ok=True)
	client = chromadb.Client(Settings(persist_directory=chroma_path))

	# SentenceTransformer wrapper (Chroma v0.4+ compatible)
	class SBERTEmbeddingFunction(EmbeddingFunction):
	def __init__(self, model):
	self.model = model
	def __call__(self, texts):
	return self.model.encode(texts).tolist()
	def name(self):
	return "sbert-embedder"

	sbert_model = SentenceTransformer("all-MiniLM-L6-v2")
	embedder = SBERTEmbeddingFunction(sbert_model)

	collection = client.get_or_create_collection(
	name="cv_collection",
	embedding_function=embedder
	)

	# -----------------------------
	# 4️⃣ Text extraction functions
	# -----------------------------
	def extract_pdf(path):
	try:
	with fitz.open(path) as doc:
	return "\n".join(page.get_text("text") or "" for page in doc)
	except Exception as e:
	raise RuntimeError(f"PDF error: {e}")

	def extract_docx(path):
	try:
	doc = Document(path)
	return "\n".join(para.text for para in doc.paragraphs)
	except Exception as e:
	raise RuntimeError(f"DOCX error: {e}")

	def extract_csv(path):
	try:
	with open(path, "r", encoding="utf-8", errors="ignore") as f:
	return f.read()
	except Exception as e:
	raise RuntimeError(f"CSV error: {e}")

	def extract_text(path):
	if path.lower().endswith(".pdf"):
	return extract_pdf(path)
	elif path.lower().endswith(".docx"):
	return extract_docx(path)
	else:
	return extract_csv(path)

	# -----------------------------
	# 5️⃣ Index CVs into ChromaDB (only if needed)
	# -----------------------------
	def load_cvs(paths):
	current_count = collection.count()
	if current_count > 0:
	print(f"ℹ️ Skipping indexing (ChromaDB already has {current_count} CVs).")
	return

	print("➡️ Indexing CVs into ChromaDB (this will run only once)...")
	indexed, skipped = 0, 0
	for i, path in enumerate(paths, start=1):
	try:
	text = extract_text(path).strip()
	if not text:
	skipped += 1
	continue
	rel_id = os.path.relpath(path, local_dir)
	collection.add(
	ids=[rel_id],
	documents=[text],
	metadatas=[{"name": os.path.basename(path), "path": path}]
	)
	indexed += 1
	except Exception as e:
	print(f"⚠️ Skipped {path}: {e}")
	skipped += 1
	if i % 100 == 0:
	print(f"Progress: Indexed {indexed}/{i} processed...")
	print(f"✅ Finished: Indexed {indexed}/{len(paths)} CVs, skipped {skipped} (corrupt/encrypted).")

	load_cvs(cv_paths)

	# -----------------------------
	# 6️⃣ Matching and Stats
	# -----------------------------
	def find_matching(jd, top_n):
	jd = jd.strip()
	if not jd:
	return [], "⚠️ Please enter a job description."
	res = collection.query(query_texts=[jd], n_results=top_n)
	md, ds = res["metadatas"][0], res["distances"][0]
	if not md:
	return [], "❌ No matches found."
	files, scores = [], []
	for meta, dist in zip(md, ds):
	if os.path.exists(meta["path"]):
	sim = 1 / (1 + dist)
	files.append(meta["path"])
	scores.append(f"{meta['name']}: {sim:.3f}")
	return files, "✅ Matches:\n" + "\n".join(scores)

	def show_stats():
	return f"📊 Indexed {collection.count()} CV(s) (stored in {chroma_path})"

	# -----------------------------
	# 7️⃣ Gradio Web App
	# -----------------------------
	with gr.Blocks(title="JD→CV Semantic Matcher (Persistent DB)") as app:
	gr.Markdown("# 🎯 JD→CV Semantic Matcher\nHandles all CVs (PDF, DOCX, CSV, Git LFS) with persistent DB")
	jd = gr.Textbox(lines=8, placeholder="Paste your job description...")
	top_n = gr.Slider(1, 20, value=5, label="Top N CVs")
	search_btn = gr.Button("🔍 Search")
	stats_btn = gr.Button("📊 Stats")
	files_out = gr.Files()
	status_out = gr.Textbox(lines=6, interactive=False)

	search_btn.click(find_matching, [jd, top_n], [files_out, status_out])
	stats_btn.click(show_stats, outputs=status_out)

	app.launch(
	auth=("gigaswar", "gigaswarai"), # will prompt user before app loads
	server_name="0.0.0.0", # avoid localhost
	server_port=7860, # default port for Spaces
	share=True, # create a public link (required)
	ssr_mode=False # fix Gradio locale error
	)