| import os |
| import fitz |
| import chromadb |
| import gradio as gr |
| from docx import Document |
| from huggingface_hub import snapshot_download |
| from sentence_transformers import SentenceTransformer |
| from chromadb.config import Settings |
| from chromadb.utils.embedding_functions import EmbeddingFunction |
|
|
| print("➡️ Downloading dataset snapshot (fetching all LFS files)...") |
| local_dir = snapshot_download( |
| repo_id="gigswar/cv_files", |
| repo_type="dataset", |
| local_dir="data_repo", |
| local_dir_use_symlinks="auto", |
| force_download=True, |
| allow_patterns=["*"], |
| ignore_patterns=[] |
| ) |
| print(f"✔️ Dataset ready at: {local_dir}") |
|
|
| |
| |
| |
| cv_paths = [] |
| for root, _, files in os.walk(local_dir): |
| for fname in files: |
| if fname.lower().endswith((".pdf", ".docx", ".csv")): |
| cv_paths.append(os.path.join(root, fname)) |
|
|
| print(f"🔍 Found {len(cv_paths)} CV files (PDF/DOCX/CSV).") |
| if not cv_paths: |
| raise FileNotFoundError("❌ No valid CV files found in dataset.") |
|
|
| |
| |
| chroma_path = os.path.join(local_dir, "chroma_db") |
| os.makedirs(chroma_path, exist_ok=True) |
| client = chromadb.Client(Settings(persist_directory=chroma_path)) |
|
|
| |
| class SBERTEmbeddingFunction(EmbeddingFunction): |
| def __init__(self, model): |
| self.model = model |
| def __call__(self, texts): |
| return self.model.encode(texts).tolist() |
| def name(self): |
| return "sbert-embedder" |
|
|
| sbert_model = SentenceTransformer("all-MiniLM-L6-v2") |
| embedder = SBERTEmbeddingFunction(sbert_model) |
|
|
| collection = client.get_or_create_collection( |
| name="cv_collection", |
| embedding_function=embedder |
| ) |
|
|
| |
| |
| |
| def extract_pdf(path): |
| try: |
| with fitz.open(path) as doc: |
| return "\n".join(page.get_text("text") or "" for page in doc) |
| except Exception as e: |
| raise RuntimeError(f"PDF error: {e}") |
|
|
| def extract_docx(path): |
| try: |
| doc = Document(path) |
| return "\n".join(para.text for para in doc.paragraphs) |
| except Exception as e: |
| raise RuntimeError(f"DOCX error: {e}") |
|
|
| def extract_csv(path): |
| try: |
| with open(path, "r", encoding="utf-8", errors="ignore") as f: |
| return f.read() |
| except Exception as e: |
| raise RuntimeError(f"CSV error: {e}") |
|
|
| def extract_text(path): |
| if path.lower().endswith(".pdf"): |
| return extract_pdf(path) |
| elif path.lower().endswith(".docx"): |
| return extract_docx(path) |
| else: |
| return extract_csv(path) |
|
|
| |
| |
| |
| def load_cvs(paths): |
| current_count = collection.count() |
| if current_count > 0: |
| print(f"ℹ️ Skipping indexing (ChromaDB already has {current_count} CVs).") |
| return |
|
|
| print("➡️ Indexing CVs into ChromaDB (this will run only once)...") |
| indexed, skipped = 0, 0 |
| for i, path in enumerate(paths, start=1): |
| try: |
| text = extract_text(path).strip() |
| if not text: |
| skipped += 1 |
| continue |
| rel_id = os.path.relpath(path, local_dir) |
| collection.add( |
| ids=[rel_id], |
| documents=[text], |
| metadatas=[{"name": os.path.basename(path), "path": path}] |
| ) |
| indexed += 1 |
| except Exception as e: |
| print(f"⚠️ Skipped {path}: {e}") |
| skipped += 1 |
| if i % 100 == 0: |
| print(f"Progress: Indexed {indexed}/{i} processed...") |
| print(f"✅ Finished: Indexed {indexed}/{len(paths)} CVs, skipped {skipped} (corrupt/encrypted).") |
|
|
| load_cvs(cv_paths) |
|
|
| |
| |
| |
| def find_matching(jd, top_n): |
| jd = jd.strip() |
| if not jd: |
| return [], "⚠️ Please enter a job description." |
| res = collection.query(query_texts=[jd], n_results=top_n) |
| md, ds = res["metadatas"][0], res["distances"][0] |
| if not md: |
| return [], "❌ No matches found." |
| files, scores = [], [] |
| for meta, dist in zip(md, ds): |
| if os.path.exists(meta["path"]): |
| sim = 1 / (1 + dist) |
| files.append(meta["path"]) |
| scores.append(f"{meta['name']}: {sim:.3f}") |
| return files, "✅ Matches:\n" + "\n".join(scores) |
|
|
| def show_stats(): |
| return f"📊 Indexed {collection.count()} CV(s) (stored in {chroma_path})" |
|
|
| |
| |
| |
| with gr.Blocks(title="JD→CV Semantic Matcher (Persistent DB)") as app: |
| gr.Markdown("# 🎯 JD→CV Semantic Matcher\nHandles all CVs (PDF, DOCX, CSV, Git LFS) with persistent DB") |
| jd = gr.Textbox(lines=8, placeholder="Paste your job description...") |
| top_n = gr.Slider(1, 20, value=5, label="Top N CVs") |
| search_btn = gr.Button("🔍 Search") |
| stats_btn = gr.Button("📊 Stats") |
| files_out = gr.Files() |
| status_out = gr.Textbox(lines=6, interactive=False) |
|
|
| search_btn.click(find_matching, [jd, top_n], [files_out, status_out]) |
| stats_btn.click(show_stats, outputs=status_out) |
|
|
| app.launch( |
| auth=("gigaswar", "gigaswarai"), |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=True, |
| ssr_mode=False |
| ) |
|
|