Rs_mini_projrct / src /pipeline_tech_match.py
Harshilforworks's picture
Upload 33 files
4f10909 verified
"""Pipeline: take tech-stack input -> normalize tokens -> format -> embed -> query FAISS -> output JSON
Usage:
python backend/src/pipeline_tech_match.py --tech "Python, AWS, Docker"
python backend/src/pipeline_tech_match.py --tech-file path\to\tech.txt
Defaults: looks for FAISS files at ../Vector_db/faiss_index.faiss and ../Vector_db/faiss_metadata.json
"""
from __future__ import annotations
import argparse
import json
from pathlib import Path
from typing import List, Dict, Any
try:
from sentence_transformers import SentenceTransformer
except Exception:
SentenceTransformer = None # type: ignore
try:
import faiss
except Exception:
faiss = None # type: ignore
import numpy as np
import re
def _token_clean(tok: str) -> str:
t = tok.strip()
t = re.sub(r"^[^A-Za-z0-9#+./-]+|[^A-Za-z0-9#+./-]+$", "", t)
return t
def tokens_from_text(text: str) -> List[str]:
if not text:
return []
parts = re.split(r"[,/;|\u2022]+", text)
seen = set()
out = []
for p in parts:
tok = _token_clean(p)
if not tok:
continue
key = tok.lower()
if key in seen:
continue
seen.add(key)
out.append(tok)
return out
def pipeline_match_from_tech(
tech_text: str,
index_path: str = "../Vector_db/faiss_index.faiss",
metadata_path: str = "../Vector_db/faiss_metadata.json",
model_name: str = "all-MiniLM-L6-v2",
top_k: int = 5,
) -> Dict[str, Any]:
if SentenceTransformer is None:
raise ImportError("sentence-transformers is required. Install it into your environment (pip install sentence-transformers).")
if faiss is None:
raise ImportError("faiss (faiss-cpu) is required. Install it into your environment (pip install faiss-cpu).")
# parse tokens and dedupe
tokens = tokens_from_text(tech_text)
unique_tokens = sorted(set(tokens), key=lambda s: s.lower())
skills_text = "Skills required for this job: " + ", ".join(unique_tokens) if unique_tokens else ""
model = SentenceTransformer(model_name)
if not skills_text:
return {"skills": unique_tokens, "skills_text": skills_text, "matches": []}
emb = model.encode([skills_text], convert_to_numpy=True)
vec = np.asarray(emb, dtype=np.float32)
try:
faiss.normalize_L2(vec)
except Exception:
pass
idx_path = Path(index_path)
meta_path = Path(metadata_path)
# If given paths don't exist, try a few likely locations (backend/Vector_db, src-relative, cwd)
if not idx_path.exists():
base = Path(__file__).resolve().parent.parent # backend/
tried = [str(idx_path)]
candidates = [
base / "Vector_db" / idx_path.name,
base / idx_path,
Path(__file__).resolve().parent / idx_path,
Path.cwd() / "backend" / "Vector_db" / idx_path.name,
Path.cwd() / "backend" / idx_path,
Path.cwd() / "Vector_db" / idx_path.name,
]
found = False
for c in candidates:
tried.append(str(c))
if c.exists():
idx_path = c
found = True
break
if not found:
raise FileNotFoundError(f"FAISS index not found at {index_path}. Tried: {tried}")
if not meta_path.exists():
base = Path(__file__).resolve().parent.parent
tried_m = [str(meta_path)]
candidates_m = [
base / "Vector_db" / meta_path.name,
base / meta_path,
Path(__file__).resolve().parent / meta_path,
Path.cwd() / "backend" / "Vector_db" / meta_path.name,
Path.cwd() / "backend" / meta_path,
Path.cwd() / "Vector_db" / meta_path.name,
]
found_m = False
for c in candidates_m:
tried_m.append(str(c))
if c.exists():
meta_path = c
found_m = True
break
if not found_m:
raise FileNotFoundError(f"FAISS metadata not found at {metadata_path}. Tried: {tried_m}")
index = faiss.read_index(str(idx_path))
D, I = index.search(vec, top_k)
meta = json.loads(meta_path.read_text(encoding="utf-8"))
matches = []
for score, iid in zip(D[0].tolist(), I[0].tolist()):
if iid < 0:
continue
key = str(iid)
title = meta.get(key, None)
matches.append({"id": iid, "title": title, "score": float(score)})
return {"skills": unique_tokens, "skills_text": skills_text, "matches": matches}
def main(argv=None):
p = argparse.ArgumentParser(description="Tech-stack -> embed -> FAISS matching pipeline")
group = p.add_mutually_exclusive_group(required=True)
group.add_argument("--tech", help="Tech stack string, e.g. 'Python, AWS, Docker'")
group.add_argument("--tech-file", help="Path to text file containing tech stack string")
p.add_argument("--index", help="Path to FAISS index file", default="../Vector_db/faiss_index.faiss")
p.add_argument("--metadata", help="Path to FAISS metadata json", default="../Vector_db/faiss_metadata.json")
p.add_argument("--model", help="SentenceTransformer model name", default="all-MiniLM-L6-v2")
p.add_argument("--topk", type=int, help="Number of top matches to return", default=5)
p.add_argument("--out", help="Path to save JSON output (if not provided prints to stdout)")
args = p.parse_args(argv)
if args.tech:
tech_text = args.tech
else:
tech_text = Path(args.tech_file).read_text(encoding="utf-8")
out = pipeline_match_from_tech(tech_text, index_path=args.index, metadata_path=args.metadata, model_name=args.model, top_k=args.topk)
j = json.dumps(out, ensure_ascii=False, indent=2)
if args.out:
Path(args.out).write_text(j, encoding="utf-8")
else:
print(j)
if __name__ == "__main__":
main()