Spaces:

Harshilforworks
/

Rs_mini_projrct

Sleeping

App Files Files Community

Rs_mini_projrct / src /pipeline_tech_match.py

Harshilforworks

Upload 33 files

4f10909 verified 5 months ago

raw

history blame contribute delete

6.1 kB

	"""Pipeline: take tech-stack input -> normalize tokens -> format -> embed -> query FAISS -> output JSON

	Usage:
	python backend/src/pipeline_tech_match.py --tech "Python, AWS, Docker"
	python backend/src/pipeline_tech_match.py --tech-file path\to\tech.txt

	Defaults: looks for FAISS files at ../Vector_db/faiss_index.faiss and ../Vector_db/faiss_metadata.json
	"""
	from __future__ import annotations
	import argparse
	import json
	from pathlib import Path
	from typing import List, Dict, Any

	try:
	from sentence_transformers import SentenceTransformer
	except Exception:
	SentenceTransformer = None # type: ignore

	try:
	import faiss
	except Exception:
	faiss = None # type: ignore

	import numpy as np
	import re


	def _token_clean(tok: str) -> str:
	t = tok.strip()
	t = re.sub(r"^[^A-Za-z0-9#+./-]+\|[^A-Za-z0-9#+./-]+$", "", t)
	return t


	def tokens_from_text(text: str) -> List[str]:
	if not text:
	return []
	parts = re.split(r"[,/;\|\u2022]+", text)
	seen = set()
	out = []
	for p in parts:
	tok = _token_clean(p)
	if not tok:
	continue
	key = tok.lower()
	if key in seen:
	continue
	seen.add(key)
	out.append(tok)
	return out


	def pipeline_match_from_tech(
	tech_text: str,
	index_path: str = "../Vector_db/faiss_index.faiss",
	metadata_path: str = "../Vector_db/faiss_metadata.json",
	model_name: str = "all-MiniLM-L6-v2",
	top_k: int = 5,
	) -> Dict[str, Any]:
	if SentenceTransformer is None:
	raise ImportError("sentence-transformers is required. Install it into your environment (pip install sentence-transformers).")
	if faiss is None:
	raise ImportError("faiss (faiss-cpu) is required. Install it into your environment (pip install faiss-cpu).")

	# parse tokens and dedupe
	tokens = tokens_from_text(tech_text)
	unique_tokens = sorted(set(tokens), key=lambda s: s.lower())

	skills_text = "Skills required for this job: " + ", ".join(unique_tokens) if unique_tokens else ""

	model = SentenceTransformer(model_name)

	if not skills_text:
	return {"skills": unique_tokens, "skills_text": skills_text, "matches": []}

	emb = model.encode([skills_text], convert_to_numpy=True)
	vec = np.asarray(emb, dtype=np.float32)
	try:
	faiss.normalize_L2(vec)
	except Exception:
	pass

	idx_path = Path(index_path)
	meta_path = Path(metadata_path)

	# If given paths don't exist, try a few likely locations (backend/Vector_db, src-relative, cwd)
	if not idx_path.exists():
	base = Path(__file__).resolve().parent.parent # backend/
	tried = [str(idx_path)]
	candidates = [
	base / "Vector_db" / idx_path.name,
	base / idx_path,
	Path(__file__).resolve().parent / idx_path,
	Path.cwd() / "backend" / "Vector_db" / idx_path.name,
	Path.cwd() / "backend" / idx_path,
	Path.cwd() / "Vector_db" / idx_path.name,
	]
	found = False
	for c in candidates:
	tried.append(str(c))
	if c.exists():
	idx_path = c
	found = True
	break
	if not found:
	raise FileNotFoundError(f"FAISS index not found at {index_path}. Tried: {tried}")

	if not meta_path.exists():
	base = Path(__file__).resolve().parent.parent
	tried_m = [str(meta_path)]
	candidates_m = [
	base / "Vector_db" / meta_path.name,
	base / meta_path,
	Path(__file__).resolve().parent / meta_path,
	Path.cwd() / "backend" / "Vector_db" / meta_path.name,
	Path.cwd() / "backend" / meta_path,
	Path.cwd() / "Vector_db" / meta_path.name,
	]
	found_m = False
	for c in candidates_m:
	tried_m.append(str(c))
	if c.exists():
	meta_path = c
	found_m = True
	break
	if not found_m:
	raise FileNotFoundError(f"FAISS metadata not found at {metadata_path}. Tried: {tried_m}")

	index = faiss.read_index(str(idx_path))
	D, I = index.search(vec, top_k)

	meta = json.loads(meta_path.read_text(encoding="utf-8"))
	matches = []
	for score, iid in zip(D[0].tolist(), I[0].tolist()):
	if iid < 0:
	continue
	key = str(iid)
	title = meta.get(key, None)
	matches.append({"id": iid, "title": title, "score": float(score)})

	return {"skills": unique_tokens, "skills_text": skills_text, "matches": matches}


	def main(argv=None):
	p = argparse.ArgumentParser(description="Tech-stack -> embed -> FAISS matching pipeline")
	group = p.add_mutually_exclusive_group(required=True)
	group.add_argument("--tech", help="Tech stack string, e.g. 'Python, AWS, Docker'")
	group.add_argument("--tech-file", help="Path to text file containing tech stack string")
	p.add_argument("--index", help="Path to FAISS index file", default="../Vector_db/faiss_index.faiss")
	p.add_argument("--metadata", help="Path to FAISS metadata json", default="../Vector_db/faiss_metadata.json")
	p.add_argument("--model", help="SentenceTransformer model name", default="all-MiniLM-L6-v2")
	p.add_argument("--topk", type=int, help="Number of top matches to return", default=5)
	p.add_argument("--out", help="Path to save JSON output (if not provided prints to stdout)")
	args = p.parse_args(argv)

	if args.tech:
	tech_text = args.tech
	else:
	tech_text = Path(args.tech_file).read_text(encoding="utf-8")

	out = pipeline_match_from_tech(tech_text, index_path=args.index, metadata_path=args.metadata, model_name=args.model, top_k=args.topk)
	j = json.dumps(out, ensure_ascii=False, indent=2)
	if args.out:
	Path(args.out).write_text(j, encoding="utf-8")
	else:
	print(j)


	if __name__ == "__main__":
	main()