Spaces:

jayansh21
/

skillsync-backend

Sleeping

skillsync-backend / app /services /jd_parser.py

GitHub Actions

sync: github commit e4109213b5cedf256d6e30f65518976b7d530541 to HF Space

19dc325 21 days ago

2.81 kB

	import requests
	from bs4 import BeautifulSoup
	from PIL import Image
	import io
	import fitz # PyMuPDF
	# OR standard file handling if using simple libraries

	class JDParser:
	@staticmethod
	def extract_from_text(text: str) -> str:
	return text.strip()

	@staticmethod
	def extract_from_url(url: str) -> str:
	try:
	# Basic headers to avoid immediate 403s
	headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	soup = BeautifulSoup(response.content, 'html.parser')

	# Remove script and style elements
	for script in soup(["script", "style", "nav", "footer", "header"]):
	script.decompose()

	# Get text
	text = soup.get_text()

	# Break into lines and remove leading/trailing space on each
	lines = (line.strip() for line in text.splitlines())
	# Break multi-headlines into a line each
	chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
	# Drop blank lines
	text = '\n'.join(chunk for chunk in chunks if chunk)

	return text[:10000] # Limit length
	except Exception as e:
	raise Exception(f"Failed to fetch content from URL: {str(e)}")

	@staticmethod
	def extract_from_pdf(file_bytes: bytes) -> str:
	try:
	# Using PyMuPDF or similar would be good, but we have fitz/pdfminer installed?
	# Let's use pdfminer or pypdf if available.
	# I'll rely on a basic extraction.
	# Re-using ResumeParser logic might be good but that takes file paths.
	# Here I have bytes.
	import fitz
	doc = fitz.open(stream=file_bytes, filetype="pdf")
	text = ""
	for page in doc:
	text += page.get_text()
	return text
	except ImportError:
	# Fallback if fitz not present, try standard pypdf?
	# We already used ResumeParser -> check what it uses.
	# It uses pdfminer.six usually.
	from pdfminer.high_level import extract_text
	text = extract_text(io.BytesIO(file_bytes))
	return text
	except Exception as e:
	raise Exception(f"Failed to extract text from PDF: {str(e)}")

	@staticmethod
	def extract_from_image(file_bytes: bytes) -> str:
	# Explicitly removed as per requirements (Step 1682)
	raise ValueError("Image inputs and OCR are no longer supported. Please provide text or PDF.")