Spaces:

Zeqhx
/

Automated-CV-Parser

Running

App Files Files Community

Automated-CV-Parser / lib /extract.py

Zeqhx

Deploy CV parser dashboard with dataset 2 model

c59578d verified 1 day ago

raw

history blame contribute delete

2.91 kB

	"""Extract plain text from uploaded CV files (PDF / DOCX / TXT).

	Extraction libs are imported lazily so the app still loads if one is missing;
	the caller gets a clear error string instead of a crash.
	"""
	from __future__ import annotations

	import os
	import re


	def _clean(text: str) -> str:
	"""Light normalisation mirroring the project's preprocessing."""
	text = text.replace("\x00", " ")
	text = re.sub(r"\(cid:\d+\)", " ", text) # unmapped PDF glyphs (icons/ligatures)
	text = re.sub(r"[ \t]+", " ", text)
	text = re.sub(r"\n{3,}", "\n\n", text)
	return "\n".join(line.strip() for line in text.splitlines()).strip()


	def _space_ratio(text: str) -> float:
	"""Fraction of characters that are spaces. Normal prose ~0.12-0.18;
	PDFs with glued words ('UniversityofMalaya') drop near ~0.0."""
	t = text.strip()
	return (t.count(" ") / len(t)) if t else 0.0


	def _from_pdf(file) -> str:
	import pdfplumber

	def extract(pages, **kw):
	return "\n".join((p.extract_text(**kw) or "") for p in pages)

	with pdfplumber.open(file) as pdf:
	pages = pdf.pages
	text = extract(pages)
	# Some PDFs encode inter-word spaces as gaps smaller than pdfplumber's
	# default x_tolerance (3), so words come out glued together. Detect that
	# via a very low space ratio and re-extract with a tighter tolerance,
	# keeping it only if it genuinely adds spaces.
	if _space_ratio(text) < 0.08:
	tight = extract(pages, x_tolerance=1)
	if _space_ratio(tight) > _space_ratio(text):
	text = tight
	return text


	def _from_docx(file) -> str:
	import docx
	document = docx.Document(file)
	return "\n".join(p.text for p in document.paragraphs)


	def _from_txt(file) -> str:
	raw = file.read()
	if isinstance(raw, bytes):
	return raw.decode("utf-8", errors="ignore")
	return raw


	def extract_text(file, filename: str \| None = None):
	"""Return (text, error). Exactly one is non-empty.

	`file` is a file-like object (e.g. a Streamlit UploadedFile).
	"""
	name = filename or getattr(file, "name", "") or ""
	ext = os.path.splitext(name)[1].lower()
	try:
	if ext == ".pdf":
	text = _from_pdf(file)
	elif ext == ".docx":
	text = _from_docx(file)
	elif ext == ".txt":
	text = _from_txt(file)
	else:
	return "", f"Unsupported file type: {ext or '(none)'}"
	except ModuleNotFoundError as e:
	return "", (f"Missing library for {ext} files ({e.name}). "
	f"Install dashboard/requirements.txt.")
	except Exception as e: # noqa: BLE001 - surface any parse error to the UI
	return "", f"Could not read {name}: {e}"

	text = _clean(text)
	if not text:
	return "", f"No extractable text in {name} (scanned/image PDF?)."
	return text, ""