Spaces:
Paused
Paused
Commit ·
947d727
1
Parent(s): 6d286f1
updated
Browse files
backend/services/resume_parser.py
CHANGED
|
@@ -1,10 +1,11 @@
|
|
| 1 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 2 |
import zipfile, re, os
|
|
|
|
| 3 |
|
| 4 |
# ===============================
|
| 5 |
# Load Model & Tokenizer
|
| 6 |
# ===============================
|
| 7 |
-
MODEL_NAME = "sravya-abburi/ResumeParserBERT" #
|
| 8 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 9 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 10 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
|
@@ -16,16 +17,18 @@ def extract_text(file_path: str) -> str:
|
|
| 16 |
"""Extract text from PDF or DOCX without external dependencies."""
|
| 17 |
file_path_lower = file_path.lower()
|
| 18 |
|
| 19 |
-
# PDF reading using
|
| 20 |
if file_path_lower.endswith(".pdf"):
|
| 21 |
-
import fitz # PyMuPDF
|
| 22 |
text = ""
|
| 23 |
-
with
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
| 26 |
return text
|
| 27 |
|
| 28 |
-
# DOCX reading by extracting XML content
|
| 29 |
elif file_path_lower.endswith(".docx"):
|
| 30 |
with zipfile.ZipFile(file_path) as zf:
|
| 31 |
with zf.open("word/document.xml") as docx_xml:
|
|
|
|
| 1 |
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline
|
| 2 |
import zipfile, re, os
|
| 3 |
+
from PyPDF2 import PdfReader # Lightweight & already in Spaces
|
| 4 |
|
| 5 |
# ===============================
|
| 6 |
# Load Model & Tokenizer
|
| 7 |
# ===============================
|
| 8 |
+
MODEL_NAME = "sravya-abburi/ResumeParserBERT" # Swap to Kiet model if needed
|
| 9 |
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
| 10 |
model = AutoModelForTokenClassification.from_pretrained(MODEL_NAME)
|
| 11 |
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple")
|
|
|
|
| 17 |
"""Extract text from PDF or DOCX without external dependencies."""
|
| 18 |
file_path_lower = file_path.lower()
|
| 19 |
|
| 20 |
+
# ✅ PDF reading using PyPDF2 (no fitz, no installs needed)
|
| 21 |
if file_path_lower.endswith(".pdf"):
|
|
|
|
| 22 |
text = ""
|
| 23 |
+
with open(file_path, "rb") as f:
|
| 24 |
+
reader = PdfReader(f)
|
| 25 |
+
for page in reader.pages:
|
| 26 |
+
page_text = page.extract_text()
|
| 27 |
+
if page_text:
|
| 28 |
+
text += page_text + "\n"
|
| 29 |
return text
|
| 30 |
|
| 31 |
+
# ✅ DOCX reading by extracting XML content
|
| 32 |
elif file_path_lower.endswith(".docx"):
|
| 33 |
with zipfile.ZipFile(file_path) as zf:
|
| 34 |
with zf.open("word/document.xml") as docx_xml:
|